universal_document_processor 1.0.5 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +115 -1
- data/lib/universal_document_processor/processors/archive_processor.rb +26 -0
- data/lib/universal_document_processor/version.rb +1 -1
- metadata +1 -6
- data/debug_test.rb +0 -35
- data/test_ai_dependency.rb +0 -80
- data/test_core_functionality.rb +0 -280
- data/test_performance_memory.rb +0 -271
- data/test_published_gem.rb +0 -349
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8444e9dc03cd125a0a9e62df6370b7dbba4adf4777d89478d1d51c60f5c83d70
|
4
|
+
data.tar.gz: 3a5fc000774c34683c7b0d95c0ca9a034838cd5e69be51c43c98792070b278aa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: be66ce6b411fcfa52eaf6353ef1f37785cbe996c8c977bb671c971988c43a500108004741b157ca7636d9e2f20991c43b44884134c3ffd4fde2f6b4a90d27380
|
7
|
+
data.tar.gz: 94e90e17615093e529db4100d674944150d0cbbcd527df3341b8c025ada3fe389dc8dbc64099b1cfa87473105c57038ca1fb5106281242b8658e583042cc52c9
|
data/CHANGELOG.md
CHANGED
@@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
7
7
|
|
8
8
|
## [Unreleased]
|
9
9
|
|
10
|
+
## [1.1.0] - 2025-01-XX
|
11
|
+
### Added
|
12
|
+
- **ZIP File Creation**: New functionality to create ZIP archives programmatically
|
13
|
+
- `ArchiveProcessor.create_zip()` class method for creating ZIP files
|
14
|
+
- Support for creating archives from individual files or entire directories
|
15
|
+
- Recursive directory archiving with proper path structure preservation
|
16
|
+
- Comprehensive test coverage with error handling
|
17
|
+
- Integration with existing archive processing capabilities
|
18
|
+
|
19
|
+
### Enhanced
|
20
|
+
- **ArchiveProcessor**: Extended with ZIP creation capabilities alongside existing extraction features
|
21
|
+
- **Archive Support**: Now supports both reading/extracting and creating ZIP archives
|
22
|
+
|
10
23
|
## [1.2.0] - 2024-01-15
|
11
24
|
### Added
|
12
25
|
- **TSV (Tab-Separated Values) File Support**: Complete built-in TSV processing capabilities
|
data/README.md
CHANGED
@@ -29,6 +29,7 @@ A comprehensive Ruby gem that provides unified document processing capabilities
|
|
29
29
|
- **Table Detection**: Structured data extraction
|
30
30
|
- **Character Validation**: Invalid character detection and cleaning
|
31
31
|
- **Multi-language Support**: Full Unicode support including Japanese (日本語)
|
32
|
+
- **Archive Creation**: Create ZIP files from individual files or directories
|
32
33
|
|
33
34
|
### **Character & Encoding Support**
|
34
35
|
- **Smart encoding detection** (UTF-8, Shift_JIS, EUC-JP, ISO-8859-1)
|
@@ -110,7 +111,7 @@ result = UniversalDocumentProcessor.process('document.pdf')
|
|
110
111
|
},
|
111
112
|
images: [...],
|
112
113
|
tables: [...],
|
113
|
-
processed_at:
|
114
|
+
processed_at: 2025-07-06 10:30:00 UTC
|
114
115
|
}
|
115
116
|
```
|
116
117
|
|
@@ -421,6 +422,119 @@ agent = UniversalDocumentProcessor.create_ai_agent(
|
|
421
422
|
)
|
422
423
|
```
|
423
424
|
|
425
|
+
## 📦 Archive Processing (ZIP Creation & Extraction)
|
426
|
+
|
427
|
+
The gem provides comprehensive archive processing capabilities, including both extracting from existing archives and creating new ZIP files.
|
428
|
+
|
429
|
+
### Extracting from Archives
|
430
|
+
|
431
|
+
```ruby
|
432
|
+
# Extract text and metadata from ZIP archives
|
433
|
+
result = UniversalDocumentProcessor.process('archive.zip')
|
434
|
+
|
435
|
+
# Access archive-specific metadata
|
436
|
+
metadata = result[:metadata]
|
437
|
+
puts "Archive type: #{metadata[:archive_type]}" # => "zip"
|
438
|
+
puts "Total files: #{metadata[:total_files]}" # => 15
|
439
|
+
puts "Uncompressed size: #{metadata[:total_uncompressed_size]} bytes"
|
440
|
+
puts "Compression ratio: #{metadata[:compression_ratio]}%" # => 75%
|
441
|
+
puts "Directory structure: #{metadata[:directory_structure]}"
|
442
|
+
|
443
|
+
# Check for specific file types
|
444
|
+
puts "File types: #{metadata[:file_types]}" # => {"txt"=>5, "pdf"=>3, "jpg"=>7}
|
445
|
+
puts "Has executables: #{metadata[:has_executable_files]}" # => false
|
446
|
+
puts "Largest file: #{metadata[:largest_file][:path]} (#{metadata[:largest_file][:size]} bytes)"
|
447
|
+
|
448
|
+
# Extract text from text files within the archive
|
449
|
+
text_content = result[:text_content]
|
450
|
+
puts "Combined text from archive: #{text_content.length} characters"
|
451
|
+
```
|
452
|
+
|
453
|
+
### Creating ZIP Archives
|
454
|
+
|
455
|
+
```ruby
|
456
|
+
# Create ZIP from individual files
|
457
|
+
files_to_zip = ['document1.pdf', 'document2.txt', 'image.jpg']
|
458
|
+
output_zip = 'my_archive.zip'
|
459
|
+
|
460
|
+
zip_path = UniversalDocumentProcessor::Processors::ArchiveProcessor.create_zip(
|
461
|
+
output_zip,
|
462
|
+
files_to_zip
|
463
|
+
)
|
464
|
+
puts "ZIP created: #{zip_path}"
|
465
|
+
|
466
|
+
# Create ZIP from entire directory (preserves folder structure)
|
467
|
+
directory_to_zip = '/path/to/documents'
|
468
|
+
archive_path = UniversalDocumentProcessor::Processors::ArchiveProcessor.create_zip(
|
469
|
+
'directory_backup.zip',
|
470
|
+
directory_to_zip
|
471
|
+
)
|
472
|
+
puts "Directory archived: #{archive_path}"
|
473
|
+
|
474
|
+
# Working with temporary directories
|
475
|
+
require 'tmpdir'
|
476
|
+
|
477
|
+
Dir.mktmpdir do |tmpdir|
|
478
|
+
# Create some test files
|
479
|
+
File.write(File.join(tmpdir, 'file1.txt'), 'Hello from file 1')
|
480
|
+
File.write(File.join(tmpdir, 'file2.txt'), 'Hello from file 2')
|
481
|
+
|
482
|
+
# Create subdirectory with files
|
483
|
+
subdir = File.join(tmpdir, 'subfolder')
|
484
|
+
Dir.mkdir(subdir)
|
485
|
+
File.write(File.join(subdir, 'file3.txt'), 'Hello from subfolder')
|
486
|
+
|
487
|
+
# Archive the entire directory structure
|
488
|
+
zip_file = File.join(tmpdir, 'complete_backup.zip')
|
489
|
+
UniversalDocumentProcessor::Processors::ArchiveProcessor.create_zip(zip_file, tmpdir)
|
490
|
+
|
491
|
+
puts "Archive size: #{File.size(zip_file)} bytes"
|
492
|
+
|
493
|
+
# Verify archive contents by processing it
|
494
|
+
archive_result = UniversalDocumentProcessor.process(zip_file)
|
495
|
+
puts "Files in archive: #{archive_result[:metadata][:total_files]}"
|
496
|
+
end
|
497
|
+
|
498
|
+
# Error handling for ZIP creation
|
499
|
+
begin
|
500
|
+
UniversalDocumentProcessor::Processors::ArchiveProcessor.create_zip(
|
501
|
+
'/invalid/path/archive.zip',
|
502
|
+
['file1.txt', 'file2.txt']
|
503
|
+
)
|
504
|
+
rescue => e
|
505
|
+
puts "Error creating ZIP: #{e.message}"
|
506
|
+
end
|
507
|
+
|
508
|
+
# Validate input before creating ZIP
|
509
|
+
files = ['doc1.pdf', 'doc2.txt']
|
510
|
+
files.each do |file|
|
511
|
+
unless File.exist?(file)
|
512
|
+
puts "Warning: #{file} does not exist"
|
513
|
+
end
|
514
|
+
end
|
515
|
+
```
|
516
|
+
|
517
|
+
### Archive Analysis
|
518
|
+
|
519
|
+
```ruby
|
520
|
+
# Analyze archive security and structure
|
521
|
+
result = UniversalDocumentProcessor.process('suspicious_archive.zip')
|
522
|
+
metadata = result[:metadata]
|
523
|
+
|
524
|
+
# Security analysis
|
525
|
+
if metadata[:has_executable_files]
|
526
|
+
puts "⚠️ Archive contains executable files"
|
527
|
+
end
|
528
|
+
|
529
|
+
# Directory structure analysis
|
530
|
+
structure = metadata[:directory_structure]
|
531
|
+
puts "Top-level directories: #{structure.keys.join(', ')}"
|
532
|
+
|
533
|
+
# File type distribution
|
534
|
+
file_types = metadata[:file_types]
|
535
|
+
puts "Most common file type: #{file_types.max_by{|k,v| v}}"
|
536
|
+
```
|
537
|
+
|
424
538
|
## 🎌 Japanese Filename Support
|
425
539
|
|
426
540
|
The gem provides comprehensive support for Japanese filenames across all operating systems:
|
@@ -91,6 +91,32 @@ module UniversalDocumentProcessor
|
|
91
91
|
super + [:list_files, :extract_file, :extract_all, :analyze_security]
|
92
92
|
end
|
93
93
|
|
94
|
+
# Class method to create a zip file from a list of files or a directory
|
95
|
+
def self.create_zip(output_zip_path, files_or_directory)
|
96
|
+
require 'zip'
|
97
|
+
files = []
|
98
|
+
if files_or_directory.is_a?(String) && File.directory?(files_or_directory)
|
99
|
+
# Recursively collect all files in the directory
|
100
|
+
Dir[File.join(files_or_directory, '**', '**')].each do |file|
|
101
|
+
files << file unless File.directory?(file)
|
102
|
+
end
|
103
|
+
base_dir = files_or_directory
|
104
|
+
elsif files_or_directory.is_a?(Array)
|
105
|
+
files = files_or_directory
|
106
|
+
base_dir = nil
|
107
|
+
else
|
108
|
+
raise ArgumentError, 'files_or_directory must be a directory path or an array of file paths'
|
109
|
+
end
|
110
|
+
|
111
|
+
Zip::File.open(output_zip_path, Zip::File::CREATE) do |zipfile|
|
112
|
+
files.each do |file|
|
113
|
+
entry_name = base_dir ? file.sub(/^#{Regexp.escape(base_dir)}\/?/, '') : File.basename(file)
|
114
|
+
zipfile.add(entry_name, file)
|
115
|
+
end
|
116
|
+
end
|
117
|
+
output_zip_path
|
118
|
+
end
|
119
|
+
|
94
120
|
private
|
95
121
|
|
96
122
|
def detect_archive_type
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: universal_document_processor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Vikas Patil
|
@@ -201,7 +201,6 @@ files:
|
|
201
201
|
- README.md
|
202
202
|
- Rakefile
|
203
203
|
- USER_GUIDE.md
|
204
|
-
- debug_test.rb
|
205
204
|
- lib/universal_document_processor.rb
|
206
205
|
- lib/universal_document_processor/ai_agent.rb
|
207
206
|
- lib/universal_document_processor/document.rb
|
@@ -217,10 +216,6 @@ files:
|
|
217
216
|
- lib/universal_document_processor/utils/file_detector.rb
|
218
217
|
- lib/universal_document_processor/utils/japanese_filename_handler.rb
|
219
218
|
- lib/universal_document_processor/version.rb
|
220
|
-
- test_ai_dependency.rb
|
221
|
-
- test_core_functionality.rb
|
222
|
-
- test_performance_memory.rb
|
223
|
-
- test_published_gem.rb
|
224
219
|
homepage: https://github.com/vpatil160/universal_document_processor
|
225
220
|
licenses:
|
226
221
|
- MIT
|
data/debug_test.rb
DELETED
@@ -1,35 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
# Add lib directory to load path
|
4
|
-
$LOAD_PATH.unshift File.expand_path('lib', __dir__)
|
5
|
-
|
6
|
-
# Load the gem
|
7
|
-
require 'universal_document_processor'
|
8
|
-
require 'tempfile'
|
9
|
-
|
10
|
-
# Create a simple text file
|
11
|
-
txt_file = Tempfile.new(['test', '.txt'])
|
12
|
-
txt_file.write("This is a sample text file.\nIt has multiple lines.\nUsed for testing.")
|
13
|
-
txt_file.close
|
14
|
-
|
15
|
-
puts "Testing text file: #{txt_file.path}"
|
16
|
-
|
17
|
-
begin
|
18
|
-
puts "Processing file..."
|
19
|
-
result = UniversalDocumentProcessor.process(txt_file.path)
|
20
|
-
|
21
|
-
puts "Result keys: #{result.keys}"
|
22
|
-
puts "Result type: #{result.class}"
|
23
|
-
|
24
|
-
if result.is_a?(Hash)
|
25
|
-
result.each do |key, value|
|
26
|
-
puts "#{key}: #{value.class} - #{value.to_s[0..100]}..."
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
rescue => e
|
31
|
-
puts "Error: #{e.class} - #{e.message}"
|
32
|
-
puts e.backtrace.first(5)
|
33
|
-
end
|
34
|
-
|
35
|
-
txt_file.unlink
|
data/test_ai_dependency.rb
DELETED
@@ -1,80 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
# Add lib directory to load path
|
4
|
-
$LOAD_PATH.unshift File.expand_path('lib', __dir__)
|
5
|
-
|
6
|
-
# Load the gem
|
7
|
-
require 'universal_document_processor'
|
8
|
-
|
9
|
-
puts "Testing AI Dependency Handling"
|
10
|
-
puts "=" * 50
|
11
|
-
|
12
|
-
# Test 1: Check AI availability without API key
|
13
|
-
puts "\n1. Testing AI availability without API key:"
|
14
|
-
ai_available = UniversalDocumentProcessor.ai_available?
|
15
|
-
puts " AI Available: #{ai_available}"
|
16
|
-
|
17
|
-
# Test 2: Create AI agent without API key
|
18
|
-
puts "\n2. Creating AI agent without API key:"
|
19
|
-
agent = UniversalDocumentProcessor.create_ai_agent
|
20
|
-
puts " Agent created: #{agent.class}"
|
21
|
-
puts " AI enabled: #{agent.ai_enabled}"
|
22
|
-
puts " AI available: #{agent.ai_available?}"
|
23
|
-
|
24
|
-
# Test 3: Try to use AI methods without API key
|
25
|
-
puts "\n3. Testing AI methods without API key:"
|
26
|
-
|
27
|
-
# Create a sample text file
|
28
|
-
require 'tempfile'
|
29
|
-
sample_file = Tempfile.new(['test', '.txt'])
|
30
|
-
sample_file.write("This is a test document for AI processing.")
|
31
|
-
sample_file.close
|
32
|
-
|
33
|
-
begin
|
34
|
-
result = UniversalDocumentProcessor.ai_analyze(sample_file.path)
|
35
|
-
puts " ERROR: Should have raised an exception!"
|
36
|
-
rescue UniversalDocumentProcessor::DependencyMissingError => e
|
37
|
-
puts " ✓ Correctly raised DependencyMissingError: #{e.message}"
|
38
|
-
rescue => e
|
39
|
-
puts " ✗ Unexpected error: #{e.class} - #{e.message}"
|
40
|
-
end
|
41
|
-
|
42
|
-
# Test 4: Check available features
|
43
|
-
puts "\n4. Available features:"
|
44
|
-
features = UniversalDocumentProcessor.available_features
|
45
|
-
puts " Features: #{features.join(', ')}"
|
46
|
-
puts " AI processing included: #{features.include?(:ai_processing)}"
|
47
|
-
|
48
|
-
# Test 5: Check optional dependencies
|
49
|
-
puts "\n5. Optional dependencies:"
|
50
|
-
optional_deps = UniversalDocumentProcessor.optional_dependencies
|
51
|
-
puts " Optional dependencies: #{optional_deps.keys.join(', ')}"
|
52
|
-
|
53
|
-
missing_deps = UniversalDocumentProcessor.missing_dependencies
|
54
|
-
puts " Missing dependencies: #{missing_deps.join(', ')}"
|
55
|
-
|
56
|
-
# Test 6: Installation instructions
|
57
|
-
puts "\n6. Installation instructions:"
|
58
|
-
instructions = UniversalDocumentProcessor.installation_instructions
|
59
|
-
puts instructions
|
60
|
-
|
61
|
-
# Test 7: Test with API key if provided
|
62
|
-
if ENV['OPENAI_API_KEY'] && !ENV['OPENAI_API_KEY'].empty?
|
63
|
-
puts "\n7. Testing with API key:"
|
64
|
-
ai_available_with_key = UniversalDocumentProcessor.ai_available?
|
65
|
-
puts " AI Available with key: #{ai_available_with_key}"
|
66
|
-
|
67
|
-
agent_with_key = UniversalDocumentProcessor.create_ai_agent
|
68
|
-
puts " Agent AI enabled: #{agent_with_key.ai_enabled}"
|
69
|
-
else
|
70
|
-
puts "\n7. Skipping API key test (OPENAI_API_KEY not set)"
|
71
|
-
end
|
72
|
-
|
73
|
-
# Clean up
|
74
|
-
sample_file.unlink
|
75
|
-
|
76
|
-
puts "\n" + "=" * 50
|
77
|
-
puts "AI Dependency Test Complete!"
|
78
|
-
puts "✓ AI features are properly optional"
|
79
|
-
puts "✓ Clear error messages when dependencies missing"
|
80
|
-
puts "✓ Graceful degradation when features unavailable"
|
data/test_core_functionality.rb
DELETED
@@ -1,280 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
# Add lib directory to load path
|
4
|
-
$LOAD_PATH.unshift File.expand_path('lib', __dir__)
|
5
|
-
|
6
|
-
# Load the gem
|
7
|
-
require 'universal_document_processor'
|
8
|
-
require 'tempfile'
|
9
|
-
|
10
|
-
puts "Testing Core Functionality"
|
11
|
-
puts "=" * 50
|
12
|
-
|
13
|
-
test_count = 0
|
14
|
-
passed_count = 0
|
15
|
-
|
16
|
-
def test(description)
|
17
|
-
global_test_count = caller_locations.first.lineno
|
18
|
-
print "#{global_test_count}. #{description}... "
|
19
|
-
|
20
|
-
begin
|
21
|
-
yield
|
22
|
-
puts "✓ PASS"
|
23
|
-
return true
|
24
|
-
rescue => e
|
25
|
-
puts "✗ FAIL: #{e.message}"
|
26
|
-
puts " #{e.backtrace.first}" if ENV['DEBUG']
|
27
|
-
return false
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
# Create sample files for testing
|
32
|
-
puts "\nCreating sample files..."
|
33
|
-
|
34
|
-
# Text file
|
35
|
-
txt_file = Tempfile.new(['test', '.txt'])
|
36
|
-
txt_file.write("This is a sample text file.\nIt has multiple lines.\nUsed for testing.")
|
37
|
-
txt_file.close
|
38
|
-
|
39
|
-
# CSV file
|
40
|
-
csv_file = Tempfile.new(['test', '.csv'])
|
41
|
-
csv_file.write("Name,Age,City\nJohn,25,New York\nJane,30,Los Angeles\nBob,35,Chicago")
|
42
|
-
csv_file.close
|
43
|
-
|
44
|
-
# TSV file
|
45
|
-
tsv_file = Tempfile.new(['test', '.tsv'])
|
46
|
-
tsv_file.write("Name\tAge\tCity\nJohn\t25\tNew York\nJane\t30\tLos Angeles\nBob\t35\tChicago")
|
47
|
-
tsv_file.close
|
48
|
-
|
49
|
-
# JSON file
|
50
|
-
json_file = Tempfile.new(['test', '.json'])
|
51
|
-
json_file.write('{"name": "Test Document", "type": "sample", "data": [1, 2, 3, 4, 5]}')
|
52
|
-
json_file.close
|
53
|
-
|
54
|
-
# XML file
|
55
|
-
xml_file = Tempfile.new(['test', '.xml'])
|
56
|
-
xml_file.write(<<~XML)
|
57
|
-
<?xml version="1.0" encoding="UTF-8"?>
|
58
|
-
<document>
|
59
|
-
<title>Sample XML Document</title>
|
60
|
-
<content>This is a sample XML file for testing.</content>
|
61
|
-
</document>
|
62
|
-
XML
|
63
|
-
xml_file.close
|
64
|
-
|
65
|
-
puts "Sample files created successfully!"
|
66
|
-
|
67
|
-
# Run tests
|
68
|
-
puts "\nRunning Core Tests:"
|
69
|
-
puts "-" * 30
|
70
|
-
|
71
|
-
# Test 1: Version number
|
72
|
-
test_count += 1
|
73
|
-
passed = test("Version number is defined") do
|
74
|
-
version = UniversalDocumentProcessor::VERSION
|
75
|
-
raise "Version is nil" if version.nil?
|
76
|
-
raise "Version format invalid" unless version.match?(/\d+\.\d+\.\d+/)
|
77
|
-
end
|
78
|
-
passed_count += 1 if passed
|
79
|
-
|
80
|
-
# Test 2: Text file processing
|
81
|
-
test_count += 1
|
82
|
-
passed = test("Text file processing") do
|
83
|
-
result = UniversalDocumentProcessor.process(txt_file.path)
|
84
|
-
raise "Result is not a hash" unless result.is_a?(Hash)
|
85
|
-
raise "Missing text key" unless result.has_key?(:text)
|
86
|
-
raise "Missing metadata key" unless result.has_key?(:metadata)
|
87
|
-
raise "Text content incorrect" unless result[:text].include?("sample text file")
|
88
|
-
raise "Format incorrect" unless result[:metadata][:format] == "txt"
|
89
|
-
end
|
90
|
-
passed_count += 1 if passed
|
91
|
-
|
92
|
-
# Test 3: Text extraction
|
93
|
-
test_count += 1
|
94
|
-
passed = test("Text extraction method") do
|
95
|
-
text = UniversalDocumentProcessor.extract_text(txt_file.path)
|
96
|
-
raise "Text is not a string" unless text.is_a?(String)
|
97
|
-
raise "Text content missing" unless text.include?("sample text file")
|
98
|
-
end
|
99
|
-
passed_count += 1 if passed
|
100
|
-
|
101
|
-
# Test 4: Metadata extraction
|
102
|
-
test_count += 1
|
103
|
-
passed = test("Metadata extraction") do
|
104
|
-
metadata = UniversalDocumentProcessor.get_metadata(txt_file.path)
|
105
|
-
raise "Metadata is not a hash" unless metadata.is_a?(Hash)
|
106
|
-
raise "Format missing" unless metadata[:format] == "txt"
|
107
|
-
raise "File size missing" unless metadata[:file_size] > 0
|
108
|
-
end
|
109
|
-
passed_count += 1 if passed
|
110
|
-
|
111
|
-
# Test 5: CSV processing
|
112
|
-
test_count += 1
|
113
|
-
passed = test("CSV file processing") do
|
114
|
-
result = UniversalDocumentProcessor.process(csv_file.path)
|
115
|
-
raise "Result is not a hash" unless result.is_a?(Hash)
|
116
|
-
raise "Missing tables key" unless result.has_key?(:tables)
|
117
|
-
raise "Format incorrect" unless result[:metadata][:format] == "csv"
|
118
|
-
raise "Delimiter incorrect" unless result[:metadata][:delimiter] == "comma"
|
119
|
-
raise "No tables found" unless result[:tables].length > 0
|
120
|
-
end
|
121
|
-
passed_count += 1 if passed
|
122
|
-
|
123
|
-
# Test 6: TSV processing
|
124
|
-
test_count += 1
|
125
|
-
passed = test("TSV file processing") do
|
126
|
-
result = UniversalDocumentProcessor.process(tsv_file.path)
|
127
|
-
raise "Result is not a hash" unless result.is_a?(Hash)
|
128
|
-
raise "Missing tables key" unless result.has_key?(:tables)
|
129
|
-
raise "Format incorrect" unless result[:metadata][:format] == "tsv"
|
130
|
-
raise "Delimiter incorrect" unless result[:metadata][:delimiter] == "tab"
|
131
|
-
raise "No tables found" unless result[:tables].length > 0
|
132
|
-
end
|
133
|
-
passed_count += 1 if passed
|
134
|
-
|
135
|
-
# Test 7: JSON processing
|
136
|
-
test_count += 1
|
137
|
-
passed = test("JSON file processing") do
|
138
|
-
result = UniversalDocumentProcessor.process(json_file.path)
|
139
|
-
raise "Result is not a hash" unless result.is_a?(Hash)
|
140
|
-
raise "Format incorrect" unless result[:metadata][:format] == "json"
|
141
|
-
raise "Text missing" unless result[:text].include?("Test Document")
|
142
|
-
end
|
143
|
-
passed_count += 1 if passed
|
144
|
-
|
145
|
-
# Test 8: XML processing
|
146
|
-
test_count += 1
|
147
|
-
passed = test("XML file processing") do
|
148
|
-
result = UniversalDocumentProcessor.process(xml_file.path)
|
149
|
-
raise "Result is not a hash" unless result.is_a?(Hash)
|
150
|
-
raise "Format incorrect" unless result[:metadata][:format] == "xml"
|
151
|
-
raise "Text missing" unless result[:text].include?("Sample XML Document")
|
152
|
-
end
|
153
|
-
passed_count += 1 if passed
|
154
|
-
|
155
|
-
# Test 9: Batch processing
|
156
|
-
test_count += 1
|
157
|
-
passed = test("Batch processing") do
|
158
|
-
files = [txt_file.path, csv_file.path, json_file.path]
|
159
|
-
results = UniversalDocumentProcessor.batch_process(files)
|
160
|
-
raise "Results not array" unless results.is_a?(Array)
|
161
|
-
raise "Wrong number of results" unless results.length == 3
|
162
|
-
results.each do |result|
|
163
|
-
raise "Missing text or error key" unless result.has_key?(:text) || result.has_key?(:error)
|
164
|
-
end
|
165
|
-
end
|
166
|
-
passed_count += 1 if passed
|
167
|
-
|
168
|
-
# Test 10: Available features
|
169
|
-
test_count += 1
|
170
|
-
passed = test("Available features check") do
|
171
|
-
features = UniversalDocumentProcessor.available_features
|
172
|
-
raise "Features not array" unless features.is_a?(Array)
|
173
|
-
raise "Missing text processing" unless features.include?(:text_processing)
|
174
|
-
raise "Missing CSV processing" unless features.include?(:csv_processing)
|
175
|
-
raise "Missing TSV processing" unless features.include?(:tsv_processing)
|
176
|
-
end
|
177
|
-
passed_count += 1 if passed
|
178
|
-
|
179
|
-
# Test 11: Dependency checking
|
180
|
-
test_count += 1
|
181
|
-
passed = test("Dependency availability check") do
|
182
|
-
# These may or may not be available, just test the method works
|
183
|
-
pdf_available = UniversalDocumentProcessor.dependency_available?(:pdf_reader)
|
184
|
-
raise "Dependency check failed" unless [true, false].include?(pdf_available)
|
185
|
-
end
|
186
|
-
passed_count += 1 if passed
|
187
|
-
|
188
|
-
# Test 12: Text quality analysis
|
189
|
-
test_count += 1
|
190
|
-
passed = test("Text quality analysis") do
|
191
|
-
analysis = UniversalDocumentProcessor.analyze_text_quality("Clean text")
|
192
|
-
raise "Analysis not hash" unless analysis.is_a?(Hash)
|
193
|
-
raise "Missing valid_characters" unless analysis.has_key?(:valid_characters)
|
194
|
-
raise "Missing invalid_characters" unless analysis.has_key?(:invalid_characters)
|
195
|
-
end
|
196
|
-
passed_count += 1 if passed
|
197
|
-
|
198
|
-
# Test 13: Text cleaning
|
199
|
-
test_count += 1
|
200
|
-
passed = test("Text cleaning") do
|
201
|
-
dirty_text = "Clean\x00text"
|
202
|
-
clean_text = UniversalDocumentProcessor.clean_text(dirty_text)
|
203
|
-
raise "Cleaning failed" if clean_text.include?("\x00")
|
204
|
-
end
|
205
|
-
passed_count += 1 if passed
|
206
|
-
|
207
|
-
# Test 14: Japanese text detection
|
208
|
-
test_count += 1
|
209
|
-
passed = test("Japanese text detection") do
|
210
|
-
english = "This is English"
|
211
|
-
japanese = "これは日本語"
|
212
|
-
raise "English detected as Japanese" if UniversalDocumentProcessor.japanese_text?(english)
|
213
|
-
raise "Japanese not detected" unless UniversalDocumentProcessor.japanese_text?(japanese)
|
214
|
-
end
|
215
|
-
passed_count += 1 if passed
|
216
|
-
|
217
|
-
# Test 15: Optional dependencies info
|
218
|
-
test_count += 1
|
219
|
-
passed = test("Optional dependencies information") do
|
220
|
-
optional_deps = UniversalDocumentProcessor.optional_dependencies
|
221
|
-
raise "Optional deps not hash" unless optional_deps.is_a?(Hash)
|
222
|
-
raise "Missing pdf-reader" unless optional_deps.has_key?('pdf-reader')
|
223
|
-
|
224
|
-
missing_deps = UniversalDocumentProcessor.missing_dependencies
|
225
|
-
raise "Missing deps not array" unless missing_deps.is_a?(Array)
|
226
|
-
|
227
|
-
instructions = UniversalDocumentProcessor.installation_instructions
|
228
|
-
raise "Instructions not string" unless instructions.is_a?(String)
|
229
|
-
end
|
230
|
-
passed_count += 1 if passed
|
231
|
-
|
232
|
-
# Test 16: AI availability check (should be false without API key)
|
233
|
-
test_count += 1
|
234
|
-
passed = test("AI availability check") do
|
235
|
-
ai_available = UniversalDocumentProcessor.ai_available?
|
236
|
-
raise "AI should not be available without key" if ai_available
|
237
|
-
end
|
238
|
-
passed_count += 1 if passed
|
239
|
-
|
240
|
-
# Test 17: Error handling for unsupported format
|
241
|
-
test_count += 1
|
242
|
-
passed = test("Error handling for unsupported format") do
|
243
|
-
unsupported_file = Tempfile.new(['test', '.unknown'])
|
244
|
-
unsupported_file.write("test content")
|
245
|
-
unsupported_file.close
|
246
|
-
|
247
|
-
begin
|
248
|
-
UniversalDocumentProcessor.process(unsupported_file.path)
|
249
|
-
raise "Should have raised UnsupportedFormatError"
|
250
|
-
rescue UniversalDocumentProcessor::UnsupportedFormatError
|
251
|
-
# Expected error
|
252
|
-
rescue => e
|
253
|
-
raise "Wrong error type: #{e.class}"
|
254
|
-
ensure
|
255
|
-
unsupported_file.unlink
|
256
|
-
end
|
257
|
-
end
|
258
|
-
passed_count += 1 if passed
|
259
|
-
|
260
|
-
# Clean up
|
261
|
-
puts "\nCleaning up temporary files..."
|
262
|
-
[txt_file, csv_file, tsv_file, json_file, xml_file].each do |file|
|
263
|
-
file.unlink if File.exist?(file.path)
|
264
|
-
end
|
265
|
-
|
266
|
-
# Results
|
267
|
-
puts "\n" + "=" * 50
|
268
|
-
puts "Test Results:"
|
269
|
-
puts " Total tests: #{test_count}"
|
270
|
-
puts " Passed: #{passed_count}"
|
271
|
-
puts " Failed: #{test_count - passed_count}"
|
272
|
-
puts " Success rate: #{((passed_count.to_f / test_count) * 100).round(1)}%"
|
273
|
-
|
274
|
-
if passed_count == test_count
|
275
|
-
puts "\n🎉 All tests passed! Core functionality is working correctly."
|
276
|
-
exit 0
|
277
|
-
else
|
278
|
-
puts "\n❌ Some tests failed. Please check the issues above."
|
279
|
-
exit 1
|
280
|
-
end
|
data/test_performance_memory.rb
DELETED
@@ -1,271 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
# Performance and Memory Usage Analysis for Universal Document Processor
|
4
|
-
# This test checks if we need to add performance guidelines and memory usage documentation
|
5
|
-
|
6
|
-
puts "🚀 Performance & Memory Analysis - Universal Document Processor"
|
7
|
-
puts "=" * 70
|
8
|
-
|
9
|
-
$LOAD_PATH.unshift File.expand_path('lib', __dir__)
|
10
|
-
require 'universal_document_processor'
|
11
|
-
require 'tempfile'
|
12
|
-
require 'benchmark'
|
13
|
-
|
14
|
-
# Helper to get memory usage (Windows-specific)
|
15
|
-
def get_memory_usage
|
16
|
-
begin
|
17
|
-
result = `tasklist /FI "PID eq #{Process.pid}" /FO CSV 2>nul`
|
18
|
-
if result && !result.empty?
|
19
|
-
lines = result.split("\n")
|
20
|
-
if lines.length > 1
|
21
|
-
memory_str = lines[1].split(",")[4].gsub('"', '').gsub(',', '')
|
22
|
-
return memory_str.to_i # KB
|
23
|
-
end
|
24
|
-
end
|
25
|
-
rescue
|
26
|
-
# Fallback for non-Windows or error cases
|
27
|
-
end
|
28
|
-
return 0
|
29
|
-
end
|
30
|
-
|
31
|
-
def format_memory(kb)
|
32
|
-
if kb > 1024
|
33
|
-
"#{(kb / 1024.0).round(1)} MB"
|
34
|
-
else
|
35
|
-
"#{kb} KB"
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
def create_test_file(size_description, content_generator)
|
40
|
-
file = Tempfile.new(['perf_test', '.txt'])
|
41
|
-
content = content_generator.call
|
42
|
-
file.write(content)
|
43
|
-
file.close
|
44
|
-
|
45
|
-
actual_size = File.size(file.path)
|
46
|
-
puts " 📁 Created #{size_description}: #{format_memory(actual_size / 1024)} (#{file.path})"
|
47
|
-
|
48
|
-
return file, actual_size
|
49
|
-
end
|
50
|
-
|
51
|
-
issues_found = []
|
52
|
-
performance_concerns = []
|
53
|
-
|
54
|
-
puts "\n📊 PERFORMANCE TESTING"
|
55
|
-
puts "-" * 50
|
56
|
-
|
57
|
-
# Test 1: Small file performance (baseline)
|
58
|
-
puts "\n1️⃣ Small File Performance (Baseline)"
|
59
|
-
small_file, small_size = create_test_file("small file", -> { "Hello World!\n" * 100 })
|
60
|
-
|
61
|
-
start_memory = get_memory_usage
|
62
|
-
time_taken = Benchmark.realtime do
|
63
|
-
result = UniversalDocumentProcessor.process(small_file.path)
|
64
|
-
end
|
65
|
-
end_memory = get_memory_usage
|
66
|
-
|
67
|
-
puts " ⏱️ Processing time: #{(time_taken * 1000).round(2)} ms"
|
68
|
-
puts " 🧠 Memory change: #{format_memory(end_memory - start_memory)}"
|
69
|
-
|
70
|
-
small_file.unlink
|
71
|
-
baseline_time = time_taken
|
72
|
-
|
73
|
-
# Test 2: Medium file performance
|
74
|
-
puts "\n2️⃣ Medium File Performance (1MB)"
|
75
|
-
medium_file, medium_size = create_test_file("medium file", -> { "This is a test line with some content.\n" * 25000 })
|
76
|
-
|
77
|
-
start_memory = get_memory_usage
|
78
|
-
time_taken = Benchmark.realtime do
|
79
|
-
result = UniversalDocumentProcessor.process(medium_file.path)
|
80
|
-
end
|
81
|
-
end_memory = get_memory_usage
|
82
|
-
|
83
|
-
puts " ⏱️ Processing time: #{(time_taken * 1000).round(2)} ms"
|
84
|
-
puts " 🧠 Memory change: #{format_memory(end_memory - start_memory)}"
|
85
|
-
puts " 📈 Speed ratio: #{(time_taken / baseline_time).round(1)}x slower than baseline"
|
86
|
-
|
87
|
-
if time_taken > 2.0
|
88
|
-
performance_concerns << "Medium files (1MB) take #{time_taken.round(2)} seconds to process"
|
89
|
-
end
|
90
|
-
|
91
|
-
medium_file.unlink
|
92
|
-
|
93
|
-
# Test 3: Large file performance
|
94
|
-
puts "\n3️⃣ Large File Performance (5MB)"
|
95
|
-
large_file, large_size = create_test_file("large file", -> { "This is a longer test line with more content to simulate real documents.\n" * 75000 })
|
96
|
-
|
97
|
-
start_memory = get_memory_usage
|
98
|
-
time_taken = Benchmark.realtime do
|
99
|
-
result = UniversalDocumentProcessor.process(large_file.path)
|
100
|
-
end
|
101
|
-
end_memory = get_memory_usage
|
102
|
-
|
103
|
-
puts " ⏱️ Processing time: #{(time_taken * 1000).round(2)} ms"
|
104
|
-
puts " 🧠 Memory change: #{format_memory(end_memory - start_memory)}"
|
105
|
-
puts " 📈 Speed ratio: #{(time_taken / baseline_time).round(1)}x slower than baseline"
|
106
|
-
|
107
|
-
if time_taken > 10.0
|
108
|
-
performance_concerns << "Large files (5MB) take #{time_taken.round(2)} seconds to process"
|
109
|
-
end
|
110
|
-
|
111
|
-
if (end_memory - start_memory) > 100000 # 100MB
|
112
|
-
performance_concerns << "Large files use #{format_memory(end_memory - start_memory)} of memory"
|
113
|
-
end
|
114
|
-
|
115
|
-
large_file.unlink
|
116
|
-
|
117
|
-
puts "\n💾 MEMORY USAGE TESTING"
|
118
|
-
puts "-" * 50
|
119
|
-
|
120
|
-
# Test 4: Memory usage with multiple files
|
121
|
-
puts "\n4️⃣ Batch Processing Memory Test"
|
122
|
-
files = []
|
123
|
-
file_sizes = []
|
124
|
-
|
125
|
-
5.times do |i|
|
126
|
-
file, size = create_test_file("batch file #{i+1}", -> { "Batch processing test content line #{i}.\n" * 5000 })
|
127
|
-
files << file.path
|
128
|
-
file_sizes << size
|
129
|
-
end
|
130
|
-
|
131
|
-
total_file_size = file_sizes.sum
|
132
|
-
puts " 📦 Total file size: #{format_memory(total_file_size / 1024)}"
|
133
|
-
|
134
|
-
start_memory = get_memory_usage
|
135
|
-
time_taken = Benchmark.realtime do
|
136
|
-
results = UniversalDocumentProcessor.batch_process(files)
|
137
|
-
end
|
138
|
-
end_memory = get_memory_usage
|
139
|
-
|
140
|
-
memory_used = end_memory - start_memory
|
141
|
-
puts " ⏱️ Batch processing time: #{(time_taken * 1000).round(2)} ms"
|
142
|
-
puts " 🧠 Memory used: #{format_memory(memory_used)}"
|
143
|
-
puts " 📊 Memory efficiency: #{(memory_used.to_f / (total_file_size / 1024)).round(2)}x file size"
|
144
|
-
|
145
|
-
if memory_used > (total_file_size / 1024) * 3 # More than 3x file size
|
146
|
-
performance_concerns << "Batch processing uses #{(memory_used.to_f / (total_file_size / 1024)).round(1)}x the file size in memory"
|
147
|
-
end
|
148
|
-
|
149
|
-
# Cleanup
|
150
|
-
files.each { |f| File.delete(f) if File.exist?(f) }
|
151
|
-
|
152
|
-
# Test 5: CSV/TSV processing performance
|
153
|
-
puts "\n5️⃣ Structured Data Processing Performance"
|
154
|
-
|
155
|
-
# Large CSV test
|
156
|
-
csv_content = "Name,Age,Email,Department,Salary,Location,Phone\n"
|
157
|
-
csv_content += 10000.times.map { |i| "User#{i},#{20+i%50},user#{i}@example.com,Dept#{i%10},#{30000+i*10},City#{i%100},555-#{i.to_s.rjust(4, '0')}" }.join("\n")
|
158
|
-
|
159
|
-
csv_file = Tempfile.new(['large', '.csv'])
|
160
|
-
csv_file.write(csv_content)
|
161
|
-
csv_file.close
|
162
|
-
|
163
|
-
csv_size = File.size(csv_file.path)
|
164
|
-
puts " 📊 Large CSV size: #{format_memory(csv_size / 1024)}"
|
165
|
-
|
166
|
-
start_memory = get_memory_usage
|
167
|
-
time_taken = Benchmark.realtime do
|
168
|
-
result = UniversalDocumentProcessor.process(csv_file.path)
|
169
|
-
end
|
170
|
-
end_memory = get_memory_usage
|
171
|
-
|
172
|
-
puts " ⏱️ CSV processing time: #{(time_taken * 1000).round(2)} ms"
|
173
|
-
puts " 🧠 Memory change: #{format_memory(end_memory - start_memory)}"
|
174
|
-
|
175
|
-
if time_taken > 5.0
|
176
|
-
performance_concerns << "Large CSV files (#{format_memory(csv_size / 1024)}) take #{time_taken.round(2)} seconds"
|
177
|
-
end
|
178
|
-
|
179
|
-
csv_file.unlink
|
180
|
-
|
181
|
-
# Test 6: Unicode content performance
|
182
|
-
puts "\n6️⃣ Unicode Content Performance"
|
183
|
-
unicode_content = "これは日本語のテストです。🌟 This includes emoji and special characters: áéíóú, ñ, ç, ü\n" * 5000
|
184
|
-
|
185
|
-
unicode_file = Tempfile.new(['unicode', '.txt'])
|
186
|
-
unicode_file.write(unicode_content)
|
187
|
-
unicode_file.close
|
188
|
-
|
189
|
-
start_memory = get_memory_usage
|
190
|
-
time_taken = Benchmark.realtime do
|
191
|
-
result = UniversalDocumentProcessor.process(unicode_file.path)
|
192
|
-
end
|
193
|
-
end_memory = get_memory_usage
|
194
|
-
|
195
|
-
puts " ⏱️ Unicode processing time: #{(time_taken * 1000).round(2)} ms"
|
196
|
-
puts " 🧠 Memory change: #{format_memory(end_memory - start_memory)}"
|
197
|
-
|
198
|
-
unicode_file.unlink
|
199
|
-
|
200
|
-
puts "\n" + "=" * 70
|
201
|
-
puts "🎯 PERFORMANCE & MEMORY ANALYSIS RESULTS"
|
202
|
-
puts "=" * 70
|
203
|
-
|
204
|
-
puts "\n📈 PERFORMANCE CONCERNS FOUND:"
|
205
|
-
if performance_concerns.empty?
|
206
|
-
puts "✅ No significant performance issues detected!"
|
207
|
-
puts " The gem performs well within reasonable limits."
|
208
|
-
else
|
209
|
-
performance_concerns.each_with_index do |concern, i|
|
210
|
-
puts "⚠️ #{i + 1}. #{concern}"
|
211
|
-
end
|
212
|
-
end
|
213
|
-
|
214
|
-
puts "\n📚 DOCUMENTATION RECOMMENDATIONS:"
|
215
|
-
|
216
|
-
puts "\n4️⃣ Performance Guidelines Needed:"
|
217
|
-
guidelines_needed = []
|
218
|
-
|
219
|
-
if performance_concerns.any? { |c| c.include?("seconds") }
|
220
|
-
guidelines_needed << "Processing time expectations for different file sizes"
|
221
|
-
guidelines_needed << "Recommended file size limits for real-time processing"
|
222
|
-
end
|
223
|
-
|
224
|
-
if performance_concerns.any? { |c| c.include?("memory") }
|
225
|
-
guidelines_needed << "Memory usage patterns and optimization tips"
|
226
|
-
guidelines_needed << "Best practices for batch processing large files"
|
227
|
-
end
|
228
|
-
|
229
|
-
guidelines_needed << "Performance comparison between different file formats"
|
230
|
-
guidelines_needed << "Optimization tips for production environments"
|
231
|
-
|
232
|
-
if guidelines_needed.any?
|
233
|
-
puts "📋 Suggested documentation additions:"
|
234
|
-
guidelines_needed.each_with_index do |guideline, i|
|
235
|
-
puts " #{i + 1}. #{guideline}"
|
236
|
-
end
|
237
|
-
else
|
238
|
-
puts "✅ Current performance is good - minimal documentation needed"
|
239
|
-
end
|
240
|
-
|
241
|
-
puts "\n5️⃣ Memory Usage Documentation Needed:"
|
242
|
-
memory_docs_needed = []
|
243
|
-
|
244
|
-
memory_docs_needed << "Expected memory usage patterns (typically 2-3x file size)"
|
245
|
-
memory_docs_needed << "Memory-efficient processing tips for large files"
|
246
|
-
memory_docs_needed << "Batch processing memory considerations"
|
247
|
-
memory_docs_needed << "When to process files individually vs. in batches"
|
248
|
-
|
249
|
-
puts "📋 Suggested memory usage documentation:"
|
250
|
-
memory_docs_needed.each_with_index do |doc, i|
|
251
|
-
puts " #{i + 1}. #{doc}"
|
252
|
-
end
|
253
|
-
|
254
|
-
puts "\n💡 SPECIFIC RECOMMENDATIONS:"
|
255
|
-
puts "1. Add a PERFORMANCE.md file with benchmarks and guidelines"
|
256
|
-
puts "2. Include memory usage examples in README"
|
257
|
-
puts "3. Add performance tips to method documentation"
|
258
|
-
puts "4. Consider adding a performance_info method to the gem"
|
259
|
-
puts "5. Document recommended file size limits for different use cases"
|
260
|
-
|
261
|
-
puts "\n🎯 CONCLUSION:"
|
262
|
-
if performance_concerns.length > 2
|
263
|
-
puts "❌ Performance documentation is NEEDED - several concerns found"
|
264
|
-
exit 1
|
265
|
-
elsif performance_concerns.length > 0
|
266
|
-
puts "⚠️ Performance documentation would be HELPFUL - some concerns found"
|
267
|
-
exit 2
|
268
|
-
else
|
269
|
-
puts "✅ Performance is good, but documentation would still be valuable for users"
|
270
|
-
exit 0
|
271
|
-
end
|
data/test_published_gem.rb
DELETED
@@ -1,349 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
# Test script to check for potential issues with the published gem
|
4
|
-
# This simulates real-world usage scenarios
|
5
|
-
|
6
|
-
puts "🔍 Testing Universal Document Processor v1.0.3 for Potential Issues"
|
7
|
-
puts "=" * 70
|
8
|
-
|
9
|
-
# Add lib directory to load path for local testing
|
10
|
-
$LOAD_PATH.unshift File.expand_path('lib', __dir__)
|
11
|
-
|
12
|
-
require 'universal_document_processor'
|
13
|
-
require 'tempfile'
|
14
|
-
|
15
|
-
test_count = 0
|
16
|
-
issue_count = 0
|
17
|
-
warnings = []
|
18
|
-
|
19
|
-
def test_issue(description)
|
20
|
-
global_test_count = caller_locations.first.lineno
|
21
|
-
print "#{global_test_count}. #{description}... "
|
22
|
-
|
23
|
-
begin
|
24
|
-
yield
|
25
|
-
puts "✅ OK"
|
26
|
-
return false # No issue
|
27
|
-
rescue => e
|
28
|
-
puts "❌ ISSUE: #{e.message}"
|
29
|
-
puts " #{e.backtrace.first}" if ENV['DEBUG']
|
30
|
-
return true # Issue found
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
def check_warning(description)
|
35
|
-
print "⚠️ #{description}... "
|
36
|
-
begin
|
37
|
-
result = yield
|
38
|
-
if result
|
39
|
-
puts "FOUND"
|
40
|
-
return result
|
41
|
-
else
|
42
|
-
puts "OK"
|
43
|
-
return nil
|
44
|
-
end
|
45
|
-
rescue => e
|
46
|
-
puts "ERROR: #{e.message}"
|
47
|
-
return e.message
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
puts "\n🧪 Testing Core Functionality Issues"
|
52
|
-
puts "-" * 40
|
53
|
-
|
54
|
-
# Test 1: Basic gem loading
|
55
|
-
test_count += 1
|
56
|
-
issue_found = test_issue("Gem loads without errors") do
|
57
|
-
# Just loading the gem should work
|
58
|
-
raise "VERSION not defined" unless defined?(UniversalDocumentProcessor::VERSION)
|
59
|
-
raise "Main module not available" unless defined?(UniversalDocumentProcessor)
|
60
|
-
end
|
61
|
-
issue_count += 1 if issue_found
|
62
|
-
|
63
|
-
# Test 2: AI agent without API key (should not crash)
|
64
|
-
test_count += 1
|
65
|
-
issue_found = test_issue("AI agent creation without API key") do
|
66
|
-
agent = UniversalDocumentProcessor.create_ai_agent
|
67
|
-
raise "Agent not created" unless agent.is_a?(UniversalDocumentProcessor::AIAgent)
|
68
|
-
raise "AI should not be available" if agent.ai_available?
|
69
|
-
end
|
70
|
-
issue_count += 1 if issue_found
|
71
|
-
|
72
|
-
# Test 3: Text file processing
|
73
|
-
test_count += 1
|
74
|
-
issue_found = test_issue("Basic text file processing") do
|
75
|
-
txt_file = Tempfile.new(['test', '.txt'])
|
76
|
-
txt_file.write("Sample text content")
|
77
|
-
txt_file.close
|
78
|
-
|
79
|
-
result = UniversalDocumentProcessor.process(txt_file.path)
|
80
|
-
raise "No text_content key" unless result.has_key?(:text_content)
|
81
|
-
raise "No metadata key" unless result.has_key?(:metadata)
|
82
|
-
|
83
|
-
txt_file.unlink
|
84
|
-
end
|
85
|
-
issue_count += 1 if issue_found
|
86
|
-
|
87
|
-
# Test 4: CSV processing
|
88
|
-
test_count += 1
|
89
|
-
issue_found = test_issue("CSV file processing") do
|
90
|
-
csv_file = Tempfile.new(['test', '.csv'])
|
91
|
-
csv_file.write("Name,Age\nJohn,25\nJane,30")
|
92
|
-
csv_file.close
|
93
|
-
|
94
|
-
result = UniversalDocumentProcessor.process(csv_file.path)
|
95
|
-
raise "Wrong format detected" unless result[:metadata][:format] == "csv"
|
96
|
-
raise "No tables extracted" unless result[:tables].length > 0
|
97
|
-
|
98
|
-
csv_file.unlink
|
99
|
-
end
|
100
|
-
issue_count += 1 if issue_found
|
101
|
-
|
102
|
-
# Test 5: TSV processing (our new feature)
|
103
|
-
test_count += 1
|
104
|
-
issue_found = test_issue("TSV file processing") do
|
105
|
-
tsv_file = Tempfile.new(['test', '.tsv'])
|
106
|
-
tsv_file.write("Name\tAge\nJohn\t25\nJane\t30")
|
107
|
-
tsv_file.close
|
108
|
-
|
109
|
-
result = UniversalDocumentProcessor.process(tsv_file.path)
|
110
|
-
raise "Wrong format detected" unless result[:metadata][:format] == "tsv"
|
111
|
-
raise "Wrong delimiter" unless result[:metadata][:delimiter] == "tab"
|
112
|
-
raise "No tables extracted" unless result[:tables].length > 0
|
113
|
-
|
114
|
-
tsv_file.unlink
|
115
|
-
end
|
116
|
-
issue_count += 1 if issue_found
|
117
|
-
|
118
|
-
puts "\n🔒 Testing Dependency Issues"
|
119
|
-
puts "-" * 40
|
120
|
-
|
121
|
-
# Test 6: Optional dependency checking
|
122
|
-
test_count += 1
|
123
|
-
issue_found = test_issue("Optional dependency information") do
|
124
|
-
deps = UniversalDocumentProcessor.optional_dependencies
|
125
|
-
raise "No optional deps info" if deps.empty?
|
126
|
-
|
127
|
-
missing = UniversalDocumentProcessor.missing_dependencies
|
128
|
-
raise "Missing deps not array" unless missing.is_a?(Array)
|
129
|
-
|
130
|
-
instructions = UniversalDocumentProcessor.installation_instructions
|
131
|
-
raise "No installation instructions" if instructions.empty?
|
132
|
-
end
|
133
|
-
issue_count += 1 if issue_found
|
134
|
-
|
135
|
-
# Test 7: PDF processing without pdf-reader gem
|
136
|
-
test_count += 1
|
137
|
-
issue_found = test_issue("PDF processing dependency handling") do
|
138
|
-
# Create a fake PDF file (just for testing error handling)
|
139
|
-
pdf_file = Tempfile.new(['test', '.pdf'])
|
140
|
-
pdf_file.write("%PDF-1.4\nFake PDF content")
|
141
|
-
pdf_file.close
|
142
|
-
|
143
|
-
begin
|
144
|
-
result = UniversalDocumentProcessor.process(pdf_file.path)
|
145
|
-
# Should either work (if pdf-reader available) or give graceful error
|
146
|
-
rescue UniversalDocumentProcessor::DependencyMissingError => e
|
147
|
-
# This is expected and good
|
148
|
-
raise "Wrong error message" unless e.message.include?("pdf-reader")
|
149
|
-
end
|
150
|
-
|
151
|
-
pdf_file.unlink
|
152
|
-
end
|
153
|
-
issue_count += 1 if issue_found
|
154
|
-
|
155
|
-
puts "\n⚠️ Testing Edge Cases & Potential Warnings"
|
156
|
-
puts "-" * 40
|
157
|
-
|
158
|
-
# Warning 1: Large file handling
|
159
|
-
warning = check_warning("Large file memory usage") do
|
160
|
-
# Create a moderately large text file
|
161
|
-
large_file = Tempfile.new(['large_test', '.txt'])
|
162
|
-
content = "This is a test line.\n" * 10000 # ~200KB
|
163
|
-
large_file.write(content)
|
164
|
-
large_file.close
|
165
|
-
|
166
|
-
start_time = Time.now
|
167
|
-
result = UniversalDocumentProcessor.process(large_file.path)
|
168
|
-
end_time = Time.now
|
169
|
-
|
170
|
-
large_file.unlink
|
171
|
-
|
172
|
-
processing_time = end_time - start_time
|
173
|
-
if processing_time > 5.0
|
174
|
-
"Large file processing took #{processing_time.round(2)} seconds"
|
175
|
-
else
|
176
|
-
false
|
177
|
-
end
|
178
|
-
end
|
179
|
-
warnings << warning if warning
|
180
|
-
|
181
|
-
# Warning 2: Unicode/Japanese filename handling
|
182
|
-
warning = check_warning("Unicode filename handling") do
|
183
|
-
begin
|
184
|
-
japanese_content = "これは日本語のテストです。"
|
185
|
-
unicode_file = Tempfile.new(['テスト', '.txt'])
|
186
|
-
unicode_file.write(japanese_content)
|
187
|
-
unicode_file.close
|
188
|
-
|
189
|
-
result = UniversalDocumentProcessor.process(unicode_file.path)
|
190
|
-
unicode_file.unlink
|
191
|
-
false
|
192
|
-
rescue => e
|
193
|
-
"Unicode filename issue: #{e.message}"
|
194
|
-
end
|
195
|
-
end
|
196
|
-
warnings << warning if warning
|
197
|
-
|
198
|
-
# Warning 3: Empty file handling
|
199
|
-
warning = check_warning("Empty file handling") do
|
200
|
-
empty_file = Tempfile.new(['empty', '.txt'])
|
201
|
-
empty_file.close
|
202
|
-
|
203
|
-
begin
|
204
|
-
result = UniversalDocumentProcessor.process(empty_file.path)
|
205
|
-
empty_file.unlink
|
206
|
-
|
207
|
-
if result[:text_content].nil? || result[:text_content].empty?
|
208
|
-
false # This is expected
|
209
|
-
else
|
210
|
-
false # Also fine
|
211
|
-
end
|
212
|
-
rescue => e
|
213
|
-
empty_file.unlink
|
214
|
-
"Empty file processing issue: #{e.message}"
|
215
|
-
end
|
216
|
-
end
|
217
|
-
warnings << warning if warning
|
218
|
-
|
219
|
-
# Warning 4: Invalid file extension handling
|
220
|
-
warning = check_warning("Invalid file extension handling") do
|
221
|
-
invalid_file = Tempfile.new(['test', '.xyz'])
|
222
|
-
invalid_file.write("Test content")
|
223
|
-
invalid_file.close
|
224
|
-
|
225
|
-
begin
|
226
|
-
result = UniversalDocumentProcessor.process(invalid_file.path)
|
227
|
-
invalid_file.unlink
|
228
|
-
false # Processed successfully
|
229
|
-
rescue UniversalDocumentProcessor::UnsupportedFormatError
|
230
|
-
invalid_file.unlink
|
231
|
-
false # Expected error, good
|
232
|
-
rescue => e
|
233
|
-
invalid_file.unlink
|
234
|
-
"Unexpected error for unsupported format: #{e.message}"
|
235
|
-
end
|
236
|
-
end
|
237
|
-
warnings << warning if warning
|
238
|
-
|
239
|
-
# Warning 5: Memory usage with multiple files
|
240
|
-
warning = check_warning("Memory usage with batch processing") do
|
241
|
-
files = []
|
242
|
-
5.times do |i|
|
243
|
-
file = Tempfile.new(["batch_#{i}", '.txt'])
|
244
|
-
file.write("Batch test content #{i}\n" * 1000)
|
245
|
-
file.close
|
246
|
-
files << file.path
|
247
|
-
end
|
248
|
-
|
249
|
-
begin
|
250
|
-
start_memory = `tasklist /FI "PID eq #{Process.pid}" /FO CSV`.split("\n")[1].split(",")[4].gsub('"', '').gsub(',', '').to_i rescue 0
|
251
|
-
|
252
|
-
results = UniversalDocumentProcessor.batch_process(files)
|
253
|
-
|
254
|
-
end_memory = `tasklist /FI "PID eq #{Process.pid}" /FO CSV`.split("\n")[1].split(",")[4].gsub('"', '').gsub(',', '').to_i rescue 0
|
255
|
-
|
256
|
-
files.each { |f| File.delete(f) if File.exist?(f) }
|
257
|
-
|
258
|
-
memory_increase = end_memory - start_memory
|
259
|
-
if memory_increase > 50000 # 50MB increase
|
260
|
-
"High memory usage: #{memory_increase}KB increase"
|
261
|
-
else
|
262
|
-
false
|
263
|
-
end
|
264
|
-
rescue => e
|
265
|
-
files.each { |f| File.delete(f) if File.exist?(f) }
|
266
|
-
"Batch processing memory test failed: #{e.message}"
|
267
|
-
end
|
268
|
-
end
|
269
|
-
warnings << warning if warning
|
270
|
-
|
271
|
-
puts "\n🔍 Testing AI Features (Without API Key)"
|
272
|
-
puts "-" * 40
|
273
|
-
|
274
|
-
# Test 8: AI methods should fail gracefully
|
275
|
-
test_count += 1
|
276
|
-
issue_found = test_issue("AI methods fail gracefully without API key") do
|
277
|
-
txt_file = Tempfile.new(['ai_test', '.txt'])
|
278
|
-
txt_file.write("Test content for AI")
|
279
|
-
txt_file.close
|
280
|
-
|
281
|
-
begin
|
282
|
-
UniversalDocumentProcessor.ai_analyze(txt_file.path)
|
283
|
-
raise "Should have raised DependencyMissingError"
|
284
|
-
rescue UniversalDocumentProcessor::DependencyMissingError => e
|
285
|
-
# Expected - this is good
|
286
|
-
raise "Wrong error message" unless e.message.include?("OpenAI API key")
|
287
|
-
end
|
288
|
-
|
289
|
-
txt_file.unlink
|
290
|
-
end
|
291
|
-
issue_count += 1 if issue_found
|
292
|
-
|
293
|
-
puts "\n📊 Testing Available Features"
|
294
|
-
puts "-" * 40
|
295
|
-
|
296
|
-
# Test 9: Feature detection
|
297
|
-
test_count += 1
|
298
|
-
issue_found = test_issue("Feature detection works correctly") do
|
299
|
-
features = UniversalDocumentProcessor.available_features
|
300
|
-
raise "No features detected" if features.empty?
|
301
|
-
raise "Missing basic features" unless features.include?(:text_processing)
|
302
|
-
raise "Missing TSV support" unless features.include?(:tsv_processing)
|
303
|
-
|
304
|
-
# AI should not be available without API key
|
305
|
-
raise "AI should not be available" if features.include?(:ai_processing)
|
306
|
-
end
|
307
|
-
issue_count += 1 if issue_found
|
308
|
-
|
309
|
-
puts "\n" + "=" * 70
|
310
|
-
puts "🎯 ISSUE ANALYSIS COMPLETE"
|
311
|
-
puts "=" * 70
|
312
|
-
|
313
|
-
puts "\n📈 SUMMARY:"
|
314
|
-
puts " Total tests run: #{test_count}"
|
315
|
-
puts " Issues found: #{issue_count}"
|
316
|
-
puts " Warnings: #{warnings.compact.length}"
|
317
|
-
|
318
|
-
if issue_count == 0
|
319
|
-
puts "\n✅ NO CRITICAL ISSUES FOUND!"
|
320
|
-
puts "The gem appears to be working correctly for basic usage."
|
321
|
-
else
|
322
|
-
puts "\n❌ CRITICAL ISSUES DETECTED!"
|
323
|
-
puts "The gem has #{issue_count} critical issues that need attention."
|
324
|
-
end
|
325
|
-
|
326
|
-
if warnings.compact.length > 0
|
327
|
-
puts "\n⚠️ WARNINGS TO CONSIDER:"
|
328
|
-
warnings.compact.each_with_index do |warning, i|
|
329
|
-
puts " #{i + 1}. #{warning}"
|
330
|
-
end
|
331
|
-
else
|
332
|
-
puts "\n✅ No significant warnings detected."
|
333
|
-
end
|
334
|
-
|
335
|
-
puts "\n🔮 POTENTIAL USER ISSUES TO WATCH FOR:"
|
336
|
-
puts "1. Users trying to use AI features without setting OPENAI_API_KEY"
|
337
|
-
puts "2. Users expecting PDF/Word processing without installing optional gems"
|
338
|
-
puts "3. Large file processing performance"
|
339
|
-
puts "4. Unicode filename handling on different systems"
|
340
|
-
puts "5. Memory usage with batch processing of many files"
|
341
|
-
|
342
|
-
puts "\n💡 RECOMMENDATIONS:"
|
343
|
-
puts "1. ✅ AI dependency handling is working correctly"
|
344
|
-
puts "2. ✅ TSV processing is functional"
|
345
|
-
puts "3. ✅ Error messages are helpful"
|
346
|
-
puts "4. 📚 Consider adding performance guidelines to documentation"
|
347
|
-
puts "5. 📚 Consider adding memory usage notes for large files"
|
348
|
-
|
349
|
-
exit issue_count
|