universal_document_processor 1.0.3 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ISSUES_ANALYSIS.md +295 -0
- data/PERFORMANCE.md +492 -0
- data/USER_GUIDE.md +597 -0
- data/debug_test.rb +35 -0
- data/lib/universal_document_processor/document.rb +5 -1
- data/lib/universal_document_processor/processors/base_processor.rb +5 -1
- data/lib/universal_document_processor/processors/pdf_processor.rb +17 -0
- data/lib/universal_document_processor/version.rb +1 -1
- data/test_ai_dependency.rb +80 -0
- data/test_core_functionality.rb +280 -0
- data/test_performance_memory.rb +271 -0
- data/test_published_gem.rb +349 -0
- metadata +20 -6
data/PERFORMANCE.md
ADDED
@@ -0,0 +1,492 @@
|
|
1
|
+
# Performance Guide - Universal Document Processor
|
2
|
+
|
3
|
+
This guide provides detailed performance information, benchmarks, and optimization strategies for the Universal Document Processor gem.
|
4
|
+
|
5
|
+
## 📊 Performance Benchmarks
|
6
|
+
|
7
|
+
### Test Environment
|
8
|
+
- **OS**: Windows 10
|
9
|
+
- **Ruby**: 3.x
|
10
|
+
- **Hardware**: Standard development machine
|
11
|
+
- **Files**: Various synthetic test files
|
12
|
+
|
13
|
+
### Processing Time by File Size
|
14
|
+
|
15
|
+
| File Size | Text Files | CSV Files | JSON Files | TSV Files |
|
16
|
+
|-----------|------------|-----------|------------|-----------|
|
17
|
+
| 1 KB | ~30 ms | ~35 ms | ~32 ms | ~36 ms |
|
18
|
+
| 100 KB | ~50 ms | ~80 ms | ~60 ms | ~85 ms |
|
19
|
+
| 1 MB | ~270 ms | ~400 ms | ~350 ms | ~420 ms |
|
20
|
+
| 5 MB | ~1.25 s | ~2.1 s | ~1.8 s | ~2.2 s |
|
21
|
+
|
22
|
+
### Memory Usage Patterns
|
23
|
+
|
24
|
+
| File Size | Peak Memory Usage | Steady State |
|
25
|
+
|-----------|-------------------|--------------|
|
26
|
+
| 1 KB | +36 KB | Baseline |
|
27
|
+
| 100 KB | +200 KB | Baseline |
|
28
|
+
| 1 MB | +2.5 MB | Baseline |
|
29
|
+
| 5 MB | +12 MB | Baseline |
|
30
|
+
|
31
|
+
## ⚡ Performance Characteristics
|
32
|
+
|
33
|
+
### Linear Scaling
|
34
|
+
- **Text files**: Near-linear scaling with file size
|
35
|
+
- **CSV/TSV files**: Linear with slight parsing overhead
|
36
|
+
- **JSON files**: Depends on structure complexity
|
37
|
+
- **XML files**: Varies significantly with nesting depth
|
38
|
+
|
39
|
+
### Format-Specific Performance
|
40
|
+
|
41
|
+
#### Text Files (.txt, .md, .log)
|
42
|
+
```ruby
|
43
|
+
# Fastest processing - simple file reading
|
44
|
+
# Performance: O(n) where n = file size
|
45
|
+
# Memory: ~1.5x file size during processing
|
46
|
+
```
|
47
|
+
|
48
|
+
#### CSV Files (.csv)
|
49
|
+
```ruby
|
50
|
+
# Good performance with parsing overhead
|
51
|
+
# Performance: O(n*m) where n = rows, m = columns
|
52
|
+
# Memory: ~2-3x file size (stores parsed structure)
|
53
|
+
```
|
54
|
+
|
55
|
+
#### TSV Files (.tsv, .tab)
|
56
|
+
```ruby
|
57
|
+
# Similar to CSV with tab delimiter
|
58
|
+
# Performance: O(n*m) where n = rows, m = columns
|
59
|
+
# Memory: ~2-3x file size
|
60
|
+
```
|
61
|
+
|
62
|
+
#### JSON Files (.json)
|
63
|
+
```ruby
|
64
|
+
# Performance depends on structure
|
65
|
+
# Simple JSON: ~1.5x slower than text
|
66
|
+
# Complex nested JSON: ~3x slower than text
|
67
|
+
# Memory: ~2-4x file size depending on structure
|
68
|
+
```
|
69
|
+
|
70
|
+
#### XML Files (.xml)
|
71
|
+
```ruby
|
72
|
+
# Performance varies with complexity
|
73
|
+
# Simple XML: ~2x slower than text
|
74
|
+
# Complex nested XML: ~5x slower than text
|
75
|
+
# Memory: ~3-5x file size
|
76
|
+
```
|
77
|
+
|
78
|
+
## 🚀 Optimization Strategies
|
79
|
+
|
80
|
+
### 1. File Size Management
|
81
|
+
|
82
|
+
```ruby
|
83
|
+
# Check file size before processing
|
84
|
+
def process_with_size_check(file_path)
|
85
|
+
file_size = File.size(file_path)
|
86
|
+
|
87
|
+
case file_size
|
88
|
+
when 0..100_000 # < 100 KB
|
89
|
+
# Process immediately - excellent performance
|
90
|
+
UniversalDocumentProcessor.process(file_path)
|
91
|
+
|
92
|
+
when 100_001..1_000_000 # 100 KB - 1 MB
|
93
|
+
# Good performance - consider async for UI
|
94
|
+
puts "Processing medium file (#{file_size / 1000} KB)..."
|
95
|
+
UniversalDocumentProcessor.process(file_path)
|
96
|
+
|
97
|
+
when 1_000_001..10_000_000 # 1 MB - 10 MB
|
98
|
+
# Consider background processing
|
99
|
+
puts "Processing large file (#{file_size / 1_000_000} MB)..."
|
100
|
+
puts "This may take #{estimate_processing_time(file_size)} seconds"
|
101
|
+
UniversalDocumentProcessor.process(file_path)
|
102
|
+
|
103
|
+
else # > 10 MB
|
104
|
+
# Recommend chunking or streaming
|
105
|
+
puts "Very large file detected (#{file_size / 1_000_000} MB)"
|
106
|
+
puts "Consider processing in chunks"
|
107
|
+
process_large_file_in_chunks(file_path)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def estimate_processing_time(file_size_bytes)
|
112
|
+
# Rough estimate based on benchmarks
|
113
|
+
(file_size_bytes / 4_000_000.0).round(1) # ~4MB per second
|
114
|
+
end
|
115
|
+
```
|
116
|
+
|
117
|
+
### 2. Batch Processing Optimization
|
118
|
+
|
119
|
+
```ruby
|
120
|
+
# Smart batch sizing based on file sizes
|
121
|
+
def optimize_batch_processing(files)
|
122
|
+
# Group files by size for optimal batching
|
123
|
+
small_files = files.select { |f| File.size(f) < 100_000 }
|
124
|
+
medium_files = files.select { |f| File.size(f).between?(100_000, 1_000_000) }
|
125
|
+
large_files = files.select { |f| File.size(f) > 1_000_000 }
|
126
|
+
|
127
|
+
results = []
|
128
|
+
|
129
|
+
# Process small files in large batches
|
130
|
+
small_files.each_slice(20) do |batch|
|
131
|
+
results.concat(UniversalDocumentProcessor.batch_process(batch))
|
132
|
+
end
|
133
|
+
|
134
|
+
# Process medium files in smaller batches
|
135
|
+
medium_files.each_slice(5) do |batch|
|
136
|
+
results.concat(UniversalDocumentProcessor.batch_process(batch))
|
137
|
+
end
|
138
|
+
|
139
|
+
# Process large files individually
|
140
|
+
large_files.each do |file|
|
141
|
+
results << UniversalDocumentProcessor.process(file)
|
142
|
+
end
|
143
|
+
|
144
|
+
results
|
145
|
+
end
|
146
|
+
```
|
147
|
+
|
148
|
+
### 3. Memory-Efficient Processing
|
149
|
+
|
150
|
+
```ruby
|
151
|
+
# Process large datasets without memory buildup
|
152
|
+
def memory_efficient_processing(file_paths)
|
153
|
+
file_paths.each_with_index do |path, index|
|
154
|
+
puts "Processing #{index + 1}/#{file_paths.length}: #{File.basename(path)}"
|
155
|
+
|
156
|
+
# Process file
|
157
|
+
result = UniversalDocumentProcessor.process(path)
|
158
|
+
|
159
|
+
# Extract only essential data
|
160
|
+
summary = extract_summary(result)
|
161
|
+
|
162
|
+
# Save or process immediately
|
163
|
+
save_result(summary, path)
|
164
|
+
|
165
|
+
# Force garbage collection for large files
|
166
|
+
if File.size(path) > 5_000_000
|
167
|
+
GC.start
|
168
|
+
end
|
169
|
+
|
170
|
+
# Optional: Progress callback
|
171
|
+
yield(index + 1, file_paths.length, summary) if block_given?
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
def extract_summary(result)
|
176
|
+
{
|
177
|
+
format: result[:metadata][:format],
|
178
|
+
size: result[:metadata][:file_size],
|
179
|
+
text_preview: result[:text_content]&.slice(0, 500),
|
180
|
+
table_count: result[:tables]&.length || 0,
|
181
|
+
has_structured_data: !result[:structured_data].nil?
|
182
|
+
}
|
183
|
+
end
|
184
|
+
```
|
185
|
+
|
186
|
+
### 4. Asynchronous Processing
|
187
|
+
|
188
|
+
```ruby
|
189
|
+
require 'concurrent-ruby'
|
190
|
+
|
191
|
+
# Process files asynchronously
|
192
|
+
def async_process_files(file_paths, max_threads: 4)
|
193
|
+
# Create thread pool
|
194
|
+
pool = Concurrent::FixedThreadPool.new(max_threads)
|
195
|
+
|
196
|
+
# Submit processing tasks
|
197
|
+
futures = file_paths.map do |path|
|
198
|
+
Concurrent::Future.execute(executor: pool) do
|
199
|
+
{
|
200
|
+
file: path,
|
201
|
+
result: UniversalDocumentProcessor.process(path),
|
202
|
+
processed_at: Time.now
|
203
|
+
}
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
# Wait for completion and collect results
|
208
|
+
results = futures.map(&:value)
|
209
|
+
|
210
|
+
# Shutdown thread pool
|
211
|
+
pool.shutdown
|
212
|
+
pool.wait_for_termination
|
213
|
+
|
214
|
+
results
|
215
|
+
end
|
216
|
+
```
|
217
|
+
|
218
|
+
## 📈 Performance Monitoring
|
219
|
+
|
220
|
+
### Built-in Performance Tracking
|
221
|
+
|
222
|
+
```ruby
|
223
|
+
require 'benchmark'
|
224
|
+
|
225
|
+
def process_with_detailed_metrics(file_path)
|
226
|
+
file_size = File.size(file_path)
|
227
|
+
|
228
|
+
# Memory before processing
|
229
|
+
memory_before = get_memory_usage
|
230
|
+
|
231
|
+
# Time the processing
|
232
|
+
result = nil
|
233
|
+
time_taken = Benchmark.realtime do
|
234
|
+
result = UniversalDocumentProcessor.process(file_path)
|
235
|
+
end
|
236
|
+
|
237
|
+
# Memory after processing
|
238
|
+
memory_after = get_memory_usage
|
239
|
+
memory_used = memory_after - memory_before
|
240
|
+
|
241
|
+
# Calculate metrics
|
242
|
+
throughput = file_size / time_taken / 1024 / 1024 # MB/s
|
243
|
+
memory_efficiency = memory_used.to_f / file_size
|
244
|
+
|
245
|
+
{
|
246
|
+
result: result,
|
247
|
+
metrics: {
|
248
|
+
file_size: file_size,
|
249
|
+
processing_time: time_taken,
|
250
|
+
throughput_mbps: throughput.round(2),
|
251
|
+
memory_used: memory_used,
|
252
|
+
memory_efficiency: memory_efficiency.round(2)
|
253
|
+
}
|
254
|
+
}
|
255
|
+
end
|
256
|
+
|
257
|
+
def get_memory_usage
|
258
|
+
# Platform-specific memory usage detection
|
259
|
+
if RUBY_PLATFORM =~ /win32/
|
260
|
+
`tasklist /FI "PID eq #{Process.pid}" /FO CSV`.split("\n")[1]&.split(",")&.[](4)&.gsub(/[",]/, '')&.to_i || 0
|
261
|
+
else
|
262
|
+
`ps -o rss= -p #{Process.pid}`.strip.to_i
|
263
|
+
end
|
264
|
+
end
|
265
|
+
```
|
266
|
+
|
267
|
+
### Performance Alerts
|
268
|
+
|
269
|
+
```ruby
|
270
|
+
class PerformanceMonitor
|
271
|
+
THRESHOLDS = {
|
272
|
+
processing_time: 5.0, # seconds
|
273
|
+
memory_usage: 100_000, # KB
|
274
|
+
throughput: 1.0 # MB/s minimum
|
275
|
+
}
|
276
|
+
|
277
|
+
def self.monitor_processing(file_path)
|
278
|
+
metrics = process_with_detailed_metrics(file_path)
|
279
|
+
|
280
|
+
alerts = []
|
281
|
+
|
282
|
+
if metrics[:metrics][:processing_time] > THRESHOLDS[:processing_time]
|
283
|
+
alerts << "Slow processing: #{metrics[:metrics][:processing_time].round(2)}s"
|
284
|
+
end
|
285
|
+
|
286
|
+
if metrics[:metrics][:memory_used] > THRESHOLDS[:memory_usage]
|
287
|
+
alerts << "High memory usage: #{metrics[:metrics][:memory_used] / 1024}MB"
|
288
|
+
end
|
289
|
+
|
290
|
+
if metrics[:metrics][:throughput_mbps] < THRESHOLDS[:throughput]
|
291
|
+
alerts << "Low throughput: #{metrics[:metrics][:throughput_mbps]}MB/s"
|
292
|
+
end
|
293
|
+
|
294
|
+
unless alerts.empty?
|
295
|
+
puts "⚠️ Performance Alerts for #{File.basename(file_path)}:"
|
296
|
+
alerts.each { |alert| puts " - #{alert}" }
|
297
|
+
end
|
298
|
+
|
299
|
+
metrics
|
300
|
+
end
|
301
|
+
end
|
302
|
+
```
|
303
|
+
|
304
|
+
## 🎯 Production Optimization
|
305
|
+
|
306
|
+
### Configuration for Production
|
307
|
+
|
308
|
+
```ruby
|
309
|
+
class ProductionProcessor
|
310
|
+
def initialize
|
311
|
+
@config = {
|
312
|
+
max_file_size: 50_000_000, # 50 MB limit
|
313
|
+
batch_size_small: 50, # Files < 100KB
|
314
|
+
batch_size_medium: 10, # Files 100KB-1MB
|
315
|
+
batch_size_large: 1, # Files > 1MB
|
316
|
+
enable_gc_after_large: true, # GC after large files
|
317
|
+
performance_monitoring: true,
|
318
|
+
async_processing: true,
|
319
|
+
max_concurrent_threads: 4
|
320
|
+
}
|
321
|
+
end
|
322
|
+
|
323
|
+
def process_files(file_paths)
|
324
|
+
validate_files(file_paths)
|
325
|
+
|
326
|
+
if @config[:async_processing] && file_paths.length > 1
|
327
|
+
async_process_files(file_paths)
|
328
|
+
else
|
329
|
+
sequential_process_files(file_paths)
|
330
|
+
end
|
331
|
+
end
|
332
|
+
|
333
|
+
private
|
334
|
+
|
335
|
+
def validate_files(file_paths)
|
336
|
+
file_paths.each do |path|
|
337
|
+
raise "File not found: #{path}" unless File.exist?(path)
|
338
|
+
|
339
|
+
size = File.size(path)
|
340
|
+
if size > @config[:max_file_size]
|
341
|
+
raise "File too large: #{path} (#{size / 1_000_000}MB > #{@config[:max_file_size] / 1_000_000}MB)"
|
342
|
+
end
|
343
|
+
end
|
344
|
+
end
|
345
|
+
end
|
346
|
+
```
|
347
|
+
|
348
|
+
### Caching Strategy
|
349
|
+
|
350
|
+
```ruby
|
351
|
+
require 'digest'
|
352
|
+
|
353
|
+
class CachedProcessor
|
354
|
+
def initialize(cache_dir: './cache')
|
355
|
+
@cache_dir = cache_dir
|
356
|
+
Dir.mkdir(@cache_dir) unless Dir.exist?(@cache_dir)
|
357
|
+
end
|
358
|
+
|
359
|
+
def process_with_cache(file_path)
|
360
|
+
# Generate cache key based on file content and modification time
|
361
|
+
file_stat = File.stat(file_path)
|
362
|
+
cache_key = Digest::SHA256.hexdigest("#{file_path}:#{file_stat.mtime}:#{file_stat.size}")
|
363
|
+
cache_file = File.join(@cache_dir, "#{cache_key}.json")
|
364
|
+
|
365
|
+
# Return cached result if available
|
366
|
+
if File.exist?(cache_file)
|
367
|
+
puts "Using cached result for #{File.basename(file_path)}"
|
368
|
+
return JSON.parse(File.read(cache_file), symbolize_names: true)
|
369
|
+
end
|
370
|
+
|
371
|
+
# Process and cache result
|
372
|
+
result = UniversalDocumentProcessor.process(file_path)
|
373
|
+
File.write(cache_file, JSON.pretty_generate(result))
|
374
|
+
|
375
|
+
result
|
376
|
+
end
|
377
|
+
|
378
|
+
def clear_cache
|
379
|
+
Dir.glob(File.join(@cache_dir, "*.json")).each { |f| File.delete(f) }
|
380
|
+
end
|
381
|
+
end
|
382
|
+
```
|
383
|
+
|
384
|
+
## 📋 Performance Checklist
|
385
|
+
|
386
|
+
### Before Processing Large Batches
|
387
|
+
|
388
|
+
- [ ] Check available system memory
|
389
|
+
- [ ] Estimate total processing time
|
390
|
+
- [ ] Plan for progress reporting
|
391
|
+
- [ ] Consider async processing for > 10 files
|
392
|
+
- [ ] Set up error handling for individual files
|
393
|
+
- [ ] Plan for result storage/processing
|
394
|
+
|
395
|
+
### During Processing
|
396
|
+
|
397
|
+
- [ ] Monitor memory usage
|
398
|
+
- [ ] Track processing progress
|
399
|
+
- [ ] Handle errors gracefully
|
400
|
+
- [ ] Log performance metrics
|
401
|
+
- [ ] Provide user feedback
|
402
|
+
|
403
|
+
### After Processing
|
404
|
+
|
405
|
+
- [ ] Clean up temporary files
|
406
|
+
- [ ] Force garbage collection if needed
|
407
|
+
- [ ] Log final performance summary
|
408
|
+
- [ ] Archive or process results
|
409
|
+
- [ ] Update performance baselines
|
410
|
+
|
411
|
+
## 🔧 Troubleshooting Performance Issues
|
412
|
+
|
413
|
+
### Slow Processing
|
414
|
+
|
415
|
+
```ruby
|
416
|
+
# Diagnose slow processing
|
417
|
+
def diagnose_slow_processing(file_path)
|
418
|
+
puts "Diagnosing: #{file_path}"
|
419
|
+
|
420
|
+
file_size = File.size(file_path)
|
421
|
+
puts "File size: #{file_size / 1024}KB"
|
422
|
+
|
423
|
+
# Check file format
|
424
|
+
format = File.extname(file_path).downcase
|
425
|
+
puts "Format: #{format}"
|
426
|
+
|
427
|
+
# Expected processing time
|
428
|
+
expected_time = estimate_processing_time(file_size)
|
429
|
+
puts "Expected time: ~#{expected_time}s"
|
430
|
+
|
431
|
+
# Actual processing with timing
|
432
|
+
actual_time = Benchmark.realtime do
|
433
|
+
UniversalDocumentProcessor.process(file_path)
|
434
|
+
end
|
435
|
+
|
436
|
+
puts "Actual time: #{actual_time.round(2)}s"
|
437
|
+
|
438
|
+
if actual_time > expected_time * 2
|
439
|
+
puts "⚠️ Processing significantly slower than expected"
|
440
|
+
puts "Consider:"
|
441
|
+
puts "- File complexity (nested structures, encoding issues)"
|
442
|
+
puts "- System resources (memory, CPU)"
|
443
|
+
puts "- Concurrent processing load"
|
444
|
+
end
|
445
|
+
end
|
446
|
+
```
|
447
|
+
|
448
|
+
### Memory Issues
|
449
|
+
|
450
|
+
```ruby
|
451
|
+
# Monitor memory during processing
|
452
|
+
def process_with_memory_monitoring(file_path)
|
453
|
+
initial_memory = get_memory_usage
|
454
|
+
peak_memory = initial_memory
|
455
|
+
|
456
|
+
# Process with periodic memory checks
|
457
|
+
result = nil
|
458
|
+
thread = Thread.new do
|
459
|
+
loop do
|
460
|
+
current_memory = get_memory_usage
|
461
|
+
peak_memory = [peak_memory, current_memory].max
|
462
|
+
sleep 0.1
|
463
|
+
end
|
464
|
+
end
|
465
|
+
|
466
|
+
result = UniversalDocumentProcessor.process(file_path)
|
467
|
+
thread.kill
|
468
|
+
|
469
|
+
final_memory = get_memory_usage
|
470
|
+
|
471
|
+
puts "Memory usage:"
|
472
|
+
puts " Initial: #{initial_memory / 1024}MB"
|
473
|
+
puts " Peak: #{peak_memory / 1024}MB"
|
474
|
+
puts " Final: #{final_memory / 1024}MB"
|
475
|
+
puts " Increase: #{(peak_memory - initial_memory) / 1024}MB"
|
476
|
+
|
477
|
+
result
|
478
|
+
end
|
479
|
+
```
|
480
|
+
|
481
|
+
---
|
482
|
+
|
483
|
+
## 📞 Performance Support
|
484
|
+
|
485
|
+
For performance-related issues:
|
486
|
+
- Check system resources (RAM, CPU)
|
487
|
+
- Review file characteristics (size, format, complexity)
|
488
|
+
- Consider batch processing strategies
|
489
|
+
- Monitor memory usage patterns
|
490
|
+
- Use async processing for large datasets
|
491
|
+
|
492
|
+
Performance optimization is an ongoing process. Monitor your specific use cases and adjust strategies accordingly.
|