universal_document_processor 1.0.3 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/USER_GUIDE.md ADDED
@@ -0,0 +1,597 @@
1
+ # Universal Document Processor - User Guide
2
+
3
+ Welcome to the Universal Document Processor! This guide will help you get started and make the most of this powerful document processing gem.
4
+
5
+ ## 📋 Table of Contents
6
+
7
+ 1. [Quick Start](#quick-start)
8
+ 2. [Installation](#installation)
9
+ 3. [Basic Usage](#basic-usage)
10
+ 4. [Advanced Features](#advanced-features)
11
+ 5. [Performance Guidelines](#performance-guidelines)
12
+ 6. [Memory Usage](#memory-usage)
13
+ 7. [AI Features](#ai-features)
14
+ 8. [Troubleshooting](#troubleshooting)
15
+ 9. [Best Practices](#best-practices)
16
+ 10. [FAQ](#faq)
17
+
18
+ ## 🚀 Quick Start
19
+
20
+ ```ruby
21
+ # Install the gem
22
+ gem install universal_document_processor
23
+
24
+ # Process a document
25
+ require 'universal_document_processor'
26
+
27
+ result = UniversalDocumentProcessor.process('path/to/your/document.txt')
28
+ puts result[:text_content]
29
+ ```
30
+
31
+ ## 📦 Installation
32
+
33
+ ### Basic Installation
34
+
35
+ ```bash
36
+ gem install universal_document_processor
37
+ ```
38
+
39
+ ### Optional Dependencies
40
+
41
+ The gem works with core features out of the box, but you can install optional dependencies for enhanced functionality:
42
+
43
+ ```bash
44
+ # For PDF processing
45
+ gem install pdf-reader
46
+
47
+ # For Word document processing
48
+ gem install docx
49
+
50
+ # For AI features
51
+ gem install ruby-openai
52
+ ```
53
+
54
+ ### Check Available Features
55
+
56
+ ```ruby
57
+ require 'universal_document_processor'
58
+
59
+ # See what features are available
60
+ puts UniversalDocumentProcessor.available_features
61
+ # => [:text_processing, :csv_processing, :tsv_processing, :json_processing, :xml_processing]
62
+
63
+ # Check for missing optional dependencies
64
+ puts UniversalDocumentProcessor.missing_dependencies
65
+ # => ["pdf-reader", "docx", "ruby-openai"]
66
+
67
+ # Get installation instructions
68
+ puts UniversalDocumentProcessor.installation_instructions
69
+ ```
70
+
71
+ ## 🔧 Basic Usage
72
+
73
+ ### Processing Single Files
74
+
75
+ ```ruby
76
+ require 'universal_document_processor'
77
+
78
+ # Text files
79
+ result = UniversalDocumentProcessor.process('document.txt')
80
+ puts result[:text_content]
81
+ puts result[:metadata][:format] # => "txt"
82
+
83
+ # CSV files
84
+ result = UniversalDocumentProcessor.process('data.csv')
85
+ puts result[:tables].first[:headers]
86
+ puts result[:tables].first[:rows]
87
+
88
+ # TSV files (Tab-separated values)
89
+ result = UniversalDocumentProcessor.process('data.tsv')
90
+ puts result[:metadata][:delimiter] # => "tab"
91
+
92
+ # JSON files
93
+ result = UniversalDocumentProcessor.process('config.json')
94
+ puts result[:structured_data]
95
+
96
+ # XML files
97
+ result = UniversalDocumentProcessor.process('data.xml')
98
+ puts result[:structured_data]
99
+ ```
100
+
101
+ ### Batch Processing
102
+
103
+ ```ruby
104
+ files = ['doc1.txt', 'doc2.csv', 'doc3.json']
105
+ results = UniversalDocumentProcessor.batch_process(files)
106
+
107
+ results.each_with_index do |result, i|
108
+ puts "File #{i + 1}: #{result[:metadata][:format]}"
109
+ puts "Content preview: #{result[:text_content]&.slice(0, 100)}..."
110
+ end
111
+ ```
112
+
113
+ ### Understanding Results
114
+
115
+ All processing methods return a hash with consistent structure:
116
+
117
+ ```ruby
118
+ {
119
+ text_content: "Extracted text content",
120
+ metadata: {
121
+ format: "txt", # File format detected
122
+ file_size: 1024, # File size in bytes
123
+ encoding: "UTF-8", # Text encoding
124
+ delimiter: "comma" # For CSV/TSV files
125
+ },
126
+ tables: [ # For structured data (CSV, TSV)
127
+ {
128
+ headers: ["Name", "Age"],
129
+ rows: [["John", "25"], ["Jane", "30"]]
130
+ }
131
+ ],
132
+ structured_data: {...} # For JSON/XML files
133
+ }
134
+ ```
135
+
136
+ ## 🎯 Advanced Features
137
+
138
+ ### File Format Detection
139
+
140
+ The gem automatically detects file formats based on extension and content:
141
+
142
+ ```ruby
143
+ # Supported formats
144
+ formats = {
145
+ text: ['.txt', '.md', '.log'],
146
+ csv: ['.csv'],
147
+ tsv: ['.tsv', '.tab'],
148
+ json: ['.json'],
149
+ xml: ['.xml'],
150
+ pdf: ['.pdf'], # Requires pdf-reader gem
151
+ word: ['.docx'] # Requires docx gem
152
+ }
153
+ ```
154
+
155
+ ### Custom Processing Options
156
+
157
+ ```ruby
158
+ # Process with specific options (if needed in future versions)
159
+ result = UniversalDocumentProcessor.process(
160
+ 'document.csv',
161
+ options: {
162
+ encoding: 'UTF-8',
163
+ delimiter: ','
164
+ }
165
+ )
166
+ ```
167
+
168
+ ### Error Handling
169
+
170
+ ```ruby
171
+ begin
172
+ result = UniversalDocumentProcessor.process('document.pdf')
173
+ rescue UniversalDocumentProcessor::DependencyMissingError => e
174
+ puts "Missing dependency: #{e.message}"
175
+ puts "Install with: gem install pdf-reader"
176
+ rescue UniversalDocumentProcessor::UnsupportedFormatError => e
177
+ puts "Unsupported file format: #{e.message}"
178
+ rescue => e
179
+ puts "Processing error: #{e.message}"
180
+ end
181
+ ```
182
+
183
+ ## ⚡ Performance Guidelines
184
+
185
+ ### File Size Recommendations
186
+
187
+ Based on performance testing:
188
+
189
+ | File Size | Processing Time | Recommendation |
190
+ |-----------|----------------|----------------|
191
+ | < 100 KB | < 50 ms | ✅ Excellent for real-time |
192
+ | 100 KB - 1 MB | 50-300 ms | ✅ Good for interactive use |
193
+ | 1 MB - 5 MB | 300ms - 1.5s | ⚠️ Consider async processing |
194
+ | > 5 MB | > 1.5s | 🔄 Use batch processing |
195
+
196
+ ### Performance by Format
197
+
198
+ - **Text files**: Fastest processing, linear with file size
199
+ - **CSV/TSV**: Good performance, slight overhead for parsing
200
+ - **JSON**: Fast for well-structured data
201
+ - **XML**: Moderate performance, depends on complexity
202
+ - **PDF**: Slower, depends on pdf-reader gem performance
203
+ - **Word**: Moderate, depends on docx gem performance
204
+
205
+ ### Optimization Tips
206
+
207
+ ```ruby
208
+ # For large files, process individually
209
+ large_files.each do |file|
210
+ result = UniversalDocumentProcessor.process(file)
211
+ # Process result immediately
212
+ handle_result(result)
213
+ end
214
+
215
+ # For many small files, use batch processing
216
+ small_files_batch = small_files.each_slice(10).to_a
217
+ small_files_batch.each do |batch|
218
+ results = UniversalDocumentProcessor.batch_process(batch)
219
+ # Process batch results
220
+ end
221
+ ```
222
+
223
+ ## 💾 Memory Usage
224
+
225
+ ### Expected Memory Patterns
226
+
227
+ - **Memory usage**: Typically 2-3x the file size
228
+ - **Peak memory**: During processing, returns to baseline after
229
+ - **Batch processing**: Memory scales with total batch size
230
+
231
+ ### Memory-Efficient Processing
232
+
233
+ ```ruby
234
+ # For large files - process one at a time
235
+ def process_large_files_efficiently(file_paths)
236
+ results = []
237
+
238
+ file_paths.each do |path|
239
+ result = UniversalDocumentProcessor.process(path)
240
+
241
+ # Extract only what you need
242
+ summary = {
243
+ file: path,
244
+ format: result[:metadata][:format],
245
+ size: result[:metadata][:file_size],
246
+ preview: result[:text_content]&.slice(0, 200)
247
+ }
248
+
249
+ results << summary
250
+ # result goes out of scope, allowing garbage collection
251
+ end
252
+
253
+ results
254
+ end
255
+ ```
256
+
257
+ ### Batch Processing Guidelines
258
+
259
+ ```ruby
260
+ # Recommended batch sizes
261
+ batch_sizes = {
262
+ small_files: 20, # < 100 KB each
263
+ medium_files: 10, # 100 KB - 1 MB each
264
+ large_files: 1 # > 1 MB each
265
+ }
266
+
267
+ # Example batch processing
268
+ def smart_batch_process(files)
269
+ files.group_by { |f| File.size(f) }.map do |size_group, file_list|
270
+ batch_size = case size_group
271
+ when 0..100_000 then 20
272
+ when 100_001..1_000_000 then 10
273
+ else 1
274
+ end
275
+
276
+ file_list.each_slice(batch_size).map do |batch|
277
+ UniversalDocumentProcessor.batch_process(batch)
278
+ end
279
+ end.flatten
280
+ end
281
+ ```
282
+
283
+ ## 🤖 AI Features
284
+
285
+ ### Setup
286
+
287
+ ```ruby
288
+ # Install AI dependency
289
+ gem install ruby-openai
290
+
291
+ # Set API key
292
+ ENV['OPENAI_API_KEY'] = 'your-api-key-here'
293
+
294
+ # Or pass directly
295
+ agent = UniversalDocumentProcessor.create_ai_agent(api_key: 'your-key')
296
+ ```
297
+
298
+ ### AI Processing
299
+
300
+ ```ruby
301
+ # Check if AI is available
302
+ if UniversalDocumentProcessor.ai_available?
303
+ # AI analysis
304
+ analysis = UniversalDocumentProcessor.ai_analyze('document.txt')
305
+ puts analysis[:summary]
306
+ puts analysis[:key_points]
307
+
308
+ # AI extraction
309
+ extracted = UniversalDocumentProcessor.ai_extract('document.txt', 'email addresses')
310
+ puts extracted[:results]
311
+
312
+ # AI summarization
313
+ summary = UniversalDocumentProcessor.ai_summarize('long_document.txt')
314
+ puts summary[:summary]
315
+ else
316
+ puts "AI features not available. Install ruby-openai and set OPENAI_API_KEY"
317
+ end
318
+ ```
319
+
320
+ ### AI Agent Direct Usage
321
+
322
+ ```ruby
323
+ agent = UniversalDocumentProcessor.create_ai_agent
324
+
325
+ if agent.ai_available?
326
+ # Process and analyze in one step
327
+ result = agent.analyze_document('document.txt')
328
+
329
+ # Custom AI queries
330
+ insights = agent.query_document('document.txt', 'What are the main themes?')
331
+ else
332
+ puts "AI not available: #{agent.ai_available? ? 'Unknown error' : 'Missing API key'}"
333
+ end
334
+ ```
335
+
336
+ ## 🔧 Troubleshooting
337
+
338
+ ### Common Issues
339
+
340
+ #### 1. Dependency Missing Errors
341
+
342
+ ```ruby
343
+ # Error: pdf-reader gem not found
344
+ begin
345
+ result = UniversalDocumentProcessor.process('document.pdf')
346
+ rescue UniversalDocumentProcessor::DependencyMissingError => e
347
+ puts e.message
348
+ # Install missing dependency: gem install pdf-reader
349
+ end
350
+ ```
351
+
352
+ #### 2. Unsupported File Format
353
+
354
+ ```ruby
355
+ # Error: Unsupported format
356
+ begin
357
+ result = UniversalDocumentProcessor.process('document.xyz')
358
+ rescue UniversalDocumentProcessor::UnsupportedFormatError => e
359
+ puts "#{e.message}"
360
+ puts "Supported formats: #{UniversalDocumentProcessor.supported_formats}"
361
+ end
362
+ ```
363
+
364
+ #### 3. Large File Processing
365
+
366
+ ```ruby
367
+ # For very large files, consider streaming or chunking
368
+ def process_large_file_safely(file_path)
369
+ file_size = File.size(file_path)
370
+
371
+ if file_size > 10_000_000 # 10 MB
372
+ puts "Warning: Large file detected (#{file_size / 1_000_000} MB)"
373
+ puts "Processing may take time and use significant memory"
374
+ end
375
+
376
+ UniversalDocumentProcessor.process(file_path)
377
+ end
378
+ ```
379
+
380
+ #### 4. Encoding Issues
381
+
382
+ ```ruby
383
+ # Handle encoding problems
384
+ begin
385
+ result = UniversalDocumentProcessor.process('document.txt')
386
+ rescue Encoding::InvalidByteSequenceError => e
387
+ puts "Encoding issue: #{e.message}"
388
+ # Try different encoding or clean the file
389
+ end
390
+ ```
391
+
392
+ #### 5. AI Features Not Working
393
+
394
+ ```ruby
395
+ # Debug AI availability
396
+ puts "AI Available: #{UniversalDocumentProcessor.ai_available?}"
397
+ puts "Missing Dependencies: #{UniversalDocumentProcessor.missing_dependencies}"
398
+
399
+ # Check API key
400
+ if ENV['OPENAI_API_KEY'].nil? || ENV['OPENAI_API_KEY'].empty?
401
+ puts "OPENAI_API_KEY not set"
402
+ else
403
+ puts "API key is set (length: #{ENV['OPENAI_API_KEY'].length})"
404
+ end
405
+ ```
406
+
407
+ ## 🏆 Best Practices
408
+
409
+ ### 1. Error Handling
410
+
411
+ ```ruby
412
+ def robust_document_processing(file_path)
413
+ begin
414
+ # Check if file exists
415
+ unless File.exist?(file_path)
416
+ return { error: "File not found: #{file_path}" }
417
+ end
418
+
419
+ # Check file size
420
+ file_size = File.size(file_path)
421
+ if file_size > 50_000_000 # 50 MB
422
+ return { error: "File too large: #{file_size / 1_000_000} MB" }
423
+ end
424
+
425
+ # Process the file
426
+ result = UniversalDocumentProcessor.process(file_path)
427
+
428
+ # Validate result
429
+ if result[:text_content].nil? || result[:text_content].empty?
430
+ return { warning: "No text content extracted", result: result }
431
+ end
432
+
433
+ { success: true, result: result }
434
+
435
+ rescue UniversalDocumentProcessor::DependencyMissingError => e
436
+ { error: "Missing dependency", details: e.message }
437
+ rescue UniversalDocumentProcessor::UnsupportedFormatError => e
438
+ { error: "Unsupported format", details: e.message }
439
+ rescue => e
440
+ { error: "Processing failed", details: e.message }
441
+ end
442
+ end
443
+ ```
444
+
445
+ ### 2. Performance Monitoring
446
+
447
+ ```ruby
448
+ require 'benchmark'
449
+
450
+ def process_with_monitoring(file_path)
451
+ start_time = Time.now
452
+
453
+ result = Benchmark.measure do
454
+ UniversalDocumentProcessor.process(file_path)
455
+ end
456
+
457
+ end_time = Time.now
458
+
459
+ puts "Processing time: #{(end_time - start_time).round(3)}s"
460
+ puts "CPU time: #{result.total.round(3)}s"
461
+
462
+ result
463
+ end
464
+ ```
465
+
466
+ ### 3. Logging
467
+
468
+ ```ruby
469
+ require 'logger'
470
+
471
+ logger = Logger.new(STDOUT)
472
+
473
+ def process_with_logging(file_path)
474
+ logger.info "Starting processing: #{file_path}"
475
+
476
+ begin
477
+ result = UniversalDocumentProcessor.process(file_path)
478
+ logger.info "Successfully processed: #{result[:metadata][:format]} format"
479
+ result
480
+ rescue => e
481
+ logger.error "Failed to process #{file_path}: #{e.message}"
482
+ raise
483
+ end
484
+ end
485
+ ```
486
+
487
+ ### 4. Configuration Management
488
+
489
+ ```ruby
490
+ class DocumentProcessor
491
+ def initialize(config = {})
492
+ @config = {
493
+ max_file_size: 10_000_000, # 10 MB
494
+ batch_size: 10,
495
+ enable_ai: ENV['OPENAI_API_KEY'] != nil,
496
+ log_level: :info
497
+ }.merge(config)
498
+ end
499
+
500
+ def process(file_path)
501
+ validate_file(file_path)
502
+
503
+ if @config[:enable_ai] && UniversalDocumentProcessor.ai_available?
504
+ UniversalDocumentProcessor.ai_analyze(file_path)
505
+ else
506
+ UniversalDocumentProcessor.process(file_path)
507
+ end
508
+ end
509
+
510
+ private
511
+
512
+ def validate_file(file_path)
513
+ raise "File not found" unless File.exist?(file_path)
514
+ raise "File too large" if File.size(file_path) > @config[:max_file_size]
515
+ end
516
+ end
517
+ ```
518
+
519
+ ## ❓ FAQ
520
+
521
+ ### Q: What file formats are supported?
522
+
523
+ **A:** Core formats (always available):
524
+ - Text files: `.txt`, `.md`, `.log`
525
+ - CSV files: `.csv`
526
+ - TSV files: `.tsv`, `.tab`
527
+ - JSON files: `.json`
528
+ - XML files: `.xml`
529
+
530
+ Optional formats (require additional gems):
531
+ - PDF files: `.pdf` (requires `pdf-reader`)
532
+ - Word documents: `.docx` (requires `docx`)
533
+
534
+ ### Q: How do I enable AI features?
535
+
536
+ **A:**
537
+ 1. Install the ruby-openai gem: `gem install ruby-openai`
538
+ 2. Set your OpenAI API key: `ENV['OPENAI_API_KEY'] = 'your-key'`
539
+ 3. Check availability: `UniversalDocumentProcessor.ai_available?`
540
+
541
+ ### Q: What's the maximum file size I can process?
542
+
543
+ **A:** There's no hard limit, but consider:
544
+ - Files < 1 MB: Fast processing
545
+ - Files 1-5 MB: Good performance
546
+ - Files > 5 MB: Consider chunking or async processing
547
+ - Files > 50 MB: May cause memory issues
548
+
549
+ ### Q: Can I process files in parallel?
550
+
551
+ **A:** Yes, use Ruby's threading or the batch processing feature:
552
+
553
+ ```ruby
554
+ # Batch processing (recommended)
555
+ results = UniversalDocumentProcessor.batch_process(files)
556
+
557
+ # Manual threading
558
+ threads = files.map do |file|
559
+ Thread.new { UniversalDocumentProcessor.process(file) }
560
+ end
561
+ results = threads.map(&:value)
562
+ ```
563
+
564
+ ### Q: How do I handle Unicode/international characters?
565
+
566
+ **A:** The gem handles Unicode automatically. Files are processed with UTF-8 encoding by default.
567
+
568
+ ### Q: Can I extend the gem with custom processors?
569
+
570
+ **A:** Currently, the gem doesn't support custom processors, but you can:
571
+ 1. Process files with the gem
572
+ 2. Apply custom logic to the results
573
+ 3. Submit feature requests for new formats
574
+
575
+ ### Q: How do I report bugs or request features?
576
+
577
+ **A:** Please visit the project repository and:
578
+ 1. Check existing issues
579
+ 2. Create a new issue with details
580
+ 3. Include sample files (if possible)
581
+ 4. Specify your Ruby version and OS
582
+
583
+ ### Q: Is this gem thread-safe?
584
+
585
+ **A:** Yes, the gem is designed to be thread-safe for concurrent processing of different files.
586
+
587
+ ---
588
+
589
+ ## 📞 Support
590
+
591
+ For additional help:
592
+ - Check the [README](README.md) for quick reference
593
+ - Review the [CHANGELOG](CHANGELOG.md) for recent updates
594
+ - Submit issues on the project repository
595
+ - Check `UniversalDocumentProcessor.installation_instructions` for dependency help
596
+
597
+ Happy document processing! 🚀
data/debug_test.rb ADDED
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Add lib directory to load path
4
+ $LOAD_PATH.unshift File.expand_path('lib', __dir__)
5
+
6
+ # Load the gem
7
+ require 'universal_document_processor'
8
+ require 'tempfile'
9
+
10
+ # Create a simple text file
11
+ txt_file = Tempfile.new(['test', '.txt'])
12
+ txt_file.write("This is a sample text file.\nIt has multiple lines.\nUsed for testing.")
13
+ txt_file.close
14
+
15
+ puts "Testing text file: #{txt_file.path}"
16
+
17
+ begin
18
+ puts "Processing file..."
19
+ result = UniversalDocumentProcessor.process(txt_file.path)
20
+
21
+ puts "Result keys: #{result.keys}"
22
+ puts "Result type: #{result.class}"
23
+
24
+ if result.is_a?(Hash)
25
+ result.each do |key, value|
26
+ puts "#{key}: #{value.class} - #{value.to_s[0..100]}..."
27
+ end
28
+ end
29
+
30
+ rescue => e
31
+ puts "Error: #{e.class} - #{e.message}"
32
+ puts e.backtrace.first(5)
33
+ end
34
+
35
+ txt_file.unlink
@@ -222,7 +222,11 @@ module UniversalDocumentProcessor
222
222
 
223
223
  def fallback_text_extraction
224
224
  begin
225
- Yomu.new(@file_path).text
225
+ if defined?(Yomu)
226
+ Yomu.new(@file_path).text
227
+ else
228
+ "Unable to extract text: Yomu gem not available. Please install 'yomu' gem for universal text extraction: gem install yomu"
229
+ end
226
230
  rescue => e
227
231
  "Unable to extract text: #{e.message}"
228
232
  end
@@ -10,7 +10,11 @@ module UniversalDocumentProcessor
10
10
 
11
11
  def extract_text
12
12
  # Fallback to universal text extraction
13
- Yomu.new(@file_path).text
13
+ if defined?(Yomu)
14
+ Yomu.new(@file_path).text
15
+ else
16
+ raise ProcessingError, "Universal text extraction requires the 'yomu' gem. Install it with: gem install yomu -v '~> 0.2'"
17
+ end
14
18
  rescue => e
15
19
  raise ProcessingError, "Failed to extract text: #{e.message}"
16
20
  end
@@ -9,6 +9,9 @@ module UniversalDocumentProcessor
9
9
  text = reader.pages.map(&:text).join("\n")
10
10
  text.strip.empty? ? "No text content found in PDF" : text
11
11
  end
12
+ rescue => e
13
+ # Fallback to Yomu if pdf-reader fails
14
+ fallback_text_extraction(e)
12
15
  end
13
16
 
14
17
  def extract_metadata
@@ -114,6 +117,20 @@ module UniversalDocumentProcessor
114
117
  rescue
115
118
  []
116
119
  end
120
+
121
+ def fallback_text_extraction(original_error)
122
+ if defined?(Yomu)
123
+ begin
124
+ text = Yomu.new(@file_path).text
125
+ return text unless text.nil? || text.strip.empty?
126
+ "No text content found in PDF"
127
+ rescue => yomu_error
128
+ raise ProcessingError, "PDF text extraction failed. pdf-reader error: #{original_error.message}. Yomu fallback error: #{yomu_error.message}"
129
+ end
130
+ else
131
+ raise ProcessingError, "PDF text extraction failed: #{original_error.message}. Consider installing 'yomu' gem for fallback extraction: gem install yomu"
132
+ end
133
+ end
117
134
  end
118
135
  end
119
136
  end
@@ -1,3 +1,3 @@
1
1
  module UniversalDocumentProcessor
2
- VERSION = "1.0.3"
2
+ VERSION = "1.0.5"
3
3
  end