universal_document_processor 1.0.5 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,29 +2,62 @@ module UniversalDocumentProcessor
2
2
  class Document
3
3
  attr_reader :file_path, :content_type, :file_size, :options, :filename_validation
4
4
 
5
+ class LargeFileError < StandardError; end
6
+ class FileValidationError < StandardError; end
7
+ MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
8
+
5
9
  def initialize(file_path_or_io, options = {})
6
10
  @file_path = file_path_or_io.is_a?(String) ? normalize_file_path(file_path_or_io) : save_temp_file(file_path_or_io)
7
11
  @options = options
12
+ # 1. Check file existence and readability
13
+ unless File.exist?(@file_path) && File.readable?(@file_path)
14
+ raise FileValidationError, "File is missing or unreadable: #{@file_path}"
15
+ end
8
16
  @content_type = detect_content_type
9
17
  @file_size = File.size(@file_path)
18
+ # 2. Large file safeguard
19
+ if @file_size > MAX_FILE_SIZE
20
+ raise LargeFileError, "File size #{@file_size} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
21
+ end
10
22
  @filename_validation = validate_filename_encoding
23
+ # 3. Encoding validation and cleaning for text files
24
+ if @content_type =~ /text|plain/
25
+ validation = UniversalDocumentProcessor.validate_file(@file_path)
26
+ unless validation[:valid]
27
+ @cleaned_text_content = UniversalDocumentProcessor.clean_text(validation[:content], {
28
+ remove_null_bytes: true,
29
+ remove_control_chars: true,
30
+ normalize_whitespace: true
31
+ })
32
+ else
33
+ @cleaned_text_content = nil
34
+ end
35
+ end
11
36
  end
12
37
 
13
38
  def process
14
- {
15
- file_path: @file_path,
16
- content_type: @content_type,
17
- file_size: @file_size,
18
- text_content: extract_text,
19
- metadata: metadata,
20
- images: extract_images,
21
- tables: extract_tables,
22
- filename_info: filename_info,
23
- processed_at: Time.current
24
- }
39
+ begin
40
+ {
41
+ file_path: @file_path,
42
+ content_type: @content_type,
43
+ file_size: @file_size,
44
+ text_content: extract_text,
45
+ metadata: metadata,
46
+ images: extract_images,
47
+ tables: extract_tables,
48
+ filename_info: filename_info,
49
+ processed_at: Time.current
50
+ }
51
+ rescue LargeFileError, FileValidationError => e
52
+ { error: e.class.name, message: e.message, file_path: @file_path }
53
+ rescue => e
54
+ { error: 'ProcessingError', message: e.message, file_path: @file_path }
55
+ end
25
56
  end
26
57
 
27
58
  def extract_text
59
+ # Use cleaned text if available (from encoding validation)
60
+ return @cleaned_text_content if defined?(@cleaned_text_content) && @cleaned_text_content
28
61
  processor.extract_text
29
62
  rescue => e
30
63
  fallback_text_extraction
@@ -253,13 +286,97 @@ module UniversalDocumentProcessor
253
286
  end
254
287
 
255
288
  def convert_to_pdf
256
- # Implementation for PDF conversion
257
- raise NotImplementedError, "PDF conversion not yet implemented"
289
+ ensure_prawn_available!
290
+
291
+ output_path = @file_path.gsub(File.extname(@file_path), '.pdf')
292
+
293
+ Prawn::Document.generate(output_path) do |pdf|
294
+ # Add title
295
+ pdf.font_size 18
296
+ pdf.text "Document: #{File.basename(@file_path)}", style: :bold
297
+ pdf.move_down 20
298
+
299
+ # Add metadata section
300
+ pdf.font_size 12
301
+ pdf.text "Document Information", style: :bold
302
+ pdf.move_down 10
303
+
304
+ metadata_info = metadata
305
+ pdf.text "File Size: #{format_file_size(@file_size)}"
306
+ pdf.text "Content Type: #{@content_type}"
307
+ pdf.text "Created: #{metadata_info[:created_at]}" if metadata_info[:created_at]
308
+ pdf.text "Modified: #{metadata_info[:modified_at]}" if metadata_info[:modified_at]
309
+ pdf.move_down 20
310
+
311
+ # Add content section
312
+ pdf.text "Content", style: :bold
313
+ pdf.move_down 10
314
+
315
+ text_content = extract_text
316
+ if text_content && !text_content.strip.empty?
317
+ pdf.font_size 10
318
+ pdf.text text_content
319
+ else
320
+ pdf.text "No text content available for this document."
321
+ end
322
+
323
+ # Add tables if available
324
+ tables = extract_tables
325
+ unless tables.empty?
326
+ pdf.start_new_page
327
+ pdf.font_size 12
328
+ pdf.text "Tables", style: :bold
329
+ pdf.move_down 10
330
+
331
+ tables.each_with_index do |table, index|
332
+ pdf.text "Table #{index + 1}", style: :bold
333
+ pdf.move_down 5
334
+
335
+ if table[:content] && !table[:content].empty?
336
+ # Format table data for Prawn
337
+ table_data = table[:content].first(20) # Limit to first 20 rows
338
+ pdf.table(table_data, header: true) do
339
+ row(0).font_style = :bold
340
+ cells.size = 8
341
+ cells.padding = 3
342
+ end
343
+ end
344
+ pdf.move_down 15
345
+ end
346
+ end
347
+ end
348
+
349
+ output_path
350
+ rescue => e
351
+ raise ProcessingError, "Failed to create PDF: #{e.message}"
258
352
  end
259
353
 
260
354
  def convert_to_html
261
355
  # Implementation for HTML conversion
262
356
  raise NotImplementedError, "HTML conversion not yet implemented"
263
357
  end
358
+
359
+ private
360
+
361
+ def ensure_prawn_available!
362
+ unless defined?(Prawn)
363
+ raise DependencyMissingError, "PDF creation requires the 'prawn' gem. Install it with: gem install prawn -v '~> 2.4'"
364
+ end
365
+ end
366
+
367
+ def format_file_size(bytes)
368
+ return "0 B" if bytes == 0
369
+
370
+ units = ['B', 'KB', 'MB', 'GB']
371
+ size = bytes.to_f
372
+ unit_index = 0
373
+
374
+ while size >= 1024 && unit_index < units.length - 1
375
+ size /= 1024
376
+ unit_index += 1
377
+ end
378
+
379
+ "#{size.round(2)} #{units[unit_index]}"
380
+ end
264
381
  end
265
382
  end
@@ -91,6 +91,32 @@ module UniversalDocumentProcessor
91
91
  super + [:list_files, :extract_file, :extract_all, :analyze_security]
92
92
  end
93
93
 
94
+ # Class method to create a zip file from a list of files or a directory
95
+ def self.create_zip(output_zip_path, files_or_directory)
96
+ require 'zip'
97
+ files = []
98
+ if files_or_directory.is_a?(String) && File.directory?(files_or_directory)
99
+ # Recursively collect all files in the directory
100
+ Dir[File.join(files_or_directory, '**', '**')].each do |file|
101
+ files << file unless File.directory?(file)
102
+ end
103
+ base_dir = files_or_directory
104
+ elsif files_or_directory.is_a?(Array)
105
+ files = files_or_directory
106
+ base_dir = nil
107
+ else
108
+ raise ArgumentError, 'files_or_directory must be a directory path or an array of file paths'
109
+ end
110
+
111
+ Zip::File.open(output_zip_path, Zip::File::CREATE) do |zipfile|
112
+ files.each do |file|
113
+ entry_name = base_dir ? file.sub(/^#{Regexp.escape(base_dir)}\/?/, '') : File.basename(file)
114
+ zipfile.add(entry_name, file)
115
+ end
116
+ end
117
+ output_zip_path
118
+ end
119
+
94
120
  private
95
121
 
96
122
  def detect_archive_type
@@ -3,6 +3,8 @@ module UniversalDocumentProcessor
3
3
  class BaseProcessor
4
4
  attr_reader :file_path, :options
5
5
 
6
+ MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
7
+
6
8
  def initialize(file_path, options = {})
7
9
  @file_path = file_path
8
10
  @options = options
@@ -11,6 +13,17 @@ module UniversalDocumentProcessor
11
13
  def extract_text
12
14
  # Fallback to universal text extraction
13
15
  if defined?(Yomu)
16
+ # Encoding validation for text files
17
+ if File.extname(@file_path) =~ /\.(txt|csv|tsv|md|json|xml|html|htm)$/i
18
+ validation = UniversalDocumentProcessor.validate_file(@file_path)
19
+ unless validation[:valid]
20
+ return UniversalDocumentProcessor.clean_text(validation[:content], {
21
+ remove_null_bytes: true,
22
+ remove_control_chars: true,
23
+ normalize_whitespace: true
24
+ })
25
+ end
26
+ end
14
27
  Yomu.new(@file_path).text
15
28
  else
16
29
  raise ProcessingError, "Universal text extraction requires the 'yomu' gem. Install it with: gem install yomu -v '~> 0.2'"
@@ -49,6 +62,10 @@ module UniversalDocumentProcessor
49
62
  def validate_file
50
63
  raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
51
64
  raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
65
+ # Large file safeguard
66
+ if File.size(@file_path) > MAX_FILE_SIZE
67
+ raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
68
+ end
52
69
  end
53
70
 
54
71
  def with_error_handling
@@ -6,11 +6,32 @@ require 'csv'
6
6
  module UniversalDocumentProcessor
7
7
  module Processors
8
8
  class ExcelProcessor < BaseProcessor
9
+ MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
10
+
9
11
  def extract_text
12
+ validate_file
10
13
  with_error_handling do
11
14
  if @file_path.end_with?('.csv')
15
+ # Encoding validation for CSV
16
+ validation = UniversalDocumentProcessor.validate_file(@file_path)
17
+ unless validation[:valid]
18
+ return UniversalDocumentProcessor.clean_text(validation[:content], {
19
+ remove_null_bytes: true,
20
+ remove_control_chars: true,
21
+ normalize_whitespace: true
22
+ })
23
+ end
12
24
  extract_csv_text
13
25
  elsif @file_path.end_with?('.tsv')
26
+ # Encoding validation for TSV
27
+ validation = UniversalDocumentProcessor.validate_file(@file_path)
28
+ unless validation[:valid]
29
+ return UniversalDocumentProcessor.clean_text(validation[:content], {
30
+ remove_null_bytes: true,
31
+ remove_control_chars: true,
32
+ normalize_whitespace: true
33
+ })
34
+ end
14
35
  extract_tsv_text
15
36
  elsif @file_path.end_with?('.xlsx')
16
37
  extract_xlsx_text_builtin
@@ -208,6 +229,15 @@ module UniversalDocumentProcessor
208
229
 
209
230
  private
210
231
 
232
+ def validate_file
233
+ raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
234
+ raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
235
+ # Large file safeguard
236
+ if File.size(@file_path) > MAX_FILE_SIZE
237
+ raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
238
+ end
239
+ end
240
+
211
241
  # CSV Processing Methods
212
242
  def extract_csv_text
213
243
  content = File.read(@file_path, encoding: 'UTF-8')
@@ -1,12 +1,23 @@
1
1
  module UniversalDocumentProcessor
2
2
  module Processors
3
3
  class PdfProcessor < BaseProcessor
4
+ MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
5
+
4
6
  def extract_text
5
7
  ensure_pdf_reader_available!
6
-
8
+ validate_file
7
9
  with_error_handling do
8
10
  reader = PDF::Reader.new(@file_path)
9
11
  text = reader.pages.map(&:text).join("\n")
12
+ # Encoding validation for extracted text
13
+ validation = UniversalDocumentProcessor.validate_file(@file_path)
14
+ unless validation[:valid]
15
+ return UniversalDocumentProcessor.clean_text(validation[:content], {
16
+ remove_null_bytes: true,
17
+ remove_control_chars: true,
18
+ normalize_whitespace: true
19
+ })
20
+ end
10
21
  text.strip.empty? ? "No text content found in PDF" : text
11
22
  end
12
23
  rescue => e
@@ -104,6 +115,15 @@ module UniversalDocumentProcessor
104
115
  end
105
116
  end
106
117
 
118
+ def validate_file
119
+ raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
120
+ raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
121
+ # Large file safeguard
122
+ if File.size(@file_path) > MAX_FILE_SIZE
123
+ raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
124
+ end
125
+ end
126
+
107
127
  def extract_form_fields(reader)
108
128
  # Extract PDF form fields if present
109
129
  []
@@ -1,7 +1,10 @@
1
1
  module UniversalDocumentProcessor
2
2
  module Processors
3
3
  class TextProcessor < BaseProcessor
4
+ MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
5
+
4
6
  def extract_text
7
+ validate_file
5
8
  with_error_handling do
6
9
  case detect_text_format
7
10
  when :rtf
@@ -15,6 +18,15 @@ module UniversalDocumentProcessor
15
18
  when :json
16
19
  extract_json_text
17
20
  else
21
+ # Encoding validation for plain text
22
+ validation = UniversalDocumentProcessor.validate_file(@file_path)
23
+ unless validation[:valid]
24
+ return UniversalDocumentProcessor.clean_text(validation[:content], {
25
+ remove_null_bytes: true,
26
+ remove_control_chars: true,
27
+ normalize_whitespace: true
28
+ })
29
+ end
18
30
  extract_plain_text
19
31
  end
20
32
  end
@@ -81,6 +93,15 @@ module UniversalDocumentProcessor
81
93
 
82
94
  private
83
95
 
96
+ def validate_file
97
+ raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
98
+ raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
99
+ # Large file safeguard
100
+ if File.size(@file_path) > MAX_FILE_SIZE
101
+ raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
102
+ end
103
+ end
104
+
84
105
  def detect_text_format
85
106
  extension = File.extname(@file_path).downcase
86
107
  case extension
@@ -1,11 +1,32 @@
1
1
  module UniversalDocumentProcessor
2
2
  module Processors
3
3
  class WordProcessor < BaseProcessor
4
+ MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
5
+
4
6
  def extract_text
7
+ validate_file
5
8
  with_error_handling do
6
9
  if @file_path.end_with?('.docx')
10
+ # Encoding validation for docx (if possible)
11
+ validation = UniversalDocumentProcessor.validate_file(@file_path)
12
+ unless validation[:valid]
13
+ return UniversalDocumentProcessor.clean_text(validation[:content], {
14
+ remove_null_bytes: true,
15
+ remove_control_chars: true,
16
+ normalize_whitespace: true
17
+ })
18
+ end
7
19
  extract_docx_text
8
20
  elsif @file_path.end_with?('.doc')
21
+ # Encoding validation for doc (if possible)
22
+ validation = UniversalDocumentProcessor.validate_file(@file_path)
23
+ unless validation[:valid]
24
+ return UniversalDocumentProcessor.clean_text(validation[:content], {
25
+ remove_null_bytes: true,
26
+ remove_control_chars: true,
27
+ normalize_whitespace: true
28
+ })
29
+ end
9
30
  # Built-in .doc file processing
10
31
  fallback_text_extraction
11
32
  else
@@ -90,6 +111,15 @@ module UniversalDocumentProcessor
90
111
 
91
112
  private
92
113
 
114
+ def validate_file
115
+ raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
116
+ raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
117
+ # Large file safeguard
118
+ if File.size(@file_path) > MAX_FILE_SIZE
119
+ raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
120
+ end
121
+ end
122
+
93
123
  def ensure_docx_available!
94
124
  unless defined?(Docx)
95
125
  raise DependencyMissingError, "DOCX processing requires the 'docx' gem. Install it with: gem install docx -v '~> 0.8'"
@@ -1,3 +1,3 @@
1
1
  module UniversalDocumentProcessor
2
- VERSION = "1.0.5"
2
+ VERSION = "1.1.1"
3
3
  end
@@ -206,6 +206,16 @@ module UniversalDocumentProcessor
206
206
  Document.new(file_path_or_io, options).convert_to(target_format)
207
207
  end
208
208
 
209
+ # Create PDF from any supported document
210
+ def self.create_pdf(file_path, options = {})
211
+ Document.new(file_path, options).convert_to(:pdf)
212
+ end
213
+
214
+ # Check if PDF creation is available
215
+ def self.pdf_creation_available?
216
+ defined?(Prawn)
217
+ end
218
+
209
219
  # Batch process multiple documents
210
220
  def self.batch_process(file_paths, options = {})
211
221
  file_paths.map do |file_path|
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: universal_document_processor
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.5
4
+ version: 1.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vikas Patil
@@ -201,7 +201,6 @@ files:
201
201
  - README.md
202
202
  - Rakefile
203
203
  - USER_GUIDE.md
204
- - debug_test.rb
205
204
  - lib/universal_document_processor.rb
206
205
  - lib/universal_document_processor/ai_agent.rb
207
206
  - lib/universal_document_processor/document.rb
@@ -217,10 +216,6 @@ files:
217
216
  - lib/universal_document_processor/utils/file_detector.rb
218
217
  - lib/universal_document_processor/utils/japanese_filename_handler.rb
219
218
  - lib/universal_document_processor/version.rb
220
- - test_ai_dependency.rb
221
- - test_core_functionality.rb
222
- - test_performance_memory.rb
223
- - test_published_gem.rb
224
219
  homepage: https://github.com/vpatil160/universal_document_processor
225
220
  licenses:
226
221
  - MIT
data/debug_test.rb DELETED
@@ -1,35 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- # Add lib directory to load path
4
- $LOAD_PATH.unshift File.expand_path('lib', __dir__)
5
-
6
- # Load the gem
7
- require 'universal_document_processor'
8
- require 'tempfile'
9
-
10
- # Create a simple text file
11
- txt_file = Tempfile.new(['test', '.txt'])
12
- txt_file.write("This is a sample text file.\nIt has multiple lines.\nUsed for testing.")
13
- txt_file.close
14
-
15
- puts "Testing text file: #{txt_file.path}"
16
-
17
- begin
18
- puts "Processing file..."
19
- result = UniversalDocumentProcessor.process(txt_file.path)
20
-
21
- puts "Result keys: #{result.keys}"
22
- puts "Result type: #{result.class}"
23
-
24
- if result.is_a?(Hash)
25
- result.each do |key, value|
26
- puts "#{key}: #{value.class} - #{value.to_s[0..100]}..."
27
- end
28
- end
29
-
30
- rescue => e
31
- puts "Error: #{e.class} - #{e.message}"
32
- puts e.backtrace.first(5)
33
- end
34
-
35
- txt_file.unlink
@@ -1,80 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- # Add lib directory to load path
4
- $LOAD_PATH.unshift File.expand_path('lib', __dir__)
5
-
6
- # Load the gem
7
- require 'universal_document_processor'
8
-
9
- puts "Testing AI Dependency Handling"
10
- puts "=" * 50
11
-
12
- # Test 1: Check AI availability without API key
13
- puts "\n1. Testing AI availability without API key:"
14
- ai_available = UniversalDocumentProcessor.ai_available?
15
- puts " AI Available: #{ai_available}"
16
-
17
- # Test 2: Create AI agent without API key
18
- puts "\n2. Creating AI agent without API key:"
19
- agent = UniversalDocumentProcessor.create_ai_agent
20
- puts " Agent created: #{agent.class}"
21
- puts " AI enabled: #{agent.ai_enabled}"
22
- puts " AI available: #{agent.ai_available?}"
23
-
24
- # Test 3: Try to use AI methods without API key
25
- puts "\n3. Testing AI methods without API key:"
26
-
27
- # Create a sample text file
28
- require 'tempfile'
29
- sample_file = Tempfile.new(['test', '.txt'])
30
- sample_file.write("This is a test document for AI processing.")
31
- sample_file.close
32
-
33
- begin
34
- result = UniversalDocumentProcessor.ai_analyze(sample_file.path)
35
- puts " ERROR: Should have raised an exception!"
36
- rescue UniversalDocumentProcessor::DependencyMissingError => e
37
- puts " ✓ Correctly raised DependencyMissingError: #{e.message}"
38
- rescue => e
39
- puts " ✗ Unexpected error: #{e.class} - #{e.message}"
40
- end
41
-
42
- # Test 4: Check available features
43
- puts "\n4. Available features:"
44
- features = UniversalDocumentProcessor.available_features
45
- puts " Features: #{features.join(', ')}"
46
- puts " AI processing included: #{features.include?(:ai_processing)}"
47
-
48
- # Test 5: Check optional dependencies
49
- puts "\n5. Optional dependencies:"
50
- optional_deps = UniversalDocumentProcessor.optional_dependencies
51
- puts " Optional dependencies: #{optional_deps.keys.join(', ')}"
52
-
53
- missing_deps = UniversalDocumentProcessor.missing_dependencies
54
- puts " Missing dependencies: #{missing_deps.join(', ')}"
55
-
56
- # Test 6: Installation instructions
57
- puts "\n6. Installation instructions:"
58
- instructions = UniversalDocumentProcessor.installation_instructions
59
- puts instructions
60
-
61
- # Test 7: Test with API key if provided
62
- if ENV['OPENAI_API_KEY'] && !ENV['OPENAI_API_KEY'].empty?
63
- puts "\n7. Testing with API key:"
64
- ai_available_with_key = UniversalDocumentProcessor.ai_available?
65
- puts " AI Available with key: #{ai_available_with_key}"
66
-
67
- agent_with_key = UniversalDocumentProcessor.create_ai_agent
68
- puts " Agent AI enabled: #{agent_with_key.ai_enabled}"
69
- else
70
- puts "\n7. Skipping API key test (OPENAI_API_KEY not set)"
71
- end
72
-
73
- # Clean up
74
- sample_file.unlink
75
-
76
- puts "\n" + "=" * 50
77
- puts "AI Dependency Test Complete!"
78
- puts "✓ AI features are properly optional"
79
- puts "✓ Clear error messages when dependencies missing"
80
- puts "✓ Graceful degradation when features unavailable"