universal_document_processor 1.0.5 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +237 -2
- data/lib/universal_document_processor/ai_agent.rb +48 -49
- data/lib/universal_document_processor/document.rb +130 -13
- data/lib/universal_document_processor/processors/archive_processor.rb +26 -0
- data/lib/universal_document_processor/processors/base_processor.rb +17 -0
- data/lib/universal_document_processor/processors/excel_processor.rb +30 -0
- data/lib/universal_document_processor/processors/pdf_processor.rb +21 -1
- data/lib/universal_document_processor/processors/text_processor.rb +21 -0
- data/lib/universal_document_processor/processors/word_processor.rb +30 -0
- data/lib/universal_document_processor/version.rb +1 -1
- data/lib/universal_document_processor.rb +10 -0
- metadata +1 -6
- data/debug_test.rb +0 -35
- data/test_ai_dependency.rb +0 -80
- data/test_core_functionality.rb +0 -280
- data/test_performance_memory.rb +0 -271
- data/test_published_gem.rb +0 -349
|
@@ -2,29 +2,62 @@ module UniversalDocumentProcessor
|
|
|
2
2
|
class Document
|
|
3
3
|
attr_reader :file_path, :content_type, :file_size, :options, :filename_validation
|
|
4
4
|
|
|
5
|
+
class LargeFileError < StandardError; end
|
|
6
|
+
class FileValidationError < StandardError; end
|
|
7
|
+
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
|
|
8
|
+
|
|
5
9
|
def initialize(file_path_or_io, options = {})
|
|
6
10
|
@file_path = file_path_or_io.is_a?(String) ? normalize_file_path(file_path_or_io) : save_temp_file(file_path_or_io)
|
|
7
11
|
@options = options
|
|
12
|
+
# 1. Check file existence and readability
|
|
13
|
+
unless File.exist?(@file_path) && File.readable?(@file_path)
|
|
14
|
+
raise FileValidationError, "File is missing or unreadable: #{@file_path}"
|
|
15
|
+
end
|
|
8
16
|
@content_type = detect_content_type
|
|
9
17
|
@file_size = File.size(@file_path)
|
|
18
|
+
# 2. Large file safeguard
|
|
19
|
+
if @file_size > MAX_FILE_SIZE
|
|
20
|
+
raise LargeFileError, "File size #{@file_size} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
|
|
21
|
+
end
|
|
10
22
|
@filename_validation = validate_filename_encoding
|
|
23
|
+
# 3. Encoding validation and cleaning for text files
|
|
24
|
+
if @content_type =~ /text|plain/
|
|
25
|
+
validation = UniversalDocumentProcessor.validate_file(@file_path)
|
|
26
|
+
unless validation[:valid]
|
|
27
|
+
@cleaned_text_content = UniversalDocumentProcessor.clean_text(validation[:content], {
|
|
28
|
+
remove_null_bytes: true,
|
|
29
|
+
remove_control_chars: true,
|
|
30
|
+
normalize_whitespace: true
|
|
31
|
+
})
|
|
32
|
+
else
|
|
33
|
+
@cleaned_text_content = nil
|
|
34
|
+
end
|
|
35
|
+
end
|
|
11
36
|
end
|
|
12
37
|
|
|
13
38
|
def process
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
39
|
+
begin
|
|
40
|
+
{
|
|
41
|
+
file_path: @file_path,
|
|
42
|
+
content_type: @content_type,
|
|
43
|
+
file_size: @file_size,
|
|
44
|
+
text_content: extract_text,
|
|
45
|
+
metadata: metadata,
|
|
46
|
+
images: extract_images,
|
|
47
|
+
tables: extract_tables,
|
|
48
|
+
filename_info: filename_info,
|
|
49
|
+
processed_at: Time.current
|
|
50
|
+
}
|
|
51
|
+
rescue LargeFileError, FileValidationError => e
|
|
52
|
+
{ error: e.class.name, message: e.message, file_path: @file_path }
|
|
53
|
+
rescue => e
|
|
54
|
+
{ error: 'ProcessingError', message: e.message, file_path: @file_path }
|
|
55
|
+
end
|
|
25
56
|
end
|
|
26
57
|
|
|
27
58
|
def extract_text
|
|
59
|
+
# Use cleaned text if available (from encoding validation)
|
|
60
|
+
return @cleaned_text_content if defined?(@cleaned_text_content) && @cleaned_text_content
|
|
28
61
|
processor.extract_text
|
|
29
62
|
rescue => e
|
|
30
63
|
fallback_text_extraction
|
|
@@ -253,13 +286,97 @@ module UniversalDocumentProcessor
|
|
|
253
286
|
end
|
|
254
287
|
|
|
255
288
|
def convert_to_pdf
|
|
256
|
-
|
|
257
|
-
|
|
289
|
+
ensure_prawn_available!
|
|
290
|
+
|
|
291
|
+
output_path = @file_path.gsub(File.extname(@file_path), '.pdf')
|
|
292
|
+
|
|
293
|
+
Prawn::Document.generate(output_path) do |pdf|
|
|
294
|
+
# Add title
|
|
295
|
+
pdf.font_size 18
|
|
296
|
+
pdf.text "Document: #{File.basename(@file_path)}", style: :bold
|
|
297
|
+
pdf.move_down 20
|
|
298
|
+
|
|
299
|
+
# Add metadata section
|
|
300
|
+
pdf.font_size 12
|
|
301
|
+
pdf.text "Document Information", style: :bold
|
|
302
|
+
pdf.move_down 10
|
|
303
|
+
|
|
304
|
+
metadata_info = metadata
|
|
305
|
+
pdf.text "File Size: #{format_file_size(@file_size)}"
|
|
306
|
+
pdf.text "Content Type: #{@content_type}"
|
|
307
|
+
pdf.text "Created: #{metadata_info[:created_at]}" if metadata_info[:created_at]
|
|
308
|
+
pdf.text "Modified: #{metadata_info[:modified_at]}" if metadata_info[:modified_at]
|
|
309
|
+
pdf.move_down 20
|
|
310
|
+
|
|
311
|
+
# Add content section
|
|
312
|
+
pdf.text "Content", style: :bold
|
|
313
|
+
pdf.move_down 10
|
|
314
|
+
|
|
315
|
+
text_content = extract_text
|
|
316
|
+
if text_content && !text_content.strip.empty?
|
|
317
|
+
pdf.font_size 10
|
|
318
|
+
pdf.text text_content
|
|
319
|
+
else
|
|
320
|
+
pdf.text "No text content available for this document."
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
# Add tables if available
|
|
324
|
+
tables = extract_tables
|
|
325
|
+
unless tables.empty?
|
|
326
|
+
pdf.start_new_page
|
|
327
|
+
pdf.font_size 12
|
|
328
|
+
pdf.text "Tables", style: :bold
|
|
329
|
+
pdf.move_down 10
|
|
330
|
+
|
|
331
|
+
tables.each_with_index do |table, index|
|
|
332
|
+
pdf.text "Table #{index + 1}", style: :bold
|
|
333
|
+
pdf.move_down 5
|
|
334
|
+
|
|
335
|
+
if table[:content] && !table[:content].empty?
|
|
336
|
+
# Format table data for Prawn
|
|
337
|
+
table_data = table[:content].first(20) # Limit to first 20 rows
|
|
338
|
+
pdf.table(table_data, header: true) do
|
|
339
|
+
row(0).font_style = :bold
|
|
340
|
+
cells.size = 8
|
|
341
|
+
cells.padding = 3
|
|
342
|
+
end
|
|
343
|
+
end
|
|
344
|
+
pdf.move_down 15
|
|
345
|
+
end
|
|
346
|
+
end
|
|
347
|
+
end
|
|
348
|
+
|
|
349
|
+
output_path
|
|
350
|
+
rescue => e
|
|
351
|
+
raise ProcessingError, "Failed to create PDF: #{e.message}"
|
|
258
352
|
end
|
|
259
353
|
|
|
260
354
|
def convert_to_html
|
|
261
355
|
# Implementation for HTML conversion
|
|
262
356
|
raise NotImplementedError, "HTML conversion not yet implemented"
|
|
263
357
|
end
|
|
358
|
+
|
|
359
|
+
private
|
|
360
|
+
|
|
361
|
+
def ensure_prawn_available!
|
|
362
|
+
unless defined?(Prawn)
|
|
363
|
+
raise DependencyMissingError, "PDF creation requires the 'prawn' gem. Install it with: gem install prawn -v '~> 2.4'"
|
|
364
|
+
end
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
def format_file_size(bytes)
|
|
368
|
+
return "0 B" if bytes == 0
|
|
369
|
+
|
|
370
|
+
units = ['B', 'KB', 'MB', 'GB']
|
|
371
|
+
size = bytes.to_f
|
|
372
|
+
unit_index = 0
|
|
373
|
+
|
|
374
|
+
while size >= 1024 && unit_index < units.length - 1
|
|
375
|
+
size /= 1024
|
|
376
|
+
unit_index += 1
|
|
377
|
+
end
|
|
378
|
+
|
|
379
|
+
"#{size.round(2)} #{units[unit_index]}"
|
|
380
|
+
end
|
|
264
381
|
end
|
|
265
382
|
end
|
|
@@ -91,6 +91,32 @@ module UniversalDocumentProcessor
|
|
|
91
91
|
super + [:list_files, :extract_file, :extract_all, :analyze_security]
|
|
92
92
|
end
|
|
93
93
|
|
|
94
|
+
# Class method to create a zip file from a list of files or a directory
|
|
95
|
+
def self.create_zip(output_zip_path, files_or_directory)
|
|
96
|
+
require 'zip'
|
|
97
|
+
files = []
|
|
98
|
+
if files_or_directory.is_a?(String) && File.directory?(files_or_directory)
|
|
99
|
+
# Recursively collect all files in the directory
|
|
100
|
+
Dir[File.join(files_or_directory, '**', '**')].each do |file|
|
|
101
|
+
files << file unless File.directory?(file)
|
|
102
|
+
end
|
|
103
|
+
base_dir = files_or_directory
|
|
104
|
+
elsif files_or_directory.is_a?(Array)
|
|
105
|
+
files = files_or_directory
|
|
106
|
+
base_dir = nil
|
|
107
|
+
else
|
|
108
|
+
raise ArgumentError, 'files_or_directory must be a directory path or an array of file paths'
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
Zip::File.open(output_zip_path, Zip::File::CREATE) do |zipfile|
|
|
112
|
+
files.each do |file|
|
|
113
|
+
entry_name = base_dir ? file.sub(/^#{Regexp.escape(base_dir)}\/?/, '') : File.basename(file)
|
|
114
|
+
zipfile.add(entry_name, file)
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
output_zip_path
|
|
118
|
+
end
|
|
119
|
+
|
|
94
120
|
private
|
|
95
121
|
|
|
96
122
|
def detect_archive_type
|
|
@@ -3,6 +3,8 @@ module UniversalDocumentProcessor
|
|
|
3
3
|
class BaseProcessor
|
|
4
4
|
attr_reader :file_path, :options
|
|
5
5
|
|
|
6
|
+
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
|
|
7
|
+
|
|
6
8
|
def initialize(file_path, options = {})
|
|
7
9
|
@file_path = file_path
|
|
8
10
|
@options = options
|
|
@@ -11,6 +13,17 @@ module UniversalDocumentProcessor
|
|
|
11
13
|
def extract_text
|
|
12
14
|
# Fallback to universal text extraction
|
|
13
15
|
if defined?(Yomu)
|
|
16
|
+
# Encoding validation for text files
|
|
17
|
+
if File.extname(@file_path) =~ /\.(txt|csv|tsv|md|json|xml|html|htm)$/i
|
|
18
|
+
validation = UniversalDocumentProcessor.validate_file(@file_path)
|
|
19
|
+
unless validation[:valid]
|
|
20
|
+
return UniversalDocumentProcessor.clean_text(validation[:content], {
|
|
21
|
+
remove_null_bytes: true,
|
|
22
|
+
remove_control_chars: true,
|
|
23
|
+
normalize_whitespace: true
|
|
24
|
+
})
|
|
25
|
+
end
|
|
26
|
+
end
|
|
14
27
|
Yomu.new(@file_path).text
|
|
15
28
|
else
|
|
16
29
|
raise ProcessingError, "Universal text extraction requires the 'yomu' gem. Install it with: gem install yomu -v '~> 0.2'"
|
|
@@ -49,6 +62,10 @@ module UniversalDocumentProcessor
|
|
|
49
62
|
def validate_file
|
|
50
63
|
raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
|
|
51
64
|
raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
|
|
65
|
+
# Large file safeguard
|
|
66
|
+
if File.size(@file_path) > MAX_FILE_SIZE
|
|
67
|
+
raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
|
|
68
|
+
end
|
|
52
69
|
end
|
|
53
70
|
|
|
54
71
|
def with_error_handling
|
|
@@ -6,11 +6,32 @@ require 'csv'
|
|
|
6
6
|
module UniversalDocumentProcessor
|
|
7
7
|
module Processors
|
|
8
8
|
class ExcelProcessor < BaseProcessor
|
|
9
|
+
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
|
|
10
|
+
|
|
9
11
|
def extract_text
|
|
12
|
+
validate_file
|
|
10
13
|
with_error_handling do
|
|
11
14
|
if @file_path.end_with?('.csv')
|
|
15
|
+
# Encoding validation for CSV
|
|
16
|
+
validation = UniversalDocumentProcessor.validate_file(@file_path)
|
|
17
|
+
unless validation[:valid]
|
|
18
|
+
return UniversalDocumentProcessor.clean_text(validation[:content], {
|
|
19
|
+
remove_null_bytes: true,
|
|
20
|
+
remove_control_chars: true,
|
|
21
|
+
normalize_whitespace: true
|
|
22
|
+
})
|
|
23
|
+
end
|
|
12
24
|
extract_csv_text
|
|
13
25
|
elsif @file_path.end_with?('.tsv')
|
|
26
|
+
# Encoding validation for TSV
|
|
27
|
+
validation = UniversalDocumentProcessor.validate_file(@file_path)
|
|
28
|
+
unless validation[:valid]
|
|
29
|
+
return UniversalDocumentProcessor.clean_text(validation[:content], {
|
|
30
|
+
remove_null_bytes: true,
|
|
31
|
+
remove_control_chars: true,
|
|
32
|
+
normalize_whitespace: true
|
|
33
|
+
})
|
|
34
|
+
end
|
|
14
35
|
extract_tsv_text
|
|
15
36
|
elsif @file_path.end_with?('.xlsx')
|
|
16
37
|
extract_xlsx_text_builtin
|
|
@@ -208,6 +229,15 @@ module UniversalDocumentProcessor
|
|
|
208
229
|
|
|
209
230
|
private
|
|
210
231
|
|
|
232
|
+
def validate_file
|
|
233
|
+
raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
|
|
234
|
+
raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
|
|
235
|
+
# Large file safeguard
|
|
236
|
+
if File.size(@file_path) > MAX_FILE_SIZE
|
|
237
|
+
raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
|
|
211
241
|
# CSV Processing Methods
|
|
212
242
|
def extract_csv_text
|
|
213
243
|
content = File.read(@file_path, encoding: 'UTF-8')
|
|
@@ -1,12 +1,23 @@
|
|
|
1
1
|
module UniversalDocumentProcessor
|
|
2
2
|
module Processors
|
|
3
3
|
class PdfProcessor < BaseProcessor
|
|
4
|
+
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
|
|
5
|
+
|
|
4
6
|
def extract_text
|
|
5
7
|
ensure_pdf_reader_available!
|
|
6
|
-
|
|
8
|
+
validate_file
|
|
7
9
|
with_error_handling do
|
|
8
10
|
reader = PDF::Reader.new(@file_path)
|
|
9
11
|
text = reader.pages.map(&:text).join("\n")
|
|
12
|
+
# Encoding validation for extracted text
|
|
13
|
+
validation = UniversalDocumentProcessor.validate_file(@file_path)
|
|
14
|
+
unless validation[:valid]
|
|
15
|
+
return UniversalDocumentProcessor.clean_text(validation[:content], {
|
|
16
|
+
remove_null_bytes: true,
|
|
17
|
+
remove_control_chars: true,
|
|
18
|
+
normalize_whitespace: true
|
|
19
|
+
})
|
|
20
|
+
end
|
|
10
21
|
text.strip.empty? ? "No text content found in PDF" : text
|
|
11
22
|
end
|
|
12
23
|
rescue => e
|
|
@@ -104,6 +115,15 @@ module UniversalDocumentProcessor
|
|
|
104
115
|
end
|
|
105
116
|
end
|
|
106
117
|
|
|
118
|
+
def validate_file
|
|
119
|
+
raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
|
|
120
|
+
raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
|
|
121
|
+
# Large file safeguard
|
|
122
|
+
if File.size(@file_path) > MAX_FILE_SIZE
|
|
123
|
+
raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
107
127
|
def extract_form_fields(reader)
|
|
108
128
|
# Extract PDF form fields if present
|
|
109
129
|
[]
|
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
module UniversalDocumentProcessor
|
|
2
2
|
module Processors
|
|
3
3
|
class TextProcessor < BaseProcessor
|
|
4
|
+
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
|
|
5
|
+
|
|
4
6
|
def extract_text
|
|
7
|
+
validate_file
|
|
5
8
|
with_error_handling do
|
|
6
9
|
case detect_text_format
|
|
7
10
|
when :rtf
|
|
@@ -15,6 +18,15 @@ module UniversalDocumentProcessor
|
|
|
15
18
|
when :json
|
|
16
19
|
extract_json_text
|
|
17
20
|
else
|
|
21
|
+
# Encoding validation for plain text
|
|
22
|
+
validation = UniversalDocumentProcessor.validate_file(@file_path)
|
|
23
|
+
unless validation[:valid]
|
|
24
|
+
return UniversalDocumentProcessor.clean_text(validation[:content], {
|
|
25
|
+
remove_null_bytes: true,
|
|
26
|
+
remove_control_chars: true,
|
|
27
|
+
normalize_whitespace: true
|
|
28
|
+
})
|
|
29
|
+
end
|
|
18
30
|
extract_plain_text
|
|
19
31
|
end
|
|
20
32
|
end
|
|
@@ -81,6 +93,15 @@ module UniversalDocumentProcessor
|
|
|
81
93
|
|
|
82
94
|
private
|
|
83
95
|
|
|
96
|
+
def validate_file
|
|
97
|
+
raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
|
|
98
|
+
raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
|
|
99
|
+
# Large file safeguard
|
|
100
|
+
if File.size(@file_path) > MAX_FILE_SIZE
|
|
101
|
+
raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
84
105
|
def detect_text_format
|
|
85
106
|
extension = File.extname(@file_path).downcase
|
|
86
107
|
case extension
|
|
@@ -1,11 +1,32 @@
|
|
|
1
1
|
module UniversalDocumentProcessor
|
|
2
2
|
module Processors
|
|
3
3
|
class WordProcessor < BaseProcessor
|
|
4
|
+
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
|
|
5
|
+
|
|
4
6
|
def extract_text
|
|
7
|
+
validate_file
|
|
5
8
|
with_error_handling do
|
|
6
9
|
if @file_path.end_with?('.docx')
|
|
10
|
+
# Encoding validation for docx (if possible)
|
|
11
|
+
validation = UniversalDocumentProcessor.validate_file(@file_path)
|
|
12
|
+
unless validation[:valid]
|
|
13
|
+
return UniversalDocumentProcessor.clean_text(validation[:content], {
|
|
14
|
+
remove_null_bytes: true,
|
|
15
|
+
remove_control_chars: true,
|
|
16
|
+
normalize_whitespace: true
|
|
17
|
+
})
|
|
18
|
+
end
|
|
7
19
|
extract_docx_text
|
|
8
20
|
elsif @file_path.end_with?('.doc')
|
|
21
|
+
# Encoding validation for doc (if possible)
|
|
22
|
+
validation = UniversalDocumentProcessor.validate_file(@file_path)
|
|
23
|
+
unless validation[:valid]
|
|
24
|
+
return UniversalDocumentProcessor.clean_text(validation[:content], {
|
|
25
|
+
remove_null_bytes: true,
|
|
26
|
+
remove_control_chars: true,
|
|
27
|
+
normalize_whitespace: true
|
|
28
|
+
})
|
|
29
|
+
end
|
|
9
30
|
# Built-in .doc file processing
|
|
10
31
|
fallback_text_extraction
|
|
11
32
|
else
|
|
@@ -90,6 +111,15 @@ module UniversalDocumentProcessor
|
|
|
90
111
|
|
|
91
112
|
private
|
|
92
113
|
|
|
114
|
+
def validate_file
|
|
115
|
+
raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
|
|
116
|
+
raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
|
|
117
|
+
# Large file safeguard
|
|
118
|
+
if File.size(@file_path) > MAX_FILE_SIZE
|
|
119
|
+
raise ProcessingError, "File size #{File.size(@file_path)} exceeds maximum allowed (#{MAX_FILE_SIZE} bytes)"
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
93
123
|
def ensure_docx_available!
|
|
94
124
|
unless defined?(Docx)
|
|
95
125
|
raise DependencyMissingError, "DOCX processing requires the 'docx' gem. Install it with: gem install docx -v '~> 0.8'"
|
|
@@ -206,6 +206,16 @@ module UniversalDocumentProcessor
|
|
|
206
206
|
Document.new(file_path_or_io, options).convert_to(target_format)
|
|
207
207
|
end
|
|
208
208
|
|
|
209
|
+
# Create PDF from any supported document
|
|
210
|
+
def self.create_pdf(file_path, options = {})
|
|
211
|
+
Document.new(file_path, options).convert_to(:pdf)
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
# Check if PDF creation is available
|
|
215
|
+
def self.pdf_creation_available?
|
|
216
|
+
defined?(Prawn)
|
|
217
|
+
end
|
|
218
|
+
|
|
209
219
|
# Batch process multiple documents
|
|
210
220
|
def self.batch_process(file_paths, options = {})
|
|
211
221
|
file_paths.map do |file_path|
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: universal_document_processor
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.1.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Vikas Patil
|
|
@@ -201,7 +201,6 @@ files:
|
|
|
201
201
|
- README.md
|
|
202
202
|
- Rakefile
|
|
203
203
|
- USER_GUIDE.md
|
|
204
|
-
- debug_test.rb
|
|
205
204
|
- lib/universal_document_processor.rb
|
|
206
205
|
- lib/universal_document_processor/ai_agent.rb
|
|
207
206
|
- lib/universal_document_processor/document.rb
|
|
@@ -217,10 +216,6 @@ files:
|
|
|
217
216
|
- lib/universal_document_processor/utils/file_detector.rb
|
|
218
217
|
- lib/universal_document_processor/utils/japanese_filename_handler.rb
|
|
219
218
|
- lib/universal_document_processor/version.rb
|
|
220
|
-
- test_ai_dependency.rb
|
|
221
|
-
- test_core_functionality.rb
|
|
222
|
-
- test_performance_memory.rb
|
|
223
|
-
- test_published_gem.rb
|
|
224
219
|
homepage: https://github.com/vpatil160/universal_document_processor
|
|
225
220
|
licenses:
|
|
226
221
|
- MIT
|
data/debug_test.rb
DELETED
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
|
|
3
|
-
# Add lib directory to load path
|
|
4
|
-
$LOAD_PATH.unshift File.expand_path('lib', __dir__)
|
|
5
|
-
|
|
6
|
-
# Load the gem
|
|
7
|
-
require 'universal_document_processor'
|
|
8
|
-
require 'tempfile'
|
|
9
|
-
|
|
10
|
-
# Create a simple text file
|
|
11
|
-
txt_file = Tempfile.new(['test', '.txt'])
|
|
12
|
-
txt_file.write("This is a sample text file.\nIt has multiple lines.\nUsed for testing.")
|
|
13
|
-
txt_file.close
|
|
14
|
-
|
|
15
|
-
puts "Testing text file: #{txt_file.path}"
|
|
16
|
-
|
|
17
|
-
begin
|
|
18
|
-
puts "Processing file..."
|
|
19
|
-
result = UniversalDocumentProcessor.process(txt_file.path)
|
|
20
|
-
|
|
21
|
-
puts "Result keys: #{result.keys}"
|
|
22
|
-
puts "Result type: #{result.class}"
|
|
23
|
-
|
|
24
|
-
if result.is_a?(Hash)
|
|
25
|
-
result.each do |key, value|
|
|
26
|
-
puts "#{key}: #{value.class} - #{value.to_s[0..100]}..."
|
|
27
|
-
end
|
|
28
|
-
end
|
|
29
|
-
|
|
30
|
-
rescue => e
|
|
31
|
-
puts "Error: #{e.class} - #{e.message}"
|
|
32
|
-
puts e.backtrace.first(5)
|
|
33
|
-
end
|
|
34
|
-
|
|
35
|
-
txt_file.unlink
|
data/test_ai_dependency.rb
DELETED
|
@@ -1,80 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
|
|
3
|
-
# Add lib directory to load path
|
|
4
|
-
$LOAD_PATH.unshift File.expand_path('lib', __dir__)
|
|
5
|
-
|
|
6
|
-
# Load the gem
|
|
7
|
-
require 'universal_document_processor'
|
|
8
|
-
|
|
9
|
-
puts "Testing AI Dependency Handling"
|
|
10
|
-
puts "=" * 50
|
|
11
|
-
|
|
12
|
-
# Test 1: Check AI availability without API key
|
|
13
|
-
puts "\n1. Testing AI availability without API key:"
|
|
14
|
-
ai_available = UniversalDocumentProcessor.ai_available?
|
|
15
|
-
puts " AI Available: #{ai_available}"
|
|
16
|
-
|
|
17
|
-
# Test 2: Create AI agent without API key
|
|
18
|
-
puts "\n2. Creating AI agent without API key:"
|
|
19
|
-
agent = UniversalDocumentProcessor.create_ai_agent
|
|
20
|
-
puts " Agent created: #{agent.class}"
|
|
21
|
-
puts " AI enabled: #{agent.ai_enabled}"
|
|
22
|
-
puts " AI available: #{agent.ai_available?}"
|
|
23
|
-
|
|
24
|
-
# Test 3: Try to use AI methods without API key
|
|
25
|
-
puts "\n3. Testing AI methods without API key:"
|
|
26
|
-
|
|
27
|
-
# Create a sample text file
|
|
28
|
-
require 'tempfile'
|
|
29
|
-
sample_file = Tempfile.new(['test', '.txt'])
|
|
30
|
-
sample_file.write("This is a test document for AI processing.")
|
|
31
|
-
sample_file.close
|
|
32
|
-
|
|
33
|
-
begin
|
|
34
|
-
result = UniversalDocumentProcessor.ai_analyze(sample_file.path)
|
|
35
|
-
puts " ERROR: Should have raised an exception!"
|
|
36
|
-
rescue UniversalDocumentProcessor::DependencyMissingError => e
|
|
37
|
-
puts " ✓ Correctly raised DependencyMissingError: #{e.message}"
|
|
38
|
-
rescue => e
|
|
39
|
-
puts " ✗ Unexpected error: #{e.class} - #{e.message}"
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
# Test 4: Check available features
|
|
43
|
-
puts "\n4. Available features:"
|
|
44
|
-
features = UniversalDocumentProcessor.available_features
|
|
45
|
-
puts " Features: #{features.join(', ')}"
|
|
46
|
-
puts " AI processing included: #{features.include?(:ai_processing)}"
|
|
47
|
-
|
|
48
|
-
# Test 5: Check optional dependencies
|
|
49
|
-
puts "\n5. Optional dependencies:"
|
|
50
|
-
optional_deps = UniversalDocumentProcessor.optional_dependencies
|
|
51
|
-
puts " Optional dependencies: #{optional_deps.keys.join(', ')}"
|
|
52
|
-
|
|
53
|
-
missing_deps = UniversalDocumentProcessor.missing_dependencies
|
|
54
|
-
puts " Missing dependencies: #{missing_deps.join(', ')}"
|
|
55
|
-
|
|
56
|
-
# Test 6: Installation instructions
|
|
57
|
-
puts "\n6. Installation instructions:"
|
|
58
|
-
instructions = UniversalDocumentProcessor.installation_instructions
|
|
59
|
-
puts instructions
|
|
60
|
-
|
|
61
|
-
# Test 7: Test with API key if provided
|
|
62
|
-
if ENV['OPENAI_API_KEY'] && !ENV['OPENAI_API_KEY'].empty?
|
|
63
|
-
puts "\n7. Testing with API key:"
|
|
64
|
-
ai_available_with_key = UniversalDocumentProcessor.ai_available?
|
|
65
|
-
puts " AI Available with key: #{ai_available_with_key}"
|
|
66
|
-
|
|
67
|
-
agent_with_key = UniversalDocumentProcessor.create_ai_agent
|
|
68
|
-
puts " Agent AI enabled: #{agent_with_key.ai_enabled}"
|
|
69
|
-
else
|
|
70
|
-
puts "\n7. Skipping API key test (OPENAI_API_KEY not set)"
|
|
71
|
-
end
|
|
72
|
-
|
|
73
|
-
# Clean up
|
|
74
|
-
sample_file.unlink
|
|
75
|
-
|
|
76
|
-
puts "\n" + "=" * 50
|
|
77
|
-
puts "AI Dependency Test Complete!"
|
|
78
|
-
puts "✓ AI features are properly optional"
|
|
79
|
-
puts "✓ Clear error messages when dependencies missing"
|
|
80
|
-
puts "✓ Graceful degradation when features unavailable"
|