universal_document_processor 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,137 @@
1
+ module UniversalDocumentProcessor
2
+ module Processors
3
+ class WordProcessor < BaseProcessor
4
+ def extract_text
5
+ with_error_handling do
6
+ if @file_path.end_with?('.docx')
7
+ extract_docx_text
8
+ else
9
+ # Fallback for .doc files
10
+ fallback_text_extraction
11
+ end
12
+ end
13
+ end
14
+
15
+ def extract_metadata
16
+ with_error_handling do
17
+ if @file_path.end_with?('.docx')
18
+ extract_docx_metadata
19
+ else
20
+ super
21
+ end
22
+ end
23
+ end
24
+
25
+ def extract_images
26
+ with_error_handling do
27
+ return [] unless @file_path.end_with?('.docx')
28
+
29
+ images = []
30
+ doc = Docx::Document.open(@file_path)
31
+
32
+ # Extract embedded images
33
+ doc.doc_xml.xpath('//w:drawing//a:blip').each_with_index do |blip, index|
34
+ embed_id = blip['r:embed']
35
+ if embed_id
36
+ images << {
37
+ index: index + 1,
38
+ embed_id: embed_id,
39
+ type: 'embedded'
40
+ }
41
+ end
42
+ end
43
+
44
+ images
45
+ end
46
+ end
47
+
48
+ def extract_tables
49
+ with_error_handling do
50
+ return [] unless @file_path.end_with?('.docx')
51
+
52
+ tables = []
53
+ doc = Docx::Document.open(@file_path)
54
+
55
+ doc.tables.each_with_index do |table, table_index|
56
+ table_data = {
57
+ index: table_index + 1,
58
+ rows: table.rows.length,
59
+ columns: table.column_count,
60
+ content: []
61
+ }
62
+
63
+ table.rows.each do |row|
64
+ row_data = row.cells.map(&:text)
65
+ table_data[:content] << row_data
66
+ end
67
+
68
+ tables << table_data
69
+ end
70
+
71
+ tables
72
+ end
73
+ end
74
+
75
+ def supported_operations
76
+ super + [:extract_images, :extract_tables, :extract_styles, :extract_comments]
77
+ end
78
+
79
+ private
80
+
81
+ def extract_docx_text
82
+ doc = Docx::Document.open(@file_path)
83
+ text_content = []
84
+
85
+ # Extract paragraphs
86
+ doc.paragraphs.each do |paragraph|
87
+ text_content << paragraph.text unless paragraph.text.strip.empty?
88
+ end
89
+
90
+ # Extract table content
91
+ doc.tables.each do |table|
92
+ table.rows.each do |row|
93
+ row_text = row.cells.map(&:text).join(' | ')
94
+ text_content << row_text unless row_text.strip.empty?
95
+ end
96
+ end
97
+
98
+ text_content.join("\n")
99
+ end
100
+
101
+ def extract_docx_metadata
102
+ doc = Docx::Document.open(@file_path)
103
+ core_properties = doc.core_properties
104
+
105
+ super.merge({
106
+ title: core_properties.title,
107
+ author: core_properties.creator,
108
+ subject: core_properties.subject,
109
+ description: core_properties.description,
110
+ keywords: core_properties.keywords,
111
+ created_at: core_properties.created,
112
+ modified_at: core_properties.modified,
113
+ last_modified_by: core_properties.last_modified_by,
114
+ revision: core_properties.revision,
115
+ word_count: count_words(extract_docx_text),
116
+ paragraph_count: doc.paragraphs.length,
117
+ table_count: doc.tables.length
118
+ })
119
+ rescue => e
120
+ super
121
+ end
122
+
123
+ def count_words(text)
124
+ text.split(/\s+/).length
125
+ rescue
126
+ 0
127
+ end
128
+
129
+ def fallback_text_extraction
130
+ # Use Yomu for .doc files or as fallback
131
+ Yomu.new(@file_path).text
132
+ rescue => e
133
+ "Unable to extract text from Word document: #{e.message}"
134
+ end
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,83 @@
1
+ module UniversalDocumentProcessor
2
+ module Utils
3
+ class FileDetector
4
+ MIME_TYPE_MAPPINGS = {
5
+ 'pdf' => 'application/pdf',
6
+ 'doc' => 'application/msword',
7
+ 'docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
8
+ 'xls' => 'application/vnd.ms-excel',
9
+ 'xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
10
+ 'ppt' => 'application/vnd.ms-powerpoint',
11
+ 'pptx' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
12
+ 'txt' => 'text/plain',
13
+ 'rtf' => 'application/rtf',
14
+ 'html' => 'text/html',
15
+ 'htm' => 'text/html',
16
+ 'xml' => 'application/xml',
17
+ 'csv' => 'text/csv',
18
+ 'json' => 'application/json',
19
+ 'jpg' => 'image/jpeg',
20
+ 'jpeg' => 'image/jpeg',
21
+ 'png' => 'image/png',
22
+ 'gif' => 'image/gif',
23
+ 'bmp' => 'image/bmp',
24
+ 'tiff' => 'image/tiff',
25
+ 'tif' => 'image/tiff',
26
+ 'zip' => 'application/zip',
27
+ 'rar' => 'application/x-rar-compressed',
28
+ '7z' => 'application/x-7z-compressed'
29
+ }.freeze
30
+
31
+ def self.detect(file_path)
32
+ # First try Marcel for accurate MIME detection
33
+ mime_type = Marcel::MimeType.for(Pathname.new(file_path))
34
+ return mime_type if mime_type && mime_type != 'application/octet-stream'
35
+
36
+ # Fallback to extension-based detection
37
+ extension = File.extname(file_path).downcase.gsub('.', '')
38
+ MIME_TYPE_MAPPINGS[extension] || 'application/octet-stream'
39
+ end
40
+
41
+ def self.supported?(file_path)
42
+ mime_type = detect(file_path)
43
+ supported_mime_types.include?(mime_type)
44
+ end
45
+
46
+ def self.supported_mime_types
47
+ MIME_TYPE_MAPPINGS.values + [
48
+ 'application/octet-stream',
49
+ 'text/plain',
50
+ 'text/html',
51
+ 'application/xml'
52
+ ]
53
+ end
54
+
55
+ def self.format_category(file_path)
56
+ mime_type = detect(file_path)
57
+
58
+ case mime_type
59
+ when /pdf/
60
+ :pdf
61
+ when /word/, /document/
62
+ :document
63
+ when /excel/, /spreadsheet/
64
+ :spreadsheet
65
+ when /powerpoint/, /presentation/
66
+ :presentation
67
+ when /image/
68
+ :image
69
+ when /text/, /plain/
70
+ :text
71
+ when /zip/, /archive/, /compressed/
72
+ :archive
73
+ else
74
+ :unknown
75
+ end
76
+ end
77
+
78
+ def self.extension_from_mime(mime_type)
79
+ MIME_TYPE_MAPPINGS.key(mime_type) || 'bin'
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,205 @@
1
+ module UniversalDocumentProcessor
2
+ module Utils
3
+ class JapaneseFilenameHandler
4
+ # Japanese filename character ranges
5
+ HIRAGANA_RANGE = /[\u{3040}-\u{309F}]/
6
+ KATAKANA_RANGE = /[\u{30A0}-\u{30FF}]/
7
+ KANJI_RANGE = /[\u{4E00}-\u{9FAF}]/
8
+ FULLWIDTH_RANGE = /[\u{FF00}-\u{FFEF}]/
9
+
10
+ # Combined Japanese character pattern
11
+ JAPANESE_CHARS = /[\u{3040}-\u{309F}\u{30A0}-\u{30FF}\u{4E00}-\u{9FAF}\u{FF00}-\u{FFEF}]/
12
+
13
+ # Valid filename characters (including Japanese)
14
+ VALID_FILENAME_CHARS = /\A[\u{3040}-\u{309F}\u{30A0}-\u{30FF}\u{4E00}-\u{9FAF}\u{FF00}-\u{FFEF}\w\s\-_.()@#$%&+=!~]*\z/
15
+
16
+ def self.contains_japanese?(filename)
17
+ return false unless filename.is_a?(String)
18
+
19
+ # Ensure UTF-8 encoding for regex matching
20
+ normalized = filename.encode('UTF-8', invalid: :replace, undef: :replace, replace: '')
21
+ normalized.match?(JAPANESE_CHARS)
22
+ rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
23
+ false
24
+ end
25
+
26
+ def self.normalize_filename(filename)
27
+ return filename unless filename.is_a?(String)
28
+
29
+ # Ensure UTF-8 encoding
30
+ normalized = filename.encode('UTF-8', invalid: :replace, undef: :replace, replace: '')
31
+
32
+ # Handle different encoding scenarios
33
+ if normalized.encoding != Encoding::UTF_8
34
+ normalized = normalized.force_encoding('UTF-8')
35
+ end
36
+
37
+ normalized
38
+ end
39
+
40
+ def self.safe_filename(filename)
41
+ normalized = normalize_filename(filename)
42
+
43
+ # Replace problematic characters while preserving Japanese
44
+ safe = normalized.gsub(/[<>:"|?*]/, '_')
45
+
46
+ # Handle Windows reserved names
47
+ safe = safe.gsub(/^(CON|PRN|AUX|NUL|COM[1-9]|LPT[1-9])$/i, '_\1')
48
+
49
+ # Ensure not too long (Windows has 255 char limit, but we'll be conservative)
50
+ if safe.bytesize > 200
51
+ extension = File.extname(safe)
52
+ basename = File.basename(safe, extension)
53
+ # Truncate basename but keep extension
54
+ while (basename + extension).bytesize > 200 && basename.length > 1
55
+ basename = basename[0...-1]
56
+ end
57
+ safe = basename + extension
58
+ end
59
+
60
+ safe
61
+ end
62
+
63
+ def self.validate_filename(filename)
64
+ issues = []
65
+
66
+ return { valid: false, issues: ['Filename is nil or empty'] } if filename.nil? || filename.empty?
67
+
68
+ normalized = normalize_filename(filename)
69
+
70
+ # Check encoding validity
71
+ unless normalized.valid_encoding?
72
+ issues << 'Filename contains invalid encoding sequences'
73
+ end
74
+
75
+ # Check for null bytes
76
+ if normalized.include?("\x00")
77
+ issues << 'Filename contains null bytes'
78
+ end
79
+
80
+ # Check for control characters
81
+ if normalized.match?(/[\x00-\x1F\x7F]/)
82
+ issues << 'Filename contains control characters'
83
+ end
84
+
85
+ # Check length
86
+ if normalized.bytesize > 255
87
+ issues << 'Filename is too long (over 255 bytes)'
88
+ end
89
+
90
+ # Check for Windows reserved names
91
+ basename = File.basename(normalized, File.extname(normalized))
92
+ if basename.match?(/^(CON|PRN|AUX|NUL|COM[1-9]|LPT[1-9])$/i)
93
+ issues << 'Filename uses Windows reserved name'
94
+ end
95
+
96
+ {
97
+ valid: issues.empty?,
98
+ issues: issues,
99
+ contains_japanese: contains_japanese?(normalized),
100
+ normalized_filename: normalized,
101
+ safe_filename: safe_filename(filename)
102
+ }
103
+ end
104
+
105
+ def self.extract_japanese_parts(filename)
106
+ return {} unless contains_japanese?(filename)
107
+
108
+ {
109
+ hiragana: filename.scan(HIRAGANA_RANGE),
110
+ katakana: filename.scan(KATAKANA_RANGE),
111
+ kanji: filename.scan(KANJI_RANGE),
112
+ fullwidth: filename.scan(FULLWIDTH_RANGE),
113
+ japanese_count: filename.scan(JAPANESE_CHARS).length
114
+ }
115
+ end
116
+
117
+ def self.create_safe_temp_filename(original_filename, prefix = 'doc')
118
+ validation = validate_filename(original_filename)
119
+
120
+ if validation[:valid]
121
+ # Use the normalized filename if it's valid
122
+ validation[:normalized_filename]
123
+ else
124
+ # Create a safe temporary filename
125
+ extension = File.extname(original_filename)
126
+ timestamp = Time.now.strftime('%Y%m%d_%H%M%S')
127
+ japanese_parts = extract_japanese_parts(original_filename)
128
+
129
+ if japanese_parts[:japanese_count] > 0
130
+ # Include some Japanese context if possible
131
+ safe_japanese = japanese_parts[:hiragana].first(3).join +
132
+ japanese_parts[:katakana].first(3).join +
133
+ japanese_parts[:kanji].first(3).join
134
+ "#{prefix}_#{safe_japanese}_#{timestamp}#{extension}"
135
+ else
136
+ "#{prefix}_#{timestamp}#{extension}"
137
+ end
138
+ end
139
+ end
140
+
141
+ def self.analyze_filename_encoding(filename)
142
+ encodings_to_try = ['UTF-8', 'Shift_JIS', 'EUC-JP', 'ISO-8859-1', 'Windows-1252']
143
+
144
+ results = {}
145
+
146
+ encodings_to_try.each do |encoding|
147
+ begin
148
+ if filename.encoding.name == encoding
149
+ # Already in this encoding
150
+ results[encoding] = {
151
+ valid: filename.valid_encoding?,
152
+ convertible: true,
153
+ contains_japanese: contains_japanese?(filename.dup.force_encoding('UTF-8'))
154
+ }
155
+ else
156
+ # Try to convert to this encoding
157
+ converted = filename.encode(encoding)
158
+ # For Japanese detection, always use UTF-8 version
159
+ utf8_version = converted.encode('UTF-8', invalid: :replace, undef: :replace, replace: '')
160
+ results[encoding] = {
161
+ valid: converted.valid_encoding?,
162
+ convertible: true,
163
+ contains_japanese: contains_japanese?(utf8_version)
164
+ }
165
+ end
166
+ rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
167
+ results[encoding] = {
168
+ valid: false,
169
+ convertible: false,
170
+ contains_japanese: false
171
+ }
172
+ end
173
+ end
174
+
175
+ {
176
+ original_encoding: filename.encoding.name,
177
+ analysis: results,
178
+ recommended_encoding: find_best_encoding(results)
179
+ }
180
+ end
181
+
182
+ private
183
+
184
+ def self.find_best_encoding(analysis_results)
185
+ # Prefer UTF-8 if valid
186
+ return 'UTF-8' if analysis_results['UTF-8']&.dig(:valid)
187
+
188
+ # Then try Japanese encodings if they contain Japanese
189
+ ['Shift_JIS', 'EUC-JP'].each do |encoding|
190
+ result = analysis_results[encoding]
191
+ if result&.dig(:valid) && result&.dig(:contains_japanese)
192
+ return encoding
193
+ end
194
+ end
195
+
196
+ # Fall back to any valid encoding
197
+ analysis_results.each do |encoding, result|
198
+ return encoding if result&.dig(:valid)
199
+ end
200
+
201
+ 'UTF-8' # Default fallback
202
+ end
203
+ end
204
+ end
205
+ end
@@ -0,0 +1,3 @@
1
+ module UniversalDocumentProcessor
2
+ VERSION = "1.0.0"
3
+ end
@@ -0,0 +1,223 @@
1
+ require 'active_support/all'
2
+ require 'marcel'
3
+ require 'nokogiri'
4
+ require 'zip'
5
+
6
+ # Optional dependencies - only require if available
7
+ begin
8
+ require 'pdf-reader'
9
+ rescue LoadError
10
+ # PDF processing will use fallback
11
+ end
12
+
13
+ begin
14
+ require 'prawn'
15
+ rescue LoadError
16
+ # PDF generation will not be available
17
+ end
18
+
19
+ begin
20
+ require 'docx'
21
+ rescue LoadError
22
+ # Word processing will use fallback
23
+ end
24
+
25
+ begin
26
+ require 'roo'
27
+ rescue LoadError
28
+ # Excel processing will use fallback
29
+ end
30
+
31
+ begin
32
+ require 'mini_magick'
33
+ rescue LoadError
34
+ # Image processing will use fallback
35
+ end
36
+
37
+ begin
38
+ require 'yomu'
39
+ rescue LoadError
40
+ # Universal text extraction will use basic fallback
41
+ end
42
+
43
+ require_relative 'universal_document_processor/version'
44
+ require_relative 'universal_document_processor/document'
45
+ require_relative 'universal_document_processor/processors/base_processor'
46
+ require_relative 'universal_document_processor/processors/pdf_processor'
47
+ require_relative 'universal_document_processor/processors/word_processor'
48
+ require_relative 'universal_document_processor/processors/excel_processor'
49
+ require_relative 'universal_document_processor/processors/powerpoint_processor'
50
+ require_relative 'universal_document_processor/processors/image_processor'
51
+ require_relative 'universal_document_processor/processors/archive_processor'
52
+ require_relative 'universal_document_processor/processors/text_processor'
53
+ require_relative 'universal_document_processor/processors/character_validator'
54
+ require_relative 'universal_document_processor/utils/file_detector'
55
+ require_relative 'universal_document_processor/utils/japanese_filename_handler'
56
+ require_relative 'universal_document_processor/ai_agent'
57
+
58
+ module UniversalDocumentProcessor
59
+ class Error < StandardError; end
60
+ class UnsupportedFormatError < Error; end
61
+ class ProcessingError < Error; end
62
+ class DependencyMissingError < Error; end
63
+
64
+ # Main entry point for document processing
65
+ def self.process(file_path_or_io, options = {})
66
+ Document.new(file_path_or_io, options).process
67
+ end
68
+
69
+ # Extract text from any document
70
+ def self.extract_text(file_path_or_io, options = {})
71
+ Document.new(file_path_or_io, options).extract_text
72
+ end
73
+
74
+ # Get document metadata
75
+ def self.get_metadata(file_path_or_io, options = {})
76
+ Document.new(file_path_or_io, options).metadata
77
+ end
78
+
79
+ # Analyze text for invalid characters and encoding issues
80
+ def self.analyze_text_quality(text)
81
+ Processors::CharacterValidator.analyze_text(text)
82
+ end
83
+
84
+ # Validate file encoding and character issues
85
+ def self.validate_file(file_path)
86
+ Processors::CharacterValidator.validate_file_encoding(file_path)
87
+ end
88
+
89
+ # Clean text by removing invalid characters
90
+ def self.clean_text(text, options = {})
91
+ Processors::CharacterValidator.clean_text(text, options)
92
+ end
93
+
94
+ # Validate Japanese text specifically
95
+ def self.validate_japanese_text(text)
96
+ Processors::CharacterValidator.validate_japanese_text(text)
97
+ end
98
+
99
+ # Check if text contains Japanese characters
100
+ def self.japanese_text?(text)
101
+ Processors::CharacterValidator.is_japanese_text?(text)
102
+ end
103
+
104
+ # Japanese filename support methods
105
+ def self.japanese_filename?(filename)
106
+ Utils::JapaneseFilenameHandler.contains_japanese?(filename)
107
+ end
108
+
109
+ def self.validate_filename(filename)
110
+ Utils::JapaneseFilenameHandler.validate_filename(filename)
111
+ end
112
+
113
+ def self.safe_filename(filename)
114
+ Utils::JapaneseFilenameHandler.safe_filename(filename)
115
+ end
116
+
117
+ def self.normalize_filename(filename)
118
+ Utils::JapaneseFilenameHandler.normalize_filename(filename)
119
+ end
120
+
121
+ # AI-powered document analysis methods
122
+ def self.ai_analyze(file_path, options = {})
123
+ document_result = process(file_path, options)
124
+ ai_agent = AIAgent.new(options)
125
+ ai_agent.analyze_document(document_result, options[:query])
126
+ end
127
+
128
+ def self.ai_summarize(file_path, length: :medium, options: {})
129
+ document_result = process(file_path, options)
130
+ ai_agent = AIAgent.new(options)
131
+ ai_agent.summarize_document(document_result, length: length)
132
+ end
133
+
134
+ def self.ai_extract_info(file_path, categories = nil, options = {})
135
+ document_result = process(file_path, options)
136
+ ai_agent = AIAgent.new(options)
137
+ ai_agent.extract_key_information(document_result, categories)
138
+ end
139
+
140
+ def self.ai_translate(file_path, target_language, options = {})
141
+ document_result = process(file_path, options)
142
+ ai_agent = AIAgent.new(options)
143
+ ai_agent.translate_document(document_result, target_language)
144
+ end
145
+
146
+ def self.ai_classify(file_path, options = {})
147
+ document_result = process(file_path, options)
148
+ ai_agent = AIAgent.new(options)
149
+ ai_agent.classify_document(document_result)
150
+ end
151
+
152
+ def self.ai_insights(file_path, options = {})
153
+ document_result = process(file_path, options)
154
+ ai_agent = AIAgent.new(options)
155
+ ai_agent.generate_insights(document_result)
156
+ end
157
+
158
+ def self.ai_action_items(file_path, options = {})
159
+ document_result = process(file_path, options)
160
+ ai_agent = AIAgent.new(options)
161
+ ai_agent.extract_action_items(document_result)
162
+ end
163
+
164
+ def self.ai_compare(file_paths, comparison_type = :content, options = {})
165
+ document_results = file_paths.map { |path| process(path, options) }
166
+ ai_agent = AIAgent.new(options)
167
+ ai_agent.compare_documents(document_results, comparison_type)
168
+ end
169
+
170
+ def self.create_ai_agent(options = {})
171
+ AIAgent.new(options)
172
+ end
173
+
174
+ # Convert document to different format
175
+ def self.convert(file_path_or_io, target_format, options = {})
176
+ Document.new(file_path_or_io, options).convert_to(target_format)
177
+ end
178
+
179
+ # Batch process multiple documents
180
+ def self.batch_process(file_paths, options = {})
181
+ file_paths.map do |file_path|
182
+ begin
183
+ process(file_path, options)
184
+ rescue => e
185
+ { file: file_path, error: e.message, success: false }
186
+ end
187
+ end
188
+ end
189
+
190
+ # Check if a dependency is available
191
+ def self.dependency_available?(dependency)
192
+ case dependency.to_sym
193
+ when :pdf_reader
194
+ defined?(PDF::Reader)
195
+ when :docx
196
+ defined?(Docx)
197
+ when :roo
198
+ defined?(Roo)
199
+ when :mini_magick
200
+ defined?(MiniMagick)
201
+ when :yomu
202
+ defined?(Yomu)
203
+ when :prawn
204
+ defined?(Prawn)
205
+ else
206
+ false
207
+ end
208
+ end
209
+
210
+ # Get list of available features based on installed dependencies
211
+ def self.available_features
212
+ features = [:text_processing, :html_processing, :xml_processing, :csv_processing, :json_processing, :archive_processing]
213
+
214
+ features << :pdf_processing if dependency_available?(:pdf_reader)
215
+ features << :word_processing if dependency_available?(:docx)
216
+ features << :excel_processing if dependency_available?(:roo)
217
+ features << :image_processing if dependency_available?(:mini_magick)
218
+ features << :universal_text_extraction if dependency_available?(:yomu)
219
+ features << :pdf_generation if dependency_available?(:prawn)
220
+
221
+ features
222
+ end
223
+ end