universal_document_processor 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,225 @@
1
+ module UniversalDocumentProcessor
2
+ class Document
3
+ attr_reader :file_path, :content_type, :file_size, :options, :filename_validation
4
+
5
+ def initialize(file_path_or_io, options = {})
6
+ @file_path = file_path_or_io.is_a?(String) ? normalize_file_path(file_path_or_io) : save_temp_file(file_path_or_io)
7
+ @options = options
8
+ @content_type = detect_content_type
9
+ @file_size = File.size(@file_path)
10
+ @filename_validation = validate_filename_encoding
11
+ end
12
+
13
+ def process
14
+ {
15
+ file_path: @file_path,
16
+ content_type: @content_type,
17
+ file_size: @file_size,
18
+ text_content: extract_text,
19
+ metadata: metadata,
20
+ images: extract_images,
21
+ tables: extract_tables,
22
+ filename_info: filename_info,
23
+ processed_at: Time.current
24
+ }
25
+ end
26
+
27
+ def extract_text
28
+ processor.extract_text
29
+ rescue => e
30
+ fallback_text_extraction
31
+ end
32
+
33
+ def metadata
34
+ processor.extract_metadata
35
+ rescue => e
36
+ basic_metadata
37
+ end
38
+
39
+ def extract_images
40
+ processor.respond_to?(:extract_images) ? processor.extract_images : []
41
+ rescue => e
42
+ []
43
+ end
44
+
45
+ def extract_tables
46
+ processor.respond_to?(:extract_tables) ? processor.extract_tables : []
47
+ rescue => e
48
+ []
49
+ end
50
+
51
+ def convert_to(target_format)
52
+ case target_format.to_sym
53
+ when :pdf
54
+ convert_to_pdf
55
+ when :text, :txt
56
+ extract_text
57
+ when :html
58
+ convert_to_html
59
+ when :json
60
+ process.to_json
61
+ else
62
+ raise UnsupportedFormatError, "Conversion to #{target_format} not supported"
63
+ end
64
+ end
65
+
66
+ def supported_formats
67
+ %w[pdf docx doc xlsx xls pptx ppt txt rtf html xml csv jpg jpeg png gif bmp tiff zip rar 7z]
68
+ end
69
+
70
+ def supported?
71
+ supported_formats.include?(file_extension.downcase)
72
+ end
73
+
74
+ def japanese_filename?
75
+ Utils::JapaneseFilenameHandler.contains_japanese?(File.basename(@file_path))
76
+ end
77
+
78
+ def filename_info
79
+ {
80
+ original_filename: File.basename(@file_path),
81
+ contains_japanese: japanese_filename?,
82
+ validation: @filename_validation,
83
+ japanese_parts: Utils::JapaneseFilenameHandler.extract_japanese_parts(File.basename(@file_path))
84
+ }
85
+ end
86
+
87
+ # AI-powered analysis methods
88
+ def ai_analyze(query = nil, options = {})
89
+ ai_agent = create_ai_agent(options)
90
+ ai_agent.analyze_document(process, query)
91
+ end
92
+
93
+ def ai_summarize(length: :medium, options: {})
94
+ ai_agent = create_ai_agent(options)
95
+ ai_agent.summarize_document(process, length: length)
96
+ end
97
+
98
+ def ai_extract_info(categories = nil, options = {})
99
+ ai_agent = create_ai_agent(options)
100
+ ai_agent.extract_key_information(process, categories)
101
+ end
102
+
103
+ def ai_translate(target_language, options = {})
104
+ ai_agent = create_ai_agent(options)
105
+ ai_agent.translate_document(process, target_language)
106
+ end
107
+
108
+ def ai_classify(options = {})
109
+ ai_agent = create_ai_agent(options)
110
+ ai_agent.classify_document(process)
111
+ end
112
+
113
+ def ai_insights(options = {})
114
+ ai_agent = create_ai_agent(options)
115
+ ai_agent.generate_insights(process)
116
+ end
117
+
118
+ def ai_action_items(options = {})
119
+ ai_agent = create_ai_agent(options)
120
+ ai_agent.extract_action_items(process)
121
+ end
122
+
123
+ def ai_chat(message, options = {})
124
+ ai_agent = create_ai_agent(options)
125
+ ai_agent.chat(message, process)
126
+ end
127
+
128
+ def create_ai_agent(options = {})
129
+ AIAgent.new(options.merge(@options))
130
+ end
131
+
132
+ private
133
+
134
+ def processor
135
+ @processor ||= create_processor
136
+ end
137
+
138
+ def create_processor
139
+ case @content_type
140
+ when /pdf/
141
+ Processors::PdfProcessor.new(@file_path, @options)
142
+ when /word/, /document/
143
+ Processors::WordProcessor.new(@file_path, @options)
144
+ when /excel/, /spreadsheet/
145
+ Processors::ExcelProcessor.new(@file_path, @options)
146
+ when /powerpoint/, /presentation/
147
+ Processors::PowerpointProcessor.new(@file_path, @options)
148
+ when /image/
149
+ Processors::ImageProcessor.new(@file_path, @options)
150
+ when /zip/, /archive/, /compressed/
151
+ Processors::ArchiveProcessor.new(@file_path, @options)
152
+ when /text/, /plain/
153
+ Processors::TextProcessor.new(@file_path, @options)
154
+ else
155
+ # Fallback to base processor with universal extraction
156
+ Processors::BaseProcessor.new(@file_path, @options)
157
+ end
158
+ end
159
+
160
+ def detect_content_type
161
+ Utils::FileDetector.detect(@file_path)
162
+ end
163
+
164
+ def file_extension
165
+ File.extname(@file_path).gsub('.', '')
166
+ end
167
+
168
+ def save_temp_file(io)
169
+ # Try to get original filename from IO if available
170
+ original_filename = io.respond_to?(:original_filename) ? io.original_filename : nil
171
+ extension = original_filename ? File.extname(original_filename) : ".#{file_extension}"
172
+
173
+ # Create safe temporary filename
174
+ if original_filename && Utils::JapaneseFilenameHandler.contains_japanese?(original_filename)
175
+ safe_name = Utils::JapaneseFilenameHandler.create_safe_temp_filename(original_filename, 'temp')
176
+ temp_file = Tempfile.new([File.basename(safe_name, extension), extension])
177
+ else
178
+ temp_file = Tempfile.new(['document', extension])
179
+ end
180
+
181
+ temp_file.binmode
182
+ temp_file.write(io.read)
183
+ temp_file.close
184
+ temp_file.path
185
+ end
186
+
187
+ def fallback_text_extraction
188
+ begin
189
+ Yomu.new(@file_path).text
190
+ rescue => e
191
+ "Unable to extract text: #{e.message}"
192
+ end
193
+ end
194
+
195
+ def basic_metadata
196
+ {
197
+ filename: File.basename(@file_path),
198
+ file_size: @file_size,
199
+ content_type: @content_type,
200
+ created_at: File.ctime(@file_path),
201
+ modified_at: File.mtime(@file_path),
202
+ japanese_filename: japanese_filename?,
203
+ filename_encoding: @filename_validation
204
+ }
205
+ end
206
+
207
+ def normalize_file_path(file_path)
208
+ Utils::JapaneseFilenameHandler.normalize_filename(file_path)
209
+ end
210
+
211
+ def validate_filename_encoding
212
+ Utils::JapaneseFilenameHandler.validate_filename(File.basename(@file_path))
213
+ end
214
+
215
+ def convert_to_pdf
216
+ # Implementation for PDF conversion
217
+ raise NotImplementedError, "PDF conversion not yet implemented"
218
+ end
219
+
220
+ def convert_to_html
221
+ # Implementation for HTML conversion
222
+ raise NotImplementedError, "HTML conversion not yet implemented"
223
+ end
224
+ end
225
+ end
@@ -0,0 +1,290 @@
1
+ module UniversalDocumentProcessor
2
+ module Processors
3
+ class ArchiveProcessor < BaseProcessor
4
+ def extract_text
5
+ with_error_handling do
6
+ files_list = list_files
7
+ text_content = ["=== Archive Contents ==="]
8
+
9
+ files_list.each do |file_info|
10
+ text_content << "#{file_info[:path]} (#{file_info[:size]} bytes)"
11
+ end
12
+
13
+ # Try to extract text from text files within the archive
14
+ text_files = extract_text_files
15
+ unless text_files.empty?
16
+ text_content << "\n=== Text File Contents ==="
17
+ text_files.each do |file_path, content|
18
+ text_content << "\n--- #{file_path} ---"
19
+ text_content << content[0..1000] # Limit to first 1000 chars
20
+ text_content << "..." if content.length > 1000
21
+ end
22
+ end
23
+
24
+ text_content.join("\n")
25
+ end
26
+ end
27
+
28
+ def extract_metadata
29
+ with_error_handling do
30
+ files_list = list_files
31
+
32
+ super.merge({
33
+ archive_type: detect_archive_type,
34
+ total_files: files_list.length,
35
+ total_uncompressed_size: files_list.sum { |f| f[:size] },
36
+ file_types: analyze_file_types(files_list),
37
+ directory_structure: build_directory_structure(files_list),
38
+ has_executable_files: has_executable_files?(files_list),
39
+ largest_file: find_largest_file(files_list),
40
+ compression_ratio: calculate_compression_ratio
41
+ })
42
+ end
43
+ end
44
+
45
+ def list_files
46
+ with_error_handling do
47
+ case detect_archive_type
48
+ when :zip
49
+ list_zip_files
50
+ when :rar
51
+ list_rar_files
52
+ when :seven_zip
53
+ list_7z_files
54
+ else
55
+ []
56
+ end
57
+ end
58
+ end
59
+
60
+ def extract_file(file_path, output_path = nil)
61
+ with_error_handling do
62
+ case detect_archive_type
63
+ when :zip
64
+ extract_zip_file(file_path, output_path)
65
+ when :rar
66
+ extract_rar_file(file_path, output_path)
67
+ when :seven_zip
68
+ extract_7z_file(file_path, output_path)
69
+ else
70
+ raise UnsupportedFormatError, "Unsupported archive format"
71
+ end
72
+ end
73
+ end
74
+
75
+ def extract_all(output_directory)
76
+ with_error_handling do
77
+ case detect_archive_type
78
+ when :zip
79
+ extract_all_zip(output_directory)
80
+ when :rar
81
+ extract_all_rar(output_directory)
82
+ when :seven_zip
83
+ extract_all_7z(output_directory)
84
+ else
85
+ raise UnsupportedFormatError, "Unsupported archive format"
86
+ end
87
+ end
88
+ end
89
+
90
+ def supported_operations
91
+ super + [:list_files, :extract_file, :extract_all, :analyze_security]
92
+ end
93
+
94
+ private
95
+
96
+ def detect_archive_type
97
+ extension = File.extname(@file_path).downcase
98
+ case extension
99
+ when '.zip'
100
+ :zip
101
+ when '.rar'
102
+ :rar
103
+ when '.7z'
104
+ :seven_zip
105
+ else
106
+ # Try to detect by file signature
107
+ File.open(@file_path, 'rb') do |file|
108
+ signature = file.read(4)
109
+ case signature
110
+ when "PK\x03\x04", "PK\x05\x06", "PK\x07\x08"
111
+ :zip
112
+ when "Rar!"
113
+ :rar
114
+ when "7z\xBC\xAF"
115
+ :seven_zip
116
+ else
117
+ :unknown
118
+ end
119
+ end
120
+ end
121
+ end
122
+
123
+ def list_zip_files
124
+ files = []
125
+ Zip::File.open(@file_path) do |zip|
126
+ zip.each do |entry|
127
+ files << {
128
+ path: entry.name,
129
+ size: entry.size,
130
+ compressed_size: entry.compressed_size,
131
+ is_directory: entry.directory?,
132
+ modified_time: entry.time,
133
+ crc: entry.crc
134
+ }
135
+ end
136
+ end
137
+ files
138
+ end
139
+
140
+ def list_rar_files
141
+ # RAR support would require external library or system command
142
+ # This is a placeholder implementation
143
+ []
144
+ end
145
+
146
+ def list_7z_files
147
+ # 7z support would require external library or system command
148
+ # This is a placeholder implementation
149
+ []
150
+ end
151
+
152
+ def extract_zip_file(file_path, output_path)
153
+ Zip::File.open(@file_path) do |zip|
154
+ entry = zip.find_entry(file_path)
155
+ if entry
156
+ if output_path
157
+ entry.extract(output_path)
158
+ output_path
159
+ else
160
+ entry.get_input_stream.read
161
+ end
162
+ else
163
+ raise ProcessingError, "File not found in archive: #{file_path}"
164
+ end
165
+ end
166
+ end
167
+
168
+ def extract_rar_file(file_path, output_path)
169
+ # RAR extraction would require external library
170
+ raise NotImplementedError, "RAR extraction not implemented"
171
+ end
172
+
173
+ def extract_7z_file(file_path, output_path)
174
+ # 7z extraction would require external library
175
+ raise NotImplementedError, "7z extraction not implemented"
176
+ end
177
+
178
+ def extract_all_zip(output_directory)
179
+ FileUtils.mkdir_p(output_directory)
180
+ Zip::File.open(@file_path) do |zip|
181
+ zip.each do |entry|
182
+ output_path = File.join(output_directory, entry.name)
183
+ FileUtils.mkdir_p(File.dirname(output_path))
184
+ entry.extract(output_path) unless File.exist?(output_path)
185
+ end
186
+ end
187
+ output_directory
188
+ end
189
+
190
+ def extract_all_rar(output_directory)
191
+ raise NotImplementedError, "RAR extraction not implemented"
192
+ end
193
+
194
+ def extract_all_7z(output_directory)
195
+ raise NotImplementedError, "7z extraction not implemented"
196
+ end
197
+
198
+ def extract_text_files
199
+ text_files = {}
200
+ return text_files unless detect_archive_type == :zip
201
+
202
+ Zip::File.open(@file_path) do |zip|
203
+ zip.each do |entry|
204
+ next if entry.directory?
205
+
206
+ # Check if it's a text file
207
+ if text_file?(entry.name)
208
+ begin
209
+ content = entry.get_input_stream.read
210
+ # Try to decode as UTF-8
211
+ text_files[entry.name] = content.force_encoding('UTF-8')
212
+ rescue
213
+ # Skip files that can't be read as text
214
+ end
215
+ end
216
+ end
217
+ end
218
+
219
+ text_files
220
+ end
221
+
222
+ def text_file?(filename)
223
+ text_extensions = %w[.txt .md .readme .log .csv .json .xml .html .css .js .rb .py .java .c .cpp .h]
224
+ extension = File.extname(filename).downcase
225
+ text_extensions.include?(extension) || File.basename(filename).downcase.match?(/readme|license|changelog/)
226
+ end
227
+
228
+ def analyze_file_types(files_list)
229
+ type_counts = Hash.new(0)
230
+ files_list.each do |file_info|
231
+ next if file_info[:is_directory]
232
+
233
+ extension = File.extname(file_info[:path]).downcase
234
+ type_counts[extension.empty? ? 'no_extension' : extension] += 1
235
+ end
236
+ type_counts
237
+ end
238
+
239
+ def build_directory_structure(files_list)
240
+ structure = {}
241
+ files_list.each do |file_info|
242
+ path_parts = file_info[:path].split('/')
243
+ current = structure
244
+
245
+ path_parts.each_with_index do |part, index|
246
+ current[part] ||= {}
247
+ current = current[part]
248
+
249
+ if index == path_parts.length - 1 && !file_info[:is_directory]
250
+ current[:_file_info] = file_info
251
+ end
252
+ end
253
+ end
254
+ structure
255
+ end
256
+
257
+ def has_executable_files?(files_list)
258
+ executable_extensions = %w[.exe .bat .sh .cmd .com .scr .msi .deb .rpm .dmg .app]
259
+ files_list.any? do |file_info|
260
+ extension = File.extname(file_info[:path]).downcase
261
+ executable_extensions.include?(extension)
262
+ end
263
+ end
264
+
265
+ def find_largest_file(files_list)
266
+ files_list.reject { |f| f[:is_directory] }.max_by { |f| f[:size] }
267
+ end
268
+
269
+ def calculate_compression_ratio
270
+ return 0 unless detect_archive_type == :zip
271
+
272
+ total_size = 0
273
+ compressed_size = 0
274
+
275
+ Zip::File.open(@file_path) do |zip|
276
+ zip.each do |entry|
277
+ next if entry.directory?
278
+ total_size += entry.size
279
+ compressed_size += entry.compressed_size
280
+ end
281
+ end
282
+
283
+ return 0 if total_size == 0
284
+ ((total_size - compressed_size).to_f / total_size * 100).round(2)
285
+ rescue
286
+ 0
287
+ end
288
+ end
289
+ end
290
+ end
@@ -0,0 +1,58 @@
1
+ module UniversalDocumentProcessor
2
+ module Processors
3
+ class BaseProcessor
4
+ attr_reader :file_path, :options
5
+
6
+ def initialize(file_path, options = {})
7
+ @file_path = file_path
8
+ @options = options
9
+ end
10
+
11
+ def extract_text
12
+ # Fallback to universal text extraction
13
+ Yomu.new(@file_path).text
14
+ rescue => e
15
+ raise ProcessingError, "Failed to extract text: #{e.message}"
16
+ end
17
+
18
+ def extract_metadata
19
+ # Basic file metadata
20
+ {
21
+ filename: File.basename(@file_path),
22
+ file_size: File.size(@file_path),
23
+ content_type: Marcel::MimeType.for(Pathname.new(@file_path)),
24
+ created_at: File.ctime(@file_path),
25
+ modified_at: File.mtime(@file_path)
26
+ }
27
+ rescue => e
28
+ raise ProcessingError, "Failed to extract metadata: #{e.message}"
29
+ end
30
+
31
+ def extract_images
32
+ []
33
+ end
34
+
35
+ def extract_tables
36
+ []
37
+ end
38
+
39
+ def supported_operations
40
+ [:extract_text, :extract_metadata]
41
+ end
42
+
43
+ protected
44
+
45
+ def validate_file
46
+ raise ProcessingError, "File not found: #{@file_path}" unless File.exist?(@file_path)
47
+ raise ProcessingError, "File is empty: #{@file_path}" if File.zero?(@file_path)
48
+ end
49
+
50
+ def with_error_handling
51
+ validate_file
52
+ yield
53
+ rescue => e
54
+ raise ProcessingError, "Processing failed: #{e.message}"
55
+ end
56
+ end
57
+ end
58
+ end