universal_document_processor 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,105 @@
1
+ module UniversalDocumentProcessor
2
+ module Processors
3
+ class PdfProcessor < BaseProcessor
4
+ def extract_text
5
+ with_error_handling do
6
+ reader = PDF::Reader.new(@file_path)
7
+ text = reader.pages.map(&:text).join("\n")
8
+ text.strip.empty? ? "No text content found in PDF" : text
9
+ end
10
+ end
11
+
12
+ def extract_metadata
13
+ with_error_handling do
14
+ reader = PDF::Reader.new(@file_path)
15
+ info = reader.info || {}
16
+
17
+ super.merge({
18
+ title: info[:Title],
19
+ author: info[:Author],
20
+ subject: info[:Subject],
21
+ creator: info[:Creator],
22
+ producer: info[:Producer],
23
+ creation_date: info[:CreationDate],
24
+ modification_date: info[:ModDate],
25
+ page_count: reader.page_count,
26
+ pdf_version: reader.pdf_version,
27
+ encrypted: reader.encrypted?,
28
+ form_fields: extract_form_fields(reader),
29
+ bookmarks: extract_bookmarks(reader)
30
+ })
31
+ end
32
+ end
33
+
34
+ def extract_images
35
+ with_error_handling do
36
+ # Extract embedded images from PDF
37
+ images = []
38
+ reader = PDF::Reader.new(@file_path)
39
+
40
+ reader.pages.each_with_index do |page, page_num|
41
+ page.xobjects.each do |name, stream|
42
+ if stream.hash[:Subtype] == :Image
43
+ images << {
44
+ page: page_num + 1,
45
+ name: name,
46
+ width: stream.hash[:Width],
47
+ height: stream.hash[:Height],
48
+ color_space: stream.hash[:ColorSpace],
49
+ bits_per_component: stream.hash[:BitsPerComponent]
50
+ }
51
+ end
52
+ end
53
+ end
54
+
55
+ images
56
+ end
57
+ end
58
+
59
+ def extract_tables
60
+ with_error_handling do
61
+ # Basic table extraction from PDF text
62
+ tables = []
63
+ reader = PDF::Reader.new(@file_path)
64
+
65
+ reader.pages.each_with_index do |page, page_num|
66
+ text = page.text
67
+ # Simple heuristic to detect table-like content
68
+ lines = text.split("\n")
69
+ table_lines = lines.select { |line| line.count("\t") > 1 || line.scan(/\s{3,}/).length > 2 }
70
+
71
+ unless table_lines.empty?
72
+ tables << {
73
+ page: page_num + 1,
74
+ rows: table_lines.length,
75
+ content: table_lines
76
+ }
77
+ end
78
+ end
79
+
80
+ tables
81
+ end
82
+ end
83
+
84
+ def supported_operations
85
+ super + [:extract_images, :extract_tables, :extract_form_fields, :extract_bookmarks]
86
+ end
87
+
88
+ private
89
+
90
+ def extract_form_fields(reader)
91
+ # Extract PDF form fields if present
92
+ []
93
+ rescue
94
+ []
95
+ end
96
+
97
+ def extract_bookmarks(reader)
98
+ # Extract PDF bookmarks/outline if present
99
+ []
100
+ rescue
101
+ []
102
+ end
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,214 @@
1
+ module UniversalDocumentProcessor
2
+ module Processors
3
+ class PowerpointProcessor < BaseProcessor
4
+ def extract_text
5
+ with_error_handling do
6
+ if @file_path.end_with?('.pptx')
7
+ extract_pptx_text
8
+ else
9
+ # Fallback for .ppt files using Yomu
10
+ fallback_text_extraction
11
+ end
12
+ end
13
+ end
14
+
15
+ def extract_metadata
16
+ with_error_handling do
17
+ if @file_path.end_with?('.pptx')
18
+ extract_pptx_metadata
19
+ else
20
+ super
21
+ end
22
+ end
23
+ end
24
+
25
+ def extract_slides
26
+ with_error_handling do
27
+ return [] unless @file_path.end_with?('.pptx')
28
+
29
+ slides = []
30
+
31
+ # Use zip to read PPTX structure
32
+ Zip::File.open(@file_path) do |zip|
33
+ slide_files = zip.entries.select { |entry| entry.name.match?(/ppt\/slides\/slide\d+\.xml/) }
34
+
35
+ slide_files.sort_by { |f| f.name[/slide(\d+)/, 1].to_i }.each_with_index do |slide_file, index|
36
+ slide_content = zip.read(slide_file.name)
37
+ slide_xml = Nokogiri::XML(slide_content)
38
+
39
+ # Extract text from slide
40
+ text_elements = slide_xml.xpath('//a:t', 'a' => 'http://schemas.openxmlformats.org/drawingml/2006/main')
41
+ slide_text = text_elements.map(&:text).join(' ')
42
+
43
+ slides << {
44
+ slide_number: index + 1,
45
+ text: slide_text,
46
+ has_images: slide_xml.xpath('//a:blip').any?,
47
+ has_charts: slide_xml.xpath('//c:chart').any?,
48
+ has_tables: slide_xml.xpath('//a:tbl').any?
49
+ }
50
+ end
51
+ end
52
+
53
+ slides
54
+ rescue => e
55
+ # If ZIP parsing fails, return empty array
56
+ []
57
+ end
58
+ end
59
+
60
+ def extract_images
61
+ with_error_handling do
62
+ return [] unless @file_path.end_with?('.pptx')
63
+
64
+ images = []
65
+
66
+ Zip::File.open(@file_path) do |zip|
67
+ # Find slide files
68
+ slide_files = zip.entries.select { |entry| entry.name.match?(/ppt\/slides\/slide\d+\.xml/) }
69
+
70
+ slide_files.each_with_index do |slide_file, slide_index|
71
+ slide_content = zip.read(slide_file.name)
72
+ slide_xml = Nokogiri::XML(slide_content)
73
+
74
+ # Extract image references
75
+ slide_xml.xpath('//a:blip', 'a' => 'http://schemas.openxmlformats.org/drawingml/2006/main').each_with_index do |blip, img_index|
76
+ embed_id = blip['r:embed']
77
+ if embed_id
78
+ images << {
79
+ slide_number: slide_index + 1,
80
+ image_index: img_index + 1,
81
+ embed_id: embed_id,
82
+ type: 'embedded'
83
+ }
84
+ end
85
+ end
86
+ end
87
+ end
88
+
89
+ images
90
+ rescue => e
91
+ []
92
+ end
93
+ end
94
+
95
+ def extract_notes
96
+ with_error_handling do
97
+ return [] unless @file_path.end_with?('.pptx')
98
+
99
+ notes = []
100
+
101
+ Zip::File.open(@file_path) do |zip|
102
+ notes_files = zip.entries.select { |entry| entry.name.match?(/ppt\/notesSlides\/notesSlide\d+\.xml/) }
103
+
104
+ notes_files.sort_by { |f| f.name[/notesSlide(\d+)/, 1].to_i }.each_with_index do |notes_file, index|
105
+ notes_content = zip.read(notes_file.name)
106
+ notes_xml = Nokogiri::XML(notes_content)
107
+
108
+ # Extract text from notes
109
+ text_elements = notes_xml.xpath('//a:t', 'a' => 'http://schemas.openxmlformats.org/drawingml/2006/main')
110
+ notes_text = text_elements.map(&:text).join(' ')
111
+
112
+ unless notes_text.strip.empty?
113
+ notes << {
114
+ slide_number: index + 1,
115
+ notes: notes_text
116
+ }
117
+ end
118
+ end
119
+ end
120
+
121
+ notes
122
+ rescue => e
123
+ []
124
+ end
125
+ end
126
+
127
+ def supported_operations
128
+ super + [:extract_slides, :extract_images, :extract_notes]
129
+ end
130
+
131
+ private
132
+
133
+ def extract_pptx_text
134
+ text_content = []
135
+
136
+ begin
137
+ Zip::File.open(@file_path) do |zip|
138
+ slide_files = zip.entries.select { |entry| entry.name.match?(/ppt\/slides\/slide\d+\.xml/) }
139
+
140
+ slide_files.sort_by { |f| f.name[/slide(\d+)/, 1].to_i }.each_with_index do |slide_file, index|
141
+ slide_content = zip.read(slide_file.name)
142
+ slide_xml = Nokogiri::XML(slide_content)
143
+
144
+ text_content << "=== Slide #{index + 1} ==="
145
+
146
+ # Extract all text elements
147
+ text_elements = slide_xml.xpath('//a:t', 'a' => 'http://schemas.openxmlformats.org/drawingml/2006/main')
148
+ slide_text = text_elements.map(&:text).reject(&:empty?).join(' ')
149
+
150
+ text_content << slide_text unless slide_text.strip.empty?
151
+ text_content << "" # Add blank line between slides
152
+ end
153
+ end
154
+
155
+ text_content.join("\n")
156
+ rescue => e
157
+ # Fallback to Yomu if ZIP parsing fails
158
+ fallback_text_extraction
159
+ end
160
+ end
161
+
162
+ def extract_pptx_metadata
163
+ slide_count = 0
164
+ has_notes = false
165
+
166
+ begin
167
+ Zip::File.open(@file_path) do |zip|
168
+ # Count slides
169
+ slide_files = zip.entries.select { |entry| entry.name.match?(/ppt\/slides\/slide\d+\.xml/) }
170
+ slide_count = slide_files.length
171
+
172
+ # Check for notes
173
+ notes_files = zip.entries.select { |entry| entry.name.match?(/ppt\/notesSlides\/notesSlide\d+\.xml/) }
174
+ has_notes = notes_files.any?
175
+
176
+ # Try to get core properties
177
+ core_props = nil
178
+ if zip.find_entry('docProps/core.xml')
179
+ core_content = zip.read('docProps/core.xml')
180
+ core_xml = Nokogiri::XML(core_content)
181
+
182
+ core_props = {
183
+ title: core_xml.xpath('//dc:title').text,
184
+ author: core_xml.xpath('//dc:creator').text,
185
+ subject: core_xml.xpath('//dc:subject').text,
186
+ description: core_xml.xpath('//dc:description').text,
187
+ created_at: core_xml.xpath('//dcterms:created').text,
188
+ modified_at: core_xml.xpath('//dcterms:modified').text
189
+ }
190
+ end
191
+ end
192
+
193
+ metadata = super.merge({
194
+ slide_count: slide_count,
195
+ has_notes: has_notes,
196
+ presentation_type: 'PowerPoint'
197
+ })
198
+
199
+ metadata.merge!(core_props) if core_props
200
+ metadata
201
+ rescue => e
202
+ super
203
+ end
204
+ end
205
+
206
+ def fallback_text_extraction
207
+ # Use Yomu for .ppt files or as fallback
208
+ Yomu.new(@file_path).text
209
+ rescue => e
210
+ "Unable to extract text from PowerPoint presentation: #{e.message}"
211
+ end
212
+ end
213
+ end
214
+ end
@@ -0,0 +1,360 @@
1
+ module UniversalDocumentProcessor
2
+ module Processors
3
+ class TextProcessor < BaseProcessor
4
+ def extract_text
5
+ with_error_handling do
6
+ case detect_text_format
7
+ when :rtf
8
+ extract_rtf_text
9
+ when :html
10
+ extract_html_text
11
+ when :xml
12
+ extract_xml_text
13
+ when :csv
14
+ extract_csv_text
15
+ when :json
16
+ extract_json_text
17
+ else
18
+ extract_plain_text
19
+ end
20
+ end
21
+ end
22
+
23
+ def extract_metadata
24
+ with_error_handling do
25
+ content = File.read(@file_path, encoding: detect_encoding)
26
+
27
+ super.merge({
28
+ text_format: detect_text_format,
29
+ encoding: detect_encoding,
30
+ line_count: content.lines.count,
31
+ word_count: count_words(content),
32
+ character_count: content.length,
33
+ character_count_no_spaces: content.gsub(/\s/, '').length,
34
+ paragraph_count: count_paragraphs(content),
35
+ language: detect_language(content),
36
+ has_urls: has_urls?(content),
37
+ has_emails: has_emails?(content),
38
+ has_phone_numbers: has_phone_numbers?(content)
39
+ })
40
+ end
41
+ end
42
+
43
+ def extract_structure
44
+ with_error_handling do
45
+ case detect_text_format
46
+ when :html
47
+ extract_html_structure
48
+ when :xml
49
+ extract_xml_structure
50
+ when :csv
51
+ extract_csv_structure
52
+ when :json
53
+ extract_json_structure
54
+ else
55
+ extract_plain_structure
56
+ end
57
+ end
58
+ end
59
+
60
+ def extract_links
61
+ with_error_handling do
62
+ content = File.read(@file_path, encoding: detect_encoding)
63
+
64
+ links = {
65
+ urls: extract_urls(content),
66
+ emails: extract_emails(content),
67
+ phone_numbers: extract_phone_numbers(content)
68
+ }
69
+
70
+ if detect_text_format == :html
71
+ links.merge!(extract_html_links)
72
+ end
73
+
74
+ links
75
+ end
76
+ end
77
+
78
+ def supported_operations
79
+ super + [:extract_structure, :extract_links, :analyze_sentiment, :extract_keywords]
80
+ end
81
+
82
+ private
83
+
84
+ def detect_text_format
85
+ extension = File.extname(@file_path).downcase
86
+ case extension
87
+ when '.rtf'
88
+ :rtf
89
+ when '.html', '.htm'
90
+ :html
91
+ when '.xml'
92
+ :xml
93
+ when '.csv'
94
+ :csv
95
+ when '.json'
96
+ :json
97
+ when '.md'
98
+ :markdown
99
+ else
100
+ # Try to detect by content
101
+ content = File.read(@file_path, 1000, encoding: 'UTF-8') rescue nil
102
+ return :plain unless content
103
+
104
+ if content.start_with?('{\rtf')
105
+ :rtf
106
+ elsif content.match?(/<html|<HTML|<!DOCTYPE/i)
107
+ :html
108
+ elsif content.match?(/<?xml|<\w+.*>/i)
109
+ :xml
110
+ elsif content.match?(/^[^,\n]*,[^,\n]*,/)
111
+ :csv
112
+ elsif content.strip.start_with?('{') || content.strip.start_with?('[')
113
+ :json
114
+ else
115
+ :plain
116
+ end
117
+ end
118
+ end
119
+
120
+ def detect_encoding
121
+ # Simple encoding detection
122
+ begin
123
+ content = File.read(@file_path, encoding: 'UTF-8')
124
+ 'UTF-8'
125
+ rescue Encoding::InvalidByteSequenceError
126
+ begin
127
+ content = File.read(@file_path, encoding: 'ISO-8859-1')
128
+ 'ISO-8859-1'
129
+ rescue
130
+ 'ASCII'
131
+ end
132
+ end
133
+ end
134
+
135
+ def extract_plain_text
136
+ File.read(@file_path, encoding: detect_encoding)
137
+ end
138
+
139
+ def extract_rtf_text
140
+ # RTF text extraction would require RTF parsing library
141
+ # This is a simplified version
142
+ content = File.read(@file_path, encoding: detect_encoding)
143
+ # Remove RTF control codes (basic cleanup)
144
+ content.gsub(/\\[a-z]+\d*\s?/i, '').gsub(/[{}]/, '').strip
145
+ rescue => e
146
+ fallback_text_extraction
147
+ end
148
+
149
+ def extract_html_text
150
+ content = File.read(@file_path, encoding: detect_encoding)
151
+ doc = Nokogiri::HTML(content)
152
+
153
+ # Remove script and style elements
154
+ doc.xpath('//script | //style').remove
155
+
156
+ # Extract text content
157
+ doc.text.gsub(/\s+/, ' ').strip
158
+ rescue => e
159
+ fallback_text_extraction
160
+ end
161
+
162
+ def extract_xml_text
163
+ content = File.read(@file_path, encoding: detect_encoding)
164
+ doc = Nokogiri::XML(content)
165
+ doc.text.gsub(/\s+/, ' ').strip
166
+ rescue => e
167
+ fallback_text_extraction
168
+ end
169
+
170
+ def extract_csv_text
171
+ content = File.read(@file_path, encoding: detect_encoding)
172
+ # Convert CSV to readable text format
173
+ lines = content.lines
174
+ header = lines.first&.strip
175
+
176
+ text_content = ["CSV Data:"]
177
+ text_content << "Header: #{header}" if header
178
+ text_content << "Rows: #{lines.length - 1}" if lines.length > 1
179
+ text_content << "\nSample data:"
180
+ text_content << lines[1..5].join if lines.length > 1
181
+
182
+ text_content.join("\n")
183
+ rescue => e
184
+ fallback_text_extraction
185
+ end
186
+
187
+ def extract_json_text
188
+ content = File.read(@file_path, encoding: detect_encoding)
189
+ begin
190
+ json_data = JSON.parse(content)
191
+ "JSON Data: #{json_data.inspect}"
192
+ rescue JSON::ParserError
193
+ content
194
+ end
195
+ rescue => e
196
+ fallback_text_extraction
197
+ end
198
+
199
+ def extract_html_structure
200
+ content = File.read(@file_path, encoding: detect_encoding)
201
+ doc = Nokogiri::HTML(content)
202
+
203
+ {
204
+ title: doc.title,
205
+ headings: extract_headings(doc),
206
+ links: doc.css('a').map { |link| { text: link.text, href: link['href'] } },
207
+ images: doc.css('img').map { |img| { alt: img['alt'], src: img['src'] } },
208
+ forms: doc.css('form').length,
209
+ tables: doc.css('table').length
210
+ }
211
+ rescue => e
212
+ {}
213
+ end
214
+
215
+ def extract_xml_structure
216
+ content = File.read(@file_path, encoding: detect_encoding)
217
+ doc = Nokogiri::XML(content)
218
+
219
+ {
220
+ root_element: doc.root&.name,
221
+ namespaces: doc.namespaces,
222
+ element_count: doc.xpath('//*').length,
223
+ attribute_count: doc.xpath('//@*').length
224
+ }
225
+ rescue => e
226
+ {}
227
+ end
228
+
229
+ def extract_csv_structure
230
+ content = File.read(@file_path, encoding: detect_encoding)
231
+ lines = content.lines
232
+
233
+ {
234
+ rows: lines.length,
235
+ columns: lines.first&.split(',')&.length || 0,
236
+ headers: lines.first&.strip&.split(','),
237
+ sample_data: lines[1..3]&.map { |line| line.strip.split(',') }
238
+ }
239
+ rescue => e
240
+ {}
241
+ end
242
+
243
+ def extract_json_structure
244
+ content = File.read(@file_path, encoding: detect_encoding)
245
+ begin
246
+ json_data = JSON.parse(content)
247
+ analyze_json_structure(json_data)
248
+ rescue JSON::ParserError
249
+ { error: 'Invalid JSON format' }
250
+ end
251
+ rescue => e
252
+ {}
253
+ end
254
+
255
+ def extract_plain_structure
256
+ content = File.read(@file_path, encoding: detect_encoding)
257
+
258
+ {
259
+ lines: content.lines.count,
260
+ paragraphs: count_paragraphs(content),
261
+ words: count_words(content),
262
+ characters: content.length
263
+ }
264
+ end
265
+
266
+ def count_words(text)
267
+ text.scan(/\b\w+\b/).length
268
+ end
269
+
270
+ def count_paragraphs(text)
271
+ text.split(/\n\s*\n/).length
272
+ end
273
+
274
+ def detect_language(text)
275
+ # Simple language detection based on common words
276
+ # This is a placeholder - would use a proper language detection library
277
+ sample = text[0..1000].downcase
278
+
279
+ if sample.match?(/\b(the|and|for|are|but|not|you|all|can|had|her|was|one|our|out|day|get|has|him|his|how|its|may|new|now|old|see|two|who|boy|did|man|men|run|she|too|use|way|who|oil|sit|set|run|hot|let|say|she|try|ask|may|own|say|she|too|use|way|who|oil|sit|set|run|hot|let|say|she|try|ask|may|own)\b/)
280
+ 'English'
281
+ else
282
+ 'Unknown'
283
+ end
284
+ end
285
+
286
+ def has_urls?(text)
287
+ text.match?(/https?:\/\/[^\s]+/)
288
+ end
289
+
290
+ def has_emails?(text)
291
+ text.match?(/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/)
292
+ end
293
+
294
+ def has_phone_numbers?(text)
295
+ text.match?(/\b\d{3}[-.]?\d{3}[-.]?\d{4}\b/)
296
+ end
297
+
298
+ def extract_urls(text)
299
+ text.scan(/https?:\/\/[^\s]+/)
300
+ end
301
+
302
+ def extract_emails(text)
303
+ text.scan(/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/)
304
+ end
305
+
306
+ def extract_phone_numbers(text)
307
+ text.scan(/\b\d{3}[-.]?\d{3}[-.]?\d{4}\b/)
308
+ end
309
+
310
+ def extract_html_links
311
+ content = File.read(@file_path, encoding: detect_encoding)
312
+ doc = Nokogiri::HTML(content)
313
+
314
+ {
315
+ internal_links: doc.css('a[href^="/"], a[href^="#"]').map { |link| link['href'] },
316
+ external_links: doc.css('a[href^="http"]').map { |link| link['href'] },
317
+ email_links: doc.css('a[href^="mailto:"]').map { |link| link['href'] }
318
+ }
319
+ rescue
320
+ {}
321
+ end
322
+
323
+ def extract_headings(doc)
324
+ headings = {}
325
+ (1..6).each do |level|
326
+ headings["h#{level}"] = doc.css("h#{level}").map(&:text)
327
+ end
328
+ headings
329
+ end
330
+
331
+ def analyze_json_structure(data, path = [])
332
+ case data
333
+ when Hash
334
+ {
335
+ type: 'object',
336
+ keys: data.keys,
337
+ nested_structure: data.map { |k, v| [k, analyze_json_structure(v, path + [k])] }.to_h
338
+ }
339
+ when Array
340
+ {
341
+ type: 'array',
342
+ length: data.length,
343
+ element_types: data.map { |item| analyze_json_structure(item, path + ['[]']) }.uniq
344
+ }
345
+ else
346
+ {
347
+ type: data.class.name.downcase,
348
+ value: data.is_a?(String) && data.length > 100 ? "#{data[0..100]}..." : data
349
+ }
350
+ end
351
+ end
352
+
353
+ def fallback_text_extraction
354
+ File.read(@file_path, encoding: detect_encoding)
355
+ rescue => e
356
+ "Unable to extract text: #{e.message}"
357
+ end
358
+ end
359
+ end
360
+ end