universal_document_processor 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/AI_USAGE_GUIDE.md +404 -0
- data/CHANGELOG.md +67 -0
- data/GEM_RELEASE_GUIDE.md +288 -0
- data/Gemfile +27 -0
- data/LICENSE +21 -0
- data/README.md +726 -0
- data/Rakefile +36 -0
- data/lib/universal_document_processor/ai_agent.rb +491 -0
- data/lib/universal_document_processor/document.rb +225 -0
- data/lib/universal_document_processor/processors/archive_processor.rb +290 -0
- data/lib/universal_document_processor/processors/base_processor.rb +58 -0
- data/lib/universal_document_processor/processors/character_validator.rb +283 -0
- data/lib/universal_document_processor/processors/excel_processor.rb +219 -0
- data/lib/universal_document_processor/processors/image_processor.rb +172 -0
- data/lib/universal_document_processor/processors/pdf_processor.rb +105 -0
- data/lib/universal_document_processor/processors/powerpoint_processor.rb +214 -0
- data/lib/universal_document_processor/processors/text_processor.rb +360 -0
- data/lib/universal_document_processor/processors/word_processor.rb +137 -0
- data/lib/universal_document_processor/utils/file_detector.rb +83 -0
- data/lib/universal_document_processor/utils/japanese_filename_handler.rb +205 -0
- data/lib/universal_document_processor/version.rb +3 -0
- data/lib/universal_document_processor.rb +223 -0
- metadata +198 -0
@@ -0,0 +1,105 @@
|
|
1
|
+
module UniversalDocumentProcessor
|
2
|
+
module Processors
|
3
|
+
class PdfProcessor < BaseProcessor
|
4
|
+
def extract_text
|
5
|
+
with_error_handling do
|
6
|
+
reader = PDF::Reader.new(@file_path)
|
7
|
+
text = reader.pages.map(&:text).join("\n")
|
8
|
+
text.strip.empty? ? "No text content found in PDF" : text
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def extract_metadata
|
13
|
+
with_error_handling do
|
14
|
+
reader = PDF::Reader.new(@file_path)
|
15
|
+
info = reader.info || {}
|
16
|
+
|
17
|
+
super.merge({
|
18
|
+
title: info[:Title],
|
19
|
+
author: info[:Author],
|
20
|
+
subject: info[:Subject],
|
21
|
+
creator: info[:Creator],
|
22
|
+
producer: info[:Producer],
|
23
|
+
creation_date: info[:CreationDate],
|
24
|
+
modification_date: info[:ModDate],
|
25
|
+
page_count: reader.page_count,
|
26
|
+
pdf_version: reader.pdf_version,
|
27
|
+
encrypted: reader.encrypted?,
|
28
|
+
form_fields: extract_form_fields(reader),
|
29
|
+
bookmarks: extract_bookmarks(reader)
|
30
|
+
})
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def extract_images
|
35
|
+
with_error_handling do
|
36
|
+
# Extract embedded images from PDF
|
37
|
+
images = []
|
38
|
+
reader = PDF::Reader.new(@file_path)
|
39
|
+
|
40
|
+
reader.pages.each_with_index do |page, page_num|
|
41
|
+
page.xobjects.each do |name, stream|
|
42
|
+
if stream.hash[:Subtype] == :Image
|
43
|
+
images << {
|
44
|
+
page: page_num + 1,
|
45
|
+
name: name,
|
46
|
+
width: stream.hash[:Width],
|
47
|
+
height: stream.hash[:Height],
|
48
|
+
color_space: stream.hash[:ColorSpace],
|
49
|
+
bits_per_component: stream.hash[:BitsPerComponent]
|
50
|
+
}
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
images
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def extract_tables
|
60
|
+
with_error_handling do
|
61
|
+
# Basic table extraction from PDF text
|
62
|
+
tables = []
|
63
|
+
reader = PDF::Reader.new(@file_path)
|
64
|
+
|
65
|
+
reader.pages.each_with_index do |page, page_num|
|
66
|
+
text = page.text
|
67
|
+
# Simple heuristic to detect table-like content
|
68
|
+
lines = text.split("\n")
|
69
|
+
table_lines = lines.select { |line| line.count("\t") > 1 || line.scan(/\s{3,}/).length > 2 }
|
70
|
+
|
71
|
+
unless table_lines.empty?
|
72
|
+
tables << {
|
73
|
+
page: page_num + 1,
|
74
|
+
rows: table_lines.length,
|
75
|
+
content: table_lines
|
76
|
+
}
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
tables
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def supported_operations
|
85
|
+
super + [:extract_images, :extract_tables, :extract_form_fields, :extract_bookmarks]
|
86
|
+
end
|
87
|
+
|
88
|
+
private
|
89
|
+
|
90
|
+
def extract_form_fields(reader)
|
91
|
+
# Extract PDF form fields if present
|
92
|
+
[]
|
93
|
+
rescue
|
94
|
+
[]
|
95
|
+
end
|
96
|
+
|
97
|
+
def extract_bookmarks(reader)
|
98
|
+
# Extract PDF bookmarks/outline if present
|
99
|
+
[]
|
100
|
+
rescue
|
101
|
+
[]
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
@@ -0,0 +1,214 @@
|
|
1
|
+
module UniversalDocumentProcessor
|
2
|
+
module Processors
|
3
|
+
class PowerpointProcessor < BaseProcessor
|
4
|
+
def extract_text
|
5
|
+
with_error_handling do
|
6
|
+
if @file_path.end_with?('.pptx')
|
7
|
+
extract_pptx_text
|
8
|
+
else
|
9
|
+
# Fallback for .ppt files using Yomu
|
10
|
+
fallback_text_extraction
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def extract_metadata
|
16
|
+
with_error_handling do
|
17
|
+
if @file_path.end_with?('.pptx')
|
18
|
+
extract_pptx_metadata
|
19
|
+
else
|
20
|
+
super
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def extract_slides
|
26
|
+
with_error_handling do
|
27
|
+
return [] unless @file_path.end_with?('.pptx')
|
28
|
+
|
29
|
+
slides = []
|
30
|
+
|
31
|
+
# Use zip to read PPTX structure
|
32
|
+
Zip::File.open(@file_path) do |zip|
|
33
|
+
slide_files = zip.entries.select { |entry| entry.name.match?(/ppt\/slides\/slide\d+\.xml/) }
|
34
|
+
|
35
|
+
slide_files.sort_by { |f| f.name[/slide(\d+)/, 1].to_i }.each_with_index do |slide_file, index|
|
36
|
+
slide_content = zip.read(slide_file.name)
|
37
|
+
slide_xml = Nokogiri::XML(slide_content)
|
38
|
+
|
39
|
+
# Extract text from slide
|
40
|
+
text_elements = slide_xml.xpath('//a:t', 'a' => 'http://schemas.openxmlformats.org/drawingml/2006/main')
|
41
|
+
slide_text = text_elements.map(&:text).join(' ')
|
42
|
+
|
43
|
+
slides << {
|
44
|
+
slide_number: index + 1,
|
45
|
+
text: slide_text,
|
46
|
+
has_images: slide_xml.xpath('//a:blip').any?,
|
47
|
+
has_charts: slide_xml.xpath('//c:chart').any?,
|
48
|
+
has_tables: slide_xml.xpath('//a:tbl').any?
|
49
|
+
}
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
slides
|
54
|
+
rescue => e
|
55
|
+
# If ZIP parsing fails, return empty array
|
56
|
+
[]
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def extract_images
|
61
|
+
with_error_handling do
|
62
|
+
return [] unless @file_path.end_with?('.pptx')
|
63
|
+
|
64
|
+
images = []
|
65
|
+
|
66
|
+
Zip::File.open(@file_path) do |zip|
|
67
|
+
# Find slide files
|
68
|
+
slide_files = zip.entries.select { |entry| entry.name.match?(/ppt\/slides\/slide\d+\.xml/) }
|
69
|
+
|
70
|
+
slide_files.each_with_index do |slide_file, slide_index|
|
71
|
+
slide_content = zip.read(slide_file.name)
|
72
|
+
slide_xml = Nokogiri::XML(slide_content)
|
73
|
+
|
74
|
+
# Extract image references
|
75
|
+
slide_xml.xpath('//a:blip', 'a' => 'http://schemas.openxmlformats.org/drawingml/2006/main').each_with_index do |blip, img_index|
|
76
|
+
embed_id = blip['r:embed']
|
77
|
+
if embed_id
|
78
|
+
images << {
|
79
|
+
slide_number: slide_index + 1,
|
80
|
+
image_index: img_index + 1,
|
81
|
+
embed_id: embed_id,
|
82
|
+
type: 'embedded'
|
83
|
+
}
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
images
|
90
|
+
rescue => e
|
91
|
+
[]
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def extract_notes
|
96
|
+
with_error_handling do
|
97
|
+
return [] unless @file_path.end_with?('.pptx')
|
98
|
+
|
99
|
+
notes = []
|
100
|
+
|
101
|
+
Zip::File.open(@file_path) do |zip|
|
102
|
+
notes_files = zip.entries.select { |entry| entry.name.match?(/ppt\/notesSlides\/notesSlide\d+\.xml/) }
|
103
|
+
|
104
|
+
notes_files.sort_by { |f| f.name[/notesSlide(\d+)/, 1].to_i }.each_with_index do |notes_file, index|
|
105
|
+
notes_content = zip.read(notes_file.name)
|
106
|
+
notes_xml = Nokogiri::XML(notes_content)
|
107
|
+
|
108
|
+
# Extract text from notes
|
109
|
+
text_elements = notes_xml.xpath('//a:t', 'a' => 'http://schemas.openxmlformats.org/drawingml/2006/main')
|
110
|
+
notes_text = text_elements.map(&:text).join(' ')
|
111
|
+
|
112
|
+
unless notes_text.strip.empty?
|
113
|
+
notes << {
|
114
|
+
slide_number: index + 1,
|
115
|
+
notes: notes_text
|
116
|
+
}
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
notes
|
122
|
+
rescue => e
|
123
|
+
[]
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
def supported_operations
|
128
|
+
super + [:extract_slides, :extract_images, :extract_notes]
|
129
|
+
end
|
130
|
+
|
131
|
+
private
|
132
|
+
|
133
|
+
def extract_pptx_text
|
134
|
+
text_content = []
|
135
|
+
|
136
|
+
begin
|
137
|
+
Zip::File.open(@file_path) do |zip|
|
138
|
+
slide_files = zip.entries.select { |entry| entry.name.match?(/ppt\/slides\/slide\d+\.xml/) }
|
139
|
+
|
140
|
+
slide_files.sort_by { |f| f.name[/slide(\d+)/, 1].to_i }.each_with_index do |slide_file, index|
|
141
|
+
slide_content = zip.read(slide_file.name)
|
142
|
+
slide_xml = Nokogiri::XML(slide_content)
|
143
|
+
|
144
|
+
text_content << "=== Slide #{index + 1} ==="
|
145
|
+
|
146
|
+
# Extract all text elements
|
147
|
+
text_elements = slide_xml.xpath('//a:t', 'a' => 'http://schemas.openxmlformats.org/drawingml/2006/main')
|
148
|
+
slide_text = text_elements.map(&:text).reject(&:empty?).join(' ')
|
149
|
+
|
150
|
+
text_content << slide_text unless slide_text.strip.empty?
|
151
|
+
text_content << "" # Add blank line between slides
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
text_content.join("\n")
|
156
|
+
rescue => e
|
157
|
+
# Fallback to Yomu if ZIP parsing fails
|
158
|
+
fallback_text_extraction
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def extract_pptx_metadata
|
163
|
+
slide_count = 0
|
164
|
+
has_notes = false
|
165
|
+
|
166
|
+
begin
|
167
|
+
Zip::File.open(@file_path) do |zip|
|
168
|
+
# Count slides
|
169
|
+
slide_files = zip.entries.select { |entry| entry.name.match?(/ppt\/slides\/slide\d+\.xml/) }
|
170
|
+
slide_count = slide_files.length
|
171
|
+
|
172
|
+
# Check for notes
|
173
|
+
notes_files = zip.entries.select { |entry| entry.name.match?(/ppt\/notesSlides\/notesSlide\d+\.xml/) }
|
174
|
+
has_notes = notes_files.any?
|
175
|
+
|
176
|
+
# Try to get core properties
|
177
|
+
core_props = nil
|
178
|
+
if zip.find_entry('docProps/core.xml')
|
179
|
+
core_content = zip.read('docProps/core.xml')
|
180
|
+
core_xml = Nokogiri::XML(core_content)
|
181
|
+
|
182
|
+
core_props = {
|
183
|
+
title: core_xml.xpath('//dc:title').text,
|
184
|
+
author: core_xml.xpath('//dc:creator').text,
|
185
|
+
subject: core_xml.xpath('//dc:subject').text,
|
186
|
+
description: core_xml.xpath('//dc:description').text,
|
187
|
+
created_at: core_xml.xpath('//dcterms:created').text,
|
188
|
+
modified_at: core_xml.xpath('//dcterms:modified').text
|
189
|
+
}
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
metadata = super.merge({
|
194
|
+
slide_count: slide_count,
|
195
|
+
has_notes: has_notes,
|
196
|
+
presentation_type: 'PowerPoint'
|
197
|
+
})
|
198
|
+
|
199
|
+
metadata.merge!(core_props) if core_props
|
200
|
+
metadata
|
201
|
+
rescue => e
|
202
|
+
super
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
def fallback_text_extraction
|
207
|
+
# Use Yomu for .ppt files or as fallback
|
208
|
+
Yomu.new(@file_path).text
|
209
|
+
rescue => e
|
210
|
+
"Unable to extract text from PowerPoint presentation: #{e.message}"
|
211
|
+
end
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
@@ -0,0 +1,360 @@
|
|
1
|
+
module UniversalDocumentProcessor
|
2
|
+
module Processors
|
3
|
+
class TextProcessor < BaseProcessor
|
4
|
+
def extract_text
|
5
|
+
with_error_handling do
|
6
|
+
case detect_text_format
|
7
|
+
when :rtf
|
8
|
+
extract_rtf_text
|
9
|
+
when :html
|
10
|
+
extract_html_text
|
11
|
+
when :xml
|
12
|
+
extract_xml_text
|
13
|
+
when :csv
|
14
|
+
extract_csv_text
|
15
|
+
when :json
|
16
|
+
extract_json_text
|
17
|
+
else
|
18
|
+
extract_plain_text
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def extract_metadata
|
24
|
+
with_error_handling do
|
25
|
+
content = File.read(@file_path, encoding: detect_encoding)
|
26
|
+
|
27
|
+
super.merge({
|
28
|
+
text_format: detect_text_format,
|
29
|
+
encoding: detect_encoding,
|
30
|
+
line_count: content.lines.count,
|
31
|
+
word_count: count_words(content),
|
32
|
+
character_count: content.length,
|
33
|
+
character_count_no_spaces: content.gsub(/\s/, '').length,
|
34
|
+
paragraph_count: count_paragraphs(content),
|
35
|
+
language: detect_language(content),
|
36
|
+
has_urls: has_urls?(content),
|
37
|
+
has_emails: has_emails?(content),
|
38
|
+
has_phone_numbers: has_phone_numbers?(content)
|
39
|
+
})
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def extract_structure
|
44
|
+
with_error_handling do
|
45
|
+
case detect_text_format
|
46
|
+
when :html
|
47
|
+
extract_html_structure
|
48
|
+
when :xml
|
49
|
+
extract_xml_structure
|
50
|
+
when :csv
|
51
|
+
extract_csv_structure
|
52
|
+
when :json
|
53
|
+
extract_json_structure
|
54
|
+
else
|
55
|
+
extract_plain_structure
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def extract_links
|
61
|
+
with_error_handling do
|
62
|
+
content = File.read(@file_path, encoding: detect_encoding)
|
63
|
+
|
64
|
+
links = {
|
65
|
+
urls: extract_urls(content),
|
66
|
+
emails: extract_emails(content),
|
67
|
+
phone_numbers: extract_phone_numbers(content)
|
68
|
+
}
|
69
|
+
|
70
|
+
if detect_text_format == :html
|
71
|
+
links.merge!(extract_html_links)
|
72
|
+
end
|
73
|
+
|
74
|
+
links
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def supported_operations
|
79
|
+
super + [:extract_structure, :extract_links, :analyze_sentiment, :extract_keywords]
|
80
|
+
end
|
81
|
+
|
82
|
+
private
|
83
|
+
|
84
|
+
def detect_text_format
|
85
|
+
extension = File.extname(@file_path).downcase
|
86
|
+
case extension
|
87
|
+
when '.rtf'
|
88
|
+
:rtf
|
89
|
+
when '.html', '.htm'
|
90
|
+
:html
|
91
|
+
when '.xml'
|
92
|
+
:xml
|
93
|
+
when '.csv'
|
94
|
+
:csv
|
95
|
+
when '.json'
|
96
|
+
:json
|
97
|
+
when '.md'
|
98
|
+
:markdown
|
99
|
+
else
|
100
|
+
# Try to detect by content
|
101
|
+
content = File.read(@file_path, 1000, encoding: 'UTF-8') rescue nil
|
102
|
+
return :plain unless content
|
103
|
+
|
104
|
+
if content.start_with?('{\rtf')
|
105
|
+
:rtf
|
106
|
+
elsif content.match?(/<html|<HTML|<!DOCTYPE/i)
|
107
|
+
:html
|
108
|
+
elsif content.match?(/<?xml|<\w+.*>/i)
|
109
|
+
:xml
|
110
|
+
elsif content.match?(/^[^,\n]*,[^,\n]*,/)
|
111
|
+
:csv
|
112
|
+
elsif content.strip.start_with?('{') || content.strip.start_with?('[')
|
113
|
+
:json
|
114
|
+
else
|
115
|
+
:plain
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def detect_encoding
|
121
|
+
# Simple encoding detection
|
122
|
+
begin
|
123
|
+
content = File.read(@file_path, encoding: 'UTF-8')
|
124
|
+
'UTF-8'
|
125
|
+
rescue Encoding::InvalidByteSequenceError
|
126
|
+
begin
|
127
|
+
content = File.read(@file_path, encoding: 'ISO-8859-1')
|
128
|
+
'ISO-8859-1'
|
129
|
+
rescue
|
130
|
+
'ASCII'
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def extract_plain_text
|
136
|
+
File.read(@file_path, encoding: detect_encoding)
|
137
|
+
end
|
138
|
+
|
139
|
+
def extract_rtf_text
|
140
|
+
# RTF text extraction would require RTF parsing library
|
141
|
+
# This is a simplified version
|
142
|
+
content = File.read(@file_path, encoding: detect_encoding)
|
143
|
+
# Remove RTF control codes (basic cleanup)
|
144
|
+
content.gsub(/\\[a-z]+\d*\s?/i, '').gsub(/[{}]/, '').strip
|
145
|
+
rescue => e
|
146
|
+
fallback_text_extraction
|
147
|
+
end
|
148
|
+
|
149
|
+
def extract_html_text
|
150
|
+
content = File.read(@file_path, encoding: detect_encoding)
|
151
|
+
doc = Nokogiri::HTML(content)
|
152
|
+
|
153
|
+
# Remove script and style elements
|
154
|
+
doc.xpath('//script | //style').remove
|
155
|
+
|
156
|
+
# Extract text content
|
157
|
+
doc.text.gsub(/\s+/, ' ').strip
|
158
|
+
rescue => e
|
159
|
+
fallback_text_extraction
|
160
|
+
end
|
161
|
+
|
162
|
+
def extract_xml_text
|
163
|
+
content = File.read(@file_path, encoding: detect_encoding)
|
164
|
+
doc = Nokogiri::XML(content)
|
165
|
+
doc.text.gsub(/\s+/, ' ').strip
|
166
|
+
rescue => e
|
167
|
+
fallback_text_extraction
|
168
|
+
end
|
169
|
+
|
170
|
+
def extract_csv_text
|
171
|
+
content = File.read(@file_path, encoding: detect_encoding)
|
172
|
+
# Convert CSV to readable text format
|
173
|
+
lines = content.lines
|
174
|
+
header = lines.first&.strip
|
175
|
+
|
176
|
+
text_content = ["CSV Data:"]
|
177
|
+
text_content << "Header: #{header}" if header
|
178
|
+
text_content << "Rows: #{lines.length - 1}" if lines.length > 1
|
179
|
+
text_content << "\nSample data:"
|
180
|
+
text_content << lines[1..5].join if lines.length > 1
|
181
|
+
|
182
|
+
text_content.join("\n")
|
183
|
+
rescue => e
|
184
|
+
fallback_text_extraction
|
185
|
+
end
|
186
|
+
|
187
|
+
def extract_json_text
|
188
|
+
content = File.read(@file_path, encoding: detect_encoding)
|
189
|
+
begin
|
190
|
+
json_data = JSON.parse(content)
|
191
|
+
"JSON Data: #{json_data.inspect}"
|
192
|
+
rescue JSON::ParserError
|
193
|
+
content
|
194
|
+
end
|
195
|
+
rescue => e
|
196
|
+
fallback_text_extraction
|
197
|
+
end
|
198
|
+
|
199
|
+
def extract_html_structure
|
200
|
+
content = File.read(@file_path, encoding: detect_encoding)
|
201
|
+
doc = Nokogiri::HTML(content)
|
202
|
+
|
203
|
+
{
|
204
|
+
title: doc.title,
|
205
|
+
headings: extract_headings(doc),
|
206
|
+
links: doc.css('a').map { |link| { text: link.text, href: link['href'] } },
|
207
|
+
images: doc.css('img').map { |img| { alt: img['alt'], src: img['src'] } },
|
208
|
+
forms: doc.css('form').length,
|
209
|
+
tables: doc.css('table').length
|
210
|
+
}
|
211
|
+
rescue => e
|
212
|
+
{}
|
213
|
+
end
|
214
|
+
|
215
|
+
def extract_xml_structure
|
216
|
+
content = File.read(@file_path, encoding: detect_encoding)
|
217
|
+
doc = Nokogiri::XML(content)
|
218
|
+
|
219
|
+
{
|
220
|
+
root_element: doc.root&.name,
|
221
|
+
namespaces: doc.namespaces,
|
222
|
+
element_count: doc.xpath('//*').length,
|
223
|
+
attribute_count: doc.xpath('//@*').length
|
224
|
+
}
|
225
|
+
rescue => e
|
226
|
+
{}
|
227
|
+
end
|
228
|
+
|
229
|
+
def extract_csv_structure
|
230
|
+
content = File.read(@file_path, encoding: detect_encoding)
|
231
|
+
lines = content.lines
|
232
|
+
|
233
|
+
{
|
234
|
+
rows: lines.length,
|
235
|
+
columns: lines.first&.split(',')&.length || 0,
|
236
|
+
headers: lines.first&.strip&.split(','),
|
237
|
+
sample_data: lines[1..3]&.map { |line| line.strip.split(',') }
|
238
|
+
}
|
239
|
+
rescue => e
|
240
|
+
{}
|
241
|
+
end
|
242
|
+
|
243
|
+
def extract_json_structure
|
244
|
+
content = File.read(@file_path, encoding: detect_encoding)
|
245
|
+
begin
|
246
|
+
json_data = JSON.parse(content)
|
247
|
+
analyze_json_structure(json_data)
|
248
|
+
rescue JSON::ParserError
|
249
|
+
{ error: 'Invalid JSON format' }
|
250
|
+
end
|
251
|
+
rescue => e
|
252
|
+
{}
|
253
|
+
end
|
254
|
+
|
255
|
+
def extract_plain_structure
|
256
|
+
content = File.read(@file_path, encoding: detect_encoding)
|
257
|
+
|
258
|
+
{
|
259
|
+
lines: content.lines.count,
|
260
|
+
paragraphs: count_paragraphs(content),
|
261
|
+
words: count_words(content),
|
262
|
+
characters: content.length
|
263
|
+
}
|
264
|
+
end
|
265
|
+
|
266
|
+
def count_words(text)
|
267
|
+
text.scan(/\b\w+\b/).length
|
268
|
+
end
|
269
|
+
|
270
|
+
def count_paragraphs(text)
|
271
|
+
text.split(/\n\s*\n/).length
|
272
|
+
end
|
273
|
+
|
274
|
+
def detect_language(text)
|
275
|
+
# Simple language detection based on common words
|
276
|
+
# This is a placeholder - would use a proper language detection library
|
277
|
+
sample = text[0..1000].downcase
|
278
|
+
|
279
|
+
if sample.match?(/\b(the|and|for|are|but|not|you|all|can|had|her|was|one|our|out|day|get|has|him|his|how|its|may|new|now|old|see|two|who|boy|did|man|men|run|she|too|use|way|who|oil|sit|set|run|hot|let|say|she|try|ask|may|own|say|she|too|use|way|who|oil|sit|set|run|hot|let|say|she|try|ask|may|own)\b/)
|
280
|
+
'English'
|
281
|
+
else
|
282
|
+
'Unknown'
|
283
|
+
end
|
284
|
+
end
|
285
|
+
|
286
|
+
def has_urls?(text)
|
287
|
+
text.match?(/https?:\/\/[^\s]+/)
|
288
|
+
end
|
289
|
+
|
290
|
+
def has_emails?(text)
|
291
|
+
text.match?(/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/)
|
292
|
+
end
|
293
|
+
|
294
|
+
def has_phone_numbers?(text)
|
295
|
+
text.match?(/\b\d{3}[-.]?\d{3}[-.]?\d{4}\b/)
|
296
|
+
end
|
297
|
+
|
298
|
+
def extract_urls(text)
|
299
|
+
text.scan(/https?:\/\/[^\s]+/)
|
300
|
+
end
|
301
|
+
|
302
|
+
def extract_emails(text)
|
303
|
+
text.scan(/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/)
|
304
|
+
end
|
305
|
+
|
306
|
+
def extract_phone_numbers(text)
|
307
|
+
text.scan(/\b\d{3}[-.]?\d{3}[-.]?\d{4}\b/)
|
308
|
+
end
|
309
|
+
|
310
|
+
def extract_html_links
|
311
|
+
content = File.read(@file_path, encoding: detect_encoding)
|
312
|
+
doc = Nokogiri::HTML(content)
|
313
|
+
|
314
|
+
{
|
315
|
+
internal_links: doc.css('a[href^="/"], a[href^="#"]').map { |link| link['href'] },
|
316
|
+
external_links: doc.css('a[href^="http"]').map { |link| link['href'] },
|
317
|
+
email_links: doc.css('a[href^="mailto:"]').map { |link| link['href'] }
|
318
|
+
}
|
319
|
+
rescue
|
320
|
+
{}
|
321
|
+
end
|
322
|
+
|
323
|
+
def extract_headings(doc)
|
324
|
+
headings = {}
|
325
|
+
(1..6).each do |level|
|
326
|
+
headings["h#{level}"] = doc.css("h#{level}").map(&:text)
|
327
|
+
end
|
328
|
+
headings
|
329
|
+
end
|
330
|
+
|
331
|
+
def analyze_json_structure(data, path = [])
|
332
|
+
case data
|
333
|
+
when Hash
|
334
|
+
{
|
335
|
+
type: 'object',
|
336
|
+
keys: data.keys,
|
337
|
+
nested_structure: data.map { |k, v| [k, analyze_json_structure(v, path + [k])] }.to_h
|
338
|
+
}
|
339
|
+
when Array
|
340
|
+
{
|
341
|
+
type: 'array',
|
342
|
+
length: data.length,
|
343
|
+
element_types: data.map { |item| analyze_json_structure(item, path + ['[]']) }.uniq
|
344
|
+
}
|
345
|
+
else
|
346
|
+
{
|
347
|
+
type: data.class.name.downcase,
|
348
|
+
value: data.is_a?(String) && data.length > 100 ? "#{data[0..100]}..." : data
|
349
|
+
}
|
350
|
+
end
|
351
|
+
end
|
352
|
+
|
353
|
+
def fallback_text_extraction
|
354
|
+
File.read(@file_path, encoding: detect_encoding)
|
355
|
+
rescue => e
|
356
|
+
"Unable to extract text: #{e.message}"
|
357
|
+
end
|
358
|
+
end
|
359
|
+
end
|
360
|
+
end
|