universal_document_processor 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,283 @@
1
+ module UniversalDocumentProcessor
2
+ module Processors
3
+ class CharacterValidator
4
+ # Invalid character patterns
5
+ INVALID_CONTROL_CHARS = /[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/
6
+ REPLACEMENT_CHAR = "\uFFFD" # Unicode replacement character
7
+ NULL_BYTE = "\x00"
8
+
9
+ def self.analyze_text(text)
10
+ return {} if text.nil? || text.empty?
11
+
12
+ {
13
+ encoding: text.encoding.name,
14
+ valid_encoding: text.valid_encoding?,
15
+ has_invalid_chars: has_invalid_characters?(text),
16
+ has_control_chars: has_control_characters?(text),
17
+ has_null_bytes: has_null_bytes?(text),
18
+ has_replacement_chars: has_replacement_characters?(text),
19
+ has_non_printable: has_non_printable_characters?(text),
20
+ character_issues: detect_character_issues(text),
21
+ cleaned_text: clean_text(text),
22
+ statistics: character_statistics(text),
23
+ japanese_analysis: validate_japanese_text(text)
24
+ }
25
+ end
26
+
27
+ def self.has_invalid_characters?(text)
28
+ !text.valid_encoding? || text.include?(REPLACEMENT_CHAR)
29
+ end
30
+
31
+ def self.has_control_characters?(text)
32
+ text.match?(INVALID_CONTROL_CHARS)
33
+ end
34
+
35
+ def self.has_null_bytes?(text)
36
+ text.include?(NULL_BYTE)
37
+ end
38
+
39
+ def self.has_replacement_characters?(text)
40
+ text.include?(REPLACEMENT_CHAR)
41
+ end
42
+
43
+ def self.has_non_printable_characters?(text)
44
+ # Check for non-printable characters (excluding common whitespace)
45
+ text.match?(/[^\p{Print}\s\t\n\r]/)
46
+ end
47
+
48
+ def self.detect_character_issues(text)
49
+ issues = []
50
+
51
+ # Check encoding validity
52
+ unless text.valid_encoding?
53
+ issues << {
54
+ type: 'invalid_encoding',
55
+ message: "Text contains invalid #{text.encoding.name} sequences",
56
+ severity: 'high'
57
+ }
58
+ end
59
+
60
+ # Check for null bytes
61
+ if has_null_bytes?(text)
62
+ null_positions = find_character_positions(text, NULL_BYTE)
63
+ issues << {
64
+ type: 'null_bytes',
65
+ message: "Text contains #{null_positions.length} null bytes",
66
+ positions: null_positions,
67
+ severity: 'high'
68
+ }
69
+ end
70
+
71
+ # Check for control characters
72
+ if has_control_characters?(text)
73
+ control_chars = text.scan(INVALID_CONTROL_CHARS).uniq
74
+ issues << {
75
+ type: 'control_characters',
76
+ message: "Text contains control characters: #{control_chars.map { |c| "\\x#{c.ord.to_s(16).upcase}" }.join(', ')}",
77
+ characters: control_chars,
78
+ severity: 'medium'
79
+ }
80
+ end
81
+
82
+ # Check for replacement characters
83
+ if has_replacement_characters?(text)
84
+ replacement_positions = find_character_positions(text, REPLACEMENT_CHAR)
85
+ issues << {
86
+ type: 'replacement_characters',
87
+ message: "Text contains #{replacement_positions.length} replacement characters (corrupted data)",
88
+ positions: replacement_positions,
89
+ severity: 'medium'
90
+ }
91
+ end
92
+
93
+ # Check for suspicious character patterns
94
+ suspicious_patterns = detect_suspicious_patterns(text)
95
+ unless suspicious_patterns.empty?
96
+ issues << {
97
+ type: 'suspicious_patterns',
98
+ message: "Text contains suspicious character patterns",
99
+ patterns: suspicious_patterns,
100
+ severity: 'low'
101
+ }
102
+ end
103
+
104
+ issues
105
+ end
106
+
107
+ def self.clean_text(text, options = {})
108
+ cleaned = text.dup
109
+
110
+ # Remove null bytes
111
+ cleaned.gsub!(NULL_BYTE, '') if options[:remove_null_bytes] != false
112
+
113
+ # Remove or replace control characters
114
+ if options[:remove_control_chars] != false
115
+ cleaned.gsub!(INVALID_CONTROL_CHARS, options[:control_char_replacement] || ' ')
116
+ end
117
+
118
+ # Handle replacement characters
119
+ if options[:remove_replacement_chars]
120
+ cleaned.gsub!(REPLACEMENT_CHAR, '')
121
+ end
122
+
123
+ # Normalize whitespace
124
+ if options[:normalize_whitespace] != false
125
+ cleaned.gsub!(/\s+/, ' ')
126
+ cleaned.strip!
127
+ end
128
+
129
+ # Ensure valid encoding
130
+ if options[:force_encoding] && !cleaned.valid_encoding?
131
+ cleaned = cleaned.encode('UTF-8', invalid: :replace, undef: :replace, replace: '')
132
+ end
133
+
134
+ cleaned
135
+ end
136
+
137
+ def self.character_statistics(text)
138
+ {
139
+ total_chars: text.length,
140
+ printable_chars: text.count("\u{20}-\u{7E}\u{A0}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFEF}"),
141
+ control_chars: text.scan(INVALID_CONTROL_CHARS).length,
142
+ whitespace_chars: text.count(" \t\n\r"),
143
+ null_bytes: text.count(NULL_BYTE),
144
+ replacement_chars: text.count(REPLACEMENT_CHAR),
145
+ unicode_chars: text.count("\u{80}-\u{FFFF}"),
146
+ ascii_chars: text.count("\u{00}-\u{7F}"),
147
+ # Japanese character statistics
148
+ japanese_chars: count_japanese_characters(text),
149
+ hiragana_chars: text.count("\u{3040}-\u{309F}"),
150
+ katakana_chars: text.count("\u{30A0}-\u{30FF}"),
151
+ kanji_chars: text.count("\u{4E00}-\u{9FAF}"),
152
+ fullwidth_chars: text.count("\u{FF00}-\u{FFEF}"),
153
+ # Other Asian scripts
154
+ chinese_chars: text.count("\u{4E00}-\u{9FFF}"),
155
+ korean_chars: text.count("\u{AC00}-\u{D7A3}")
156
+ }
157
+ end
158
+
159
+ def self.validate_file_encoding(file_path)
160
+ encodings_to_try = ['UTF-8', 'ISO-8859-1', 'Windows-1252', 'Shift_JIS', 'EUC-JP', 'ASCII']
161
+
162
+ encodings_to_try.each do |encoding|
163
+ begin
164
+ content = File.read(file_path, encoding: encoding)
165
+ if content.valid_encoding?
166
+ return {
167
+ detected_encoding: encoding,
168
+ valid: true,
169
+ content: content,
170
+ analysis: analyze_text(content)
171
+ }
172
+ end
173
+ rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
174
+ next
175
+ end
176
+ end
177
+
178
+ # If no encoding works, read as binary and analyze
179
+ {
180
+ detected_encoding: 'BINARY',
181
+ valid: false,
182
+ content: File.read(file_path, encoding: 'BINARY'),
183
+ analysis: { has_invalid_chars: true }
184
+ }
185
+ end
186
+
187
+ def self.repair_text(text, strategy = :conservative)
188
+ case strategy
189
+ when :conservative
190
+ # Only remove clearly invalid characters
191
+ clean_text(text, remove_null_bytes: true, remove_control_chars: true)
192
+ when :aggressive
193
+ # Remove all non-printable characters
194
+ text.gsub(/[^\p{Print}\s]/, '')
195
+ when :replace
196
+ # Replace invalid characters with safe alternatives
197
+ clean_text(text,
198
+ remove_null_bytes: true,
199
+ remove_control_chars: true,
200
+ control_char_replacement: ' ',
201
+ force_encoding: true
202
+ )
203
+ else
204
+ text
205
+ end
206
+ end
207
+
208
+ # Japanese-specific methods
209
+ def self.detect_japanese_script(text)
210
+ scripts = []
211
+ scripts << 'hiragana' if text.match?(/[\u{3040}-\u{309F}]/)
212
+ scripts << 'katakana' if text.match?(/[\u{30A0}-\u{30FF}]/)
213
+ scripts << 'kanji' if text.match?(/[\u{4E00}-\u{9FAF}]/)
214
+ scripts << 'fullwidth' if text.match?(/[\u{FF00}-\u{FFEF}]/)
215
+ scripts
216
+ end
217
+
218
+ def self.is_japanese_text?(text)
219
+ japanese_chars = count_japanese_characters(text)
220
+ total_chars = text.gsub(/\s/, '').length
221
+ return false if total_chars == 0
222
+
223
+ # If more than 10% of non-space characters are Japanese, consider it Japanese text
224
+ (japanese_chars.to_f / total_chars) > 0.1
225
+ end
226
+
227
+ def self.count_japanese_characters(text)
228
+ hiragana = text.count("\u{3040}-\u{309F}")
229
+ katakana = text.count("\u{30A0}-\u{30FF}")
230
+ kanji = text.count("\u{4E00}-\u{9FAF}")
231
+ fullwidth = text.count("\u{FF00}-\u{FFEF}")
232
+
233
+ hiragana + katakana + kanji + fullwidth
234
+ end
235
+
236
+ def self.validate_japanese_text(text)
237
+ return { japanese: false } unless is_japanese_text?(text)
238
+
239
+ {
240
+ japanese: true,
241
+ scripts: detect_japanese_script(text),
242
+ character_count: count_japanese_characters(text),
243
+ mixed_with_latin: text.match?(/[\p{Latin}]/) && text.match?(/[\u{3040}-\u{30FF}\u{4E00}-\u{9FAF}]/),
244
+ valid_japanese: true # Japanese characters are always valid
245
+ }
246
+ end
247
+
248
+ private
249
+
250
+ def self.find_character_positions(text, char)
251
+ positions = []
252
+ text.chars.each_with_index do |c, index|
253
+ positions << index if c == char
254
+ end
255
+ positions
256
+ end
257
+
258
+ def self.detect_suspicious_patterns(text)
259
+ patterns = []
260
+
261
+ # Long sequences of the same character
262
+ if text.match?(/(.)\1{20,}/)
263
+ patterns << 'long_repetition'
264
+ end
265
+
266
+ # Excessive whitespace
267
+ if text.match?(/\s{50,}/)
268
+ patterns << 'excessive_whitespace'
269
+ end
270
+
271
+ # Mixed scripts that might indicate corruption (but allow common combinations)
272
+ if text.match?(/[\p{Latin}][\p{Cyrillic}\p{Arabic}\p{Hebrew}]/)
273
+ patterns << 'mixed_scripts'
274
+ end
275
+
276
+ # Note: Japanese mixed with Latin is common and NOT flagged as suspicious
277
+ # Example: "Hello 世界" or "Company株式会社" are normal
278
+
279
+ patterns
280
+ end
281
+ end
282
+ end
283
+ end
@@ -0,0 +1,219 @@
1
+ module UniversalDocumentProcessor
2
+ module Processors
3
+ class ExcelProcessor < BaseProcessor
4
+ def extract_text
5
+ with_error_handling do
6
+ workbook = Roo::Spreadsheet.open(@file_path)
7
+ text_content = []
8
+
9
+ workbook.sheets.each do |sheet_name|
10
+ workbook.sheet(sheet_name)
11
+ text_content << "=== Sheet: #{sheet_name} ==="
12
+
13
+ # Get all rows with data
14
+ if workbook.last_row
15
+ (workbook.first_row..workbook.last_row).each do |row|
16
+ row_data = []
17
+ (workbook.first_column..workbook.last_column).each do |col|
18
+ cell_value = workbook.cell(row, col)
19
+ row_data << cell_value.to_s if cell_value
20
+ end
21
+ text_content << row_data.join(' | ') unless row_data.all?(&:empty?)
22
+ end
23
+ end
24
+
25
+ text_content << "" # Add blank line between sheets
26
+ end
27
+
28
+ text_content.join("\n")
29
+ end
30
+ end
31
+
32
+ def extract_metadata
33
+ with_error_handling do
34
+ workbook = Roo::Spreadsheet.open(@file_path)
35
+
36
+ sheet_info = {}
37
+ workbook.sheets.each do |sheet_name|
38
+ workbook.sheet(sheet_name)
39
+ sheet_info[sheet_name] = {
40
+ rows: workbook.last_row || 0,
41
+ columns: workbook.last_column || 0,
42
+ first_row: workbook.first_row || 0,
43
+ first_column: workbook.first_column || 0
44
+ }
45
+ end
46
+
47
+ super.merge({
48
+ sheet_count: workbook.sheets.length,
49
+ sheet_names: workbook.sheets,
50
+ sheet_info: sheet_info,
51
+ total_rows: sheet_info.values.sum { |info| info[:rows] },
52
+ total_columns: sheet_info.values.map { |info| info[:columns] }.max || 0,
53
+ has_formulas: detect_formulas(workbook),
54
+ has_charts: detect_charts(workbook)
55
+ })
56
+ end
57
+ end
58
+
59
+ def extract_tables
60
+ with_error_handling do
61
+ workbook = Roo::Spreadsheet.open(@file_path)
62
+ tables = []
63
+
64
+ workbook.sheets.each do |sheet_name|
65
+ workbook.sheet(sheet_name)
66
+ next unless workbook.last_row
67
+
68
+ table_data = {
69
+ sheet_name: sheet_name,
70
+ rows: workbook.last_row,
71
+ columns: workbook.last_column,
72
+ data: [],
73
+ headers: []
74
+ }
75
+
76
+ # Extract headers (first row)
77
+ if workbook.first_row
78
+ (workbook.first_column..workbook.last_column).each do |col|
79
+ header = workbook.cell(workbook.first_row, col)
80
+ table_data[:headers] << (header ? header.to_s : "Column #{col}")
81
+ end
82
+ end
83
+
84
+ # Extract all data
85
+ (workbook.first_row..workbook.last_row).each do |row|
86
+ row_data = []
87
+ (workbook.first_column..workbook.last_column).each do |col|
88
+ cell_value = workbook.cell(row, col)
89
+ row_data << (cell_value ? cell_value.to_s : "")
90
+ end
91
+ table_data[:data] << row_data
92
+ end
93
+
94
+ tables << table_data
95
+ end
96
+
97
+ tables
98
+ end
99
+ end
100
+
101
+ def extract_formulas
102
+ with_error_handling do
103
+ workbook = Roo::Spreadsheet.open(@file_path)
104
+ formulas = []
105
+
106
+ workbook.sheets.each do |sheet_name|
107
+ workbook.sheet(sheet_name)
108
+ next unless workbook.last_row
109
+
110
+ (workbook.first_row..workbook.last_row).each do |row|
111
+ (workbook.first_column..workbook.last_column).each do |col|
112
+ if workbook.respond_to?(:formula) && workbook.formula(row, col)
113
+ formulas << {
114
+ sheet: sheet_name,
115
+ row: row,
116
+ column: col,
117
+ formula: workbook.formula(row, col),
118
+ value: workbook.cell(row, col)
119
+ }
120
+ end
121
+ end
122
+ end
123
+ end
124
+
125
+ formulas
126
+ end
127
+ end
128
+
129
+ def extract_charts
130
+ with_error_handling do
131
+ # Chart extraction would require more complex parsing
132
+ # This is a placeholder for future implementation
133
+ []
134
+ end
135
+ end
136
+
137
+ def supported_operations
138
+ super + [:extract_tables, :extract_formulas, :extract_charts, :extract_pivot_tables]
139
+ end
140
+
141
+ def to_csv(sheet_name = nil)
142
+ with_error_handling do
143
+ workbook = Roo::Spreadsheet.open(@file_path)
144
+
145
+ if sheet_name
146
+ workbook.sheet(sheet_name)
147
+ workbook.to_csv
148
+ else
149
+ # Convert all sheets to CSV
150
+ csv_data = {}
151
+ workbook.sheets.each do |name|
152
+ workbook.sheet(name)
153
+ csv_data[name] = workbook.to_csv
154
+ end
155
+ csv_data
156
+ end
157
+ end
158
+ end
159
+
160
+ def to_json
161
+ with_error_handling do
162
+ workbook = Roo::Spreadsheet.open(@file_path)
163
+ json_data = {}
164
+
165
+ workbook.sheets.each do |sheet_name|
166
+ workbook.sheet(sheet_name)
167
+ sheet_data = []
168
+
169
+ next unless workbook.last_row
170
+
171
+ # Get headers
172
+ headers = []
173
+ (workbook.first_column..workbook.last_column).each do |col|
174
+ header = workbook.cell(workbook.first_row, col)
175
+ headers << (header ? header.to_s : "Column #{col}")
176
+ end
177
+
178
+ # Get data rows
179
+ ((workbook.first_row + 1)..workbook.last_row).each do |row|
180
+ row_hash = {}
181
+ (workbook.first_column..workbook.last_column).each_with_index do |col, index|
182
+ cell_value = workbook.cell(row, col)
183
+ row_hash[headers[index]] = cell_value
184
+ end
185
+ sheet_data << row_hash
186
+ end
187
+
188
+ json_data[sheet_name] = sheet_data
189
+ end
190
+
191
+ json_data.to_json
192
+ end
193
+ end
194
+
195
+ private
196
+
197
+ def detect_formulas(workbook)
198
+ workbook.sheets.any? do |sheet_name|
199
+ workbook.sheet(sheet_name)
200
+ next false unless workbook.last_row
201
+
202
+ (workbook.first_row..workbook.last_row).any? do |row|
203
+ (workbook.first_column..workbook.last_column).any? do |col|
204
+ workbook.respond_to?(:formula) && workbook.formula(row, col)
205
+ end
206
+ end
207
+ end
208
+ rescue
209
+ false
210
+ end
211
+
212
+ def detect_charts(workbook)
213
+ # Chart detection would require more complex parsing
214
+ # This is a placeholder for future implementation
215
+ false
216
+ end
217
+ end
218
+ end
219
+ end
@@ -0,0 +1,172 @@
1
+ module UniversalDocumentProcessor
2
+ module Processors
3
+ class ImageProcessor < BaseProcessor
4
+ def extract_text
5
+ with_error_handling do
6
+ # Images don't contain extractable text by default
7
+ # This could be extended with OCR functionality
8
+ "Image file: #{File.basename(@file_path)}"
9
+ end
10
+ end
11
+
12
+ def extract_metadata
13
+ with_error_handling do
14
+ image = MiniMagick::Image.open(@file_path)
15
+
16
+ super.merge({
17
+ width: image.width,
18
+ height: image.height,
19
+ format: image.type,
20
+ colorspace: image.colorspace,
21
+ resolution: extract_resolution(image),
22
+ compression: image['compression'],
23
+ quality: image['quality'],
24
+ exif_data: extract_exif_data(image),
25
+ color_profile: extract_color_profile(image),
26
+ has_transparency: has_transparency?(image)
27
+ })
28
+ end
29
+ end
30
+
31
+ def extract_colors
32
+ with_error_handling do
33
+ image = MiniMagick::Image.open(@file_path)
34
+
35
+ # Get dominant colors using ImageMagick's histogram
36
+ colors = []
37
+ histogram_output = image.run_command('convert', @file_path, '-colors', '10', '-depth', '8', '-format', '%c', 'histogram:info:-')
38
+
39
+ histogram_output.split("\n").each do |line|
40
+ if line.match(/(\d+):\s+\(([^)]+)\)\s+(#\w+)/)
41
+ count = $1.to_i
42
+ rgb = $2
43
+ hex = $3
44
+ colors << {
45
+ count: count,
46
+ rgb: rgb,
47
+ hex: hex
48
+ }
49
+ end
50
+ end
51
+
52
+ colors.sort_by { |c| -c[:count] }
53
+ end
54
+ rescue => e
55
+ []
56
+ end
57
+
58
+ def resize(width, height, output_path = nil)
59
+ with_error_handling do
60
+ image = MiniMagick::Image.open(@file_path)
61
+ image.resize "#{width}x#{height}"
62
+
63
+ if output_path
64
+ image.write(output_path)
65
+ output_path
66
+ else
67
+ # Return as blob
68
+ image.to_blob
69
+ end
70
+ end
71
+ end
72
+
73
+ def convert_format(target_format, output_path = nil)
74
+ with_error_handling do
75
+ image = MiniMagick::Image.open(@file_path)
76
+ image.format(target_format.to_s.downcase)
77
+
78
+ if output_path
79
+ image.write(output_path)
80
+ output_path
81
+ else
82
+ # Return as blob
83
+ image.to_blob
84
+ end
85
+ end
86
+ end
87
+
88
+ def create_thumbnail(size = 150, output_path = nil)
89
+ with_error_handling do
90
+ image = MiniMagick::Image.open(@file_path)
91
+ image.resize "#{size}x#{size}"
92
+
93
+ if output_path
94
+ image.write(output_path)
95
+ output_path
96
+ else
97
+ image.to_blob
98
+ end
99
+ end
100
+ end
101
+
102
+ def extract_faces
103
+ with_error_handling do
104
+ # Placeholder for face detection
105
+ # Would require additional libraries like opencv or face detection APIs
106
+ []
107
+ end
108
+ end
109
+
110
+ def extract_text_ocr
111
+ with_error_handling do
112
+ # Placeholder for OCR functionality
113
+ # Would require tesseract or similar OCR library
114
+ "OCR not implemented - would require tesseract gem"
115
+ end
116
+ end
117
+
118
+ def supported_operations
119
+ super + [:extract_colors, :resize, :convert_format, :create_thumbnail, :extract_faces, :extract_text_ocr]
120
+ end
121
+
122
+ private
123
+
124
+ def extract_resolution(image)
125
+ {
126
+ x: image.resolution[0],
127
+ y: image.resolution[1],
128
+ units: image['units']
129
+ }
130
+ rescue
131
+ { x: nil, y: nil, units: nil }
132
+ end
133
+
134
+ def extract_exif_data(image)
135
+ exif = {}
136
+
137
+ # Common EXIF tags
138
+ exif_tags = %w[
139
+ exif:DateTime exif:DateTimeOriginal exif:DateTimeDigitized
140
+ exif:Make exif:Model exif:Software
141
+ exif:ExposureTime exif:FNumber exif:ISO exif:Flash
142
+ exif:FocalLength exif:WhiteBalance
143
+ exif:GPSLatitude exif:GPSLongitude exif:GPSAltitude
144
+ ]
145
+
146
+ exif_tags.each do |tag|
147
+ value = image[tag]
148
+ exif[tag.gsub('exif:', '')] = value if value
149
+ end
150
+
151
+ exif
152
+ rescue
153
+ {}
154
+ end
155
+
156
+ def extract_color_profile(image)
157
+ {
158
+ profile: image['colorspace'],
159
+ icc_profile: image['icc:description']
160
+ }
161
+ rescue
162
+ {}
163
+ end
164
+
165
+ def has_transparency?(image)
166
+ image['matte'] == 'True' || image.type.downcase.include?('png')
167
+ rescue
168
+ false
169
+ end
170
+ end
171
+ end
172
+ end