universal_document_processor 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/README.md +53 -1
- data/lib/universal_document_processor/document.rb +40 -4
- data/lib/universal_document_processor/processors/excel_processor.rb +719 -132
- data/lib/universal_document_processor/processors/word_processor.rb +82 -4
- data/lib/universal_document_processor/utils/file_detector.rb +1 -0
- data/lib/universal_document_processor/version.rb +1 -1
- metadata +15 -3
- data/AI_USAGE_GUIDE.md +0 -404
- data/GEM_RELEASE_GUIDE.md +0 -288
@@ -1,128 +1,67 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'zip'
|
3
|
+
require 'rexml/document'
|
4
|
+
require 'csv'
|
5
|
+
|
1
6
|
module UniversalDocumentProcessor
|
2
7
|
module Processors
|
3
8
|
class ExcelProcessor < BaseProcessor
|
4
9
|
def extract_text
|
5
10
|
with_error_handling do
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
row_data = []
|
17
|
-
(workbook.first_column..workbook.last_column).each do |col|
|
18
|
-
cell_value = workbook.cell(row, col)
|
19
|
-
row_data << cell_value.to_s if cell_value
|
20
|
-
end
|
21
|
-
text_content << row_data.join(' | ') unless row_data.all?(&:empty?)
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
text_content << "" # Add blank line between sheets
|
11
|
+
if @file_path.end_with?('.csv')
|
12
|
+
extract_csv_text
|
13
|
+
elsif @file_path.end_with?('.tsv')
|
14
|
+
extract_tsv_text
|
15
|
+
elsif @file_path.end_with?('.xlsx')
|
16
|
+
extract_xlsx_text_builtin
|
17
|
+
elsif @file_path.end_with?('.xls')
|
18
|
+
extract_xls_text_builtin
|
19
|
+
else
|
20
|
+
determine_format_and_extract
|
26
21
|
end
|
27
|
-
|
28
|
-
text_content.join("\n")
|
29
22
|
end
|
30
23
|
end
|
31
24
|
|
32
25
|
def extract_metadata
|
33
26
|
with_error_handling do
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
end
|
46
|
-
|
47
|
-
super.merge({
|
48
|
-
sheet_count: workbook.sheets.length,
|
49
|
-
sheet_names: workbook.sheets,
|
50
|
-
sheet_info: sheet_info,
|
51
|
-
total_rows: sheet_info.values.sum { |info| info[:rows] },
|
52
|
-
total_columns: sheet_info.values.map { |info| info[:columns] }.max || 0,
|
53
|
-
has_formulas: detect_formulas(workbook),
|
54
|
-
has_charts: detect_charts(workbook)
|
55
|
-
})
|
27
|
+
if @file_path.end_with?('.csv')
|
28
|
+
extract_csv_metadata
|
29
|
+
elsif @file_path.end_with?('.tsv')
|
30
|
+
extract_tsv_metadata
|
31
|
+
elsif @file_path.end_with?('.xlsx')
|
32
|
+
extract_xlsx_metadata_builtin
|
33
|
+
elsif @file_path.end_with?('.xls')
|
34
|
+
extract_xls_metadata_builtin
|
35
|
+
else
|
36
|
+
basic_file_metadata
|
37
|
+
end
|
56
38
|
end
|
57
39
|
end
|
58
40
|
|
59
41
|
def extract_tables
|
60
42
|
with_error_handling do
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
columns: workbook.last_column,
|
72
|
-
data: [],
|
73
|
-
headers: []
|
74
|
-
}
|
75
|
-
|
76
|
-
# Extract headers (first row)
|
77
|
-
if workbook.first_row
|
78
|
-
(workbook.first_column..workbook.last_column).each do |col|
|
79
|
-
header = workbook.cell(workbook.first_row, col)
|
80
|
-
table_data[:headers] << (header ? header.to_s : "Column #{col}")
|
81
|
-
end
|
82
|
-
end
|
83
|
-
|
84
|
-
# Extract all data
|
85
|
-
(workbook.first_row..workbook.last_row).each do |row|
|
86
|
-
row_data = []
|
87
|
-
(workbook.first_column..workbook.last_column).each do |col|
|
88
|
-
cell_value = workbook.cell(row, col)
|
89
|
-
row_data << (cell_value ? cell_value.to_s : "")
|
90
|
-
end
|
91
|
-
table_data[:data] << row_data
|
92
|
-
end
|
93
|
-
|
94
|
-
tables << table_data
|
43
|
+
if @file_path.end_with?('.csv')
|
44
|
+
extract_csv_tables
|
45
|
+
elsif @file_path.end_with?('.tsv')
|
46
|
+
extract_tsv_tables
|
47
|
+
elsif @file_path.end_with?('.xlsx')
|
48
|
+
extract_xlsx_tables_builtin
|
49
|
+
elsif @file_path.end_with?('.xls')
|
50
|
+
extract_xls_tables_builtin
|
51
|
+
else
|
52
|
+
[]
|
95
53
|
end
|
96
|
-
|
97
|
-
tables
|
98
54
|
end
|
99
55
|
end
|
100
56
|
|
101
57
|
def extract_formulas
|
102
58
|
with_error_handling do
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
next unless workbook.last_row
|
109
|
-
|
110
|
-
(workbook.first_row..workbook.last_row).each do |row|
|
111
|
-
(workbook.first_column..workbook.last_column).each do |col|
|
112
|
-
if workbook.respond_to?(:formula) && workbook.formula(row, col)
|
113
|
-
formulas << {
|
114
|
-
sheet: sheet_name,
|
115
|
-
row: row,
|
116
|
-
column: col,
|
117
|
-
formula: workbook.formula(row, col),
|
118
|
-
value: workbook.cell(row, col)
|
119
|
-
}
|
120
|
-
end
|
121
|
-
end
|
122
|
-
end
|
59
|
+
if @file_path.end_with?('.xlsx')
|
60
|
+
extract_xlsx_formulas_builtin
|
61
|
+
else
|
62
|
+
# .xls, .csv, and .tsv don't support formulas in our built-in implementation
|
63
|
+
[]
|
123
64
|
end
|
124
|
-
|
125
|
-
formulas
|
126
65
|
end
|
127
66
|
end
|
128
67
|
|
@@ -135,65 +74,647 @@ module UniversalDocumentProcessor
|
|
135
74
|
end
|
136
75
|
|
137
76
|
def supported_operations
|
138
|
-
super + [:extract_tables, :extract_formulas, :extract_charts, :extract_pivot_tables]
|
77
|
+
super + [:extract_tables, :extract_formulas, :extract_charts, :extract_pivot_tables, :extract_statistics, :extract_cell_formatting, :validate_data, :to_csv, :to_tsv, :to_json]
|
139
78
|
end
|
140
79
|
|
141
80
|
def to_csv(sheet_name = nil)
|
142
81
|
with_error_handling do
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
82
|
+
if @file_path.end_with?('.csv')
|
83
|
+
File.read(@file_path)
|
84
|
+
elsif @file_path.end_with?('.tsv')
|
85
|
+
# Convert TSV to CSV
|
86
|
+
convert_tsv_to_csv(File.read(@file_path))
|
87
|
+
else
|
88
|
+
tables = extract_tables
|
89
|
+
if sheet_name
|
90
|
+
table = tables.find { |t| t[:sheet_name] == sheet_name }
|
91
|
+
return "" unless table
|
92
|
+
convert_table_to_csv(table)
|
93
|
+
else
|
94
|
+
# Convert all sheets to CSV
|
95
|
+
csv_data = {}
|
96
|
+
tables.each do |table|
|
97
|
+
csv_data[table[:sheet_name]] = convert_table_to_csv(table)
|
98
|
+
end
|
99
|
+
csv_data
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def to_tsv(sheet_name = nil)
|
106
|
+
with_error_handling do
|
107
|
+
if @file_path.end_with?('.tsv')
|
108
|
+
File.read(@file_path)
|
109
|
+
elsif @file_path.end_with?('.csv')
|
110
|
+
# Convert CSV to TSV
|
111
|
+
convert_csv_to_tsv(File.read(@file_path))
|
148
112
|
else
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
113
|
+
tables = extract_tables
|
114
|
+
if sheet_name
|
115
|
+
table = tables.find { |t| t[:sheet_name] == sheet_name }
|
116
|
+
return "" unless table
|
117
|
+
convert_table_to_tsv(table)
|
118
|
+
else
|
119
|
+
# Convert all sheets to TSV
|
120
|
+
tsv_data = {}
|
121
|
+
tables.each do |table|
|
122
|
+
tsv_data[table[:sheet_name]] = convert_table_to_tsv(table)
|
123
|
+
end
|
124
|
+
tsv_data
|
154
125
|
end
|
155
|
-
csv_data
|
156
126
|
end
|
157
127
|
end
|
158
128
|
end
|
159
129
|
|
160
130
|
def to_json
|
161
131
|
with_error_handling do
|
162
|
-
|
132
|
+
tables = extract_tables
|
163
133
|
json_data = {}
|
164
134
|
|
165
|
-
|
166
|
-
workbook.sheet(sheet_name)
|
135
|
+
tables.each do |table|
|
167
136
|
sheet_data = []
|
137
|
+
headers = table[:headers] || []
|
168
138
|
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
headers = []
|
173
|
-
(workbook.first_column..workbook.last_column).each do |col|
|
174
|
-
header = workbook.cell(workbook.first_row, col)
|
175
|
-
headers << (header ? header.to_s : "Column #{col}")
|
176
|
-
end
|
177
|
-
|
178
|
-
# Get data rows
|
179
|
-
((workbook.first_row + 1)..workbook.last_row).each do |row|
|
139
|
+
table[:data].each_with_index do |row, index|
|
140
|
+
next if index == 0 && !headers.empty? # Skip header row if we have headers
|
141
|
+
|
180
142
|
row_hash = {}
|
181
|
-
|
182
|
-
|
183
|
-
row_hash[
|
143
|
+
row.each_with_index do |cell, col_index|
|
144
|
+
header = headers[col_index] || "Column #{col_index + 1}"
|
145
|
+
row_hash[header] = cell
|
184
146
|
end
|
185
147
|
sheet_data << row_hash
|
186
148
|
end
|
187
149
|
|
188
|
-
json_data[sheet_name] = sheet_data
|
150
|
+
json_data[table[:sheet_name]] = sheet_data
|
189
151
|
end
|
190
152
|
|
153
|
+
require 'json'
|
191
154
|
json_data.to_json
|
192
155
|
end
|
193
156
|
end
|
194
157
|
|
158
|
+
def extract_statistics
|
159
|
+
with_error_handling do
|
160
|
+
tables = extract_tables
|
161
|
+
statistics = {}
|
162
|
+
|
163
|
+
tables.each do |table|
|
164
|
+
sheet_stats = analyze_table_statistics(table)
|
165
|
+
statistics[table[:sheet_name]] = sheet_stats
|
166
|
+
end
|
167
|
+
|
168
|
+
statistics
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
def validate_data
|
173
|
+
with_error_handling do
|
174
|
+
tables = extract_tables
|
175
|
+
validation_results = {}
|
176
|
+
|
177
|
+
tables.each do |table|
|
178
|
+
validation = validate_table_data(table)
|
179
|
+
validation_results[table[:sheet_name]] = validation
|
180
|
+
end
|
181
|
+
|
182
|
+
validation_results
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
def extract_cell_formatting
|
187
|
+
with_error_handling do
|
188
|
+
# This would require more detailed Excel parsing
|
189
|
+
# For now, return basic formatting info for built-in processing
|
190
|
+
{
|
191
|
+
note: "Cell formatting extraction requires more detailed Excel parsing - feature planned for future release"
|
192
|
+
}
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
def create_summary_report
|
197
|
+
with_error_handling do
|
198
|
+
{
|
199
|
+
metadata: extract_metadata,
|
200
|
+
statistics: extract_statistics,
|
201
|
+
data_validation: validate_data,
|
202
|
+
formulas: extract_formulas.length,
|
203
|
+
total_sheets: extract_metadata[:sheet_count],
|
204
|
+
processing_time: Time.current.to_s
|
205
|
+
}
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
195
209
|
private
|
196
210
|
|
211
|
+
# CSV Processing Methods
|
212
|
+
def extract_csv_text
|
213
|
+
content = File.read(@file_path, encoding: 'UTF-8')
|
214
|
+
# Convert CSV to readable text format
|
215
|
+
lines = CSV.parse(content)
|
216
|
+
lines.map { |row| row.join(' | ') }.join("\n")
|
217
|
+
rescue => e
|
218
|
+
"Error reading CSV: #{e.message}"
|
219
|
+
end
|
220
|
+
|
221
|
+
def extract_csv_metadata
|
222
|
+
content = File.read(@file_path, encoding: 'UTF-8')
|
223
|
+
lines = CSV.parse(content)
|
224
|
+
|
225
|
+
{
|
226
|
+
format: 'csv',
|
227
|
+
file_size: File.size(@file_path),
|
228
|
+
last_modified: File.mtime(@file_path),
|
229
|
+
sheet_count: 1,
|
230
|
+
sheet_names: ['Sheet1'],
|
231
|
+
total_rows: lines.length,
|
232
|
+
total_columns: lines.first&.length || 0,
|
233
|
+
has_headers: detect_csv_headers(lines),
|
234
|
+
encoding: 'UTF-8'
|
235
|
+
}
|
236
|
+
rescue => e
|
237
|
+
basic_file_metadata.merge(error: e.message)
|
238
|
+
end
|
239
|
+
|
240
|
+
def extract_csv_tables
|
241
|
+
content = File.read(@file_path, encoding: 'UTF-8')
|
242
|
+
lines = CSV.parse(content)
|
243
|
+
|
244
|
+
headers = detect_csv_headers(lines) ? lines.first : []
|
245
|
+
|
246
|
+
[{
|
247
|
+
sheet_name: 'Sheet1',
|
248
|
+
rows: lines.length,
|
249
|
+
columns: lines.first&.length || 0,
|
250
|
+
headers: headers,
|
251
|
+
data: lines
|
252
|
+
}]
|
253
|
+
rescue => e
|
254
|
+
[]
|
255
|
+
end
|
256
|
+
|
257
|
+
def detect_csv_headers(lines)
|
258
|
+
return false if lines.empty? || lines.length < 2
|
259
|
+
|
260
|
+
first_row = lines.first
|
261
|
+
second_row = lines[1]
|
262
|
+
|
263
|
+
# Check if first row contains text and second row contains different data types
|
264
|
+
first_row.any? { |cell| cell.to_s.match?(/[a-zA-Z]/) } &&
|
265
|
+
second_row.any? { |cell| cell.to_s.match?(/^\d+$/) || cell.to_s.match?(/^\d+\.\d+$/) }
|
266
|
+
end
|
267
|
+
|
268
|
+
# TSV Processing Methods
|
269
|
+
def extract_tsv_text
|
270
|
+
content = File.read(@file_path, encoding: 'UTF-8')
|
271
|
+
# Convert TSV to readable text format
|
272
|
+
lines = CSV.parse(content, col_sep: "\t")
|
273
|
+
lines.map { |row| row.join(' | ') }.join("\n")
|
274
|
+
rescue => e
|
275
|
+
"Error reading TSV: #{e.message}"
|
276
|
+
end
|
277
|
+
|
278
|
+
def extract_tsv_metadata
|
279
|
+
content = File.read(@file_path, encoding: 'UTF-8')
|
280
|
+
lines = CSV.parse(content, col_sep: "\t")
|
281
|
+
|
282
|
+
{
|
283
|
+
format: 'tsv',
|
284
|
+
file_size: File.size(@file_path),
|
285
|
+
last_modified: File.mtime(@file_path),
|
286
|
+
sheet_count: 1,
|
287
|
+
sheet_names: ['Sheet1'],
|
288
|
+
total_rows: lines.length,
|
289
|
+
total_columns: lines.first&.length || 0,
|
290
|
+
has_headers: detect_tsv_headers(lines),
|
291
|
+
encoding: 'UTF-8',
|
292
|
+
delimiter: 'tab'
|
293
|
+
}
|
294
|
+
rescue => e
|
295
|
+
basic_file_metadata.merge(error: e.message)
|
296
|
+
end
|
297
|
+
|
298
|
+
def extract_tsv_tables
|
299
|
+
content = File.read(@file_path, encoding: 'UTF-8')
|
300
|
+
lines = CSV.parse(content, col_sep: "\t")
|
301
|
+
|
302
|
+
headers = detect_tsv_headers(lines) ? lines.first : []
|
303
|
+
|
304
|
+
[{
|
305
|
+
sheet_name: 'Sheet1',
|
306
|
+
rows: lines.length,
|
307
|
+
columns: lines.first&.length || 0,
|
308
|
+
headers: headers,
|
309
|
+
data: lines
|
310
|
+
}]
|
311
|
+
rescue => e
|
312
|
+
[]
|
313
|
+
end
|
314
|
+
|
315
|
+
def detect_tsv_headers(lines)
|
316
|
+
return false if lines.empty? || lines.length < 2
|
317
|
+
|
318
|
+
first_row = lines.first
|
319
|
+
second_row = lines[1]
|
320
|
+
|
321
|
+
# Check if first row contains text and second row contains different data types
|
322
|
+
first_row.any? { |cell| cell.to_s.match?(/[a-zA-Z]/) } &&
|
323
|
+
second_row.any? { |cell| cell.to_s.match?(/^\d+$/) || cell.to_s.match?(/^\d+\.\d+$/) }
|
324
|
+
end
|
325
|
+
|
326
|
+
# XLSX Processing Methods (ZIP-based)
|
327
|
+
def extract_xlsx_text_builtin
|
328
|
+
text_content = []
|
329
|
+
|
330
|
+
Zip::File.open(@file_path) do |zip_file|
|
331
|
+
# Get shared strings
|
332
|
+
shared_strings = extract_shared_strings(zip_file)
|
333
|
+
|
334
|
+
# Get worksheet files
|
335
|
+
worksheet_files = zip_file.entries.select { |entry| entry.name.match?(/xl\/worksheets\/sheet\d+\.xml/) }
|
336
|
+
|
337
|
+
worksheet_files.each_with_index do |worksheet_file, index|
|
338
|
+
sheet_name = "Sheet#{index + 1}"
|
339
|
+
text_content << "=== #{sheet_name} ==="
|
340
|
+
|
341
|
+
worksheet_xml = zip_file.read(worksheet_file)
|
342
|
+
sheet_text = extract_text_from_worksheet_xml(worksheet_xml, shared_strings)
|
343
|
+
text_content << sheet_text
|
344
|
+
text_content << ""
|
345
|
+
end
|
346
|
+
end
|
347
|
+
|
348
|
+
text_content.join("\n")
|
349
|
+
rescue => e
|
350
|
+
"Error reading XLSX file: #{e.message}"
|
351
|
+
end
|
352
|
+
|
353
|
+
def extract_xlsx_metadata_builtin
|
354
|
+
metadata = basic_file_metadata
|
355
|
+
|
356
|
+
Zip::File.open(@file_path) do |zip_file|
|
357
|
+
worksheet_files = zip_file.entries.select { |entry| entry.name.match?(/xl\/worksheets\/sheet\d+\.xml/) }
|
358
|
+
|
359
|
+
metadata.merge!({
|
360
|
+
format: 'xlsx',
|
361
|
+
sheet_count: worksheet_files.length,
|
362
|
+
sheet_names: worksheet_files.map.with_index { |_, i| "Sheet#{i + 1}" },
|
363
|
+
has_formulas: detect_xlsx_formulas(zip_file),
|
364
|
+
has_shared_strings: zip_file.entries.any? { |entry| entry.name == 'xl/sharedStrings.xml' }
|
365
|
+
})
|
366
|
+
end
|
367
|
+
|
368
|
+
metadata
|
369
|
+
rescue => e
|
370
|
+
basic_file_metadata.merge(error: e.message)
|
371
|
+
end
|
372
|
+
|
373
|
+
def extract_xlsx_tables_builtin
|
374
|
+
tables = []
|
375
|
+
|
376
|
+
Zip::File.open(@file_path) do |zip_file|
|
377
|
+
shared_strings = extract_shared_strings(zip_file)
|
378
|
+
worksheet_files = zip_file.entries.select { |entry| entry.name.match?(/xl\/worksheets\/sheet\d+\.xml/) }
|
379
|
+
|
380
|
+
worksheet_files.each_with_index do |worksheet_file, index|
|
381
|
+
sheet_name = "Sheet#{index + 1}"
|
382
|
+
worksheet_xml = zip_file.read(worksheet_file)
|
383
|
+
|
384
|
+
table_data = extract_table_from_worksheet_xml(worksheet_xml, shared_strings)
|
385
|
+
table_data[:sheet_name] = sheet_name
|
386
|
+
tables << table_data
|
387
|
+
end
|
388
|
+
end
|
389
|
+
|
390
|
+
tables
|
391
|
+
rescue => e
|
392
|
+
[]
|
393
|
+
end
|
394
|
+
|
395
|
+
def extract_xlsx_formulas_builtin
|
396
|
+
formulas = []
|
397
|
+
|
398
|
+
Zip::File.open(@file_path) do |zip_file|
|
399
|
+
worksheet_files = zip_file.entries.select { |entry| entry.name.match?(/xl\/worksheets\/sheet\d+\.xml/) }
|
400
|
+
|
401
|
+
worksheet_files.each_with_index do |worksheet_file, index|
|
402
|
+
sheet_name = "Sheet#{index + 1}"
|
403
|
+
worksheet_xml = zip_file.read(worksheet_file)
|
404
|
+
|
405
|
+
sheet_formulas = extract_formulas_from_worksheet_xml(worksheet_xml, sheet_name)
|
406
|
+
formulas.concat(sheet_formulas)
|
407
|
+
end
|
408
|
+
end
|
409
|
+
|
410
|
+
formulas
|
411
|
+
rescue => e
|
412
|
+
[]
|
413
|
+
end
|
414
|
+
|
415
|
+
def extract_shared_strings(zip_file)
|
416
|
+
shared_strings = []
|
417
|
+
|
418
|
+
shared_strings_entry = zip_file.entries.find { |entry| entry.name == 'xl/sharedStrings.xml' }
|
419
|
+
return shared_strings unless shared_strings_entry
|
420
|
+
|
421
|
+
shared_strings_xml = zip_file.read(shared_strings_entry)
|
422
|
+
doc = REXML::Document.new(shared_strings_xml)
|
423
|
+
|
424
|
+
doc.elements.each('sst/si') do |si|
|
425
|
+
text_elements = si.get_elements('t')
|
426
|
+
if text_elements.any?
|
427
|
+
shared_strings << text_elements.first.text
|
428
|
+
else
|
429
|
+
# Handle rich text
|
430
|
+
rich_text = si.get_elements('r/t').map(&:text).join
|
431
|
+
shared_strings << rich_text
|
432
|
+
end
|
433
|
+
end
|
434
|
+
|
435
|
+
shared_strings
|
436
|
+
rescue => e
|
437
|
+
[]
|
438
|
+
end
|
439
|
+
|
440
|
+
def extract_text_from_worksheet_xml(worksheet_xml, shared_strings)
|
441
|
+
doc = REXML::Document.new(worksheet_xml)
|
442
|
+
rows = []
|
443
|
+
|
444
|
+
doc.elements.each('worksheet/sheetData/row') do |row|
|
445
|
+
row_data = []
|
446
|
+
row.elements.each('c') do |cell|
|
447
|
+
cell_value = extract_cell_value(cell, shared_strings)
|
448
|
+
row_data << cell_value
|
449
|
+
end
|
450
|
+
rows << row_data.join(' | ') unless row_data.all?(&:empty?)
|
451
|
+
end
|
452
|
+
|
453
|
+
rows.join("\n")
|
454
|
+
end
|
455
|
+
|
456
|
+
def extract_table_from_worksheet_xml(worksheet_xml, shared_strings)
|
457
|
+
doc = REXML::Document.new(worksheet_xml)
|
458
|
+
data = []
|
459
|
+
max_columns = 0
|
460
|
+
|
461
|
+
doc.elements.each('worksheet/sheetData/row') do |row|
|
462
|
+
row_data = []
|
463
|
+
row.elements.each('c') do |cell|
|
464
|
+
cell_value = extract_cell_value(cell, shared_strings)
|
465
|
+
row_data << cell_value
|
466
|
+
end
|
467
|
+
data << row_data
|
468
|
+
max_columns = [max_columns, row_data.length].max
|
469
|
+
end
|
470
|
+
|
471
|
+
# Normalize row lengths
|
472
|
+
data.each { |row| row.fill('', row.length...max_columns) }
|
473
|
+
|
474
|
+
headers = data.first || []
|
475
|
+
|
476
|
+
{
|
477
|
+
rows: data.length,
|
478
|
+
columns: max_columns,
|
479
|
+
headers: headers,
|
480
|
+
data: data
|
481
|
+
}
|
482
|
+
end
|
483
|
+
|
484
|
+
def extract_formulas_from_worksheet_xml(worksheet_xml, sheet_name)
|
485
|
+
doc = REXML::Document.new(worksheet_xml)
|
486
|
+
formulas = []
|
487
|
+
|
488
|
+
doc.elements.each('worksheet/sheetData/row') do |row|
|
489
|
+
row_num = row.attributes['r'].to_i
|
490
|
+
|
491
|
+
row.elements.each('c') do |cell|
|
492
|
+
cell_ref = cell.attributes['r']
|
493
|
+
formula_element = cell.elements['f']
|
494
|
+
|
495
|
+
if formula_element && formula_element.text
|
496
|
+
formulas << {
|
497
|
+
sheet: sheet_name,
|
498
|
+
cell: cell_ref,
|
499
|
+
formula: formula_element.text,
|
500
|
+
value: extract_cell_value(cell, [])
|
501
|
+
}
|
502
|
+
end
|
503
|
+
end
|
504
|
+
end
|
505
|
+
|
506
|
+
formulas
|
507
|
+
end
|
508
|
+
|
509
|
+
def extract_cell_value(cell, shared_strings)
|
510
|
+
cell_type = cell.attributes['t']
|
511
|
+
value_element = cell.elements['v']
|
512
|
+
|
513
|
+
return '' unless value_element && value_element.text
|
514
|
+
|
515
|
+
case cell_type
|
516
|
+
when 's' # Shared string
|
517
|
+
index = value_element.text.to_i
|
518
|
+
shared_strings[index] || ''
|
519
|
+
when 'str' # String
|
520
|
+
value_element.text
|
521
|
+
when 'b' # Boolean
|
522
|
+
value_element.text == '1' ? 'TRUE' : 'FALSE'
|
523
|
+
else # Number or date
|
524
|
+
value_element.text
|
525
|
+
end
|
526
|
+
end
|
527
|
+
|
528
|
+
def detect_xlsx_formulas(zip_file)
|
529
|
+
worksheet_files = zip_file.entries.select { |entry| entry.name.match?(/xl\/worksheets\/sheet\d+\.xml/) }
|
530
|
+
|
531
|
+
worksheet_files.any? do |worksheet_file|
|
532
|
+
worksheet_xml = zip_file.read(worksheet_file)
|
533
|
+
worksheet_xml.include?('<f>')
|
534
|
+
end
|
535
|
+
end
|
536
|
+
|
537
|
+
# XLS Processing Methods (Binary format - basic implementation)
|
538
|
+
def extract_xls_text_builtin
|
539
|
+
# Basic XLS text extraction - this is a simplified implementation
|
540
|
+
# For full XLS support, a more complex binary parser would be needed
|
541
|
+
content = File.binread(@file_path)
|
542
|
+
|
543
|
+
# Try to extract readable text from the binary data
|
544
|
+
text_parts = content.scan(/[\x20-\x7E]{3,}/).uniq
|
545
|
+
|
546
|
+
if text_parts.any?
|
547
|
+
"=== XLS Content (Basic Extraction) ===\n" + text_parts.join("\n")
|
548
|
+
else
|
549
|
+
"XLS file detected but no readable text extracted. Consider converting to XLSX format for better support."
|
550
|
+
end
|
551
|
+
rescue => e
|
552
|
+
"Error reading XLS file: #{e.message}"
|
553
|
+
end
|
554
|
+
|
555
|
+
def extract_xls_metadata_builtin
|
556
|
+
basic_file_metadata.merge({
|
557
|
+
format: 'xls',
|
558
|
+
sheet_count: 1,
|
559
|
+
sheet_names: ['Sheet1'],
|
560
|
+
note: 'XLS format has limited built-in support. Consider converting to XLSX for full functionality.'
|
561
|
+
})
|
562
|
+
end
|
563
|
+
|
564
|
+
def extract_xls_tables_builtin
|
565
|
+
[{
|
566
|
+
sheet_name: 'Sheet1',
|
567
|
+
rows: 0,
|
568
|
+
columns: 0,
|
569
|
+
headers: [],
|
570
|
+
data: [],
|
571
|
+
note: 'XLS format has limited built-in support. Consider converting to XLSX for full functionality.'
|
572
|
+
}]
|
573
|
+
end
|
574
|
+
|
575
|
+
# Helper Methods
|
576
|
+
def determine_format_and_extract
|
577
|
+
# Try to determine format by content
|
578
|
+
if File.binread(@file_path, 4) == "PK\x03\x04"
|
579
|
+
extract_xlsx_text_builtin
|
580
|
+
else
|
581
|
+
extract_xls_text_builtin
|
582
|
+
end
|
583
|
+
end
|
584
|
+
|
585
|
+
def basic_file_metadata
|
586
|
+
{
|
587
|
+
file_size: File.size(@file_path),
|
588
|
+
last_modified: File.mtime(@file_path),
|
589
|
+
created: File.ctime(@file_path),
|
590
|
+
format: File.extname(@file_path).downcase.gsub('.', ''),
|
591
|
+
encoding: 'Unknown'
|
592
|
+
}
|
593
|
+
end
|
594
|
+
|
595
|
+
def convert_table_to_csv(table)
|
596
|
+
require 'csv'
|
597
|
+
|
598
|
+
CSV.generate do |csv|
|
599
|
+
table[:data].each do |row|
|
600
|
+
csv << row
|
601
|
+
end
|
602
|
+
end
|
603
|
+
end
|
604
|
+
|
605
|
+
def convert_table_to_tsv(table)
|
606
|
+
require 'csv'
|
607
|
+
|
608
|
+
CSV.generate(col_sep: "\t") do |tsv|
|
609
|
+
table[:data].each do |row|
|
610
|
+
tsv << row
|
611
|
+
end
|
612
|
+
end
|
613
|
+
end
|
614
|
+
|
615
|
+
def convert_csv_to_tsv(csv_content)
|
616
|
+
require 'csv'
|
617
|
+
|
618
|
+
lines = CSV.parse(csv_content)
|
619
|
+
CSV.generate(col_sep: "\t") do |tsv|
|
620
|
+
lines.each do |row|
|
621
|
+
tsv << row
|
622
|
+
end
|
623
|
+
end
|
624
|
+
end
|
625
|
+
|
626
|
+
def convert_tsv_to_csv(tsv_content)
|
627
|
+
require 'csv'
|
628
|
+
|
629
|
+
lines = CSV.parse(tsv_content, col_sep: "\t")
|
630
|
+
CSV.generate do |csv|
|
631
|
+
lines.each do |row|
|
632
|
+
csv << row
|
633
|
+
end
|
634
|
+
end
|
635
|
+
end
|
636
|
+
|
637
|
+
def analyze_table_statistics(table)
|
638
|
+
return {} if table[:data].empty?
|
639
|
+
|
640
|
+
stats = {
|
641
|
+
total_cells: table[:rows] * table[:columns],
|
642
|
+
empty_cells: 0,
|
643
|
+
numeric_cells: 0,
|
644
|
+
text_cells: 0,
|
645
|
+
numeric_values: []
|
646
|
+
}
|
647
|
+
|
648
|
+
table[:data].each do |row|
|
649
|
+
row.each do |cell|
|
650
|
+
if cell.nil? || cell.to_s.strip.empty?
|
651
|
+
stats[:empty_cells] += 1
|
652
|
+
elsif cell.to_s.match?(/^\d+(\.\d+)?$/)
|
653
|
+
stats[:numeric_cells] += 1
|
654
|
+
stats[:numeric_values] << cell.to_f
|
655
|
+
else
|
656
|
+
stats[:text_cells] += 1
|
657
|
+
end
|
658
|
+
end
|
659
|
+
end
|
660
|
+
|
661
|
+
if stats[:numeric_values].any?
|
662
|
+
values = stats[:numeric_values]
|
663
|
+
stats[:min_value] = values.min
|
664
|
+
stats[:max_value] = values.max
|
665
|
+
stats[:average_value] = values.sum / values.length.to_f
|
666
|
+
stats[:median_value] = calculate_median(values)
|
667
|
+
end
|
668
|
+
|
669
|
+
stats
|
670
|
+
end
|
671
|
+
|
672
|
+
def validate_table_data(table)
|
673
|
+
return {} if table[:data].empty?
|
674
|
+
|
675
|
+
validation = {
|
676
|
+
total_rows: table[:rows],
|
677
|
+
empty_rows: 0,
|
678
|
+
duplicate_rows: 0,
|
679
|
+
data_quality_score: 0
|
680
|
+
}
|
681
|
+
|
682
|
+
seen_rows = Set.new
|
683
|
+
|
684
|
+
table[:data].each do |row|
|
685
|
+
if row.all? { |cell| cell.nil? || cell.to_s.strip.empty? }
|
686
|
+
validation[:empty_rows] += 1
|
687
|
+
end
|
688
|
+
|
689
|
+
row_key = row.join('|')
|
690
|
+
if seen_rows.include?(row_key)
|
691
|
+
validation[:duplicate_rows] += 1
|
692
|
+
else
|
693
|
+
seen_rows.add(row_key)
|
694
|
+
end
|
695
|
+
end
|
696
|
+
|
697
|
+
# Calculate data quality score (0-100)
|
698
|
+
total_rows = table[:rows]
|
699
|
+
if total_rows > 0
|
700
|
+
quality_score = ((total_rows - validation[:empty_rows] - validation[:duplicate_rows]) / total_rows.to_f) * 100
|
701
|
+
validation[:data_quality_score] = [quality_score.round(2), 0].max
|
702
|
+
end
|
703
|
+
|
704
|
+
validation
|
705
|
+
end
|
706
|
+
|
707
|
+
def calculate_median(values)
|
708
|
+
sorted = values.sort
|
709
|
+
mid = sorted.length / 2
|
710
|
+
|
711
|
+
if sorted.length.odd?
|
712
|
+
sorted[mid]
|
713
|
+
else
|
714
|
+
(sorted[mid - 1] + sorted[mid]) / 2.0
|
715
|
+
end
|
716
|
+
end
|
717
|
+
|
197
718
|
def detect_formulas(workbook)
|
198
719
|
workbook.sheets.any? do |sheet_name|
|
199
720
|
workbook.sheet(sheet_name)
|
@@ -214,6 +735,72 @@ module UniversalDocumentProcessor
|
|
214
735
|
# This is a placeholder for future implementation
|
215
736
|
false
|
216
737
|
end
|
738
|
+
|
739
|
+
def calculate_median(values)
|
740
|
+
sorted = values.sort
|
741
|
+
length = sorted.length
|
742
|
+
if length.odd?
|
743
|
+
sorted[length / 2]
|
744
|
+
else
|
745
|
+
(sorted[length / 2 - 1] + sorted[length / 2]) / 2.0
|
746
|
+
end
|
747
|
+
end
|
748
|
+
|
749
|
+
def detect_headers(workbook)
|
750
|
+
return false unless workbook.last_row && workbook.last_row > 1
|
751
|
+
|
752
|
+
# Check if first row contains mostly text while second row contains numbers
|
753
|
+
first_row_types = []
|
754
|
+
second_row_types = []
|
755
|
+
|
756
|
+
(workbook.first_column..workbook.last_column).each do |col|
|
757
|
+
first_cell = workbook.cell(workbook.first_row, col)
|
758
|
+
second_cell = workbook.cell(workbook.first_row + 1, col)
|
759
|
+
|
760
|
+
first_row_types << (first_cell.is_a?(String) ? :text : :other)
|
761
|
+
second_row_types << (second_cell.is_a?(Numeric) ? :numeric : :other)
|
762
|
+
end
|
763
|
+
|
764
|
+
# If first row is mostly text and second row has numbers, likely has headers
|
765
|
+
text_ratio = first_row_types.count(:text).to_f / first_row_types.length
|
766
|
+
numeric_ratio = second_row_types.count(:numeric).to_f / second_row_types.length
|
767
|
+
|
768
|
+
text_ratio > 0.5 && numeric_ratio > 0.3
|
769
|
+
end
|
770
|
+
|
771
|
+
def analyze_column_types(workbook)
|
772
|
+
return {} unless workbook.last_row
|
773
|
+
|
774
|
+
column_types = {}
|
775
|
+
|
776
|
+
(workbook.first_column..workbook.last_column).each do |col|
|
777
|
+
types = { numeric: 0, text: 0, date: 0, empty: 0 }
|
778
|
+
total_rows = workbook.last_row - workbook.first_row + 1
|
779
|
+
|
780
|
+
(workbook.first_row..workbook.last_row).each do |row|
|
781
|
+
cell_value = workbook.cell(row, col)
|
782
|
+
|
783
|
+
if cell_value.nil? || cell_value.to_s.strip.empty?
|
784
|
+
types[:empty] += 1
|
785
|
+
elsif cell_value.is_a?(Numeric)
|
786
|
+
types[:numeric] += 1
|
787
|
+
elsif cell_value.is_a?(Date) || cell_value.is_a?(Time)
|
788
|
+
types[:date] += 1
|
789
|
+
else
|
790
|
+
types[:text] += 1
|
791
|
+
end
|
792
|
+
end
|
793
|
+
|
794
|
+
# Determine predominant type
|
795
|
+
max_type = types.max_by { |k, v| v }
|
796
|
+
column_types["Column #{col}"] = {
|
797
|
+
predominant_type: max_type[0],
|
798
|
+
type_distribution: types.transform_values { |v| (v.to_f / total_rows * 100).round(1) }
|
799
|
+
}
|
800
|
+
end
|
801
|
+
|
802
|
+
column_types
|
803
|
+
end
|
217
804
|
end
|
218
805
|
end
|
219
806
|
end
|