universal_document_processor 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,128 +1,67 @@
1
+ require 'set'
2
+ require 'zip'
3
+ require 'rexml/document'
4
+ require 'csv'
5
+
1
6
  module UniversalDocumentProcessor
2
7
  module Processors
3
8
  class ExcelProcessor < BaseProcessor
4
9
  def extract_text
5
10
  with_error_handling do
6
- workbook = Roo::Spreadsheet.open(@file_path)
7
- text_content = []
8
-
9
- workbook.sheets.each do |sheet_name|
10
- workbook.sheet(sheet_name)
11
- text_content << "=== Sheet: #{sheet_name} ==="
12
-
13
- # Get all rows with data
14
- if workbook.last_row
15
- (workbook.first_row..workbook.last_row).each do |row|
16
- row_data = []
17
- (workbook.first_column..workbook.last_column).each do |col|
18
- cell_value = workbook.cell(row, col)
19
- row_data << cell_value.to_s if cell_value
20
- end
21
- text_content << row_data.join(' | ') unless row_data.all?(&:empty?)
22
- end
23
- end
24
-
25
- text_content << "" # Add blank line between sheets
11
+ if @file_path.end_with?('.csv')
12
+ extract_csv_text
13
+ elsif @file_path.end_with?('.tsv')
14
+ extract_tsv_text
15
+ elsif @file_path.end_with?('.xlsx')
16
+ extract_xlsx_text_builtin
17
+ elsif @file_path.end_with?('.xls')
18
+ extract_xls_text_builtin
19
+ else
20
+ determine_format_and_extract
26
21
  end
27
-
28
- text_content.join("\n")
29
22
  end
30
23
  end
31
24
 
32
25
  def extract_metadata
33
26
  with_error_handling do
34
- workbook = Roo::Spreadsheet.open(@file_path)
35
-
36
- sheet_info = {}
37
- workbook.sheets.each do |sheet_name|
38
- workbook.sheet(sheet_name)
39
- sheet_info[sheet_name] = {
40
- rows: workbook.last_row || 0,
41
- columns: workbook.last_column || 0,
42
- first_row: workbook.first_row || 0,
43
- first_column: workbook.first_column || 0
44
- }
45
- end
46
-
47
- super.merge({
48
- sheet_count: workbook.sheets.length,
49
- sheet_names: workbook.sheets,
50
- sheet_info: sheet_info,
51
- total_rows: sheet_info.values.sum { |info| info[:rows] },
52
- total_columns: sheet_info.values.map { |info| info[:columns] }.max || 0,
53
- has_formulas: detect_formulas(workbook),
54
- has_charts: detect_charts(workbook)
55
- })
27
+ if @file_path.end_with?('.csv')
28
+ extract_csv_metadata
29
+ elsif @file_path.end_with?('.tsv')
30
+ extract_tsv_metadata
31
+ elsif @file_path.end_with?('.xlsx')
32
+ extract_xlsx_metadata_builtin
33
+ elsif @file_path.end_with?('.xls')
34
+ extract_xls_metadata_builtin
35
+ else
36
+ basic_file_metadata
37
+ end
56
38
  end
57
39
  end
58
40
 
59
41
  def extract_tables
60
42
  with_error_handling do
61
- workbook = Roo::Spreadsheet.open(@file_path)
62
- tables = []
63
-
64
- workbook.sheets.each do |sheet_name|
65
- workbook.sheet(sheet_name)
66
- next unless workbook.last_row
67
-
68
- table_data = {
69
- sheet_name: sheet_name,
70
- rows: workbook.last_row,
71
- columns: workbook.last_column,
72
- data: [],
73
- headers: []
74
- }
75
-
76
- # Extract headers (first row)
77
- if workbook.first_row
78
- (workbook.first_column..workbook.last_column).each do |col|
79
- header = workbook.cell(workbook.first_row, col)
80
- table_data[:headers] << (header ? header.to_s : "Column #{col}")
81
- end
82
- end
83
-
84
- # Extract all data
85
- (workbook.first_row..workbook.last_row).each do |row|
86
- row_data = []
87
- (workbook.first_column..workbook.last_column).each do |col|
88
- cell_value = workbook.cell(row, col)
89
- row_data << (cell_value ? cell_value.to_s : "")
90
- end
91
- table_data[:data] << row_data
92
- end
93
-
94
- tables << table_data
43
+ if @file_path.end_with?('.csv')
44
+ extract_csv_tables
45
+ elsif @file_path.end_with?('.tsv')
46
+ extract_tsv_tables
47
+ elsif @file_path.end_with?('.xlsx')
48
+ extract_xlsx_tables_builtin
49
+ elsif @file_path.end_with?('.xls')
50
+ extract_xls_tables_builtin
51
+ else
52
+ []
95
53
  end
96
-
97
- tables
98
54
  end
99
55
  end
100
56
 
101
57
  def extract_formulas
102
58
  with_error_handling do
103
- workbook = Roo::Spreadsheet.open(@file_path)
104
- formulas = []
105
-
106
- workbook.sheets.each do |sheet_name|
107
- workbook.sheet(sheet_name)
108
- next unless workbook.last_row
109
-
110
- (workbook.first_row..workbook.last_row).each do |row|
111
- (workbook.first_column..workbook.last_column).each do |col|
112
- if workbook.respond_to?(:formula) && workbook.formula(row, col)
113
- formulas << {
114
- sheet: sheet_name,
115
- row: row,
116
- column: col,
117
- formula: workbook.formula(row, col),
118
- value: workbook.cell(row, col)
119
- }
120
- end
121
- end
122
- end
59
+ if @file_path.end_with?('.xlsx')
60
+ extract_xlsx_formulas_builtin
61
+ else
62
+ # .xls, .csv, and .tsv don't support formulas in our built-in implementation
63
+ []
123
64
  end
124
-
125
- formulas
126
65
  end
127
66
  end
128
67
 
@@ -135,65 +74,647 @@ module UniversalDocumentProcessor
135
74
  end
136
75
 
137
76
  def supported_operations
138
- super + [:extract_tables, :extract_formulas, :extract_charts, :extract_pivot_tables]
77
+ super + [:extract_tables, :extract_formulas, :extract_charts, :extract_pivot_tables, :extract_statistics, :extract_cell_formatting, :validate_data, :to_csv, :to_tsv, :to_json]
139
78
  end
140
79
 
141
80
  def to_csv(sheet_name = nil)
142
81
  with_error_handling do
143
- workbook = Roo::Spreadsheet.open(@file_path)
144
-
145
- if sheet_name
146
- workbook.sheet(sheet_name)
147
- workbook.to_csv
82
+ if @file_path.end_with?('.csv')
83
+ File.read(@file_path)
84
+ elsif @file_path.end_with?('.tsv')
85
+ # Convert TSV to CSV
86
+ convert_tsv_to_csv(File.read(@file_path))
87
+ else
88
+ tables = extract_tables
89
+ if sheet_name
90
+ table = tables.find { |t| t[:sheet_name] == sheet_name }
91
+ return "" unless table
92
+ convert_table_to_csv(table)
93
+ else
94
+ # Convert all sheets to CSV
95
+ csv_data = {}
96
+ tables.each do |table|
97
+ csv_data[table[:sheet_name]] = convert_table_to_csv(table)
98
+ end
99
+ csv_data
100
+ end
101
+ end
102
+ end
103
+ end
104
+
105
+ def to_tsv(sheet_name = nil)
106
+ with_error_handling do
107
+ if @file_path.end_with?('.tsv')
108
+ File.read(@file_path)
109
+ elsif @file_path.end_with?('.csv')
110
+ # Convert CSV to TSV
111
+ convert_csv_to_tsv(File.read(@file_path))
148
112
  else
149
- # Convert all sheets to CSV
150
- csv_data = {}
151
- workbook.sheets.each do |name|
152
- workbook.sheet(name)
153
- csv_data[name] = workbook.to_csv
113
+ tables = extract_tables
114
+ if sheet_name
115
+ table = tables.find { |t| t[:sheet_name] == sheet_name }
116
+ return "" unless table
117
+ convert_table_to_tsv(table)
118
+ else
119
+ # Convert all sheets to TSV
120
+ tsv_data = {}
121
+ tables.each do |table|
122
+ tsv_data[table[:sheet_name]] = convert_table_to_tsv(table)
123
+ end
124
+ tsv_data
154
125
  end
155
- csv_data
156
126
  end
157
127
  end
158
128
  end
159
129
 
160
130
  def to_json
161
131
  with_error_handling do
162
- workbook = Roo::Spreadsheet.open(@file_path)
132
+ tables = extract_tables
163
133
  json_data = {}
164
134
 
165
- workbook.sheets.each do |sheet_name|
166
- workbook.sheet(sheet_name)
135
+ tables.each do |table|
167
136
  sheet_data = []
137
+ headers = table[:headers] || []
168
138
 
169
- next unless workbook.last_row
170
-
171
- # Get headers
172
- headers = []
173
- (workbook.first_column..workbook.last_column).each do |col|
174
- header = workbook.cell(workbook.first_row, col)
175
- headers << (header ? header.to_s : "Column #{col}")
176
- end
177
-
178
- # Get data rows
179
- ((workbook.first_row + 1)..workbook.last_row).each do |row|
139
+ table[:data].each_with_index do |row, index|
140
+ next if index == 0 && !headers.empty? # Skip header row if we have headers
141
+
180
142
  row_hash = {}
181
- (workbook.first_column..workbook.last_column).each_with_index do |col, index|
182
- cell_value = workbook.cell(row, col)
183
- row_hash[headers[index]] = cell_value
143
+ row.each_with_index do |cell, col_index|
144
+ header = headers[col_index] || "Column #{col_index + 1}"
145
+ row_hash[header] = cell
184
146
  end
185
147
  sheet_data << row_hash
186
148
  end
187
149
 
188
- json_data[sheet_name] = sheet_data
150
+ json_data[table[:sheet_name]] = sheet_data
189
151
  end
190
152
 
153
+ require 'json'
191
154
  json_data.to_json
192
155
  end
193
156
  end
194
157
 
158
+ def extract_statistics
159
+ with_error_handling do
160
+ tables = extract_tables
161
+ statistics = {}
162
+
163
+ tables.each do |table|
164
+ sheet_stats = analyze_table_statistics(table)
165
+ statistics[table[:sheet_name]] = sheet_stats
166
+ end
167
+
168
+ statistics
169
+ end
170
+ end
171
+
172
+ def validate_data
173
+ with_error_handling do
174
+ tables = extract_tables
175
+ validation_results = {}
176
+
177
+ tables.each do |table|
178
+ validation = validate_table_data(table)
179
+ validation_results[table[:sheet_name]] = validation
180
+ end
181
+
182
+ validation_results
183
+ end
184
+ end
185
+
186
+ def extract_cell_formatting
187
+ with_error_handling do
188
+ # This would require more detailed Excel parsing
189
+ # For now, return basic formatting info for built-in processing
190
+ {
191
+ note: "Cell formatting extraction requires more detailed Excel parsing - feature planned for future release"
192
+ }
193
+ end
194
+ end
195
+
196
+ def create_summary_report
197
+ with_error_handling do
198
+ {
199
+ metadata: extract_metadata,
200
+ statistics: extract_statistics,
201
+ data_validation: validate_data,
202
+ formulas: extract_formulas.length,
203
+ total_sheets: extract_metadata[:sheet_count],
204
+ processing_time: Time.current.to_s
205
+ }
206
+ end
207
+ end
208
+
195
209
  private
196
210
 
211
+ # CSV Processing Methods
212
+ def extract_csv_text
213
+ content = File.read(@file_path, encoding: 'UTF-8')
214
+ # Convert CSV to readable text format
215
+ lines = CSV.parse(content)
216
+ lines.map { |row| row.join(' | ') }.join("\n")
217
+ rescue => e
218
+ "Error reading CSV: #{e.message}"
219
+ end
220
+
221
+ def extract_csv_metadata
222
+ content = File.read(@file_path, encoding: 'UTF-8')
223
+ lines = CSV.parse(content)
224
+
225
+ {
226
+ format: 'csv',
227
+ file_size: File.size(@file_path),
228
+ last_modified: File.mtime(@file_path),
229
+ sheet_count: 1,
230
+ sheet_names: ['Sheet1'],
231
+ total_rows: lines.length,
232
+ total_columns: lines.first&.length || 0,
233
+ has_headers: detect_csv_headers(lines),
234
+ encoding: 'UTF-8'
235
+ }
236
+ rescue => e
237
+ basic_file_metadata.merge(error: e.message)
238
+ end
239
+
240
+ def extract_csv_tables
241
+ content = File.read(@file_path, encoding: 'UTF-8')
242
+ lines = CSV.parse(content)
243
+
244
+ headers = detect_csv_headers(lines) ? lines.first : []
245
+
246
+ [{
247
+ sheet_name: 'Sheet1',
248
+ rows: lines.length,
249
+ columns: lines.first&.length || 0,
250
+ headers: headers,
251
+ data: lines
252
+ }]
253
+ rescue => e
254
+ []
255
+ end
256
+
257
+ def detect_csv_headers(lines)
258
+ return false if lines.empty? || lines.length < 2
259
+
260
+ first_row = lines.first
261
+ second_row = lines[1]
262
+
263
+ # Check if first row contains text and second row contains different data types
264
+ first_row.any? { |cell| cell.to_s.match?(/[a-zA-Z]/) } &&
265
+ second_row.any? { |cell| cell.to_s.match?(/^\d+$/) || cell.to_s.match?(/^\d+\.\d+$/) }
266
+ end
267
+
268
+ # TSV Processing Methods
269
+ def extract_tsv_text
270
+ content = File.read(@file_path, encoding: 'UTF-8')
271
+ # Convert TSV to readable text format
272
+ lines = CSV.parse(content, col_sep: "\t")
273
+ lines.map { |row| row.join(' | ') }.join("\n")
274
+ rescue => e
275
+ "Error reading TSV: #{e.message}"
276
+ end
277
+
278
+ def extract_tsv_metadata
279
+ content = File.read(@file_path, encoding: 'UTF-8')
280
+ lines = CSV.parse(content, col_sep: "\t")
281
+
282
+ {
283
+ format: 'tsv',
284
+ file_size: File.size(@file_path),
285
+ last_modified: File.mtime(@file_path),
286
+ sheet_count: 1,
287
+ sheet_names: ['Sheet1'],
288
+ total_rows: lines.length,
289
+ total_columns: lines.first&.length || 0,
290
+ has_headers: detect_tsv_headers(lines),
291
+ encoding: 'UTF-8',
292
+ delimiter: 'tab'
293
+ }
294
+ rescue => e
295
+ basic_file_metadata.merge(error: e.message)
296
+ end
297
+
298
+ def extract_tsv_tables
299
+ content = File.read(@file_path, encoding: 'UTF-8')
300
+ lines = CSV.parse(content, col_sep: "\t")
301
+
302
+ headers = detect_tsv_headers(lines) ? lines.first : []
303
+
304
+ [{
305
+ sheet_name: 'Sheet1',
306
+ rows: lines.length,
307
+ columns: lines.first&.length || 0,
308
+ headers: headers,
309
+ data: lines
310
+ }]
311
+ rescue => e
312
+ []
313
+ end
314
+
315
+ def detect_tsv_headers(lines)
316
+ return false if lines.empty? || lines.length < 2
317
+
318
+ first_row = lines.first
319
+ second_row = lines[1]
320
+
321
+ # Check if first row contains text and second row contains different data types
322
+ first_row.any? { |cell| cell.to_s.match?(/[a-zA-Z]/) } &&
323
+ second_row.any? { |cell| cell.to_s.match?(/^\d+$/) || cell.to_s.match?(/^\d+\.\d+$/) }
324
+ end
325
+
326
+ # XLSX Processing Methods (ZIP-based)
327
+ def extract_xlsx_text_builtin
328
+ text_content = []
329
+
330
+ Zip::File.open(@file_path) do |zip_file|
331
+ # Get shared strings
332
+ shared_strings = extract_shared_strings(zip_file)
333
+
334
+ # Get worksheet files
335
+ worksheet_files = zip_file.entries.select { |entry| entry.name.match?(/xl\/worksheets\/sheet\d+\.xml/) }
336
+
337
+ worksheet_files.each_with_index do |worksheet_file, index|
338
+ sheet_name = "Sheet#{index + 1}"
339
+ text_content << "=== #{sheet_name} ==="
340
+
341
+ worksheet_xml = zip_file.read(worksheet_file)
342
+ sheet_text = extract_text_from_worksheet_xml(worksheet_xml, shared_strings)
343
+ text_content << sheet_text
344
+ text_content << ""
345
+ end
346
+ end
347
+
348
+ text_content.join("\n")
349
+ rescue => e
350
+ "Error reading XLSX file: #{e.message}"
351
+ end
352
+
353
+ def extract_xlsx_metadata_builtin
354
+ metadata = basic_file_metadata
355
+
356
+ Zip::File.open(@file_path) do |zip_file|
357
+ worksheet_files = zip_file.entries.select { |entry| entry.name.match?(/xl\/worksheets\/sheet\d+\.xml/) }
358
+
359
+ metadata.merge!({
360
+ format: 'xlsx',
361
+ sheet_count: worksheet_files.length,
362
+ sheet_names: worksheet_files.map.with_index { |_, i| "Sheet#{i + 1}" },
363
+ has_formulas: detect_xlsx_formulas(zip_file),
364
+ has_shared_strings: zip_file.entries.any? { |entry| entry.name == 'xl/sharedStrings.xml' }
365
+ })
366
+ end
367
+
368
+ metadata
369
+ rescue => e
370
+ basic_file_metadata.merge(error: e.message)
371
+ end
372
+
373
+ def extract_xlsx_tables_builtin
374
+ tables = []
375
+
376
+ Zip::File.open(@file_path) do |zip_file|
377
+ shared_strings = extract_shared_strings(zip_file)
378
+ worksheet_files = zip_file.entries.select { |entry| entry.name.match?(/xl\/worksheets\/sheet\d+\.xml/) }
379
+
380
+ worksheet_files.each_with_index do |worksheet_file, index|
381
+ sheet_name = "Sheet#{index + 1}"
382
+ worksheet_xml = zip_file.read(worksheet_file)
383
+
384
+ table_data = extract_table_from_worksheet_xml(worksheet_xml, shared_strings)
385
+ table_data[:sheet_name] = sheet_name
386
+ tables << table_data
387
+ end
388
+ end
389
+
390
+ tables
391
+ rescue => e
392
+ []
393
+ end
394
+
395
+ def extract_xlsx_formulas_builtin
396
+ formulas = []
397
+
398
+ Zip::File.open(@file_path) do |zip_file|
399
+ worksheet_files = zip_file.entries.select { |entry| entry.name.match?(/xl\/worksheets\/sheet\d+\.xml/) }
400
+
401
+ worksheet_files.each_with_index do |worksheet_file, index|
402
+ sheet_name = "Sheet#{index + 1}"
403
+ worksheet_xml = zip_file.read(worksheet_file)
404
+
405
+ sheet_formulas = extract_formulas_from_worksheet_xml(worksheet_xml, sheet_name)
406
+ formulas.concat(sheet_formulas)
407
+ end
408
+ end
409
+
410
+ formulas
411
+ rescue => e
412
+ []
413
+ end
414
+
415
+ def extract_shared_strings(zip_file)
416
+ shared_strings = []
417
+
418
+ shared_strings_entry = zip_file.entries.find { |entry| entry.name == 'xl/sharedStrings.xml' }
419
+ return shared_strings unless shared_strings_entry
420
+
421
+ shared_strings_xml = zip_file.read(shared_strings_entry)
422
+ doc = REXML::Document.new(shared_strings_xml)
423
+
424
+ doc.elements.each('sst/si') do |si|
425
+ text_elements = si.get_elements('t')
426
+ if text_elements.any?
427
+ shared_strings << text_elements.first.text
428
+ else
429
+ # Handle rich text
430
+ rich_text = si.get_elements('r/t').map(&:text).join
431
+ shared_strings << rich_text
432
+ end
433
+ end
434
+
435
+ shared_strings
436
+ rescue => e
437
+ []
438
+ end
439
+
440
+ def extract_text_from_worksheet_xml(worksheet_xml, shared_strings)
441
+ doc = REXML::Document.new(worksheet_xml)
442
+ rows = []
443
+
444
+ doc.elements.each('worksheet/sheetData/row') do |row|
445
+ row_data = []
446
+ row.elements.each('c') do |cell|
447
+ cell_value = extract_cell_value(cell, shared_strings)
448
+ row_data << cell_value
449
+ end
450
+ rows << row_data.join(' | ') unless row_data.all?(&:empty?)
451
+ end
452
+
453
+ rows.join("\n")
454
+ end
455
+
456
+ def extract_table_from_worksheet_xml(worksheet_xml, shared_strings)
457
+ doc = REXML::Document.new(worksheet_xml)
458
+ data = []
459
+ max_columns = 0
460
+
461
+ doc.elements.each('worksheet/sheetData/row') do |row|
462
+ row_data = []
463
+ row.elements.each('c') do |cell|
464
+ cell_value = extract_cell_value(cell, shared_strings)
465
+ row_data << cell_value
466
+ end
467
+ data << row_data
468
+ max_columns = [max_columns, row_data.length].max
469
+ end
470
+
471
+ # Normalize row lengths
472
+ data.each { |row| row.fill('', row.length...max_columns) }
473
+
474
+ headers = data.first || []
475
+
476
+ {
477
+ rows: data.length,
478
+ columns: max_columns,
479
+ headers: headers,
480
+ data: data
481
+ }
482
+ end
483
+
484
+ def extract_formulas_from_worksheet_xml(worksheet_xml, sheet_name)
485
+ doc = REXML::Document.new(worksheet_xml)
486
+ formulas = []
487
+
488
+ doc.elements.each('worksheet/sheetData/row') do |row|
489
+ row_num = row.attributes['r'].to_i
490
+
491
+ row.elements.each('c') do |cell|
492
+ cell_ref = cell.attributes['r']
493
+ formula_element = cell.elements['f']
494
+
495
+ if formula_element && formula_element.text
496
+ formulas << {
497
+ sheet: sheet_name,
498
+ cell: cell_ref,
499
+ formula: formula_element.text,
500
+ value: extract_cell_value(cell, [])
501
+ }
502
+ end
503
+ end
504
+ end
505
+
506
+ formulas
507
+ end
508
+
509
+ def extract_cell_value(cell, shared_strings)
510
+ cell_type = cell.attributes['t']
511
+ value_element = cell.elements['v']
512
+
513
+ return '' unless value_element && value_element.text
514
+
515
+ case cell_type
516
+ when 's' # Shared string
517
+ index = value_element.text.to_i
518
+ shared_strings[index] || ''
519
+ when 'str' # String
520
+ value_element.text
521
+ when 'b' # Boolean
522
+ value_element.text == '1' ? 'TRUE' : 'FALSE'
523
+ else # Number or date
524
+ value_element.text
525
+ end
526
+ end
527
+
528
+ def detect_xlsx_formulas(zip_file)
529
+ worksheet_files = zip_file.entries.select { |entry| entry.name.match?(/xl\/worksheets\/sheet\d+\.xml/) }
530
+
531
+ worksheet_files.any? do |worksheet_file|
532
+ worksheet_xml = zip_file.read(worksheet_file)
533
+ worksheet_xml.include?('<f>')
534
+ end
535
+ end
536
+
537
+ # XLS Processing Methods (Binary format - basic implementation)
538
+ def extract_xls_text_builtin
539
+ # Basic XLS text extraction - this is a simplified implementation
540
+ # For full XLS support, a more complex binary parser would be needed
541
+ content = File.binread(@file_path)
542
+
543
+ # Try to extract readable text from the binary data
544
+ text_parts = content.scan(/[\x20-\x7E]{3,}/).uniq
545
+
546
+ if text_parts.any?
547
+ "=== XLS Content (Basic Extraction) ===\n" + text_parts.join("\n")
548
+ else
549
+ "XLS file detected but no readable text extracted. Consider converting to XLSX format for better support."
550
+ end
551
+ rescue => e
552
+ "Error reading XLS file: #{e.message}"
553
+ end
554
+
555
+ def extract_xls_metadata_builtin
556
+ basic_file_metadata.merge({
557
+ format: 'xls',
558
+ sheet_count: 1,
559
+ sheet_names: ['Sheet1'],
560
+ note: 'XLS format has limited built-in support. Consider converting to XLSX for full functionality.'
561
+ })
562
+ end
563
+
564
+ def extract_xls_tables_builtin
565
+ [{
566
+ sheet_name: 'Sheet1',
567
+ rows: 0,
568
+ columns: 0,
569
+ headers: [],
570
+ data: [],
571
+ note: 'XLS format has limited built-in support. Consider converting to XLSX for full functionality.'
572
+ }]
573
+ end
574
+
575
+ # Helper Methods
576
+ def determine_format_and_extract
577
+ # Try to determine format by content
578
+ if File.binread(@file_path, 4) == "PK\x03\x04"
579
+ extract_xlsx_text_builtin
580
+ else
581
+ extract_xls_text_builtin
582
+ end
583
+ end
584
+
585
+ def basic_file_metadata
586
+ {
587
+ file_size: File.size(@file_path),
588
+ last_modified: File.mtime(@file_path),
589
+ created: File.ctime(@file_path),
590
+ format: File.extname(@file_path).downcase.gsub('.', ''),
591
+ encoding: 'Unknown'
592
+ }
593
+ end
594
+
595
+ def convert_table_to_csv(table)
596
+ require 'csv'
597
+
598
+ CSV.generate do |csv|
599
+ table[:data].each do |row|
600
+ csv << row
601
+ end
602
+ end
603
+ end
604
+
605
+ def convert_table_to_tsv(table)
606
+ require 'csv'
607
+
608
+ CSV.generate(col_sep: "\t") do |tsv|
609
+ table[:data].each do |row|
610
+ tsv << row
611
+ end
612
+ end
613
+ end
614
+
615
+ def convert_csv_to_tsv(csv_content)
616
+ require 'csv'
617
+
618
+ lines = CSV.parse(csv_content)
619
+ CSV.generate(col_sep: "\t") do |tsv|
620
+ lines.each do |row|
621
+ tsv << row
622
+ end
623
+ end
624
+ end
625
+
626
+ def convert_tsv_to_csv(tsv_content)
627
+ require 'csv'
628
+
629
+ lines = CSV.parse(tsv_content, col_sep: "\t")
630
+ CSV.generate do |csv|
631
+ lines.each do |row|
632
+ csv << row
633
+ end
634
+ end
635
+ end
636
+
637
+ def analyze_table_statistics(table)
638
+ return {} if table[:data].empty?
639
+
640
+ stats = {
641
+ total_cells: table[:rows] * table[:columns],
642
+ empty_cells: 0,
643
+ numeric_cells: 0,
644
+ text_cells: 0,
645
+ numeric_values: []
646
+ }
647
+
648
+ table[:data].each do |row|
649
+ row.each do |cell|
650
+ if cell.nil? || cell.to_s.strip.empty?
651
+ stats[:empty_cells] += 1
652
+ elsif cell.to_s.match?(/^\d+(\.\d+)?$/)
653
+ stats[:numeric_cells] += 1
654
+ stats[:numeric_values] << cell.to_f
655
+ else
656
+ stats[:text_cells] += 1
657
+ end
658
+ end
659
+ end
660
+
661
+ if stats[:numeric_values].any?
662
+ values = stats[:numeric_values]
663
+ stats[:min_value] = values.min
664
+ stats[:max_value] = values.max
665
+ stats[:average_value] = values.sum / values.length.to_f
666
+ stats[:median_value] = calculate_median(values)
667
+ end
668
+
669
+ stats
670
+ end
671
+
672
+ def validate_table_data(table)
673
+ return {} if table[:data].empty?
674
+
675
+ validation = {
676
+ total_rows: table[:rows],
677
+ empty_rows: 0,
678
+ duplicate_rows: 0,
679
+ data_quality_score: 0
680
+ }
681
+
682
+ seen_rows = Set.new
683
+
684
+ table[:data].each do |row|
685
+ if row.all? { |cell| cell.nil? || cell.to_s.strip.empty? }
686
+ validation[:empty_rows] += 1
687
+ end
688
+
689
+ row_key = row.join('|')
690
+ if seen_rows.include?(row_key)
691
+ validation[:duplicate_rows] += 1
692
+ else
693
+ seen_rows.add(row_key)
694
+ end
695
+ end
696
+
697
+ # Calculate data quality score (0-100)
698
+ total_rows = table[:rows]
699
+ if total_rows > 0
700
+ quality_score = ((total_rows - validation[:empty_rows] - validation[:duplicate_rows]) / total_rows.to_f) * 100
701
+ validation[:data_quality_score] = [quality_score.round(2), 0].max
702
+ end
703
+
704
+ validation
705
+ end
706
+
707
+ def calculate_median(values)
708
+ sorted = values.sort
709
+ mid = sorted.length / 2
710
+
711
+ if sorted.length.odd?
712
+ sorted[mid]
713
+ else
714
+ (sorted[mid - 1] + sorted[mid]) / 2.0
715
+ end
716
+ end
717
+
197
718
  def detect_formulas(workbook)
198
719
  workbook.sheets.any? do |sheet_name|
199
720
  workbook.sheet(sheet_name)
@@ -214,6 +735,72 @@ module UniversalDocumentProcessor
214
735
  # This is a placeholder for future implementation
215
736
  false
216
737
  end
738
+
739
+ def calculate_median(values)
740
+ sorted = values.sort
741
+ length = sorted.length
742
+ if length.odd?
743
+ sorted[length / 2]
744
+ else
745
+ (sorted[length / 2 - 1] + sorted[length / 2]) / 2.0
746
+ end
747
+ end
748
+
749
+ def detect_headers(workbook)
750
+ return false unless workbook.last_row && workbook.last_row > 1
751
+
752
+ # Check if first row contains mostly text while second row contains numbers
753
+ first_row_types = []
754
+ second_row_types = []
755
+
756
+ (workbook.first_column..workbook.last_column).each do |col|
757
+ first_cell = workbook.cell(workbook.first_row, col)
758
+ second_cell = workbook.cell(workbook.first_row + 1, col)
759
+
760
+ first_row_types << (first_cell.is_a?(String) ? :text : :other)
761
+ second_row_types << (second_cell.is_a?(Numeric) ? :numeric : :other)
762
+ end
763
+
764
+ # If first row is mostly text and second row has numbers, likely has headers
765
+ text_ratio = first_row_types.count(:text).to_f / first_row_types.length
766
+ numeric_ratio = second_row_types.count(:numeric).to_f / second_row_types.length
767
+
768
+ text_ratio > 0.5 && numeric_ratio > 0.3
769
+ end
770
+
771
+ def analyze_column_types(workbook)
772
+ return {} unless workbook.last_row
773
+
774
+ column_types = {}
775
+
776
+ (workbook.first_column..workbook.last_column).each do |col|
777
+ types = { numeric: 0, text: 0, date: 0, empty: 0 }
778
+ total_rows = workbook.last_row - workbook.first_row + 1
779
+
780
+ (workbook.first_row..workbook.last_row).each do |row|
781
+ cell_value = workbook.cell(row, col)
782
+
783
+ if cell_value.nil? || cell_value.to_s.strip.empty?
784
+ types[:empty] += 1
785
+ elsif cell_value.is_a?(Numeric)
786
+ types[:numeric] += 1
787
+ elsif cell_value.is_a?(Date) || cell_value.is_a?(Time)
788
+ types[:date] += 1
789
+ else
790
+ types[:text] += 1
791
+ end
792
+ end
793
+
794
+ # Determine predominant type
795
+ max_type = types.max_by { |k, v| v }
796
+ column_types["Column #{col}"] = {
797
+ predominant_type: max_type[0],
798
+ type_distribution: types.transform_values { |v| (v.to_f / total_rows * 100).round(1) }
799
+ }
800
+ end
801
+
802
+ column_types
803
+ end
217
804
  end
218
805
  end
219
806
  end