universal_document_processor 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/README.md +53 -1
- data/lib/universal_document_processor/document.rb +40 -4
- data/lib/universal_document_processor/processors/excel_processor.rb +719 -132
- data/lib/universal_document_processor/processors/word_processor.rb +82 -4
- data/lib/universal_document_processor/utils/file_detector.rb +1 -0
- data/lib/universal_document_processor/version.rb +1 -1
- metadata +15 -3
- data/AI_USAGE_GUIDE.md +0 -404
- data/GEM_RELEASE_GUIDE.md +0 -288
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '06539a78d5cc253518f84b242dd7a8dcb71ce575614d7e8853bb9de70b031f75'
|
4
|
+
data.tar.gz: c58e957dfe6940c0cb16fb4c50f9ce0ee69aa7cf9a7587b2496a80da78bbdda8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 81940e620b3dcff668493ae459e1287161ccbc56b8677acab3477913685d4f68580f0d4062ae0e777e83a000aa58a42619f3a96d0f19b2146e38fa2c3418587a
|
7
|
+
data.tar.gz: 61b98794dc95f0489a806d1dbb31b172f75570ac2fa140d0ff334347f58afdbfab370ffd72b003b948b7a4ec80656e6f29b196ca833e0b219b41316ebb729843
|
data/CHANGELOG.md
CHANGED
@@ -7,6 +7,26 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
7
7
|
|
8
8
|
## [Unreleased]
|
9
9
|
|
10
|
+
## [1.2.0] - 2024-01-15
|
11
|
+
### Added
|
12
|
+
- **TSV (Tab-Separated Values) File Support**: Complete built-in TSV processing capabilities
|
13
|
+
- Native TSV parsing using Ruby CSV library with tab delimiter
|
14
|
+
- Text extraction with proper formatting
|
15
|
+
- Comprehensive metadata detection (format, delimiter, encoding)
|
16
|
+
- Table structure analysis and header detection
|
17
|
+
- Statistical analysis and data validation
|
18
|
+
- Format conversions: TSV ↔ CSV, TSV → JSON
|
19
|
+
- Cross-format compatibility with existing CSV and Excel features
|
20
|
+
- New `to_tsv()` method for converting other formats to TSV
|
21
|
+
- Enhanced file detector with TSV MIME type mapping
|
22
|
+
- Full integration with existing Document class API
|
23
|
+
|
24
|
+
### Enhanced
|
25
|
+
- **ExcelProcessor**: Extended to handle TSV files alongside CSV and Excel formats
|
26
|
+
- **File Detection**: Added TSV MIME type support (`text/tab-separated-values`)
|
27
|
+
- **Document Class**: Added `to_tsv()` method and TSV format support
|
28
|
+
- **Supported Formats**: Updated to include TSV in format list
|
29
|
+
|
10
30
|
## [1.0.1] - 2025-06-23
|
11
31
|
|
12
32
|
### Fixed
|
data/README.md
CHANGED
@@ -16,7 +16,7 @@ A comprehensive Ruby gem that provides unified document processing capabilities
|
|
16
16
|
|
17
17
|
### **Supported File Formats**
|
18
18
|
- **📄 Documents**: PDF, DOC, DOCX, RTF
|
19
|
-
- **📊 Spreadsheets**: XLS, XLSX, CSV
|
19
|
+
- **📊 Spreadsheets**: XLS, XLSX, CSV, TSV
|
20
20
|
- **📺 Presentations**: PPT, PPTX
|
21
21
|
- **🖼️ Images**: JPG, PNG, GIF, BMP, TIFF
|
22
22
|
- **📁 Archives**: ZIP, RAR, 7Z
|
@@ -236,6 +236,58 @@ tables.each_with_index do |table, index|
|
|
236
236
|
end
|
237
237
|
```
|
238
238
|
|
239
|
+
### Processing TSV (Tab-Separated Values) Files
|
240
|
+
|
241
|
+
```ruby
|
242
|
+
# Process TSV files with built-in support
|
243
|
+
result = UniversalDocumentProcessor.process('data.tsv')
|
244
|
+
|
245
|
+
# TSV-specific metadata
|
246
|
+
metadata = result[:metadata]
|
247
|
+
puts "Format: #{metadata[:format]}" # => "tsv"
|
248
|
+
puts "Delimiter: #{metadata[:delimiter]}" # => "tab"
|
249
|
+
puts "Rows: #{metadata[:total_rows]}"
|
250
|
+
puts "Columns: #{metadata[:total_columns]}"
|
251
|
+
puts "Has headers: #{metadata[:has_headers]}"
|
252
|
+
|
253
|
+
# Extract structured data
|
254
|
+
tables = result[:tables]
|
255
|
+
table = tables.first
|
256
|
+
puts "Headers: #{table[:headers].join(', ')}"
|
257
|
+
puts "Sample row: #{table[:data][1].join(' | ')}"
|
258
|
+
|
259
|
+
# Format conversions
|
260
|
+
document = UniversalDocumentProcessor::Document.new('data.tsv')
|
261
|
+
|
262
|
+
# Convert TSV to CSV
|
263
|
+
csv_output = document.to_csv
|
264
|
+
puts "CSV conversion: #{csv_output.length} characters"
|
265
|
+
|
266
|
+
# Convert TSV to JSON
|
267
|
+
json_output = document.to_json
|
268
|
+
puts "JSON conversion: #{json_output.length} characters"
|
269
|
+
|
270
|
+
# Convert CSV to TSV
|
271
|
+
csv_document = UniversalDocumentProcessor::Document.new('data.csv')
|
272
|
+
tsv_output = csv_document.to_tsv
|
273
|
+
puts "TSV conversion: #{tsv_output.length} characters"
|
274
|
+
|
275
|
+
# Statistical analysis
|
276
|
+
stats = document.extract_statistics
|
277
|
+
sheet_stats = stats['Sheet1']
|
278
|
+
puts "Total cells: #{sheet_stats[:total_cells]}"
|
279
|
+
puts "Numeric cells: #{sheet_stats[:numeric_cells]}"
|
280
|
+
puts "Text cells: #{sheet_stats[:text_cells]}"
|
281
|
+
puts "Average value: #{sheet_stats[:average_value]}"
|
282
|
+
|
283
|
+
# Data validation
|
284
|
+
validation = document.validate_data
|
285
|
+
sheet_validation = validation['Sheet1']
|
286
|
+
puts "Data quality score: #{sheet_validation[:data_quality_score]}%"
|
287
|
+
puts "Empty rows: #{sheet_validation[:empty_rows]}"
|
288
|
+
puts "Duplicate rows: #{sheet_validation[:duplicate_rows]}"
|
289
|
+
```
|
290
|
+
|
239
291
|
### Processing Word Documents
|
240
292
|
|
241
293
|
```ruby
|
@@ -48,6 +48,42 @@ module UniversalDocumentProcessor
|
|
48
48
|
[]
|
49
49
|
end
|
50
50
|
|
51
|
+
def extract_statistics
|
52
|
+
processor.respond_to?(:extract_statistics) ? processor.extract_statistics : {}
|
53
|
+
rescue => e
|
54
|
+
{}
|
55
|
+
end
|
56
|
+
|
57
|
+
def validate_data
|
58
|
+
processor.respond_to?(:validate_data) ? processor.validate_data : {}
|
59
|
+
rescue => e
|
60
|
+
{}
|
61
|
+
end
|
62
|
+
|
63
|
+
def extract_formulas
|
64
|
+
processor.respond_to?(:extract_formulas) ? processor.extract_formulas : []
|
65
|
+
rescue => e
|
66
|
+
[]
|
67
|
+
end
|
68
|
+
|
69
|
+
def to_json
|
70
|
+
processor.respond_to?(:to_json) ? processor.to_json : process.to_json
|
71
|
+
rescue => e
|
72
|
+
process.to_json
|
73
|
+
end
|
74
|
+
|
75
|
+
def to_csv(sheet_name = nil)
|
76
|
+
processor.respond_to?(:to_csv) ? processor.to_csv(sheet_name) : ""
|
77
|
+
rescue => e
|
78
|
+
""
|
79
|
+
end
|
80
|
+
|
81
|
+
def to_tsv(sheet_name = nil)
|
82
|
+
processor.respond_to?(:to_tsv) ? processor.to_tsv(sheet_name) : ""
|
83
|
+
rescue => e
|
84
|
+
""
|
85
|
+
end
|
86
|
+
|
51
87
|
def convert_to(target_format)
|
52
88
|
case target_format.to_sym
|
53
89
|
when :pdf
|
@@ -64,7 +100,7 @@ module UniversalDocumentProcessor
|
|
64
100
|
end
|
65
101
|
|
66
102
|
def supported_formats
|
67
|
-
%w[pdf docx doc xlsx xls pptx ppt txt rtf html xml csv jpg jpeg png gif bmp tiff zip rar 7z]
|
103
|
+
%w[pdf docx doc xlsx xls pptx ppt txt rtf html xml csv tsv jpg jpeg png gif bmp tiff zip rar 7z]
|
68
104
|
end
|
69
105
|
|
70
106
|
def supported?
|
@@ -139,11 +175,11 @@ module UniversalDocumentProcessor
|
|
139
175
|
case @content_type
|
140
176
|
when /pdf/
|
141
177
|
Processors::PdfProcessor.new(@file_path, @options)
|
142
|
-
when /
|
178
|
+
when /wordprocessingml/, /msword/
|
143
179
|
Processors::WordProcessor.new(@file_path, @options)
|
144
|
-
when /excel/, /
|
180
|
+
when /spreadsheetml/, /ms-excel/, /csv/, /tab-separated/
|
145
181
|
Processors::ExcelProcessor.new(@file_path, @options)
|
146
|
-
when /
|
182
|
+
when /presentationml/, /ms-powerpoint/
|
147
183
|
Processors::PowerpointProcessor.new(@file_path, @options)
|
148
184
|
when /image/
|
149
185
|
Processors::ImageProcessor.new(@file_path, @options)
|