universal_document_processor 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c33ef9db6830ddb62e98966d0bf3dd85106833e260303651df297bb3d46b9529
4
- data.tar.gz: 280fa75bf3d842fc1af11dd95ca49e0526ed22512b5732aed0a7c25d4a57a7d9
3
+ metadata.gz: '06539a78d5cc253518f84b242dd7a8dcb71ce575614d7e8853bb9de70b031f75'
4
+ data.tar.gz: c58e957dfe6940c0cb16fb4c50f9ce0ee69aa7cf9a7587b2496a80da78bbdda8
5
5
  SHA512:
6
- metadata.gz: 01d6c5129dc7ec1a7911a77ba69b8dbafd32bd9397e0374175a4cf09cf24f8c06225a0bc927b1e3bba8250e3d1e93d40bbbc1dbf3aa1a6ff0fbc92cb7f5f24a4
7
- data.tar.gz: b445b8d773e2865e6e2c5593c123cc7d0da580d4d16822e4b9e45c851568f077f4c949d921f2f602c4ed3ff941ccf8a1d002eb14401c554b0f8355dc293bff62
6
+ metadata.gz: 81940e620b3dcff668493ae459e1287161ccbc56b8677acab3477913685d4f68580f0d4062ae0e777e83a000aa58a42619f3a96d0f19b2146e38fa2c3418587a
7
+ data.tar.gz: 61b98794dc95f0489a806d1dbb31b172f75570ac2fa140d0ff334347f58afdbfab370ffd72b003b948b7a4ec80656e6f29b196ca833e0b219b41316ebb729843
data/CHANGELOG.md CHANGED
@@ -7,6 +7,26 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [1.2.0] - 2024-01-15
11
+ ### Added
12
+ - **TSV (Tab-Separated Values) File Support**: Complete built-in TSV processing capabilities
13
+ - Native TSV parsing using Ruby CSV library with tab delimiter
14
+ - Text extraction with proper formatting
15
+ - Comprehensive metadata detection (format, delimiter, encoding)
16
+ - Table structure analysis and header detection
17
+ - Statistical analysis and data validation
18
+ - Format conversions: TSV ↔ CSV, TSV → JSON
19
+ - Cross-format compatibility with existing CSV and Excel features
20
+ - New `to_tsv()` method for converting other formats to TSV
21
+ - Enhanced file detector with TSV MIME type mapping
22
+ - Full integration with existing Document class API
23
+
24
+ ### Enhanced
25
+ - **ExcelProcessor**: Extended to handle TSV files alongside CSV and Excel formats
26
+ - **File Detection**: Added TSV MIME type support (`text/tab-separated-values`)
27
+ - **Document Class**: Added `to_tsv()` method and TSV format support
28
+ - **Supported Formats**: Updated to include TSV in format list
29
+
10
30
  ## [1.0.1] - 2025-06-23
11
31
 
12
32
  ### Fixed
data/README.md CHANGED
@@ -16,7 +16,7 @@ A comprehensive Ruby gem that provides unified document processing capabilities
16
16
 
17
17
  ### **Supported File Formats**
18
18
  - **📄 Documents**: PDF, DOC, DOCX, RTF
19
- - **📊 Spreadsheets**: XLS, XLSX, CSV
19
+ - **📊 Spreadsheets**: XLS, XLSX, CSV, TSV
20
20
  - **📺 Presentations**: PPT, PPTX
21
21
  - **🖼️ Images**: JPG, PNG, GIF, BMP, TIFF
22
22
  - **📁 Archives**: ZIP, RAR, 7Z
@@ -236,6 +236,58 @@ tables.each_with_index do |table, index|
236
236
  end
237
237
  ```
238
238
 
239
+ ### Processing TSV (Tab-Separated Values) Files
240
+
241
+ ```ruby
242
+ # Process TSV files with built-in support
243
+ result = UniversalDocumentProcessor.process('data.tsv')
244
+
245
+ # TSV-specific metadata
246
+ metadata = result[:metadata]
247
+ puts "Format: #{metadata[:format]}" # => "tsv"
248
+ puts "Delimiter: #{metadata[:delimiter]}" # => "tab"
249
+ puts "Rows: #{metadata[:total_rows]}"
250
+ puts "Columns: #{metadata[:total_columns]}"
251
+ puts "Has headers: #{metadata[:has_headers]}"
252
+
253
+ # Extract structured data
254
+ tables = result[:tables]
255
+ table = tables.first
256
+ puts "Headers: #{table[:headers].join(', ')}"
257
+ puts "Sample row: #{table[:data][1].join(' | ')}"
258
+
259
+ # Format conversions
260
+ document = UniversalDocumentProcessor::Document.new('data.tsv')
261
+
262
+ # Convert TSV to CSV
263
+ csv_output = document.to_csv
264
+ puts "CSV conversion: #{csv_output.length} characters"
265
+
266
+ # Convert TSV to JSON
267
+ json_output = document.to_json
268
+ puts "JSON conversion: #{json_output.length} characters"
269
+
270
+ # Convert CSV to TSV
271
+ csv_document = UniversalDocumentProcessor::Document.new('data.csv')
272
+ tsv_output = csv_document.to_tsv
273
+ puts "TSV conversion: #{tsv_output.length} characters"
274
+
275
+ # Statistical analysis
276
+ stats = document.extract_statistics
277
+ sheet_stats = stats['Sheet1']
278
+ puts "Total cells: #{sheet_stats[:total_cells]}"
279
+ puts "Numeric cells: #{sheet_stats[:numeric_cells]}"
280
+ puts "Text cells: #{sheet_stats[:text_cells]}"
281
+ puts "Average value: #{sheet_stats[:average_value]}"
282
+
283
+ # Data validation
284
+ validation = document.validate_data
285
+ sheet_validation = validation['Sheet1']
286
+ puts "Data quality score: #{sheet_validation[:data_quality_score]}%"
287
+ puts "Empty rows: #{sheet_validation[:empty_rows]}"
288
+ puts "Duplicate rows: #{sheet_validation[:duplicate_rows]}"
289
+ ```
290
+
239
291
  ### Processing Word Documents
240
292
 
241
293
  ```ruby
@@ -48,6 +48,42 @@ module UniversalDocumentProcessor
48
48
  []
49
49
  end
50
50
 
51
+ def extract_statistics
52
+ processor.respond_to?(:extract_statistics) ? processor.extract_statistics : {}
53
+ rescue => e
54
+ {}
55
+ end
56
+
57
+ def validate_data
58
+ processor.respond_to?(:validate_data) ? processor.validate_data : {}
59
+ rescue => e
60
+ {}
61
+ end
62
+
63
+ def extract_formulas
64
+ processor.respond_to?(:extract_formulas) ? processor.extract_formulas : []
65
+ rescue => e
66
+ []
67
+ end
68
+
69
+ def to_json
70
+ processor.respond_to?(:to_json) ? processor.to_json : process.to_json
71
+ rescue => e
72
+ process.to_json
73
+ end
74
+
75
+ def to_csv(sheet_name = nil)
76
+ processor.respond_to?(:to_csv) ? processor.to_csv(sheet_name) : ""
77
+ rescue => e
78
+ ""
79
+ end
80
+
81
+ def to_tsv(sheet_name = nil)
82
+ processor.respond_to?(:to_tsv) ? processor.to_tsv(sheet_name) : ""
83
+ rescue => e
84
+ ""
85
+ end
86
+
51
87
  def convert_to(target_format)
52
88
  case target_format.to_sym
53
89
  when :pdf
@@ -64,7 +100,7 @@ module UniversalDocumentProcessor
64
100
  end
65
101
 
66
102
  def supported_formats
67
- %w[pdf docx doc xlsx xls pptx ppt txt rtf html xml csv jpg jpeg png gif bmp tiff zip rar 7z]
103
+ %w[pdf docx doc xlsx xls pptx ppt txt rtf html xml csv tsv jpg jpeg png gif bmp tiff zip rar 7z]
68
104
  end
69
105
 
70
106
  def supported?
@@ -139,11 +175,11 @@ module UniversalDocumentProcessor
139
175
  case @content_type
140
176
  when /pdf/
141
177
  Processors::PdfProcessor.new(@file_path, @options)
142
- when /word/, /document/
178
+ when /wordprocessingml/, /msword/
143
179
  Processors::WordProcessor.new(@file_path, @options)
144
- when /excel/, /spreadsheet/
180
+ when /spreadsheetml/, /ms-excel/, /csv/, /tab-separated/
145
181
  Processors::ExcelProcessor.new(@file_path, @options)
146
- when /powerpoint/, /presentation/
182
+ when /presentationml/, /ms-powerpoint/
147
183
  Processors::PowerpointProcessor.new(@file_path, @options)
148
184
  when /image/
149
185
  Processors::ImageProcessor.new(@file_path, @options)