csv-utils 0.3.25 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ci.yml +53 -0
- data/.rubocop.yml +81 -0
- data/ARCHITECTURE.md +154 -0
- data/CLAUDE.md +63 -0
- data/Gemfile +2 -1
- data/Gemfile.lock +5 -0
- data/README.md +238 -16
- data/bin/csv-diff +3 -3
- data/bin/csv-duplicate-finder +1 -1
- data/bin/csv-grep +3 -3
- data/bin/csv-readline +4 -5
- data/bin/csv-splitter +1 -1
- data/bin/csv-validator +38 -36
- data/csv-utils.gemspec +6 -5
- data/lib/csv-utils.rb +3 -0
- data/lib/csv_utils/csv_compare.rb +77 -71
- data/lib/csv_utils/csv_extender.rb +45 -41
- data/lib/csv_utils/csv_iterator.rb +90 -75
- data/lib/csv_utils/csv_options.rb +11 -11
- data/lib/csv_utils/csv_report.rb +5 -2
- data/lib/csv_utils/csv_row.rb +3 -1
- data/lib/csv_utils/csv_row_matcher.rb +34 -0
- data/lib/csv_utils/csv_sort.rb +110 -96
- data/lib/csv_utils/csv_transformer.rb +95 -92
- data/lib/csv_utils/csv_wrapper.rb +40 -36
- metadata +13 -6
- data/docs/ARCHITECTURE.md +0 -134
|
@@ -1,119 +1,122 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
# Transforms a CSV given a series of steps
|
|
2
|
-
|
|
3
|
-
|
|
4
|
+
module CSVUtils
|
|
5
|
+
class CSVTransformer
|
|
6
|
+
attr_reader :headers
|
|
4
7
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
8
|
+
def initialize(src_csv, dest_csv, csv_options = {})
|
|
9
|
+
@src_csv = CSVUtils::CSVWrapper.new(src_csv, 'rb', csv_options)
|
|
10
|
+
@dest_csv = CSVUtils::CSVWrapper.new(dest_csv, 'wb', csv_options)
|
|
11
|
+
end
|
|
9
12
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
13
|
+
def read_headers
|
|
14
|
+
@headers = @src_csv.shift
|
|
15
|
+
self
|
|
16
|
+
end
|
|
14
17
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
18
|
+
def additional_data(&block)
|
|
19
|
+
steps << [:additional_data, @headers, block]
|
|
20
|
+
self
|
|
21
|
+
end
|
|
19
22
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
23
|
+
def select(&block)
|
|
24
|
+
steps << [:select, @headers, block]
|
|
25
|
+
self
|
|
26
|
+
end
|
|
24
27
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
28
|
+
def reject(&block)
|
|
29
|
+
steps << [:reject, @headers, block]
|
|
30
|
+
self
|
|
31
|
+
end
|
|
29
32
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
33
|
+
def map(new_headers, &block)
|
|
34
|
+
steps << [:map, @headers, block]
|
|
35
|
+
@headers = new_headers
|
|
36
|
+
self
|
|
37
|
+
end
|
|
35
38
|
|
|
36
|
-
|
|
37
|
-
|
|
39
|
+
def append(additional_headers, &block)
|
|
40
|
+
steps << [:append, @headers, block]
|
|
38
41
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
42
|
+
if additional_headers
|
|
43
|
+
@headers += additional_headers
|
|
44
|
+
else
|
|
45
|
+
@headers = nil
|
|
46
|
+
end
|
|
44
47
|
|
|
45
|
-
|
|
46
|
-
|
|
48
|
+
self
|
|
49
|
+
end
|
|
47
50
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
51
|
+
def each(&block)
|
|
52
|
+
steps << [:each, @headers, block]
|
|
53
|
+
self
|
|
54
|
+
end
|
|
52
55
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
56
|
+
def set_headers(headers)
|
|
57
|
+
@headers = headers
|
|
58
|
+
self
|
|
59
|
+
end
|
|
57
60
|
|
|
58
|
-
|
|
59
|
-
|
|
61
|
+
def process(batch_size = 10_000)
|
|
62
|
+
batch = []
|
|
60
63
|
|
|
61
|
-
|
|
64
|
+
@dest_csv << @headers if @headers
|
|
62
65
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
66
|
+
steps_proc = proc do
|
|
67
|
+
steps.each do |step_type, current_headers, proc|
|
|
68
|
+
batch = process_step(step_type, current_headers, batch, &proc)
|
|
69
|
+
end
|
|
67
70
|
|
|
68
|
-
|
|
71
|
+
batch.each { |row| @dest_csv << row }
|
|
69
72
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
while (row = @src_csv.shift)
|
|
74
|
-
batch << row
|
|
75
|
-
steps_proc.call if batch.size >= batch_size
|
|
76
|
-
end
|
|
73
|
+
batch = []
|
|
74
|
+
end
|
|
77
75
|
|
|
78
|
-
|
|
76
|
+
while (row = @src_csv.shift)
|
|
77
|
+
batch << row
|
|
78
|
+
steps_proc.call if batch.size >= batch_size
|
|
79
|
+
end
|
|
79
80
|
|
|
80
|
-
|
|
81
|
-
@dest_csv.close
|
|
82
|
-
end
|
|
81
|
+
steps_proc.call if batch.size.positive?
|
|
83
82
|
|
|
84
|
-
|
|
83
|
+
@src_csv.close
|
|
84
|
+
@dest_csv.close
|
|
85
|
+
end
|
|
85
86
|
|
|
86
|
-
|
|
87
|
-
@steps ||= []
|
|
88
|
-
end
|
|
87
|
+
private
|
|
89
88
|
|
|
89
|
+
def steps
|
|
90
|
+
@steps ||= []
|
|
91
|
+
end
|
|
90
92
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
93
|
+
def process_step(step_type, current_headers, batch, &block)
|
|
94
|
+
case step_type
|
|
95
|
+
when :select
|
|
96
|
+
batch.select! do |row|
|
|
97
|
+
block.call row, current_headers, @additional_data
|
|
98
|
+
end
|
|
99
|
+
when :reject
|
|
100
|
+
batch.reject! do |row|
|
|
101
|
+
block.call row, current_headers, @additional_data
|
|
102
|
+
end
|
|
103
|
+
when :map
|
|
104
|
+
batch.map! do |row|
|
|
105
|
+
block.call row, current_headers, @additional_data
|
|
106
|
+
end
|
|
107
|
+
when :append
|
|
108
|
+
batch.map! do |row|
|
|
109
|
+
row + block.call(row, current_headers, @additional_data)
|
|
110
|
+
end
|
|
111
|
+
when :additional_data
|
|
112
|
+
@additional_data = block.call(batch, current_headers)
|
|
113
|
+
when :each
|
|
114
|
+
batch.each do |row|
|
|
115
|
+
block.call(row, current_headers, @additional_data)
|
|
116
|
+
end
|
|
104
117
|
end
|
|
105
|
-
when :append
|
|
106
|
-
batch.map! do |row|
|
|
107
|
-
row + block.call(row, current_headers, @additional_data)
|
|
108
|
-
end
|
|
109
|
-
when :additional_data
|
|
110
|
-
@additional_data = block.call(batch, current_headers)
|
|
111
|
-
when :each
|
|
112
|
-
batch.each do |row|
|
|
113
|
-
block.call(row, current_headers, @additional_data)
|
|
114
|
-
end
|
|
115
|
-
end
|
|
116
118
|
|
|
117
|
-
|
|
119
|
+
batch
|
|
120
|
+
end
|
|
118
121
|
end
|
|
119
122
|
end
|
|
@@ -1,51 +1,55 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
# Wraps a CSV object, if wrapper opens the csv file it will close it
|
|
2
|
-
|
|
3
|
-
|
|
4
|
+
module CSVUtils
|
|
5
|
+
class CSVWrapper
|
|
6
|
+
attr_reader :csv
|
|
4
7
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
+
def initialize(csv, mode, csv_options)
|
|
9
|
+
open(csv, mode, csv_options)
|
|
10
|
+
end
|
|
8
11
|
|
|
9
|
-
|
|
10
|
-
|
|
12
|
+
def self.open(file, mode, csv_options = {})
|
|
13
|
+
csv = new(file, mode, csv_options)
|
|
11
14
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
15
|
+
if block_given?
|
|
16
|
+
yield csv
|
|
17
|
+
csv.close
|
|
18
|
+
else
|
|
19
|
+
csv
|
|
20
|
+
end
|
|
17
21
|
end
|
|
18
|
-
end
|
|
19
22
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
23
|
+
def open(csv, mode, csv_options)
|
|
24
|
+
if csv.is_a?(String)
|
|
25
|
+
@close_when_done = true
|
|
26
|
+
@csv = CSV.open(csv, mode, **csv_options)
|
|
27
|
+
else
|
|
28
|
+
@close_when_done = false
|
|
29
|
+
@csv = csv
|
|
30
|
+
end
|
|
27
31
|
end
|
|
28
|
-
end
|
|
29
32
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
+
def <<(row)
|
|
34
|
+
csv << row
|
|
35
|
+
end
|
|
33
36
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
+
def shift
|
|
38
|
+
csv.shift
|
|
39
|
+
end
|
|
37
40
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
+
def rewind
|
|
42
|
+
csv.rewind
|
|
43
|
+
end
|
|
41
44
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
+
def close
|
|
46
|
+
csv.close if close_when_done?
|
|
47
|
+
end
|
|
45
48
|
|
|
46
|
-
|
|
49
|
+
private
|
|
47
50
|
|
|
48
|
-
|
|
49
|
-
|
|
51
|
+
def close_when_done?
|
|
52
|
+
@close_when_done
|
|
53
|
+
end
|
|
50
54
|
end
|
|
51
55
|
end
|
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: csv-utils
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.5.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Doug Youch
|
|
8
8
|
bindir: bin
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date:
|
|
10
|
+
date: 2026-01-31 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
12
|
- !ruby/object:Gem::Dependency
|
|
13
13
|
name: csv
|
|
@@ -37,7 +37,9 @@ dependencies:
|
|
|
37
37
|
- - ">="
|
|
38
38
|
- !ruby/object:Gem::Version
|
|
39
39
|
version: '0'
|
|
40
|
-
description:
|
|
40
|
+
description: A Ruby library for CSV file processing featuring comparison, transformation,
|
|
41
|
+
sorting, and validation. Includes CLI tools for debugging malformed CSVs, auto-detection
|
|
42
|
+
of encodings and separators, and efficient handling of large files.
|
|
41
43
|
email: dougyouch@gmail.com
|
|
42
44
|
executables:
|
|
43
45
|
- csv-change-eol
|
|
@@ -52,9 +54,13 @@ executables:
|
|
|
52
54
|
extensions: []
|
|
53
55
|
extra_rdoc_files: []
|
|
54
56
|
files:
|
|
57
|
+
- ".github/workflows/ci.yml"
|
|
55
58
|
- ".gitignore"
|
|
59
|
+
- ".rubocop.yml"
|
|
56
60
|
- ".ruby-gemset"
|
|
57
61
|
- ".ruby-version"
|
|
62
|
+
- ARCHITECTURE.md
|
|
63
|
+
- CLAUDE.md
|
|
58
64
|
- Gemfile
|
|
59
65
|
- Gemfile.lock
|
|
60
66
|
- LICENSE
|
|
@@ -69,7 +75,6 @@ files:
|
|
|
69
75
|
- bin/csv-splitter
|
|
70
76
|
- bin/csv-validator
|
|
71
77
|
- csv-utils.gemspec
|
|
72
|
-
- docs/ARCHITECTURE.md
|
|
73
78
|
- lib/csv-utils.rb
|
|
74
79
|
- lib/csv_utils/csv_compare.rb
|
|
75
80
|
- lib/csv_utils/csv_extender.rb
|
|
@@ -77,6 +82,7 @@ files:
|
|
|
77
82
|
- lib/csv_utils/csv_options.rb
|
|
78
83
|
- lib/csv_utils/csv_report.rb
|
|
79
84
|
- lib/csv_utils/csv_row.rb
|
|
85
|
+
- lib/csv_utils/csv_row_matcher.rb
|
|
80
86
|
- lib/csv_utils/csv_sort.rb
|
|
81
87
|
- lib/csv_utils/csv_transformer.rb
|
|
82
88
|
- lib/csv_utils/csv_wrapper.rb
|
|
@@ -84,7 +90,8 @@ files:
|
|
|
84
90
|
homepage: https://github.com/dougyouch/csv-utils
|
|
85
91
|
licenses:
|
|
86
92
|
- MIT
|
|
87
|
-
metadata:
|
|
93
|
+
metadata:
|
|
94
|
+
rubygems_mfa_required: 'true'
|
|
88
95
|
rdoc_options: []
|
|
89
96
|
require_paths:
|
|
90
97
|
- lib
|
|
@@ -101,5 +108,5 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
101
108
|
requirements: []
|
|
102
109
|
rubygems_version: 3.6.2
|
|
103
110
|
specification_version: 4
|
|
104
|
-
summary: CSV
|
|
111
|
+
summary: Comprehensive CSV manipulation and debugging utilities for Ruby
|
|
105
112
|
test_files: []
|
data/docs/ARCHITECTURE.md
DELETED
|
@@ -1,134 +0,0 @@
|
|
|
1
|
-
# CSV Utils Architecture
|
|
2
|
-
|
|
3
|
-
## Overview
|
|
4
|
-
|
|
5
|
-
CSV Utils is a Ruby library designed to provide a comprehensive set of tools for CSV file manipulation. The architecture follows a modular design pattern, with each component handling a specific aspect of CSV processing.
|
|
6
|
-
|
|
7
|
-
## Core Components
|
|
8
|
-
|
|
9
|
-
### 1. CSVCompare
|
|
10
|
-
- **Purpose**: Compares two CSV files to identify differences
|
|
11
|
-
- **Key Features**:
|
|
12
|
-
- Identifies creates, updates, and deletes between files
|
|
13
|
-
- Supports custom comparison logic
|
|
14
|
-
- Handles BOM (Byte Order Mark) stripping
|
|
15
|
-
- Memory-efficient streaming comparison
|
|
16
|
-
- **Dependencies**: None (uses standard Ruby CSV library)
|
|
17
|
-
|
|
18
|
-
### 2. CSVTransformer
|
|
19
|
-
- **Purpose**: Transforms CSV data according to custom rules
|
|
20
|
-
- **Key Features**:
|
|
21
|
-
- Row-by-row transformation
|
|
22
|
-
- Custom transformation blocks
|
|
23
|
-
- Maintains header structure
|
|
24
|
-
- **Dependencies**: None
|
|
25
|
-
|
|
26
|
-
### 3. CSVSort
|
|
27
|
-
- **Purpose**: Sorts CSV files based on specified columns
|
|
28
|
-
- **Key Features**:
|
|
29
|
-
- Multi-column sorting
|
|
30
|
-
- Memory-efficient sorting
|
|
31
|
-
- Preserves header row
|
|
32
|
-
- **Dependencies**: None
|
|
33
|
-
|
|
34
|
-
### 4. CSVReport
|
|
35
|
-
- **Purpose**: Generates reports from CSV data
|
|
36
|
-
- **Key Features**:
|
|
37
|
-
- Custom report formatting
|
|
38
|
-
- Data aggregation
|
|
39
|
-
- Summary statistics
|
|
40
|
-
- **Dependencies**: None
|
|
41
|
-
|
|
42
|
-
### 5. CSVIterator
|
|
43
|
-
- **Purpose**: Provides efficient iteration over CSV files
|
|
44
|
-
- **Key Features**:
|
|
45
|
-
- Memory-efficient streaming
|
|
46
|
-
- Custom iteration blocks
|
|
47
|
-
- Header handling
|
|
48
|
-
- **Dependencies**: None
|
|
49
|
-
|
|
50
|
-
### 6. CSVExtender
|
|
51
|
-
- **Purpose**: Extends CSV files with additional data
|
|
52
|
-
- **Key Features**:
|
|
53
|
-
- Column addition
|
|
54
|
-
- Data enrichment
|
|
55
|
-
- Custom extension logic
|
|
56
|
-
- **Dependencies**: None
|
|
57
|
-
|
|
58
|
-
### 7. CSVWrapper
|
|
59
|
-
- **Purpose**: Provides a convenient wrapper for CSV operations
|
|
60
|
-
- **Key Features**:
|
|
61
|
-
- Simplified CSV access
|
|
62
|
-
- Common operation shortcuts
|
|
63
|
-
- Error handling
|
|
64
|
-
- **Dependencies**: None
|
|
65
|
-
|
|
66
|
-
## Design Principles
|
|
67
|
-
|
|
68
|
-
1. **Modularity**: Each component is self-contained and focused on a single responsibility
|
|
69
|
-
2. **Memory Efficiency**: Components are designed to handle large files through streaming
|
|
70
|
-
3. **Extensibility**: Custom logic can be injected through blocks and callbacks
|
|
71
|
-
4. **Error Handling**: Robust error handling and validation
|
|
72
|
-
5. **Performance**: Optimized for large file processing
|
|
73
|
-
|
|
74
|
-
## Data Flow
|
|
75
|
-
|
|
76
|
-
1. **Input Processing**:
|
|
77
|
-
- Files are read using Ruby's CSV library
|
|
78
|
-
- BOM stripping is handled automatically
|
|
79
|
-
- Headers are preserved and validated
|
|
80
|
-
|
|
81
|
-
2. **Processing**:
|
|
82
|
-
- Each component processes data in a streaming fashion
|
|
83
|
-
- Custom logic can be injected at various points
|
|
84
|
-
- Memory usage is optimized for large files
|
|
85
|
-
|
|
86
|
-
3. **Output Generation**:
|
|
87
|
-
- Results are written to new files or returned as data structures
|
|
88
|
-
- Headers are preserved in output files
|
|
89
|
-
- Error states are properly handled
|
|
90
|
-
|
|
91
|
-
## Error Handling
|
|
92
|
-
|
|
93
|
-
- File not found errors
|
|
94
|
-
- Invalid CSV format
|
|
95
|
-
- Missing required columns
|
|
96
|
-
- Permission issues
|
|
97
|
-
- Memory constraints
|
|
98
|
-
|
|
99
|
-
## Performance Considerations
|
|
100
|
-
|
|
101
|
-
1. **Memory Usage**:
|
|
102
|
-
- Streaming processing for large files
|
|
103
|
-
- Minimal in-memory data storage
|
|
104
|
-
- Efficient data structures
|
|
105
|
-
|
|
106
|
-
2. **Processing Speed**:
|
|
107
|
-
- Optimized comparison algorithms
|
|
108
|
-
- Efficient sorting mechanisms
|
|
109
|
-
- Minimal file I/O operations
|
|
110
|
-
|
|
111
|
-
## Future Considerations
|
|
112
|
-
|
|
113
|
-
1. **Potential Enhancements**:
|
|
114
|
-
- Parallel processing support
|
|
115
|
-
- Additional data format support
|
|
116
|
-
- Enhanced reporting capabilities
|
|
117
|
-
- Caching mechanisms
|
|
118
|
-
|
|
119
|
-
2. **Scalability**:
|
|
120
|
-
- Support for distributed processing
|
|
121
|
-
- Cloud storage integration
|
|
122
|
-
- Batch processing capabilities
|
|
123
|
-
|
|
124
|
-
## Testing Strategy
|
|
125
|
-
|
|
126
|
-
1. **Unit Tests**:
|
|
127
|
-
- Individual component testing
|
|
128
|
-
- Edge case coverage
|
|
129
|
-
- Performance benchmarks
|
|
130
|
-
|
|
131
|
-
2. **Integration Tests**:
|
|
132
|
-
- Component interaction testing
|
|
133
|
-
- End-to-end workflows
|
|
134
|
-
- Error scenario coverage
|