csv-utils 0.3.25 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ci.yml +53 -0
- data/.rubocop.yml +81 -0
- data/ARCHITECTURE.md +154 -0
- data/CLAUDE.md +63 -0
- data/Gemfile +2 -1
- data/Gemfile.lock +5 -0
- data/README.md +238 -16
- data/bin/csv-diff +3 -3
- data/bin/csv-duplicate-finder +1 -1
- data/bin/csv-grep +3 -3
- data/bin/csv-readline +4 -5
- data/bin/csv-splitter +1 -1
- data/bin/csv-validator +38 -36
- data/csv-utils.gemspec +6 -5
- data/lib/csv-utils.rb +3 -0
- data/lib/csv_utils/csv_compare.rb +77 -71
- data/lib/csv_utils/csv_extender.rb +45 -41
- data/lib/csv_utils/csv_iterator.rb +90 -75
- data/lib/csv_utils/csv_options.rb +11 -11
- data/lib/csv_utils/csv_report.rb +5 -2
- data/lib/csv_utils/csv_row.rb +3 -1
- data/lib/csv_utils/csv_row_matcher.rb +34 -0
- data/lib/csv_utils/csv_sort.rb +110 -96
- data/lib/csv_utils/csv_transformer.rb +95 -92
- data/lib/csv_utils/csv_wrapper.rb +40 -36
- metadata +13 -6
- data/docs/ARCHITECTURE.md +0 -134
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 151a25f6d4d171b169ac194665f217151a96ef43aede166c6a62ec9f2b259765
|
|
4
|
+
data.tar.gz: b223b26ea97a29f58f532e6b737d481c4f144ce2e202592f8057300790156ed5
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7afee881db16b4afce9cceb812d4994ee66bcf172db147eaf73911961983cd0d73f9c553ca549880793e1c9a2bb2904e8444f2618ff30d040808d1b4e5383cf7
|
|
7
|
+
data.tar.gz: 4811b6ff72153107bf029f36feef65ec68646e0365d3b3987bd3231b9842b20c8c27c72216ca25350bd469887315b139737eedeba3b7c7fa17d240c729ae9a5a
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [master]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [master]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
lint:
|
|
11
|
+
name: RuboCop
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
|
|
16
|
+
- name: Set up Ruby
|
|
17
|
+
uses: ruby/setup-ruby@v1
|
|
18
|
+
with:
|
|
19
|
+
ruby-version: '3.4'
|
|
20
|
+
bundler-cache: true
|
|
21
|
+
|
|
22
|
+
- name: Run RuboCop
|
|
23
|
+
run: bundle exec rubocop --format github
|
|
24
|
+
|
|
25
|
+
test:
|
|
26
|
+
name: Tests (Ruby ${{ matrix.ruby }})
|
|
27
|
+
runs-on: ubuntu-latest
|
|
28
|
+
strategy:
|
|
29
|
+
fail-fast: false
|
|
30
|
+
matrix:
|
|
31
|
+
ruby: ['3.2', '3.3', '3.4']
|
|
32
|
+
|
|
33
|
+
steps:
|
|
34
|
+
- uses: actions/checkout@v4
|
|
35
|
+
|
|
36
|
+
- name: Set up Ruby ${{ matrix.ruby }}
|
|
37
|
+
uses: ruby/setup-ruby@v1
|
|
38
|
+
with:
|
|
39
|
+
ruby-version: ${{ matrix.ruby }}
|
|
40
|
+
bundler-cache: true
|
|
41
|
+
|
|
42
|
+
- name: Run tests
|
|
43
|
+
run: bundle exec rspec
|
|
44
|
+
|
|
45
|
+
- name: Upload coverage to Codecov
|
|
46
|
+
if: matrix.ruby == '3.4'
|
|
47
|
+
uses: codecov/codecov-action@v4
|
|
48
|
+
with:
|
|
49
|
+
files: coverage/coverage.xml
|
|
50
|
+
fail_ci_if_error: false
|
|
51
|
+
verbose: true
|
|
52
|
+
env:
|
|
53
|
+
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
data/.rubocop.yml
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
AllCops:
|
|
2
|
+
TargetRubyVersion: 3.2
|
|
3
|
+
NewCops: enable
|
|
4
|
+
SuggestExtensions: false
|
|
5
|
+
Exclude:
|
|
6
|
+
- 'bin/**/*'
|
|
7
|
+
- 'vendor/**/*'
|
|
8
|
+
- 'coverage/**/*'
|
|
9
|
+
|
|
10
|
+
# Relaxed metrics for existing codebase
|
|
11
|
+
Metrics/MethodLength:
|
|
12
|
+
Max: 35
|
|
13
|
+
|
|
14
|
+
Metrics/AbcSize:
|
|
15
|
+
Max: 40
|
|
16
|
+
|
|
17
|
+
Metrics/ClassLength:
|
|
18
|
+
Max: 200
|
|
19
|
+
|
|
20
|
+
Metrics/CyclomaticComplexity:
|
|
21
|
+
Max: 15
|
|
22
|
+
|
|
23
|
+
Metrics/PerceivedComplexity:
|
|
24
|
+
Max: 15
|
|
25
|
+
|
|
26
|
+
Metrics/BlockLength:
|
|
27
|
+
Exclude:
|
|
28
|
+
- 'spec/**/*'
|
|
29
|
+
- '*.gemspec'
|
|
30
|
+
|
|
31
|
+
# Style preferences
|
|
32
|
+
Style/Documentation:
|
|
33
|
+
Enabled: false
|
|
34
|
+
|
|
35
|
+
Style/FrozenStringLiteralComment:
|
|
36
|
+
EnforcedStyle: always
|
|
37
|
+
|
|
38
|
+
Layout/LineLength:
|
|
39
|
+
Max: 130
|
|
40
|
+
Exclude:
|
|
41
|
+
- 'spec/**/*'
|
|
42
|
+
|
|
43
|
+
# File naming - allow hyphenated gem name
|
|
44
|
+
Naming/FileName:
|
|
45
|
+
Exclude:
|
|
46
|
+
- 'lib/csv-utils.rb'
|
|
47
|
+
|
|
48
|
+
# Allow set_ prefix for transformer DSL methods
|
|
49
|
+
Naming/AccessorMethodName:
|
|
50
|
+
Exclude:
|
|
51
|
+
- 'lib/csv_utils/csv_transformer.rb'
|
|
52
|
+
|
|
53
|
+
# Allow Kernel#open for pipe support (intentional design)
|
|
54
|
+
Security/Open:
|
|
55
|
+
Exclude:
|
|
56
|
+
- 'lib/csv_utils/csv_wrapper.rb'
|
|
57
|
+
|
|
58
|
+
# Duplicate branches are intentional in comparison logic
|
|
59
|
+
Lint/DuplicateBranch:
|
|
60
|
+
Exclude:
|
|
61
|
+
- 'lib/csv_utils/csv_compare.rb'
|
|
62
|
+
- 'lib/csv_utils/csv_sort.rb'
|
|
63
|
+
|
|
64
|
+
# Allow empty blocks in specs (used for testing iteration behavior)
|
|
65
|
+
Lint/EmptyBlock:
|
|
66
|
+
Exclude:
|
|
67
|
+
- 'spec/**/*'
|
|
68
|
+
|
|
69
|
+
# Style relaxations for existing code patterns
|
|
70
|
+
Style/OptionalBooleanParameter:
|
|
71
|
+
Enabled: false
|
|
72
|
+
|
|
73
|
+
Style/StringConcatenation:
|
|
74
|
+
Enabled: false
|
|
75
|
+
|
|
76
|
+
Style/FormatStringToken:
|
|
77
|
+
Enabled: false
|
|
78
|
+
|
|
79
|
+
# Gemspec settings
|
|
80
|
+
Gemspec/RequiredRubyVersion:
|
|
81
|
+
Enabled: false
|
data/ARCHITECTURE.md
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# Architecture
|
|
2
|
+
|
|
3
|
+
This document describes the internal architecture of the csv-utils gem.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
csv-utils is a Ruby gem providing utilities for manipulating, debugging, and processing CSV files. The library emphasizes handling malformed CSVs and large file processing through streaming and batch operations.
|
|
8
|
+
|
|
9
|
+
## Core Design Principles
|
|
10
|
+
|
|
11
|
+
1. **Streaming Over Loading** - Files are processed row-by-row rather than loading entire files into memory
|
|
12
|
+
2. **Resource Management** - CSVWrapper ensures proper file handle lifecycle management
|
|
13
|
+
3. **Batch Processing** - Large operations support configurable batch sizes to balance memory and performance
|
|
14
|
+
4. **BOM Handling** - All readers strip UTF-8/16/32 byte order marks from headers
|
|
15
|
+
|
|
16
|
+
## Component Architecture
|
|
17
|
+
|
|
18
|
+
```
|
|
19
|
+
┌─────────────────────────────────────────────────────────────────┐
|
|
20
|
+
│ CSVUtils Module │
|
|
21
|
+
├─────────────────────────────────────────────────────────────────┤
|
|
22
|
+
│ Detection Layer │
|
|
23
|
+
│ ┌─────────────┐ │
|
|
24
|
+
│ │ CSVOptions │ Auto-detects separators, encoding, BOM │
|
|
25
|
+
│ └─────────────┘ │
|
|
26
|
+
├─────────────────────────────────────────────────────────────────┤
|
|
27
|
+
│ I/O Layer │
|
|
28
|
+
│ ┌─────────────┐ ┌──────────────┐ │
|
|
29
|
+
│ │ CSVWrapper │ │ CSVIterator │ Enumerable, RowWrapper │
|
|
30
|
+
│ └─────────────┘ └──────────────┘ │
|
|
31
|
+
├─────────────────────────────────────────────────────────────────┤
|
|
32
|
+
│ Processing Layer │
|
|
33
|
+
│ ┌───────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
|
34
|
+
│ │ CSVTransformer│ │ CSVExtender │ │ CSVSort │ │
|
|
35
|
+
│ │ (pipeline) │ │ (append) │ │ (merge sort)│ │
|
|
36
|
+
│ └───────────────┘ └─────────────┘ └─────────────┘ │
|
|
37
|
+
├─────────────────────────────────────────────────────────────────┤
|
|
38
|
+
│ Analysis Layer │
|
|
39
|
+
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
|
40
|
+
│ │ CSVCompare │ │ CSVReport │ │ CSVRow │ │
|
|
41
|
+
│ │ (diff) │ │ (generate) │ │ (mixin) │ │
|
|
42
|
+
│ └─────────────┘ └─────────────┘ └─────────────┘ │
|
|
43
|
+
└─────────────────────────────────────────────────────────────────┘
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Key Components
|
|
47
|
+
|
|
48
|
+
### CSVOptions (Detection)
|
|
49
|
+
|
|
50
|
+
Auto-detects CSV file properties by reading the first line:
|
|
51
|
+
- **Column separators**: `\x02`, `\t`, `|`, `,` (checked in order)
|
|
52
|
+
- **Row separators**: `\r\n`, `\n`, `\r`
|
|
53
|
+
- **Byte order marks**: UTF-8, UTF-16, UTF-32
|
|
54
|
+
- **Encoding**: Derived from BOM or defaults to UTF-8
|
|
55
|
+
|
|
56
|
+
### CSVWrapper (I/O)
|
|
57
|
+
|
|
58
|
+
Resource-safe wrapper around Ruby's CSV class:
|
|
59
|
+
- Tracks whether it opened the file (vs receiving an existing handle)
|
|
60
|
+
- Only closes files it opened (`@close_when_done`)
|
|
61
|
+
- Provides uniform interface for both file paths and CSV objects
|
|
62
|
+
|
|
63
|
+
### CSVIterator (I/O)
|
|
64
|
+
|
|
65
|
+
Enumerable wrapper for CSV reading:
|
|
66
|
+
- **RowWrapper**: Hash subclass that preserves line numbers for error reporting
|
|
67
|
+
- `each_batch(size)`: Yields rows in configurable batches
|
|
68
|
+
- `to_hash(key, value)`: Builds lookup hash from CSV columns
|
|
69
|
+
- Tracks `prev_row` for error context
|
|
70
|
+
|
|
71
|
+
### CSVSort (Processing)
|
|
72
|
+
|
|
73
|
+
External merge sort for large files:
|
|
74
|
+
1. **Chunking**: Reads file in batches (default 100,000 rows)
|
|
75
|
+
2. **Sort chunks**: Each batch sorted in memory, written to `.part.N` temp files
|
|
76
|
+
3. **Merge**: Temp files merged pairwise into `.merge.N` files until one remains
|
|
77
|
+
4. **Cleanup**: Temp files deleted, final file moved to destination
|
|
78
|
+
|
|
79
|
+
### CSVTransformer (Processing)
|
|
80
|
+
|
|
81
|
+
Chainable transformation pipeline:
|
|
82
|
+
- `select(&block)` / `reject(&block)` - Filter rows
|
|
83
|
+
- `map(new_headers, &block)` - Transform rows
|
|
84
|
+
- `append(headers, &block)` - Add columns
|
|
85
|
+
- `additional_data(&block)` - Compute batch-level data accessible to other steps
|
|
86
|
+
- `each(&block)` - Side effects without modification
|
|
87
|
+
- Processes in batches (default 10,000 rows)
|
|
88
|
+
|
|
89
|
+
### CSVExtender (Processing)
|
|
90
|
+
|
|
91
|
+
Appends columns to existing CSV:
|
|
92
|
+
- `append(headers)` - Row-by-row column addition
|
|
93
|
+
- `append_in_batches(headers, size)` - Batch processing for external lookups
|
|
94
|
+
|
|
95
|
+
### CSVCompare (Analysis)
|
|
96
|
+
|
|
97
|
+
Compares two **pre-sorted** CSV files:
|
|
98
|
+
- Yields `:create`, `:update`, `:delete` actions
|
|
99
|
+
- Requires a comparison proc for row identity
|
|
100
|
+
- Optional `update_comparison_columns` to detect changes (e.g., `updated_at`)
|
|
101
|
+
- Both files must be sorted by the same key columns
|
|
102
|
+
|
|
103
|
+
### CSVReport (Analysis)
|
|
104
|
+
|
|
105
|
+
Builds CSV output from objects:
|
|
106
|
+
- Accepts file path or existing CSV object
|
|
107
|
+
- Block-based generation with automatic close
|
|
108
|
+
- Works with CSVRow-enabled objects
|
|
109
|
+
|
|
110
|
+
### CSVRow (Mixin)
|
|
111
|
+
|
|
112
|
+
Module for defining CSV-serializable objects:
|
|
113
|
+
- `csv_column(name, options, &block)` - Define columns declaratively
|
|
114
|
+
- Uses `inheritance-helper` for inherited column definitions
|
|
115
|
+
- Columns can reference methods or use custom procs
|
|
116
|
+
|
|
117
|
+
## CLI Tools
|
|
118
|
+
|
|
119
|
+
Standalone executables for CSV debugging:
|
|
120
|
+
|
|
121
|
+
| Tool | Purpose |
|
|
122
|
+
|------|---------|
|
|
123
|
+
| `csv-find-error` | Locates malformed CSV errors, shows context |
|
|
124
|
+
| `csv-readline` | Reads specific line numbers |
|
|
125
|
+
| `csv-validator` | Validates CSV structure |
|
|
126
|
+
| `csv-diff` | Compares two CSV files |
|
|
127
|
+
| `csv-grep` | Searches within CSV content |
|
|
128
|
+
| `csv-splitter` | Splits large files into parts |
|
|
129
|
+
| `csv-explorer` | Interactive CSV exploration |
|
|
130
|
+
| `csv-duplicate-finder` | Identifies duplicate rows |
|
|
131
|
+
| `csv-change-eol` | Converts line endings |
|
|
132
|
+
|
|
133
|
+
## Data Flow Patterns
|
|
134
|
+
|
|
135
|
+
### Comparison Flow (requires pre-sorting)
|
|
136
|
+
```
|
|
137
|
+
primary.csv ──┐
|
|
138
|
+
├── CSVCompare ──> :create/:update/:delete actions
|
|
139
|
+
secondary.csv─┘
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### Transformation Flow
|
|
143
|
+
```
|
|
144
|
+
input.csv ──> CSVTransformer ──[select]──[map]──[append]──> output.csv
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### Sort Flow (external merge sort)
|
|
148
|
+
```
|
|
149
|
+
large.csv ──> [chunk & sort] ──> .part.0, .part.1, ...
|
|
150
|
+
│
|
|
151
|
+
[pairwise merge] ──┘
|
|
152
|
+
│
|
|
153
|
+
sorted.csv
|
|
154
|
+
```
|
data/CLAUDE.md
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Build and Test Commands
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
bundle install # Install dependencies
|
|
9
|
+
bundle exec rspec # Run all tests
|
|
10
|
+
bundle exec rspec spec/csv_utils/csv_compare_spec.rb # Run single test file
|
|
11
|
+
bundle exec rubocop # Run linter
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Architecture
|
|
15
|
+
|
|
16
|
+
This is a Ruby gem (`csv-utils`) providing utilities for manipulating and debugging CSV files, particularly malformed ones.
|
|
17
|
+
|
|
18
|
+
### Core Classes (lib/csv_utils/)
|
|
19
|
+
|
|
20
|
+
- **CSVOptions** - Auto-detects CSV file properties: column separator, row separator, byte order marks, and encoding. Handles various separators (`\x02`, `\t`, `|`, `,`) and BOMs (UTF-8, UTF-16, UTF-32).
|
|
21
|
+
- **CSVWrapper** - Resource-safe wrapper around Ruby's CSV class that manages file handle lifecycle.
|
|
22
|
+
- **CSVCompare** - Compares two sorted CSV files, yielding `:create`, `:update`, or `:delete` actions.
|
|
23
|
+
- **CSVSort** - Sorts CSV files by specified columns.
|
|
24
|
+
- **CSVTransformer** - Applies row transformations with block-based processing.
|
|
25
|
+
- **CSVExtender** - Extends CSV files with additional columns/data.
|
|
26
|
+
- **CSVReport** - Generates reports from CSV data.
|
|
27
|
+
- **CSVIterator** - Efficient CSV iteration.
|
|
28
|
+
- **CSVRow** - Row-level operations.
|
|
29
|
+
|
|
30
|
+
### CLI Tools (bin/)
|
|
31
|
+
|
|
32
|
+
Standalone utilities for CSV debugging and manipulation:
|
|
33
|
+
- `csv-find-error` - Locates malformed CSV errors
|
|
34
|
+
- `csv-readline` - Reads specific lines from CSV
|
|
35
|
+
- `csv-validator` - Validates CSV structure
|
|
36
|
+
- `csv-diff` - Compares CSV files
|
|
37
|
+
- `csv-grep` - Searches within CSV files
|
|
38
|
+
- `csv-splitter` - Splits large CSV files
|
|
39
|
+
- `csv-explorer` - Interactive CSV exploration
|
|
40
|
+
- `csv-duplicate-finder` - Finds duplicate rows
|
|
41
|
+
- `csv-change-eol` - Converts line endings
|
|
42
|
+
|
|
43
|
+
### Dependencies
|
|
44
|
+
|
|
45
|
+
- `csv` - Ruby's standard CSV library
|
|
46
|
+
- `inheritance-helper` - Class inheritance utilities
|
|
47
|
+
|
|
48
|
+
## Code Commits
|
|
49
|
+
|
|
50
|
+
Format using angular formatting:
|
|
51
|
+
```
|
|
52
|
+
<type>(<scope>): <short summary>
|
|
53
|
+
```
|
|
54
|
+
- **type**: build|ci|docs|feat|fix|perf|refactor|test
|
|
55
|
+
- **scope**: The feature or component of the service we're working on
|
|
56
|
+
- **summary**: Summary in present tense. Not capitalized. No period at the end.
|
|
57
|
+
|
|
58
|
+
## Documentation Maintenance
|
|
59
|
+
|
|
60
|
+
When modifying the codebase, keep documentation in sync:
|
|
61
|
+
- **ARCHITECTURE.md** - Update when adding/removing classes, changing component relationships, or altering data flow patterns
|
|
62
|
+
- **README.md** - Update when adding new features, changing public APIs, or modifying usage examples
|
|
63
|
+
- **Code comments** - Update inline documentation when changing method signatures or behavior
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
|
@@ -18,6 +18,7 @@ GEM
|
|
|
18
18
|
rainbow (3.1.1)
|
|
19
19
|
rake (13.2.1)
|
|
20
20
|
regexp_parser (2.10.0)
|
|
21
|
+
rexml (3.4.4)
|
|
21
22
|
rspec (3.13.0)
|
|
22
23
|
rspec-core (~> 3.13.0)
|
|
23
24
|
rspec-expectations (~> 3.13.0)
|
|
@@ -50,6 +51,9 @@ GEM
|
|
|
50
51
|
docile (~> 1.1)
|
|
51
52
|
simplecov-html (~> 0.11)
|
|
52
53
|
simplecov_json_formatter (~> 0.1)
|
|
54
|
+
simplecov-cobertura (3.1.0)
|
|
55
|
+
rexml
|
|
56
|
+
simplecov (~> 0.19)
|
|
53
57
|
simplecov-html (0.13.1)
|
|
54
58
|
simplecov_json_formatter (0.1.4)
|
|
55
59
|
unicode-display_width (3.1.4)
|
|
@@ -67,6 +71,7 @@ DEPENDENCIES
|
|
|
67
71
|
rspec
|
|
68
72
|
rubocop
|
|
69
73
|
simplecov
|
|
74
|
+
simplecov-cobertura
|
|
70
75
|
|
|
71
76
|
BUNDLED WITH
|
|
72
77
|
2.6.2
|
data/README.md
CHANGED
|
@@ -1,16 +1,22 @@
|
|
|
1
1
|
# CSV Utils
|
|
2
2
|
|
|
3
|
+
[](https://github.com/dougyouch/csv-utils/actions/workflows/ci.yml)
|
|
4
|
+
[](https://codecov.io/gh/dougyouch/csv-utils)
|
|
5
|
+
|
|
3
6
|
A Ruby library providing a comprehensive set of utilities for manipulating and processing CSV files. This library offers a robust set of tools for comparing, transforming, sorting, and managing CSV data efficiently.
|
|
4
7
|
|
|
5
8
|
## Features
|
|
6
9
|
|
|
7
10
|
- **CSV Comparison**: Compare two CSV files and identify differences (creates, updates, and deletes)
|
|
8
|
-
- **CSV Transformation**: Transform CSV data with
|
|
9
|
-
- **CSV Sorting**: Sort CSV files
|
|
10
|
-
- **CSV Reporting**: Generate reports from
|
|
11
|
-
- **CSV
|
|
12
|
-
- **CSV
|
|
13
|
-
- **CSV
|
|
11
|
+
- **CSV Transformation**: Transform CSV data with a chainable pipeline
|
|
12
|
+
- **CSV Sorting**: Sort large CSV files using external merge sort
|
|
13
|
+
- **CSV Reporting**: Generate CSV reports from Ruby objects
|
|
14
|
+
- **CSV Row**: Mixin for defining CSV-serializable objects
|
|
15
|
+
- **CSV Row Matcher**: Filter CSV rows using regex patterns across columns
|
|
16
|
+
- **CSV Iteration**: Efficient iteration over CSV files with batch support
|
|
17
|
+
- **CSV Extension**: Extend CSV files with additional columns
|
|
18
|
+
- **CSV Options**: Auto-detect CSV file properties (separators, encoding, BOM)
|
|
19
|
+
- **CLI Tools**: Command-line utilities for CSV debugging and manipulation
|
|
14
20
|
|
|
15
21
|
## Installation
|
|
16
22
|
|
|
@@ -36,8 +42,10 @@ $ gem install csv-utils
|
|
|
36
42
|
|
|
37
43
|
### Comparing CSV Files
|
|
38
44
|
|
|
45
|
+
Compare two sorted CSV files to identify creates, updates, and deletes:
|
|
46
|
+
|
|
39
47
|
```ruby
|
|
40
|
-
require '
|
|
48
|
+
require 'csv-utils'
|
|
41
49
|
|
|
42
50
|
comparator = CSVUtils::CSVCompare.new('primary.csv', ['updated_at']) do |src, dest|
|
|
43
51
|
src['id'] <=> dest['id']
|
|
@@ -55,31 +63,245 @@ comparator.compare('secondary.csv') do |action, record|
|
|
|
55
63
|
end
|
|
56
64
|
```
|
|
57
65
|
|
|
66
|
+
**Note**: Both CSV files must be sorted by the same key columns for comparison to work correctly.
|
|
67
|
+
|
|
58
68
|
### Sorting CSV Files
|
|
59
69
|
|
|
70
|
+
Sort large CSV files using external merge sort:
|
|
71
|
+
|
|
60
72
|
```ruby
|
|
61
|
-
|
|
62
|
-
|
|
73
|
+
require 'csv-utils'
|
|
74
|
+
|
|
75
|
+
sorter = CSVUtils::CSVSort.new('input.csv', 'output.csv', true) # true = has headers
|
|
76
|
+
sorter.sort(100_000) { |a, b| a.first.to_i <=> b.first.to_i } # batch size, comparison block
|
|
63
77
|
```
|
|
64
78
|
|
|
65
79
|
### Transforming CSV Data
|
|
66
80
|
|
|
81
|
+
Transform CSV data using a chainable pipeline:
|
|
82
|
+
|
|
83
|
+
```ruby
|
|
84
|
+
require 'csv-utils'
|
|
85
|
+
|
|
86
|
+
CSVUtils::CSVTransformer.new('input.csv', 'output.csv')
|
|
87
|
+
.read_headers
|
|
88
|
+
.select { |row, headers, _| row[0].to_i > 100 } # filter rows
|
|
89
|
+
.map(['ID', 'Name']) { |row, headers, _| [row[0], row[1].upcase] } # transform rows
|
|
90
|
+
.append(['Email']) { |row, headers, _| ["#{row[1].downcase}@example.com"] }
|
|
91
|
+
.process(10_000) # batch size
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Available pipeline methods:
|
|
95
|
+
- `select { |row, headers, additional_data| }` - Keep rows where block returns true
|
|
96
|
+
- `reject { |row, headers, additional_data| }` - Remove rows where block returns true
|
|
97
|
+
- `map(new_headers) { |row, headers, additional_data| }` - Transform rows
|
|
98
|
+
- `append(additional_headers) { |row, headers, additional_data| }` - Add columns
|
|
99
|
+
- `additional_data { |batch, headers| }` - Compute batch-level data for use in other steps
|
|
100
|
+
- `each { |row, headers, additional_data| }` - Side effects without modification
|
|
101
|
+
- `set_headers(headers)` - Override output headers
|
|
102
|
+
|
|
103
|
+
### CSV Row and Report
|
|
104
|
+
|
|
105
|
+
Define CSV-serializable objects and generate reports:
|
|
106
|
+
|
|
67
107
|
```ruby
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
108
|
+
require 'csv-utils'
|
|
109
|
+
|
|
110
|
+
class User
|
|
111
|
+
include CSVUtils::CSVRow
|
|
112
|
+
|
|
113
|
+
attr_accessor :id, :name, :email
|
|
114
|
+
|
|
115
|
+
csv_column :id, header: 'ID'
|
|
116
|
+
csv_column :name
|
|
117
|
+
csv_column(:email) { email.downcase }
|
|
118
|
+
|
|
119
|
+
def initialize(id, name, email)
|
|
120
|
+
@id = id
|
|
121
|
+
@name = name
|
|
122
|
+
@email = email
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
users = [
|
|
127
|
+
User.new(1, 'Alice', 'ALICE@example.com'),
|
|
128
|
+
User.new(2, 'Bob', 'BOB@example.com')
|
|
129
|
+
]
|
|
130
|
+
|
|
131
|
+
# Generate CSV report
|
|
132
|
+
CSVUtils::CSVReport.new('users.csv', User) do |report|
|
|
133
|
+
users.each { |user| report << user }
|
|
73
134
|
end
|
|
74
135
|
```
|
|
75
136
|
|
|
137
|
+
The `csv_column` method accepts:
|
|
138
|
+
- A symbol referencing a method: `csv_column :name`
|
|
139
|
+
- A custom header: `csv_column :id, header: 'ID'`
|
|
140
|
+
- A block for computed values: `csv_column(:email) { email.downcase }`
|
|
141
|
+
- A proc: `csv_column :count, proc: Proc.new { data[:count] }`
|
|
142
|
+
|
|
143
|
+
#### Generating Reports from ActiveRecord Models
|
|
144
|
+
|
|
145
|
+
A powerful pattern is to subclass an ActiveRecord model with `CSVRow` for generating reports directly from database records:
|
|
146
|
+
|
|
147
|
+
```ruby
|
|
148
|
+
require 'csv-utils'
|
|
149
|
+
|
|
150
|
+
class UserCSVRow < User
|
|
151
|
+
include CSVUtils::CSVRow
|
|
152
|
+
|
|
153
|
+
csv_column :id
|
|
154
|
+
csv_column :name
|
|
155
|
+
csv_column :email
|
|
156
|
+
csv_column :num_orders # computed column
|
|
157
|
+
csv_column :total_revenue # computed column
|
|
158
|
+
|
|
159
|
+
def num_orders
|
|
160
|
+
orders.count
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def total_revenue
|
|
164
|
+
orders.sum(:amount)
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# free up memory during large iterations
|
|
168
|
+
def clear!
|
|
169
|
+
@association_cache = {}
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Generate report using ActiveRecord scopes
|
|
174
|
+
CSVUtils::CSVReport.new('user_report.csv', UserCSVRow) do |report|
|
|
175
|
+
UserCSVRow.where(active: true).find_each do |user|
|
|
176
|
+
report << user
|
|
177
|
+
user.clear!
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
This pattern provides:
|
|
183
|
+
- **Inherited attributes**: All model columns available without redefinition
|
|
184
|
+
- **Association access**: Query related tables for computed columns
|
|
185
|
+
- **ActiveRecord scopes**: Use `.where`, `.includes`, `.find_each` directly
|
|
186
|
+
- **Memory efficiency**: The `clear!` method frees association cache during iteration
|
|
187
|
+
|
|
188
|
+
### Iterating CSV Files
|
|
189
|
+
|
|
190
|
+
Efficiently iterate over CSV files:
|
|
191
|
+
|
|
192
|
+
```ruby
|
|
193
|
+
require 'csv-utils'
|
|
194
|
+
|
|
195
|
+
iterator = CSVUtils::CSVIterator.new('data.csv')
|
|
196
|
+
|
|
197
|
+
# Iterate row by row
|
|
198
|
+
iterator.each do |row|
|
|
199
|
+
puts "Line #{row.lineno}: #{row['name']}"
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
# Process in batches
|
|
203
|
+
iterator.each_batch(1_000) do |batch|
|
|
204
|
+
# Process batch of rows
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
# Build a lookup hash
|
|
208
|
+
lookup = iterator.to_hash('id', 'name') # { 'id_value' => 'name_value', ... }
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
### Matching CSV Rows
|
|
212
|
+
|
|
213
|
+
Filter CSV rows using regex patterns:
|
|
214
|
+
|
|
215
|
+
```ruby
|
|
216
|
+
require 'csv-utils'
|
|
217
|
+
|
|
218
|
+
# Match against all columns
|
|
219
|
+
matcher = CSVUtils::CSVRowMatcher.new(/error/i)
|
|
220
|
+
|
|
221
|
+
# Or match only specific columns
|
|
222
|
+
matcher = CSVUtils::CSVRowMatcher.new(/error/i, ['status', 'message'])
|
|
223
|
+
|
|
224
|
+
# Use with iteration
|
|
225
|
+
iterator = CSVUtils::CSVIterator.new('logs.csv')
|
|
226
|
+
error_rows = iterator.select(&matcher)
|
|
227
|
+
|
|
228
|
+
# Use directly
|
|
229
|
+
row = { 'id' => '123', 'status' => 'Error', 'message' => 'Connection failed' }
|
|
230
|
+
matcher.match?(row) # => true
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
The matcher can be used with any Enumerable method via `to_proc`:
|
|
234
|
+
|
|
235
|
+
```ruby
|
|
236
|
+
rows.select(&matcher) # rows matching the pattern
|
|
237
|
+
rows.reject(&matcher) # rows not matching the pattern
|
|
238
|
+
rows.find(&matcher) # first matching row
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
### Extending CSV Files
|
|
242
|
+
|
|
243
|
+
Add columns to an existing CSV:
|
|
244
|
+
|
|
245
|
+
```ruby
|
|
246
|
+
require 'csv-utils'
|
|
247
|
+
|
|
248
|
+
extender = CSVUtils::CSVExtender.new('input.csv', 'output.csv')
|
|
249
|
+
|
|
250
|
+
# Row by row
|
|
251
|
+
extender.append(['new_column']) do |row, headers|
|
|
252
|
+
[row[0].upcase] # return array of new column values
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
# Or in batches (useful for external lookups)
|
|
256
|
+
extender.append_in_batches(['status'], 1_000) do |batch, headers|
|
|
257
|
+
# Return array of arrays, one per row in batch
|
|
258
|
+
batch.map { |row| ['active'] }
|
|
259
|
+
end
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
### Auto-detecting CSV Options
|
|
263
|
+
|
|
264
|
+
Detect CSV file properties automatically:
|
|
265
|
+
|
|
266
|
+
```ruby
|
|
267
|
+
require 'csv-utils'
|
|
268
|
+
|
|
269
|
+
options = CSVUtils::CSVOptions.new('data.csv')
|
|
270
|
+
|
|
271
|
+
options.valid? # true if separators detected
|
|
272
|
+
options.col_separator # detected column separator
|
|
273
|
+
options.row_separator # detected row separator
|
|
274
|
+
options.encoding # detected encoding (UTF-8, UTF-16, UTF-32)
|
|
275
|
+
options.columns # number of columns
|
|
276
|
+
options.byte_order_mark # BOM if present
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
Supported column separators: `\x02`, `\t`, `|`, `,`
|
|
280
|
+
Supported row separators: `\r\n`, `\n`, `\r`
|
|
281
|
+
|
|
282
|
+
## CLI Tools
|
|
283
|
+
|
|
284
|
+
The gem includes command-line utilities for CSV debugging:
|
|
285
|
+
|
|
286
|
+
| Command | Description |
|
|
287
|
+
|---------|-------------|
|
|
288
|
+
| `csv-find-error` | Locate malformed CSV errors with context |
|
|
289
|
+
| `csv-readline` | Read specific lines from a CSV file |
|
|
290
|
+
| `csv-validator` | Validate CSV structure |
|
|
291
|
+
| `csv-diff` | Compare two CSV files |
|
|
292
|
+
| `csv-grep` | Search within CSV content |
|
|
293
|
+
| `csv-splitter` | Split large CSV files into parts |
|
|
294
|
+
| `csv-explorer` | Interactive CSV exploration |
|
|
295
|
+
| `csv-duplicate-finder` | Find duplicate rows |
|
|
296
|
+
| `csv-change-eol` | Convert line endings |
|
|
297
|
+
|
|
76
298
|
## Development
|
|
77
299
|
|
|
78
|
-
After checking out the repo, run `
|
|
300
|
+
After checking out the repo, run `bundle install` to install dependencies. Then, run `bundle exec rspec` to run the tests.
|
|
79
301
|
|
|
80
302
|
## Contributing
|
|
81
303
|
|
|
82
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
|
304
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/dougyouch/csv-utils.
|
|
83
305
|
|
|
84
306
|
## License
|
|
85
307
|
|