csv-utils 0.3.24 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 56d81ad0c7a53bc4a738cc4f9643884fa301b6c2add83f9b961263dfdae1e896
4
- data.tar.gz: 5db7f72a3df30a2933ffdafa8599a7ce46ae763124b4c0d6d88b775f2238422b
3
+ metadata.gz: 151a25f6d4d171b169ac194665f217151a96ef43aede166c6a62ec9f2b259765
4
+ data.tar.gz: b223b26ea97a29f58f532e6b737d481c4f144ce2e202592f8057300790156ed5
5
5
  SHA512:
6
- metadata.gz: bf3878abd3a7f0c5dfbdcd9cdcee6773dc20d43308464c3d0a984ee2fe147c055ba64627043478f8099604f8824b3e8630eabfe8f91ffc562d54fabed98b0535
7
- data.tar.gz: 68feef79f89b7b415c40b37f9ac03b2feb6c800552af70c22006fa56a92d987a0753e0fc00b6449b8edca04463dfeaaefc4b647fbf46ce14227ccdde927a95dd
6
+ metadata.gz: 7afee881db16b4afce9cceb812d4994ee66bcf172db147eaf73911961983cd0d73f9c553ca549880793e1c9a2bb2904e8444f2618ff30d040808d1b4e5383cf7
7
+ data.tar.gz: 4811b6ff72153107bf029f36feef65ec68646e0365d3b3987bd3231b9842b20c8c27c72216ca25350bd469887315b139737eedeba3b7c7fa17d240c729ae9a5a
@@ -0,0 +1,53 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [master]
6
+ pull_request:
7
+ branches: [master]
8
+
9
+ jobs:
10
+ lint:
11
+ name: RuboCop
12
+ runs-on: ubuntu-latest
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+
16
+ - name: Set up Ruby
17
+ uses: ruby/setup-ruby@v1
18
+ with:
19
+ ruby-version: '3.4'
20
+ bundler-cache: true
21
+
22
+ - name: Run RuboCop
23
+ run: bundle exec rubocop --format github
24
+
25
+ test:
26
+ name: Tests (Ruby ${{ matrix.ruby }})
27
+ runs-on: ubuntu-latest
28
+ strategy:
29
+ fail-fast: false
30
+ matrix:
31
+ ruby: ['3.2', '3.3', '3.4']
32
+
33
+ steps:
34
+ - uses: actions/checkout@v4
35
+
36
+ - name: Set up Ruby ${{ matrix.ruby }}
37
+ uses: ruby/setup-ruby@v1
38
+ with:
39
+ ruby-version: ${{ matrix.ruby }}
40
+ bundler-cache: true
41
+
42
+ - name: Run tests
43
+ run: bundle exec rspec
44
+
45
+ - name: Upload coverage to Codecov
46
+ if: matrix.ruby == '3.4'
47
+ uses: codecov/codecov-action@v4
48
+ with:
49
+ files: coverage/coverage.xml
50
+ fail_ci_if_error: false
51
+ verbose: true
52
+ env:
53
+ CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
data/.rubocop.yml ADDED
@@ -0,0 +1,81 @@
1
+ AllCops:
2
+ TargetRubyVersion: 3.2
3
+ NewCops: enable
4
+ SuggestExtensions: false
5
+ Exclude:
6
+ - 'bin/**/*'
7
+ - 'vendor/**/*'
8
+ - 'coverage/**/*'
9
+
10
+ # Relaxed metrics for existing codebase
11
+ Metrics/MethodLength:
12
+ Max: 35
13
+
14
+ Metrics/AbcSize:
15
+ Max: 40
16
+
17
+ Metrics/ClassLength:
18
+ Max: 200
19
+
20
+ Metrics/CyclomaticComplexity:
21
+ Max: 15
22
+
23
+ Metrics/PerceivedComplexity:
24
+ Max: 15
25
+
26
+ Metrics/BlockLength:
27
+ Exclude:
28
+ - 'spec/**/*'
29
+ - '*.gemspec'
30
+
31
+ # Style preferences
32
+ Style/Documentation:
33
+ Enabled: false
34
+
35
+ Style/FrozenStringLiteralComment:
36
+ EnforcedStyle: always
37
+
38
+ Layout/LineLength:
39
+ Max: 130
40
+ Exclude:
41
+ - 'spec/**/*'
42
+
43
+ # File naming - allow hyphenated gem name
44
+ Naming/FileName:
45
+ Exclude:
46
+ - 'lib/csv-utils.rb'
47
+
48
+ # Allow set_ prefix for transformer DSL methods
49
+ Naming/AccessorMethodName:
50
+ Exclude:
51
+ - 'lib/csv_utils/csv_transformer.rb'
52
+
53
+ # Allow Kernel#open for pipe support (intentional design)
54
+ Security/Open:
55
+ Exclude:
56
+ - 'lib/csv_utils/csv_wrapper.rb'
57
+
58
+ # Duplicate branches are intentional in comparison logic
59
+ Lint/DuplicateBranch:
60
+ Exclude:
61
+ - 'lib/csv_utils/csv_compare.rb'
62
+ - 'lib/csv_utils/csv_sort.rb'
63
+
64
+ # Allow empty blocks in specs (used for testing iteration behavior)
65
+ Lint/EmptyBlock:
66
+ Exclude:
67
+ - 'spec/**/*'
68
+
69
+ # Style relaxations for existing code patterns
70
+ Style/OptionalBooleanParameter:
71
+ Enabled: false
72
+
73
+ Style/StringConcatenation:
74
+ Enabled: false
75
+
76
+ Style/FormatStringToken:
77
+ Enabled: false
78
+
79
+ # Gemspec settings
80
+ Gemspec/RequiredRubyVersion:
81
+ Enabled: false
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 3.1.0
1
+ 3.4.2
data/ARCHITECTURE.md ADDED
@@ -0,0 +1,154 @@
1
+ # Architecture
2
+
3
+ This document describes the internal architecture of the csv-utils gem.
4
+
5
+ ## Overview
6
+
7
+ csv-utils is a Ruby gem providing utilities for manipulating, debugging, and processing CSV files. The library emphasizes handling malformed CSVs and large file processing through streaming and batch operations.
8
+
9
+ ## Core Design Principles
10
+
11
+ 1. **Streaming Over Loading** - Files are processed row-by-row rather than loading entire files into memory
12
+ 2. **Resource Management** - CSVWrapper ensures proper file handle lifecycle management
13
+ 3. **Batch Processing** - Large operations support configurable batch sizes to balance memory and performance
14
+ 4. **BOM Handling** - All readers strip UTF-8/16/32 byte order marks from headers
15
+
16
+ ## Component Architecture
17
+
18
+ ```
19
+ ┌─────────────────────────────────────────────────────────────────┐
20
+ │ CSVUtils Module │
21
+ ├─────────────────────────────────────────────────────────────────┤
22
+ │ Detection Layer │
23
+ │ ┌─────────────┐ │
24
+ │ │ CSVOptions │ Auto-detects separators, encoding, BOM │
25
+ │ └─────────────┘ │
26
+ ├─────────────────────────────────────────────────────────────────┤
27
+ │ I/O Layer │
28
+ │ ┌─────────────┐ ┌──────────────┐ │
29
+ │ │ CSVWrapper │ │ CSVIterator │ Enumerable, RowWrapper │
30
+ │ └─────────────┘ └──────────────┘ │
31
+ ├─────────────────────────────────────────────────────────────────┤
32
+ │ Processing Layer │
33
+ │ ┌───────────────┐ ┌─────────────┐ ┌─────────────┐ │
34
+ │ │ CSVTransformer│ │ CSVExtender │ │ CSVSort │ │
35
+ │ │ (pipeline) │ │ (append) │ │ (merge sort)│ │
36
+ │ └───────────────┘ └─────────────┘ └─────────────┘ │
37
+ ├─────────────────────────────────────────────────────────────────┤
38
+ │ Analysis Layer │
39
+ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
40
+ │ │ CSVCompare │ │ CSVReport │ │ CSVRow │ │
41
+ │ │ (diff) │ │ (generate) │ │ (mixin) │ │
42
+ │ └─────────────┘ └─────────────┘ └─────────────┘ │
43
+ └─────────────────────────────────────────────────────────────────┘
44
+ ```
45
+
46
+ ## Key Components
47
+
48
+ ### CSVOptions (Detection)
49
+
50
+ Auto-detects CSV file properties by reading the first line:
51
+ - **Column separators**: `\x02`, `\t`, `|`, `,` (checked in order)
52
+ - **Row separators**: `\r\n`, `\n`, `\r`
53
+ - **Byte order marks**: UTF-8, UTF-16, UTF-32
54
+ - **Encoding**: Derived from BOM or defaults to UTF-8
55
+
56
+ ### CSVWrapper (I/O)
57
+
58
+ Resource-safe wrapper around Ruby's CSV class:
59
+ - Tracks whether it opened the file (vs receiving an existing handle)
60
+ - Only closes files it opened (`@close_when_done`)
61
+ - Provides uniform interface for both file paths and CSV objects
62
+
63
+ ### CSVIterator (I/O)
64
+
65
+ Enumerable wrapper for CSV reading:
66
+ - **RowWrapper**: Hash subclass that preserves line numbers for error reporting
67
+ - `each_batch(size)`: Yields rows in configurable batches
68
+ - `to_hash(key, value)`: Builds lookup hash from CSV columns
69
+ - Tracks `prev_row` for error context
70
+
71
+ ### CSVSort (Processing)
72
+
73
+ External merge sort for large files:
74
+ 1. **Chunking**: Reads file in batches (default 100,000 rows)
75
+ 2. **Sort chunks**: Each batch sorted in memory, written to `.part.N` temp files
76
+ 3. **Merge**: Temp files merged pairwise into `.merge.N` files until one remains
77
+ 4. **Cleanup**: Temp files deleted, final file moved to destination
78
+
79
+ ### CSVTransformer (Processing)
80
+
81
+ Chainable transformation pipeline:
82
+ - `select(&block)` / `reject(&block)` - Filter rows
83
+ - `map(new_headers, &block)` - Transform rows
84
+ - `append(headers, &block)` - Add columns
85
+ - `additional_data(&block)` - Compute batch-level data accessible to other steps
86
+ - `each(&block)` - Side effects without modification
87
+ - Processes in batches (default 10,000 rows)
88
+
89
+ ### CSVExtender (Processing)
90
+
91
+ Appends columns to existing CSV:
92
+ - `append(headers)` - Row-by-row column addition
93
+ - `append_in_batches(headers, size)` - Batch processing for external lookups
94
+
95
+ ### CSVCompare (Analysis)
96
+
97
+ Compares two **pre-sorted** CSV files:
98
+ - Yields `:create`, `:update`, `:delete` actions
99
+ - Requires a comparison proc for row identity
100
+ - Optional `update_comparison_columns` to detect changes (e.g., `updated_at`)
101
+ - Both files must be sorted by the same key columns
102
+
103
+ ### CSVReport (Analysis)
104
+
105
+ Builds CSV output from objects:
106
+ - Accepts file path or existing CSV object
107
+ - Block-based generation with automatic close
108
+ - Works with CSVRow-enabled objects
109
+
110
+ ### CSVRow (Mixin)
111
+
112
+ Module for defining CSV-serializable objects:
113
+ - `csv_column(name, options, &block)` - Define columns declaratively
114
+ - Uses `inheritance-helper` for inherited column definitions
115
+ - Columns can reference methods or use custom procs
116
+
117
+ ## CLI Tools
118
+
119
+ Standalone executables for CSV debugging:
120
+
121
+ | Tool | Purpose |
122
+ |------|---------|
123
+ | `csv-find-error` | Locates malformed CSV errors, shows context |
124
+ | `csv-readline` | Reads specific line numbers |
125
+ | `csv-validator` | Validates CSV structure |
126
+ | `csv-diff` | Compares two CSV files |
127
+ | `csv-grep` | Searches within CSV content |
128
+ | `csv-splitter` | Splits large files into parts |
129
+ | `csv-explorer` | Interactive CSV exploration |
130
+ | `csv-duplicate-finder` | Identifies duplicate rows |
131
+ | `csv-change-eol` | Converts line endings |
132
+
133
+ ## Data Flow Patterns
134
+
135
+ ### Comparison Flow (requires pre-sorting)
136
+ ```
137
+ primary.csv ──┐
138
+ ├── CSVCompare ──> :create/:update/:delete actions
139
+ secondary.csv─┘
140
+ ```
141
+
142
+ ### Transformation Flow
143
+ ```
144
+ input.csv ──> CSVTransformer ──[select]──[map]──[append]──> output.csv
145
+ ```
146
+
147
+ ### Sort Flow (external merge sort)
148
+ ```
149
+ large.csv ──> [chunk & sort] ──> .part.0, .part.1, ...
150
+
151
+ [pairwise merge] ──┘
152
+
153
+ sorted.csv
154
+ ```
data/CLAUDE.md ADDED
@@ -0,0 +1,63 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Build and Test Commands
6
+
7
+ ```bash
8
+ bundle install # Install dependencies
9
+ bundle exec rspec # Run all tests
10
+ bundle exec rspec spec/csv_utils/csv_compare_spec.rb # Run single test file
11
+ bundle exec rubocop # Run linter
12
+ ```
13
+
14
+ ## Architecture
15
+
16
+ This is a Ruby gem (`csv-utils`) providing utilities for manipulating and debugging CSV files, particularly malformed ones.
17
+
18
+ ### Core Classes (lib/csv_utils/)
19
+
20
+ - **CSVOptions** - Auto-detects CSV file properties: column separator, row separator, byte order marks, and encoding. Handles various separators (`\x02`, `\t`, `|`, `,`) and BOMs (UTF-8, UTF-16, UTF-32).
21
+ - **CSVWrapper** - Resource-safe wrapper around Ruby's CSV class that manages file handle lifecycle.
22
+ - **CSVCompare** - Compares two sorted CSV files, yielding `:create`, `:update`, or `:delete` actions.
23
+ - **CSVSort** - Sorts CSV files by specified columns.
24
+ - **CSVTransformer** - Applies row transformations with block-based processing.
25
+ - **CSVExtender** - Extends CSV files with additional columns/data.
26
+ - **CSVReport** - Generates reports from CSV data.
27
+ - **CSVIterator** - Efficient CSV iteration.
28
+ - **CSVRow** - Row-level operations.
29
+
30
+ ### CLI Tools (bin/)
31
+
32
+ Standalone utilities for CSV debugging and manipulation:
33
+ - `csv-find-error` - Locates malformed CSV errors
34
+ - `csv-readline` - Reads specific lines from CSV
35
+ - `csv-validator` - Validates CSV structure
36
+ - `csv-diff` - Compares CSV files
37
+ - `csv-grep` - Searches within CSV files
38
+ - `csv-splitter` - Splits large CSV files
39
+ - `csv-explorer` - Interactive CSV exploration
40
+ - `csv-duplicate-finder` - Finds duplicate rows
41
+ - `csv-change-eol` - Converts line endings
42
+
43
+ ### Dependencies
44
+
45
+ - `csv` - Ruby's standard CSV library
46
+ - `inheritance-helper` - Class inheritance utilities
47
+
48
+ ## Code Commits
49
+
50
+ Format using angular formatting:
51
+ ```
52
+ <type>(<scope>): <short summary>
53
+ ```
54
+ - **type**: build|ci|docs|feat|fix|perf|refactor|test
55
+ - **scope**: The feature or component of the service we're working on
56
+ - **summary**: Summary in present tense. Not capitalized. No period at the end.
57
+
58
+ ## Documentation Maintenance
59
+
60
+ When modifying the codebase, keep documentation in sync:
61
+ - **ARCHITECTURE.md** - Update when adding/removing classes, changing component relationships, or altering data flow patterns
62
+ - **README.md** - Update when adding new features, changing public APIs, or modifying usage examples
63
+ - **Code comments** - Update inline documentation when changing method signatures or behavior
data/Gemfile CHANGED
@@ -2,14 +2,13 @@
2
2
 
3
3
  source 'http://rubygems.org'
4
4
 
5
+ gem 'csv'
5
6
  gem 'inheritance-helper'
6
7
 
7
8
  group :development do
8
9
  gem 'rake'
9
- gem 'rubocop'
10
- end
11
-
12
- group :spec do
13
10
  gem 'rspec'
11
+ gem 'rubocop'
14
12
  gem 'simplecov'
13
+ gem 'simplecov-cobertura'
15
14
  end
data/Gemfile.lock CHANGED
@@ -1,59 +1,77 @@
1
1
  GEM
2
2
  remote: http://rubygems.org/
3
3
  specs:
4
- ast (2.4.2)
5
- diff-lcs (1.5.0)
6
- docile (1.4.0)
4
+ ast (2.4.3)
5
+ csv (3.3.4)
6
+ diff-lcs (1.6.2)
7
+ docile (1.4.1)
7
8
  inheritance-helper (0.2.5)
8
- parallel (1.22.1)
9
- parser (3.1.1.0)
9
+ json (2.12.0)
10
+ language_server-protocol (3.17.0.5)
11
+ lint_roller (1.1.0)
12
+ parallel (1.27.0)
13
+ parser (3.3.8.0)
10
14
  ast (~> 2.4.1)
15
+ racc
16
+ prism (1.4.0)
17
+ racc (1.8.1)
11
18
  rainbow (3.1.1)
12
- rake (13.0.6)
13
- regexp_parser (2.2.1)
14
- rexml (3.2.5)
15
- rspec (3.11.0)
16
- rspec-core (~> 3.11.0)
17
- rspec-expectations (~> 3.11.0)
18
- rspec-mocks (~> 3.11.0)
19
- rspec-core (3.11.0)
20
- rspec-support (~> 3.11.0)
21
- rspec-expectations (3.11.0)
19
+ rake (13.2.1)
20
+ regexp_parser (2.10.0)
21
+ rexml (3.4.4)
22
+ rspec (3.13.0)
23
+ rspec-core (~> 3.13.0)
24
+ rspec-expectations (~> 3.13.0)
25
+ rspec-mocks (~> 3.13.0)
26
+ rspec-core (3.13.3)
27
+ rspec-support (~> 3.13.0)
28
+ rspec-expectations (3.13.4)
22
29
  diff-lcs (>= 1.2.0, < 2.0)
23
- rspec-support (~> 3.11.0)
24
- rspec-mocks (3.11.0)
30
+ rspec-support (~> 3.13.0)
31
+ rspec-mocks (3.13.4)
25
32
  diff-lcs (>= 1.2.0, < 2.0)
26
- rspec-support (~> 3.11.0)
27
- rspec-support (3.11.0)
28
- rubocop (1.26.1)
33
+ rspec-support (~> 3.13.0)
34
+ rspec-support (3.13.3)
35
+ rubocop (1.75.6)
36
+ json (~> 2.3)
37
+ language_server-protocol (~> 3.17.0.2)
38
+ lint_roller (~> 1.1.0)
29
39
  parallel (~> 1.10)
30
- parser (>= 3.1.0.0)
40
+ parser (>= 3.3.0.2)
31
41
  rainbow (>= 2.2.2, < 4.0)
32
- regexp_parser (>= 1.8, < 3.0)
33
- rexml
34
- rubocop-ast (>= 1.16.0, < 2.0)
42
+ regexp_parser (>= 2.9.3, < 3.0)
43
+ rubocop-ast (>= 1.44.0, < 2.0)
35
44
  ruby-progressbar (~> 1.7)
36
- unicode-display_width (>= 1.4.0, < 3.0)
37
- rubocop-ast (1.16.0)
38
- parser (>= 3.1.1.0)
39
- ruby-progressbar (1.11.0)
40
- simplecov (0.21.2)
45
+ unicode-display_width (>= 2.4.0, < 4.0)
46
+ rubocop-ast (1.44.1)
47
+ parser (>= 3.3.7.2)
48
+ prism (~> 1.4)
49
+ ruby-progressbar (1.13.0)
50
+ simplecov (0.22.0)
41
51
  docile (~> 1.1)
42
52
  simplecov-html (~> 0.11)
43
53
  simplecov_json_formatter (~> 0.1)
44
- simplecov-html (0.12.3)
54
+ simplecov-cobertura (3.1.0)
55
+ rexml
56
+ simplecov (~> 0.19)
57
+ simplecov-html (0.13.1)
45
58
  simplecov_json_formatter (0.1.4)
46
- unicode-display_width (2.1.0)
59
+ unicode-display_width (3.1.4)
60
+ unicode-emoji (~> 4.0, >= 4.0.4)
61
+ unicode-emoji (4.0.4)
47
62
 
48
63
  PLATFORMS
49
- x86_64-darwin-21
64
+ ruby
65
+ x86_64-darwin-24
50
66
 
51
67
  DEPENDENCIES
68
+ csv
52
69
  inheritance-helper
53
70
  rake
54
71
  rspec
55
72
  rubocop
56
73
  simplecov
74
+ simplecov-cobertura
57
75
 
58
76
  BUNDLED WITH
59
- 2.3.3
77
+ 2.6.2