philiprehberger-csv_kit 0.7.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -1
- data/README.md +35 -7
- data/lib/philiprehberger/csv_kit/processor.rb +25 -0
- data/lib/philiprehberger/csv_kit/version.rb +1 -1
- data/lib/philiprehberger/csv_kit.rb +74 -44
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 85d915adb35a580821d1055d966fe76b802598d6422571ad1e727694dd604a4f
|
|
4
|
+
data.tar.gz: 66cc520a86bae535668b77aa56ae7d154ddc1f5df22cca7b4578d4b12a7958e8
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 0b719464ccea551cb56975fe78985ffa2f49480f2439e9a528f091afa53b5b24dd55219dbab3e65a8f30ec1f326959749ad4162e52d36471b1ee3bfd1272afcd
|
|
7
|
+
data.tar.gz: 7f98968c2c5063109053f2a4a8f4508f6544e5238f783440969f237f593b66f4ac2a2ace4e6b495862621944fe891c44107a247366d3c1cedd59be6208a88df9
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.9.0] - 2026-04-19
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- `Processor#default(key, value)` — fill nil or empty cells at `key` with a default value during transform; chains naturally with `type:` coercion
|
|
14
|
+
|
|
15
|
+
## [0.8.0] - 2026-04-17
|
|
16
|
+
|
|
17
|
+
### Added
|
|
18
|
+
- `CsvKit.to_csv(rows, headers:, dialect:)` — serialize an array of hashes to a CSV string; inverse of `to_hashes`
|
|
19
|
+
- `to_hashes`, `pluck`, `headers`, `count`, `each_hash`, `find`, and `filter` now accept an IO object in addition to a file path
|
|
20
|
+
|
|
10
21
|
## [0.7.0] - 2026-04-16
|
|
11
22
|
|
|
12
23
|
### Added
|
|
@@ -101,7 +112,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
101
112
|
- Type coercion and row validation
|
|
102
113
|
- Quick load and filtering convenience methods
|
|
103
114
|
|
|
104
|
-
[Unreleased]: https://github.com/philiprehberger/rb-csv-kit/compare/v0.
|
|
115
|
+
[Unreleased]: https://github.com/philiprehberger/rb-csv-kit/compare/v0.9.0...HEAD
|
|
116
|
+
[0.9.0]: https://github.com/philiprehberger/rb-csv-kit/compare/v0.8.0...v0.9.0
|
|
117
|
+
[0.8.0]: https://github.com/philiprehberger/rb-csv-kit/compare/v0.7.0...v0.8.0
|
|
105
118
|
[0.7.0]: https://github.com/philiprehberger/rb-csv-kit/compare/v0.6.0...v0.7.0
|
|
106
119
|
[0.6.0]: https://github.com/philiprehberger/rb-csv-kit/compare/v0.5.0...v0.6.0
|
|
107
120
|
[0.5.0]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.5.0
|
data/README.md
CHANGED
|
@@ -106,6 +106,17 @@ rows = Philiprehberger::CsvKit.process("data.csv") do |p|
|
|
|
106
106
|
end
|
|
107
107
|
```
|
|
108
108
|
|
|
109
|
+
### Default Values for Missing Cells
|
|
110
|
+
|
|
111
|
+
Fill nil or empty-string cells with a default value before any `type` coercion runs:
|
|
112
|
+
|
|
113
|
+
```ruby
|
|
114
|
+
Philiprehberger::CsvKit.process("users.csv") do |p|
|
|
115
|
+
p.default(:country, "US")
|
|
116
|
+
p.type(:age, :integer)
|
|
117
|
+
end
|
|
118
|
+
```
|
|
119
|
+
|
|
109
120
|
### Date/Time Type Coercions
|
|
110
121
|
|
|
111
122
|
```ruby
|
|
@@ -124,6 +135,21 @@ rows = Philiprehberger::CsvKit.process("data.csv", dialect: { delimiter: ";", qu
|
|
|
124
135
|
end
|
|
125
136
|
```
|
|
126
137
|
|
|
138
|
+
### Write CSV String
|
|
139
|
+
|
|
140
|
+
Inverse of `to_hashes`. Serialize an array of hashes to a CSV string. Headers default to the keys of the first row:
|
|
141
|
+
|
|
142
|
+
```ruby
|
|
143
|
+
csv = Philiprehberger::CsvKit.to_csv([
|
|
144
|
+
{ name: "Alice", age: 30 },
|
|
145
|
+
{ name: "Bob", age: 25 }
|
|
146
|
+
])
|
|
147
|
+
# => "name,age\nAlice,30\nBob,25\n"
|
|
148
|
+
|
|
149
|
+
# Control column order / subset with explicit headers
|
|
150
|
+
Philiprehberger::CsvKit.to_csv(rows, headers: [:name])
|
|
151
|
+
```
|
|
152
|
+
|
|
127
153
|
### Writing CSV
|
|
128
154
|
|
|
129
155
|
```ruby
|
|
@@ -183,18 +209,20 @@ delimiter = Philiprehberger::CsvKit::Detector.detect("data.tsv")
|
|
|
183
209
|
|
|
184
210
|
| Method / Class | Description |
|
|
185
211
|
|----------------|-------------|
|
|
186
|
-
| `CsvKit.to_hashes(
|
|
212
|
+
| `CsvKit.to_hashes(path_or_io, dialect:)` | Load CSV into array of symbolized hashes |
|
|
213
|
+
| `CsvKit.to_csv(rows, headers:, dialect:)` | Serialize an array of hashes to a CSV string |
|
|
187
214
|
| `CsvKit.sample(path_or_io, n, dialect:)` | Return n randomly sampled rows using reservoir sampling (Algorithm R) |
|
|
188
|
-
| `CsvKit.pluck(
|
|
189
|
-
| `CsvKit.filter(
|
|
190
|
-
| `CsvKit.find(
|
|
191
|
-
| `CsvKit.headers(
|
|
192
|
-
| `CsvKit.count(
|
|
193
|
-
| `CsvKit.each_hash(
|
|
215
|
+
| `CsvKit.pluck(path_or_io, *keys, dialect:)` | Extract specific columns |
|
|
216
|
+
| `CsvKit.filter(path_or_io, dialect:, &block)` | Filter rows, return CSV string |
|
|
217
|
+
| `CsvKit.find(path_or_io, dialect:, &block)` | Return the first row matching the predicate, or nil |
|
|
218
|
+
| `CsvKit.headers(path_or_io, dialect:)` | Return header row as array of symbols |
|
|
219
|
+
| `CsvKit.count(path_or_io, dialect:)` | Count data rows without loading into memory |
|
|
220
|
+
| `CsvKit.each_hash(path_or_io, dialect:, &block)` | Stream rows as symbolized hashes; returns Enumerator if no block |
|
|
194
221
|
| `CsvKit.process(path_or_io, dialect:, &block)` | Streaming DSL with transforms and validations |
|
|
195
222
|
| `Processor#headers(*names)` | Override header names |
|
|
196
223
|
| `Processor#transform(key, &block)` | Register column transform |
|
|
197
224
|
| `Processor#type(key, type, **opts)` | Register built-in type coercion (:integer, :float, :string, :date, :datetime) |
|
|
225
|
+
| `Processor#default(key, value)` | Fill nil or empty cells at `key` with `value` (runs before `type` coercion) |
|
|
198
226
|
| `Processor#validate(key, &block)` | Register column validation (skip invalid) |
|
|
199
227
|
| `Processor#skip(n)` | Skip the first N data rows |
|
|
200
228
|
| `Processor#limit(n)` | Stop after processing N rows |
|
|
@@ -31,6 +31,7 @@ module Philiprehberger
|
|
|
31
31
|
@path_or_io = path_or_io
|
|
32
32
|
@dialect = dialect ? Dialect.new(dialect) : nil
|
|
33
33
|
@transforms = {}
|
|
34
|
+
@defaults = {}
|
|
34
35
|
@validations = {}
|
|
35
36
|
@reject_block = nil
|
|
36
37
|
@each_block = nil
|
|
@@ -63,6 +64,22 @@ module Philiprehberger
|
|
|
63
64
|
@transforms[key] = ->(v) { coercion.call(v, opts) }
|
|
64
65
|
end
|
|
65
66
|
|
|
67
|
+
# Register a default value for a column.
|
|
68
|
+
#
|
|
69
|
+
# Cells where the value is `nil` or an empty string are replaced with
|
|
70
|
+
# the provided default during transform. Defaults run BEFORE `type`
|
|
71
|
+
# coercions and `transform` blocks, so callers can default a missing
|
|
72
|
+
# cell to a string and then coerce it (e.g. default to "0" then cast
|
|
73
|
+
# to :integer).
|
|
74
|
+
#
|
|
75
|
+
# @param key [Symbol] column name
|
|
76
|
+
# @param value [Object] value to use when the cell is nil or empty
|
|
77
|
+
# @return [self]
|
|
78
|
+
def default(key, value)
|
|
79
|
+
@defaults[key] = value
|
|
80
|
+
self
|
|
81
|
+
end
|
|
82
|
+
|
|
66
83
|
# Register a validation for a specific column.
|
|
67
84
|
def validate(key, &block)
|
|
68
85
|
@validations[key] = block
|
|
@@ -122,6 +139,7 @@ module Philiprehberger
|
|
|
122
139
|
return unless valid?(row)
|
|
123
140
|
return if rejected?(row)
|
|
124
141
|
|
|
142
|
+
apply_defaults!(row)
|
|
125
143
|
apply_transforms!(row)
|
|
126
144
|
apply_renames!(row)
|
|
127
145
|
@each_block&.call(row)
|
|
@@ -165,6 +183,13 @@ module Philiprehberger
|
|
|
165
183
|
@reject_block&.call(row) || false
|
|
166
184
|
end
|
|
167
185
|
|
|
186
|
+
def apply_defaults!(row)
|
|
187
|
+
@defaults.each do |key, value|
|
|
188
|
+
current = row[key]
|
|
189
|
+
row[key] = value if current.nil? || current.to_s.empty?
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
|
|
168
193
|
def apply_transforms!(row)
|
|
169
194
|
@transforms.each { |key, blk| row[key] = blk.call(row[key]) }
|
|
170
195
|
end
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
require 'csv'
|
|
4
4
|
require 'date'
|
|
5
5
|
require 'time'
|
|
6
|
+
require 'stringio'
|
|
6
7
|
require_relative 'csv_kit/version'
|
|
7
8
|
require_relative 'csv_kit/dialect'
|
|
8
9
|
require_relative 'csv_kit/detector'
|
|
@@ -30,69 +31,85 @@ module Philiprehberger
|
|
|
30
31
|
|
|
31
32
|
# Load an entire CSV into an array of symbolized hashes.
|
|
32
33
|
#
|
|
33
|
-
# @param
|
|
34
|
+
# @param path_or_io [String, IO] file path or IO object
|
|
34
35
|
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
35
36
|
# @return [Array<Hash{Symbol => String}>]
|
|
36
|
-
def self.to_hashes(
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
37
|
+
def self.to_hashes(path_or_io, dialect: nil)
|
|
38
|
+
rows = []
|
|
39
|
+
foreach_row(path_or_io, headers: true, dialect: dialect) do |row|
|
|
40
|
+
rows << row.to_h.transform_keys(&:to_sym)
|
|
41
|
+
end
|
|
42
|
+
rows
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Serialize an array of hashes to a CSV string.
|
|
46
|
+
#
|
|
47
|
+
# If headers is omitted, the keys of the first hash are used. Empty input
|
|
48
|
+
# returns an empty string. Dialect options are passed through to the writer.
|
|
49
|
+
#
|
|
50
|
+
# @param rows [Array<Hash>] data rows
|
|
51
|
+
# @param headers [Array<Symbol, String>, nil] explicit column order (optional)
|
|
52
|
+
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
53
|
+
# @return [String] CSV string with header row
|
|
54
|
+
def self.to_csv(rows, headers: nil, dialect: nil)
|
|
55
|
+
return '' if rows.empty? && headers.nil?
|
|
56
|
+
|
|
57
|
+
resolved_headers = (headers || rows.first.keys).map(&:to_sym)
|
|
58
|
+
io = StringIO.new
|
|
59
|
+
Writer.stream(io, headers: resolved_headers, dialect: dialect) do |w|
|
|
60
|
+
rows.each { |row| w << (row.is_a?(Hash) ? row.transform_keys(&:to_sym) : row) }
|
|
41
61
|
end
|
|
62
|
+
io.string
|
|
42
63
|
end
|
|
43
64
|
|
|
44
65
|
# Extract specific columns from a CSV.
|
|
45
66
|
#
|
|
46
|
-
# @param
|
|
67
|
+
# @param path_or_io [String, IO] file path or IO object
|
|
47
68
|
# @param keys [Array<Symbol>] column names to extract
|
|
48
69
|
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
49
70
|
# @return [Array<Hash{Symbol => String}>]
|
|
50
|
-
def self.pluck(
|
|
51
|
-
to_hashes(
|
|
71
|
+
def self.pluck(path_or_io, *keys, dialect: nil)
|
|
72
|
+
to_hashes(path_or_io, dialect: dialect).map { |h| h.slice(*keys) }
|
|
52
73
|
end
|
|
53
74
|
|
|
54
75
|
# Return the header row as an array of symbols.
|
|
55
76
|
#
|
|
56
|
-
# @param
|
|
77
|
+
# @param path_or_io [String, IO] file path or IO object
|
|
57
78
|
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
58
79
|
# @return [Array<Symbol>]
|
|
59
|
-
def self.headers(
|
|
80
|
+
def self.headers(path_or_io, dialect: nil)
|
|
60
81
|
csv_opts = {}
|
|
61
82
|
csv_opts = Dialect.new(dialect).merge_into(csv_opts) if dialect
|
|
62
|
-
|
|
83
|
+
row = nil
|
|
84
|
+
with_csv(path_or_io, csv_opts) do |csv|
|
|
63
85
|
row = csv.shift
|
|
64
|
-
return [] unless row
|
|
65
|
-
|
|
66
|
-
row.map(&:to_sym)
|
|
67
86
|
end
|
|
87
|
+
return [] unless row
|
|
88
|
+
|
|
89
|
+
row.map(&:to_sym)
|
|
68
90
|
end
|
|
69
91
|
|
|
70
92
|
# Count data rows without loading them all into memory.
|
|
71
93
|
#
|
|
72
|
-
# @param
|
|
94
|
+
# @param path_or_io [String, IO] file path or IO object
|
|
73
95
|
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
74
96
|
# @return [Integer]
|
|
75
|
-
def self.count(
|
|
76
|
-
csv_opts = { headers: true }
|
|
77
|
-
csv_opts = Dialect.new(dialect).merge_into(csv_opts) if dialect
|
|
97
|
+
def self.count(path_or_io, dialect: nil)
|
|
78
98
|
n = 0
|
|
79
|
-
|
|
99
|
+
foreach_row(path_or_io, headers: true, dialect: dialect) { |_| n += 1 }
|
|
80
100
|
n
|
|
81
101
|
end
|
|
82
102
|
|
|
83
103
|
# Stream rows one at a time as symbolized hashes with constant memory.
|
|
84
104
|
# Returns an Enumerator if no block is given.
|
|
85
105
|
#
|
|
86
|
-
# @param
|
|
106
|
+
# @param path_or_io [String, IO] file path or IO object
|
|
87
107
|
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
88
108
|
# @yield [Hash{Symbol => String}] each row
|
|
89
109
|
# @return [Enumerator, nil]
|
|
90
|
-
def self.each_hash(
|
|
91
|
-
csv_opts = { headers: true }
|
|
92
|
-
csv_opts = Dialect.new(dialect).merge_into(csv_opts) if dialect
|
|
93
|
-
|
|
110
|
+
def self.each_hash(path_or_io, dialect: nil, &block)
|
|
94
111
|
enum = Enumerator.new do |yielder|
|
|
95
|
-
|
|
112
|
+
foreach_row(path_or_io, headers: true, dialect: dialect) do |row|
|
|
96
113
|
yielder.yield(row.to_h.transform_keys(&:to_sym))
|
|
97
114
|
end
|
|
98
115
|
end
|
|
@@ -109,13 +126,10 @@ module Philiprehberger
|
|
|
109
126
|
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
110
127
|
# @return [Array<Hash{Symbol => String}>]
|
|
111
128
|
def self.sample(path_or_io, n, dialect: nil)
|
|
112
|
-
csv_opts = { headers: true }
|
|
113
|
-
csv_opts = Dialect.new(dialect).merge_into(csv_opts) if dialect
|
|
114
|
-
|
|
115
129
|
reservoir = []
|
|
116
130
|
index = 0
|
|
117
131
|
|
|
118
|
-
|
|
132
|
+
foreach_row(path_or_io, headers: true, dialect: dialect) do |row|
|
|
119
133
|
hash = row.to_h.transform_keys(&:to_sym)
|
|
120
134
|
if index < n
|
|
121
135
|
reservoir << hash
|
|
@@ -126,25 +140,17 @@ module Philiprehberger
|
|
|
126
140
|
index += 1
|
|
127
141
|
end
|
|
128
142
|
|
|
129
|
-
if path_or_io.is_a?(String)
|
|
130
|
-
CSV.foreach(path_or_io, **csv_opts, &iterate)
|
|
131
|
-
else
|
|
132
|
-
CSV.new(path_or_io, **csv_opts).each(&iterate)
|
|
133
|
-
end
|
|
134
|
-
|
|
135
143
|
reservoir
|
|
136
144
|
end
|
|
137
145
|
|
|
138
146
|
# Find the first row matching a predicate, streaming (stops as soon as a match is found).
|
|
139
147
|
#
|
|
140
|
-
# @param
|
|
148
|
+
# @param path_or_io [String, IO] file path or IO object
|
|
141
149
|
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
142
150
|
# @yield [Hash{Symbol => String}] each row as a symbolized hash
|
|
143
151
|
# @return [Hash{Symbol => String}, nil] the first matching row or nil
|
|
144
|
-
def self.find(
|
|
145
|
-
|
|
146
|
-
csv_opts = Dialect.new(dialect).merge_into(csv_opts) if dialect
|
|
147
|
-
CSV.foreach(path, **csv_opts) do |row|
|
|
152
|
+
def self.find(path_or_io, dialect: nil, &block)
|
|
153
|
+
foreach_row(path_or_io, headers: true, dialect: dialect) do |row|
|
|
148
154
|
hash = row.to_h.transform_keys(&:to_sym)
|
|
149
155
|
return hash if block.call(hash)
|
|
150
156
|
end
|
|
@@ -153,12 +159,12 @@ module Philiprehberger
|
|
|
153
159
|
|
|
154
160
|
# Filter rows and return matching rows as a CSV string.
|
|
155
161
|
#
|
|
156
|
-
# @param
|
|
162
|
+
# @param path_or_io [String, IO] file path or IO object
|
|
157
163
|
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
158
164
|
# @yield [Hash{Symbol => String}] each row as a symbolized hash
|
|
159
165
|
# @return [String] CSV string with headers
|
|
160
|
-
def self.filter(
|
|
161
|
-
rows = to_hashes(
|
|
166
|
+
def self.filter(path_or_io, dialect: nil, &)
|
|
167
|
+
rows = to_hashes(path_or_io, dialect: dialect).select(&)
|
|
162
168
|
return '' if rows.empty?
|
|
163
169
|
|
|
164
170
|
headers = rows.first.keys
|
|
@@ -167,5 +173,29 @@ module Philiprehberger
|
|
|
167
173
|
rows.each { |row| csv << headers.map { |k| row[k] } }
|
|
168
174
|
end
|
|
169
175
|
end
|
|
176
|
+
|
|
177
|
+
# @api private
|
|
178
|
+
# Iterate CSV rows from either a file path or an IO object.
|
|
179
|
+
def self.foreach_row(path_or_io, headers: false, dialect: nil, &block)
|
|
180
|
+
csv_opts = headers ? { headers: true } : {}
|
|
181
|
+
csv_opts = Dialect.new(dialect).merge_into(csv_opts) if dialect
|
|
182
|
+
if path_or_io.is_a?(String)
|
|
183
|
+
CSV.foreach(path_or_io, **csv_opts, &block)
|
|
184
|
+
else
|
|
185
|
+
CSV.new(path_or_io, **csv_opts).each(&block)
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# @api private
|
|
190
|
+
# Open a CSV reader over either a file path or an IO object.
|
|
191
|
+
def self.with_csv(path_or_io, csv_opts, &block)
|
|
192
|
+
if path_or_io.is_a?(String)
|
|
193
|
+
CSV.open(path_or_io, **csv_opts, &block)
|
|
194
|
+
else
|
|
195
|
+
block.call(CSV.new(path_or_io, **csv_opts))
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
private_class_method :foreach_row, :with_csv
|
|
170
200
|
end
|
|
171
201
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: philiprehberger-csv_kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.9.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Philip Rehberger
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-04-
|
|
11
|
+
date: 2026-04-20 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: Streaming CSV processor with row-by-row transforms, validations, column
|
|
14
14
|
plucking, streaming each_hash iteration, filtering, writing, error recovery, and
|