philiprehberger-csv_kit 0.6.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -0
- data/README.md +33 -7
- data/lib/philiprehberger/csv_kit/version.rb +1 -1
- data/lib/philiprehberger/csv_kit.rb +99 -34
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c48c099081a08bc9be83c75fb8879f9cb876ce2cc6e8978a44b8c18d3cd43776
|
|
4
|
+
data.tar.gz: fe72a612c3a6e5b0166c5eb24694f5593cbb0baca3eb541081dc84414939019b
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: bfff72e557a3deef83f104118ec4dddf00e5855b2d29ea1ac09a023f767a3b557f7415a9b8fd953ddca68a5cc08155b4b37b14a1800b3f2672db059b1a8a02da
|
|
7
|
+
data.tar.gz: a2bd950d995457677e70c2398efb4decb742597bc032e5f0f0454ac5a9025d4a41c2d05f4bcf18988f506c46ffdd0b20b354b7620748587c3de7e32da75387d7
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.8.0] - 2026-04-17
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- `CsvKit.to_csv(rows, headers:, dialect:)` — serialize an array of hashes to a CSV string; inverse of `to_hashes`
|
|
14
|
+
- `to_hashes`, `pluck`, `headers`, `count`, `each_hash`, `find`, and `filter` now accept an IO object in addition to a file path
|
|
15
|
+
|
|
16
|
+
## [0.7.0] - 2026-04-16
|
|
17
|
+
|
|
18
|
+
### Added
|
|
19
|
+
- `CsvKit.sample(path, n, dialect:)` — return n randomly sampled rows as symbolized hashes using reservoir sampling (Algorithm R); O(n) memory regardless of file size; returns all rows if file has fewer than n rows
|
|
20
|
+
|
|
10
21
|
## [0.6.0] - 2026-04-15
|
|
11
22
|
|
|
12
23
|
### Added
|
|
@@ -96,6 +107,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
96
107
|
- Type coercion and row validation
|
|
97
108
|
- Quick load and filtering convenience methods
|
|
98
109
|
|
|
110
|
+
[Unreleased]: https://github.com/philiprehberger/rb-csv-kit/compare/v0.8.0...HEAD
|
|
111
|
+
[0.8.0]: https://github.com/philiprehberger/rb-csv-kit/compare/v0.7.0...v0.8.0
|
|
112
|
+
[0.7.0]: https://github.com/philiprehberger/rb-csv-kit/compare/v0.6.0...v0.7.0
|
|
113
|
+
[0.6.0]: https://github.com/philiprehberger/rb-csv-kit/compare/v0.5.0...v0.6.0
|
|
99
114
|
[0.5.0]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.5.0
|
|
100
115
|
[0.4.0]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.4.0
|
|
101
116
|
[0.3.1]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.3.1
|
data/README.md
CHANGED
|
@@ -69,6 +69,15 @@ adults = Philiprehberger::CsvKit.each_hash("data.csv")
|
|
|
69
69
|
.first(10)
|
|
70
70
|
```
|
|
71
71
|
|
|
72
|
+
### Reservoir Sampling
|
|
73
|
+
|
|
74
|
+
Return n randomly sampled rows with O(n) memory using Knuth's Algorithm R. If the file has fewer than n rows, all rows are returned:
|
|
75
|
+
|
|
76
|
+
```ruby
|
|
77
|
+
rows = Philiprehberger::CsvKit.sample("large.csv", 100)
|
|
78
|
+
# => [{name: "Alice", age: "30"}, ...]
|
|
79
|
+
```
|
|
80
|
+
|
|
72
81
|
### Find First Match
|
|
73
82
|
|
|
74
83
|
Return the first row that matches a predicate, streaming and stopping on the first hit:
|
|
@@ -115,6 +124,21 @@ rows = Philiprehberger::CsvKit.process("data.csv", dialect: { delimiter: ";", qu
|
|
|
115
124
|
end
|
|
116
125
|
```
|
|
117
126
|
|
|
127
|
+
### Write CSV String
|
|
128
|
+
|
|
129
|
+
Inverse of `to_hashes`. Serialize an array of hashes to a CSV string. Headers default to the keys of the first row:
|
|
130
|
+
|
|
131
|
+
```ruby
|
|
132
|
+
csv = Philiprehberger::CsvKit.to_csv([
|
|
133
|
+
{ name: "Alice", age: 30 },
|
|
134
|
+
{ name: "Bob", age: 25 }
|
|
135
|
+
])
|
|
136
|
+
# => "name,age\nAlice,30\nBob,25\n"
|
|
137
|
+
|
|
138
|
+
# Control column order / subset with explicit headers
|
|
139
|
+
Philiprehberger::CsvKit.to_csv(rows, headers: [:name])
|
|
140
|
+
```
|
|
141
|
+
|
|
118
142
|
### Writing CSV
|
|
119
143
|
|
|
120
144
|
```ruby
|
|
@@ -174,13 +198,15 @@ delimiter = Philiprehberger::CsvKit::Detector.detect("data.tsv")
|
|
|
174
198
|
|
|
175
199
|
| Method / Class | Description |
|
|
176
200
|
|----------------|-------------|
|
|
177
|
-
| `CsvKit.to_hashes(
|
|
178
|
-
| `CsvKit.
|
|
179
|
-
| `CsvKit.
|
|
180
|
-
| `CsvKit.
|
|
181
|
-
| `CsvKit.
|
|
182
|
-
| `CsvKit.
|
|
183
|
-
| `CsvKit.
|
|
201
|
+
| `CsvKit.to_hashes(path_or_io, dialect:)` | Load CSV into array of symbolized hashes |
|
|
202
|
+
| `CsvKit.to_csv(rows, headers:, dialect:)` | Serialize an array of hashes to a CSV string |
|
|
203
|
+
| `CsvKit.sample(path_or_io, n, dialect:)` | Return n randomly sampled rows using reservoir sampling (Algorithm R) |
|
|
204
|
+
| `CsvKit.pluck(path_or_io, *keys, dialect:)` | Extract specific columns |
|
|
205
|
+
| `CsvKit.filter(path_or_io, dialect:, &block)` | Filter rows, return CSV string |
|
|
206
|
+
| `CsvKit.find(path_or_io, dialect:, &block)` | Return the first row matching the predicate, or nil |
|
|
207
|
+
| `CsvKit.headers(path_or_io, dialect:)` | Return header row as array of symbols |
|
|
208
|
+
| `CsvKit.count(path_or_io, dialect:)` | Count data rows without loading into memory |
|
|
209
|
+
| `CsvKit.each_hash(path_or_io, dialect:, &block)` | Stream rows as symbolized hashes; returns Enumerator if no block |
|
|
184
210
|
| `CsvKit.process(path_or_io, dialect:, &block)` | Streaming DSL with transforms and validations |
|
|
185
211
|
| `Processor#headers(*names)` | Override header names |
|
|
186
212
|
| `Processor#transform(key, &block)` | Register column transform |
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
require 'csv'
|
|
4
4
|
require 'date'
|
|
5
5
|
require 'time'
|
|
6
|
+
require 'stringio'
|
|
6
7
|
require_relative 'csv_kit/version'
|
|
7
8
|
require_relative 'csv_kit/dialect'
|
|
8
9
|
require_relative 'csv_kit/detector'
|
|
@@ -30,69 +31,85 @@ module Philiprehberger
|
|
|
30
31
|
|
|
31
32
|
# Load an entire CSV into an array of symbolized hashes.
|
|
32
33
|
#
|
|
33
|
-
# @param
|
|
34
|
+
# @param path_or_io [String, IO] file path or IO object
|
|
34
35
|
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
35
36
|
# @return [Array<Hash{Symbol => String}>]
|
|
36
|
-
def self.to_hashes(
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
37
|
+
def self.to_hashes(path_or_io, dialect: nil)
|
|
38
|
+
rows = []
|
|
39
|
+
foreach_row(path_or_io, headers: true, dialect: dialect) do |row|
|
|
40
|
+
rows << row.to_h.transform_keys(&:to_sym)
|
|
41
|
+
end
|
|
42
|
+
rows
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Serialize an array of hashes to a CSV string.
|
|
46
|
+
#
|
|
47
|
+
# If headers is omitted, the keys of the first hash are used. Empty input
|
|
48
|
+
# returns an empty string. Dialect options are passed through to the writer.
|
|
49
|
+
#
|
|
50
|
+
# @param rows [Array<Hash>] data rows
|
|
51
|
+
# @param headers [Array<Symbol, String>, nil] explicit column order (optional)
|
|
52
|
+
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
53
|
+
# @return [String] CSV string with header row
|
|
54
|
+
def self.to_csv(rows, headers: nil, dialect: nil)
|
|
55
|
+
return '' if rows.empty? && headers.nil?
|
|
56
|
+
|
|
57
|
+
resolved_headers = (headers || rows.first.keys).map(&:to_sym)
|
|
58
|
+
io = StringIO.new
|
|
59
|
+
Writer.stream(io, headers: resolved_headers, dialect: dialect) do |w|
|
|
60
|
+
rows.each { |row| w << (row.is_a?(Hash) ? row.transform_keys(&:to_sym) : row) }
|
|
41
61
|
end
|
|
62
|
+
io.string
|
|
42
63
|
end
|
|
43
64
|
|
|
44
65
|
# Extract specific columns from a CSV.
|
|
45
66
|
#
|
|
46
|
-
# @param
|
|
67
|
+
# @param path_or_io [String, IO] file path or IO object
|
|
47
68
|
# @param keys [Array<Symbol>] column names to extract
|
|
48
69
|
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
49
70
|
# @return [Array<Hash{Symbol => String}>]
|
|
50
|
-
def self.pluck(
|
|
51
|
-
to_hashes(
|
|
71
|
+
def self.pluck(path_or_io, *keys, dialect: nil)
|
|
72
|
+
to_hashes(path_or_io, dialect: dialect).map { |h| h.slice(*keys) }
|
|
52
73
|
end
|
|
53
74
|
|
|
54
75
|
# Return the header row as an array of symbols.
|
|
55
76
|
#
|
|
56
|
-
# @param
|
|
77
|
+
# @param path_or_io [String, IO] file path or IO object
|
|
57
78
|
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
58
79
|
# @return [Array<Symbol>]
|
|
59
|
-
def self.headers(
|
|
80
|
+
def self.headers(path_or_io, dialect: nil)
|
|
60
81
|
csv_opts = {}
|
|
61
82
|
csv_opts = Dialect.new(dialect).merge_into(csv_opts) if dialect
|
|
62
|
-
|
|
83
|
+
row = nil
|
|
84
|
+
with_csv(path_or_io, csv_opts) do |csv|
|
|
63
85
|
row = csv.shift
|
|
64
|
-
return [] unless row
|
|
65
|
-
|
|
66
|
-
row.map(&:to_sym)
|
|
67
86
|
end
|
|
87
|
+
return [] unless row
|
|
88
|
+
|
|
89
|
+
row.map(&:to_sym)
|
|
68
90
|
end
|
|
69
91
|
|
|
70
92
|
# Count data rows without loading them all into memory.
|
|
71
93
|
#
|
|
72
|
-
# @param
|
|
94
|
+
# @param path_or_io [String, IO] file path or IO object
|
|
73
95
|
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
74
96
|
# @return [Integer]
|
|
75
|
-
def self.count(
|
|
76
|
-
csv_opts = { headers: true }
|
|
77
|
-
csv_opts = Dialect.new(dialect).merge_into(csv_opts) if dialect
|
|
97
|
+
def self.count(path_or_io, dialect: nil)
|
|
78
98
|
n = 0
|
|
79
|
-
|
|
99
|
+
foreach_row(path_or_io, headers: true, dialect: dialect) { |_| n += 1 }
|
|
80
100
|
n
|
|
81
101
|
end
|
|
82
102
|
|
|
83
103
|
# Stream rows one at a time as symbolized hashes with constant memory.
|
|
84
104
|
# Returns an Enumerator if no block is given.
|
|
85
105
|
#
|
|
86
|
-
# @param
|
|
106
|
+
# @param path_or_io [String, IO] file path or IO object
|
|
87
107
|
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
88
108
|
# @yield [Hash{Symbol => String}] each row
|
|
89
109
|
# @return [Enumerator, nil]
|
|
90
|
-
def self.each_hash(
|
|
91
|
-
csv_opts = { headers: true }
|
|
92
|
-
csv_opts = Dialect.new(dialect).merge_into(csv_opts) if dialect
|
|
93
|
-
|
|
110
|
+
def self.each_hash(path_or_io, dialect: nil, &block)
|
|
94
111
|
enum = Enumerator.new do |yielder|
|
|
95
|
-
|
|
112
|
+
foreach_row(path_or_io, headers: true, dialect: dialect) do |row|
|
|
96
113
|
yielder.yield(row.to_h.transform_keys(&:to_sym))
|
|
97
114
|
end
|
|
98
115
|
end
|
|
@@ -100,16 +117,40 @@ module Philiprehberger
|
|
|
100
117
|
block ? enum.each(&block) : enum
|
|
101
118
|
end
|
|
102
119
|
|
|
120
|
+
# Return n randomly sampled rows using reservoir sampling (Algorithm R).
|
|
121
|
+
# Memory usage is O(n) regardless of file size.
|
|
122
|
+
# If the file has fewer than n rows, all rows are returned.
|
|
123
|
+
#
|
|
124
|
+
# @param path_or_io [String, IO] file path or IO object
|
|
125
|
+
# @param n [Integer] number of rows to sample
|
|
126
|
+
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
127
|
+
# @return [Array<Hash{Symbol => String}>]
|
|
128
|
+
def self.sample(path_or_io, n, dialect: nil)
|
|
129
|
+
reservoir = []
|
|
130
|
+
index = 0
|
|
131
|
+
|
|
132
|
+
foreach_row(path_or_io, headers: true, dialect: dialect) do |row|
|
|
133
|
+
hash = row.to_h.transform_keys(&:to_sym)
|
|
134
|
+
if index < n
|
|
135
|
+
reservoir << hash
|
|
136
|
+
else
|
|
137
|
+
j = rand(index + 1)
|
|
138
|
+
reservoir[j] = hash if j < n
|
|
139
|
+
end
|
|
140
|
+
index += 1
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
reservoir
|
|
144
|
+
end
|
|
145
|
+
|
|
103
146
|
# Find the first row matching a predicate, streaming (stops as soon as a match is found).
|
|
104
147
|
#
|
|
105
|
-
# @param
|
|
148
|
+
# @param path_or_io [String, IO] file path or IO object
|
|
106
149
|
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
107
150
|
# @yield [Hash{Symbol => String}] each row as a symbolized hash
|
|
108
151
|
# @return [Hash{Symbol => String}, nil] the first matching row or nil
|
|
109
|
-
def self.find(
|
|
110
|
-
|
|
111
|
-
csv_opts = Dialect.new(dialect).merge_into(csv_opts) if dialect
|
|
112
|
-
CSV.foreach(path, **csv_opts) do |row|
|
|
152
|
+
def self.find(path_or_io, dialect: nil, &block)
|
|
153
|
+
foreach_row(path_or_io, headers: true, dialect: dialect) do |row|
|
|
113
154
|
hash = row.to_h.transform_keys(&:to_sym)
|
|
114
155
|
return hash if block.call(hash)
|
|
115
156
|
end
|
|
@@ -118,12 +159,12 @@ module Philiprehberger
|
|
|
118
159
|
|
|
119
160
|
# Filter rows and return matching rows as a CSV string.
|
|
120
161
|
#
|
|
121
|
-
# @param
|
|
162
|
+
# @param path_or_io [String, IO] file path or IO object
|
|
122
163
|
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
123
164
|
# @yield [Hash{Symbol => String}] each row as a symbolized hash
|
|
124
165
|
# @return [String] CSV string with headers
|
|
125
|
-
def self.filter(
|
|
126
|
-
rows = to_hashes(
|
|
166
|
+
def self.filter(path_or_io, dialect: nil, &)
|
|
167
|
+
rows = to_hashes(path_or_io, dialect: dialect).select(&)
|
|
127
168
|
return '' if rows.empty?
|
|
128
169
|
|
|
129
170
|
headers = rows.first.keys
|
|
@@ -132,5 +173,29 @@ module Philiprehberger
|
|
|
132
173
|
rows.each { |row| csv << headers.map { |k| row[k] } }
|
|
133
174
|
end
|
|
134
175
|
end
|
|
176
|
+
|
|
177
|
+
# @api private
|
|
178
|
+
# Iterate CSV rows from either a file path or an IO object.
|
|
179
|
+
def self.foreach_row(path_or_io, headers: false, dialect: nil, &block)
|
|
180
|
+
csv_opts = headers ? { headers: true } : {}
|
|
181
|
+
csv_opts = Dialect.new(dialect).merge_into(csv_opts) if dialect
|
|
182
|
+
if path_or_io.is_a?(String)
|
|
183
|
+
CSV.foreach(path_or_io, **csv_opts, &block)
|
|
184
|
+
else
|
|
185
|
+
CSV.new(path_or_io, **csv_opts).each(&block)
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# @api private
|
|
190
|
+
# Open a CSV reader over either a file path or an IO object.
|
|
191
|
+
def self.with_csv(path_or_io, csv_opts, &block)
|
|
192
|
+
if path_or_io.is_a?(String)
|
|
193
|
+
CSV.open(path_or_io, **csv_opts, &block)
|
|
194
|
+
else
|
|
195
|
+
block.call(CSV.new(path_or_io, **csv_opts))
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
private_class_method :foreach_row, :with_csv
|
|
135
200
|
end
|
|
136
201
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: philiprehberger-csv_kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.8.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Philip Rehberger
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-04-
|
|
11
|
+
date: 2026-04-17 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: Streaming CSV processor with row-by-row transforms, validations, column
|
|
14
14
|
plucking, streaming each_hash iteration, filtering, writing, error recovery, and
|