philiprehberger-csv_kit 0.5.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +20 -0
- data/lib/philiprehberger/csv_kit/version.rb +1 -1
- data/lib/philiprehberger/csv_kit.rb +51 -0
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ea3cafa68ee9e49b8c1b305af9e7f796a9c417676a6018ee416dbb978e91eb38
|
|
4
|
+
data.tar.gz: e34151bbdf97d2e78fc348620fd958fa01a3e6d446744593d7a8474b82885de6
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a98d2e7baa28c03c04c44322482cc5a9ec4bef939692f5b3812679efe1d4ab1edc90748d65003ea4ddb287a73d27cb6ba22fd4e254ccf734d9c61cd3fbb28cab
|
|
7
|
+
data.tar.gz: 04a064667d1cbbde06ab473cad6dd18e4edc9558fa5f6efcff5ea2a0505a3c55846f016ebac118735c3b6c4b035bf48edf4c3cf03750c2ba855bf4ec4fc76f4e
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.7.0] - 2026-04-16
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- `CsvKit.sample(path, n, dialect:)` — return n randomly sampled rows as symbolized hashes using reservoir sampling (Algorithm R); O(n) memory regardless of file size; returns all rows if file has fewer than n rows
|
|
14
|
+
|
|
15
|
+
## [0.6.0] - 2026-04-15
|
|
16
|
+
|
|
17
|
+
### Added
|
|
18
|
+
- `CsvKit.find(path, &block)` — return the first row matching a predicate, stopping as soon as a match is found
|
|
19
|
+
|
|
10
20
|
## [0.5.0] - 2026-04-09
|
|
11
21
|
|
|
12
22
|
### Added
|
|
@@ -91,6 +101,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
91
101
|
- Type coercion and row validation
|
|
92
102
|
- Quick load and filtering convenience methods
|
|
93
103
|
|
|
104
|
+
[Unreleased]: https://github.com/philiprehberger/rb-csv-kit/compare/v0.7.0...HEAD
|
|
105
|
+
[0.7.0]: https://github.com/philiprehberger/rb-csv-kit/compare/v0.6.0...v0.7.0
|
|
106
|
+
[0.6.0]: https://github.com/philiprehberger/rb-csv-kit/compare/v0.5.0...v0.6.0
|
|
94
107
|
[0.5.0]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.5.0
|
|
95
108
|
[0.4.0]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.4.0
|
|
96
109
|
[0.3.1]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.3.1
|
data/README.md
CHANGED
|
@@ -69,6 +69,24 @@ adults = Philiprehberger::CsvKit.each_hash("data.csv")
|
|
|
69
69
|
.first(10)
|
|
70
70
|
```
|
|
71
71
|
|
|
72
|
+
### Reservoir Sampling
|
|
73
|
+
|
|
74
|
+
Return n randomly sampled rows with O(n) memory using Knuth's Algorithm R. If the file has fewer than n rows, all rows are returned:
|
|
75
|
+
|
|
76
|
+
```ruby
|
|
77
|
+
rows = Philiprehberger::CsvKit.sample("large.csv", 100)
|
|
78
|
+
# => [{name: "Alice", age: "30"}, ...]
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Find First Match
|
|
82
|
+
|
|
83
|
+
Return the first row that matches a predicate, streaming and stopping on the first hit:
|
|
84
|
+
|
|
85
|
+
```ruby
|
|
86
|
+
user = Philiprehberger::CsvKit.find("users.csv") { |row| row[:email] == "a@b.com" }
|
|
87
|
+
# => {email: "a@b.com", name: "Alice"} or nil
|
|
88
|
+
```
|
|
89
|
+
|
|
72
90
|
### Filter Rows
|
|
73
91
|
|
|
74
92
|
```ruby
|
|
@@ -166,8 +184,10 @@ delimiter = Philiprehberger::CsvKit::Detector.detect("data.tsv")
|
|
|
166
184
|
| Method / Class | Description |
|
|
167
185
|
|----------------|-------------|
|
|
168
186
|
| `CsvKit.to_hashes(path, dialect:)` | Load CSV into array of symbolized hashes |
|
|
187
|
+
| `CsvKit.sample(path_or_io, n, dialect:)` | Return n randomly sampled rows using reservoir sampling (Algorithm R) |
|
|
169
188
|
| `CsvKit.pluck(path, *keys, dialect:)` | Extract specific columns |
|
|
170
189
|
| `CsvKit.filter(path, dialect:, &block)` | Filter rows, return CSV string |
|
|
190
|
+
| `CsvKit.find(path, dialect:, &block)` | Return the first row matching the predicate, or nil |
|
|
171
191
|
| `CsvKit.headers(path, dialect:)` | Return header row as array of symbols |
|
|
172
192
|
| `CsvKit.count(path, dialect:)` | Count data rows without loading into memory |
|
|
173
193
|
| `CsvKit.each_hash(path, dialect:, &block)` | Stream rows as symbolized hashes; returns Enumerator if no block |
|
|
@@ -100,6 +100,57 @@ module Philiprehberger
|
|
|
100
100
|
block ? enum.each(&block) : enum
|
|
101
101
|
end
|
|
102
102
|
|
|
103
|
+
# Return n randomly sampled rows using reservoir sampling (Algorithm R).
|
|
104
|
+
# Memory usage is O(n) regardless of file size.
|
|
105
|
+
# If the file has fewer than n rows, all rows are returned.
|
|
106
|
+
#
|
|
107
|
+
# @param path_or_io [String, IO] file path or IO object
|
|
108
|
+
# @param n [Integer] number of rows to sample
|
|
109
|
+
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
110
|
+
# @return [Array<Hash{Symbol => String}>]
|
|
111
|
+
def self.sample(path_or_io, n, dialect: nil)
|
|
112
|
+
csv_opts = { headers: true }
|
|
113
|
+
csv_opts = Dialect.new(dialect).merge_into(csv_opts) if dialect
|
|
114
|
+
|
|
115
|
+
reservoir = []
|
|
116
|
+
index = 0
|
|
117
|
+
|
|
118
|
+
iterate = lambda do |row|
|
|
119
|
+
hash = row.to_h.transform_keys(&:to_sym)
|
|
120
|
+
if index < n
|
|
121
|
+
reservoir << hash
|
|
122
|
+
else
|
|
123
|
+
j = rand(index + 1)
|
|
124
|
+
reservoir[j] = hash if j < n
|
|
125
|
+
end
|
|
126
|
+
index += 1
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
if path_or_io.is_a?(String)
|
|
130
|
+
CSV.foreach(path_or_io, **csv_opts, &iterate)
|
|
131
|
+
else
|
|
132
|
+
CSV.new(path_or_io, **csv_opts).each(&iterate)
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
reservoir
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Find the first row matching a predicate, streaming (stops as soon as a match is found).
|
|
139
|
+
#
|
|
140
|
+
# @param path [String] file path
|
|
141
|
+
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
142
|
+
# @yield [Hash{Symbol => String}] each row as a symbolized hash
|
|
143
|
+
# @return [Hash{Symbol => String}, nil] the first matching row or nil
|
|
144
|
+
def self.find(path, dialect: nil, &block)
|
|
145
|
+
csv_opts = { headers: true }
|
|
146
|
+
csv_opts = Dialect.new(dialect).merge_into(csv_opts) if dialect
|
|
147
|
+
CSV.foreach(path, **csv_opts) do |row|
|
|
148
|
+
hash = row.to_h.transform_keys(&:to_sym)
|
|
149
|
+
return hash if block.call(hash)
|
|
150
|
+
end
|
|
151
|
+
nil
|
|
152
|
+
end
|
|
153
|
+
|
|
103
154
|
# Filter rows and return matching rows as a CSV string.
|
|
104
155
|
#
|
|
105
156
|
# @param path [String] file path
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: philiprehberger-csv_kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.7.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Philip Rehberger
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-04-
|
|
11
|
+
date: 2026-04-17 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: Streaming CSV processor with row-by-row transforms, validations, column
|
|
14
14
|
plucking, streaming each_hash iteration, filtering, writing, error recovery, and
|