philiprehberger-csv_kit 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +10 -0
- data/lib/philiprehberger/csv_kit/version.rb +1 -1
- data/lib/philiprehberger/csv_kit.rb +35 -0
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ea3cafa68ee9e49b8c1b305af9e7f796a9c417676a6018ee416dbb978e91eb38
|
|
4
|
+
data.tar.gz: e34151bbdf97d2e78fc348620fd958fa01a3e6d446744593d7a8474b82885de6
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a98d2e7baa28c03c04c44322482cc5a9ec4bef939692f5b3812679efe1d4ab1edc90748d65003ea4ddb287a73d27cb6ba22fd4e254ccf734d9c61cd3fbb28cab
|
|
7
|
+
data.tar.gz: 04a064667d1cbbde06ab473cad6dd18e4edc9558fa5f6efcff5ea2a0505a3c55846f016ebac118735c3b6c4b035bf48edf4c3cf03750c2ba855bf4ec4fc76f4e
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.7.0] - 2026-04-16
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- `CsvKit.sample(path, n, dialect:)` — return n randomly sampled rows as symbolized hashes using reservoir sampling (Algorithm R); O(n) memory regardless of file size; returns all rows if file has fewer than n rows
|
|
14
|
+
|
|
10
15
|
## [0.6.0] - 2026-04-15
|
|
11
16
|
|
|
12
17
|
### Added
|
|
@@ -96,6 +101,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
96
101
|
- Type coercion and row validation
|
|
97
102
|
- Quick load and filtering convenience methods
|
|
98
103
|
|
|
104
|
+
[Unreleased]: https://github.com/philiprehberger/rb-csv-kit/compare/v0.7.0...HEAD
|
|
105
|
+
[0.7.0]: https://github.com/philiprehberger/rb-csv-kit/compare/v0.6.0...v0.7.0
|
|
106
|
+
[0.6.0]: https://github.com/philiprehberger/rb-csv-kit/compare/v0.5.0...v0.6.0
|
|
99
107
|
[0.5.0]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.5.0
|
|
100
108
|
[0.4.0]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.4.0
|
|
101
109
|
[0.3.1]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.3.1
|
data/README.md
CHANGED
|
@@ -69,6 +69,15 @@ adults = Philiprehberger::CsvKit.each_hash("data.csv")
|
|
|
69
69
|
.first(10)
|
|
70
70
|
```
|
|
71
71
|
|
|
72
|
+
### Reservoir Sampling
|
|
73
|
+
|
|
74
|
+
Return n randomly sampled rows with O(n) memory using Knuth's Algorithm R. If the file has fewer than n rows, all rows are returned:
|
|
75
|
+
|
|
76
|
+
```ruby
|
|
77
|
+
rows = Philiprehberger::CsvKit.sample("large.csv", 100)
|
|
78
|
+
# => [{name: "Alice", age: "30"}, ...]
|
|
79
|
+
```
|
|
80
|
+
|
|
72
81
|
### Find First Match
|
|
73
82
|
|
|
74
83
|
Return the first row that matches a predicate, streaming and stopping on the first hit:
|
|
@@ -175,6 +184,7 @@ delimiter = Philiprehberger::CsvKit::Detector.detect("data.tsv")
|
|
|
175
184
|
| Method / Class | Description |
|
|
176
185
|
|----------------|-------------|
|
|
177
186
|
| `CsvKit.to_hashes(path, dialect:)` | Load CSV into array of symbolized hashes |
|
|
187
|
+
| `CsvKit.sample(path_or_io, n, dialect:)` | Return n randomly sampled rows using reservoir sampling (Algorithm R) |
|
|
178
188
|
| `CsvKit.pluck(path, *keys, dialect:)` | Extract specific columns |
|
|
179
189
|
| `CsvKit.filter(path, dialect:, &block)` | Filter rows, return CSV string |
|
|
180
190
|
| `CsvKit.find(path, dialect:, &block)` | Return the first row matching the predicate, or nil |
|
|
@@ -100,6 +100,41 @@ module Philiprehberger
|
|
|
100
100
|
block ? enum.each(&block) : enum
|
|
101
101
|
end
|
|
102
102
|
|
|
103
|
+
# Return n randomly sampled rows using reservoir sampling (Algorithm R).
|
|
104
|
+
# Memory usage is O(n) regardless of file size.
|
|
105
|
+
# If the file has fewer than n rows, all rows are returned.
|
|
106
|
+
#
|
|
107
|
+
# @param path_or_io [String, IO] file path or IO object
|
|
108
|
+
# @param n [Integer] number of rows to sample
|
|
109
|
+
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
110
|
+
# @return [Array<Hash{Symbol => String}>]
|
|
111
|
+
def self.sample(path_or_io, n, dialect: nil)
|
|
112
|
+
csv_opts = { headers: true }
|
|
113
|
+
csv_opts = Dialect.new(dialect).merge_into(csv_opts) if dialect
|
|
114
|
+
|
|
115
|
+
reservoir = []
|
|
116
|
+
index = 0
|
|
117
|
+
|
|
118
|
+
iterate = lambda do |row|
|
|
119
|
+
hash = row.to_h.transform_keys(&:to_sym)
|
|
120
|
+
if index < n
|
|
121
|
+
reservoir << hash
|
|
122
|
+
else
|
|
123
|
+
j = rand(index + 1)
|
|
124
|
+
reservoir[j] = hash if j < n
|
|
125
|
+
end
|
|
126
|
+
index += 1
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
if path_or_io.is_a?(String)
|
|
130
|
+
CSV.foreach(path_or_io, **csv_opts, &iterate)
|
|
131
|
+
else
|
|
132
|
+
CSV.new(path_or_io, **csv_opts).each(&iterate)
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
reservoir
|
|
136
|
+
end
|
|
137
|
+
|
|
103
138
|
# Find the first row matching a predicate, streaming (stops as soon as a match is found).
|
|
104
139
|
#
|
|
105
140
|
# @param path [String] file path
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: philiprehberger-csv_kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.7.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Philip Rehberger
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-04-
|
|
11
|
+
date: 2026-04-17 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: Streaming CSV processor with row-by-row transforms, validations, column
|
|
14
14
|
plucking, streaming each_hash iteration, filtering, writing, error recovery, and
|