philiprehberger-csv_kit 0.5.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 63dcb3883c3732b41c45224ad90e4ae26ea4af2efced584db70089e0b5802be9
4
- data.tar.gz: 672c6414f9620772b8cbb664b6cfdbc7a37f76b38d2456cdf72e83d947bca659
3
+ metadata.gz: ea3cafa68ee9e49b8c1b305af9e7f796a9c417676a6018ee416dbb978e91eb38
4
+ data.tar.gz: e34151bbdf97d2e78fc348620fd958fa01a3e6d446744593d7a8474b82885de6
5
5
  SHA512:
6
- metadata.gz: ca664b0389948c7a12793a238f3b94189ef8bc7bbed4aeecdbb431cc986dfad0386f1e621922ba655bdfe85605ba9f2abcdd11550758604d5dad052e6b42d26e
7
- data.tar.gz: 973ccc2da16d11249c0dc4c1f1826a42815503b6800bfa9def603f8884dc041e23c37c81ddd9f1fcd719df58438c445a815add96e4de214513f4b6e01f0679c3
6
+ metadata.gz: a98d2e7baa28c03c04c44322482cc5a9ec4bef939692f5b3812679efe1d4ab1edc90748d65003ea4ddb287a73d27cb6ba22fd4e254ccf734d9c61cd3fbb28cab
7
+ data.tar.gz: 04a064667d1cbbde06ab473cad6dd18e4edc9558fa5f6efcff5ea2a0505a3c55846f016ebac118735c3b6c4b035bf48edf4c3cf03750c2ba855bf4ec4fc76f4e
data/CHANGELOG.md CHANGED
@@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.7.0] - 2026-04-16
11
+
12
+ ### Added
13
+ - `CsvKit.sample(path, n, dialect:)` — return n randomly sampled rows as symbolized hashes using reservoir sampling (Algorithm R); O(n) memory regardless of file size; returns all rows if file has fewer than n rows
14
+
15
+ ## [0.6.0] - 2026-04-15
16
+
17
+ ### Added
18
+ - `CsvKit.find(path, &block)` — return the first row matching a predicate, stopping as soon as a match is found
19
+
10
20
  ## [0.5.0] - 2026-04-09
11
21
 
12
22
  ### Added
@@ -91,6 +101,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
91
101
  - Type coercion and row validation
92
102
  - Quick load and filtering convenience methods
93
103
 
104
+ [Unreleased]: https://github.com/philiprehberger/rb-csv-kit/compare/v0.7.0...HEAD
105
+ [0.7.0]: https://github.com/philiprehberger/rb-csv-kit/compare/v0.6.0...v0.7.0
106
+ [0.6.0]: https://github.com/philiprehberger/rb-csv-kit/compare/v0.5.0...v0.6.0
94
107
  [0.5.0]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.5.0
95
108
  [0.4.0]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.4.0
96
109
  [0.3.1]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.3.1
data/README.md CHANGED
@@ -69,6 +69,24 @@ adults = Philiprehberger::CsvKit.each_hash("data.csv")
69
69
  .first(10)
70
70
  ```
71
71
 
72
+ ### Reservoir Sampling
73
+
74
+ Return n randomly sampled rows with O(n) memory using Knuth's Algorithm R. If the file has fewer than n rows, all rows are returned:
75
+
76
+ ```ruby
77
+ rows = Philiprehberger::CsvKit.sample("large.csv", 100)
78
+ # => [{name: "Alice", age: "30"}, ...]
79
+ ```
80
+
81
+ ### Find First Match
82
+
83
+ Return the first row that matches a predicate, streaming and stopping on the first hit:
84
+
85
+ ```ruby
86
+ user = Philiprehberger::CsvKit.find("users.csv") { |row| row[:email] == "a@b.com" }
87
+ # => {email: "a@b.com", name: "Alice"} or nil
88
+ ```
89
+
72
90
  ### Filter Rows
73
91
 
74
92
  ```ruby
@@ -166,8 +184,10 @@ delimiter = Philiprehberger::CsvKit::Detector.detect("data.tsv")
166
184
  | Method / Class | Description |
167
185
  |----------------|-------------|
168
186
  | `CsvKit.to_hashes(path, dialect:)` | Load CSV into array of symbolized hashes |
187
+ | `CsvKit.sample(path_or_io, n, dialect:)` | Return n randomly sampled rows using reservoir sampling (Algorithm R) |
169
188
  | `CsvKit.pluck(path, *keys, dialect:)` | Extract specific columns |
170
189
  | `CsvKit.filter(path, dialect:, &block)` | Filter rows, return CSV string |
190
+ | `CsvKit.find(path, dialect:, &block)` | Return the first row matching the predicate, or nil |
171
191
  | `CsvKit.headers(path, dialect:)` | Return header row as array of symbols |
172
192
  | `CsvKit.count(path, dialect:)` | Count data rows without loading into memory |
173
193
  | `CsvKit.each_hash(path, dialect:, &block)` | Stream rows as symbolized hashes; returns Enumerator if no block |
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Philiprehberger
4
4
  module CsvKit
5
- VERSION = '0.5.0'
5
+ VERSION = '0.7.0'
6
6
  end
7
7
  end
@@ -100,6 +100,57 @@ module Philiprehberger
100
100
  block ? enum.each(&block) : enum
101
101
  end
102
102
 
103
+ # Return n randomly sampled rows using reservoir sampling (Algorithm R).
104
+ # Memory usage is O(n) regardless of file size.
105
+ # If the file has fewer than n rows, all rows are returned.
106
+ #
107
+ # @param path_or_io [String, IO] file path or IO object
108
+ # @param n [Integer] number of rows to sample
109
+ # @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
110
+ # @return [Array<Hash{Symbol => String}>]
111
+ def self.sample(path_or_io, n, dialect: nil)
112
+ csv_opts = { headers: true }
113
+ csv_opts = Dialect.new(dialect).merge_into(csv_opts) if dialect
114
+
115
+ reservoir = []
116
+ index = 0
117
+
118
+ iterate = lambda do |row|
119
+ hash = row.to_h.transform_keys(&:to_sym)
120
+ if index < n
121
+ reservoir << hash
122
+ else
123
+ j = rand(index + 1)
124
+ reservoir[j] = hash if j < n
125
+ end
126
+ index += 1
127
+ end
128
+
129
+ if path_or_io.is_a?(String)
130
+ CSV.foreach(path_or_io, **csv_opts, &iterate)
131
+ else
132
+ CSV.new(path_or_io, **csv_opts).each(&iterate)
133
+ end
134
+
135
+ reservoir
136
+ end
137
+
138
+ # Find the first row matching a predicate, streaming (stops as soon as a match is found).
139
+ #
140
+ # @param path [String] file path
141
+ # @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
142
+ # @yield [Hash{Symbol => String}] each row as a symbolized hash
143
+ # @return [Hash{Symbol => String}, nil] the first matching row or nil
144
+ def self.find(path, dialect: nil, &block)
145
+ csv_opts = { headers: true }
146
+ csv_opts = Dialect.new(dialect).merge_into(csv_opts) if dialect
147
+ CSV.foreach(path, **csv_opts) do |row|
148
+ hash = row.to_h.transform_keys(&:to_sym)
149
+ return hash if block.call(hash)
150
+ end
151
+ nil
152
+ end
153
+
103
154
  # Filter rows and return matching rows as a CSV string.
104
155
  #
105
156
  # @param path [String] file path
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: philiprehberger-csv_kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Philip Rehberger
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-04-10 00:00:00.000000000 Z
11
+ date: 2026-04-17 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Streaming CSV processor with row-by-row transforms, validations, column
14
14
  plucking, streaming each_hash iteration, filtering, writing, error recovery, and