philiprehberger-csv_kit 0.3.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/README.md +48 -0
- data/lib/philiprehberger/csv_kit/processor.rb +25 -0
- data/lib/philiprehberger/csv_kit/row.rb +39 -0
- data/lib/philiprehberger/csv_kit/version.rb +1 -1
- data/lib/philiprehberger/csv_kit.rb +49 -0
- metadata +6 -5
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 63dcb3883c3732b41c45224ad90e4ae26ea4af2efced584db70089e0b5802be9
|
|
4
|
+
data.tar.gz: 672c6414f9620772b8cbb664b6cfdbc7a37f76b38d2456cdf72e83d947bca659
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ca664b0389948c7a12793a238f3b94189ef8bc7bbed4aeecdbb431cc986dfad0386f1e621922ba655bdfe85605ba9f2abcdd11550758604d5dad052e6b42d26e
|
|
7
|
+
data.tar.gz: 973ccc2da16d11249c0dc4c1f1826a42815503b6800bfa9def603f8884dc041e23c37c81ddd9f1fcd719df58438c445a815add96e4de214513f4b6e01f0679c3
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.5.0] - 2026-04-09
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- `CsvKit.each_hash(path, dialect:)` for streaming row-by-row iteration with constant memory; returns Enumerator if no block given
|
|
14
|
+
- `Row` now includes `Enumerable` with `keys`, `values`, `size`, `each`, and `merge` methods
|
|
15
|
+
|
|
16
|
+
## [0.4.0] - 2026-04-09
|
|
17
|
+
|
|
18
|
+
### Added
|
|
19
|
+
- `CsvKit.headers(path, dialect:)` to inspect header row without loading data
|
|
20
|
+
- `CsvKit.count(path, dialect:)` to count data rows without loading into memory
|
|
21
|
+
- `Processor#skip(n)` to skip the first N data rows
|
|
22
|
+
- `Processor#limit(n)` to stop after processing N rows
|
|
23
|
+
|
|
10
24
|
## [0.3.1] - 2026-03-31
|
|
11
25
|
|
|
12
26
|
### Changed
|
|
@@ -76,3 +90,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
76
90
|
- Auto-detect delimiter
|
|
77
91
|
- Type coercion and row validation
|
|
78
92
|
- Quick load and filtering convenience methods
|
|
93
|
+
|
|
94
|
+
[0.5.0]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.5.0
|
|
95
|
+
[0.4.0]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.4.0
|
|
96
|
+
[0.3.1]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.3.1
|
|
97
|
+
[0.3.0]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.3.0
|
|
98
|
+
[0.2.6]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.2.6
|
|
99
|
+
[0.2.5]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.2.5
|
|
100
|
+
[0.2.4]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.2.4
|
|
101
|
+
[0.2.3]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.2.3
|
|
102
|
+
[0.2.2]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.2.2
|
|
103
|
+
[0.2.1]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.2.1
|
|
104
|
+
[0.2.0]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.2.0
|
|
105
|
+
[0.1.2]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.1.2
|
|
106
|
+
[0.1.0]: https://github.com/philiprehberger/rb-csv-kit/releases/tag/v0.1.0
|
data/README.md
CHANGED
|
@@ -40,6 +40,35 @@ names = Philiprehberger::CsvKit.pluck("data.csv", :name, :city)
|
|
|
40
40
|
# => [{name: "Alice", city: "Berlin"}, ...]
|
|
41
41
|
```
|
|
42
42
|
|
|
43
|
+
### Inspect Headers
|
|
44
|
+
|
|
45
|
+
```ruby
|
|
46
|
+
Philiprehberger::CsvKit.headers("data.csv")
|
|
47
|
+
# => [:name, :age, :city]
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### Count Rows
|
|
51
|
+
|
|
52
|
+
```ruby
|
|
53
|
+
Philiprehberger::CsvKit.count("data.csv")
|
|
54
|
+
# => 1000
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Streaming Row-by-Row
|
|
58
|
+
|
|
59
|
+
Iterate rows with constant memory. Returns an `Enumerator` if no block is given:
|
|
60
|
+
|
|
61
|
+
```ruby
|
|
62
|
+
Philiprehberger::CsvKit.each_hash("large.csv") do |row|
|
|
63
|
+
puts row[:name]
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Or compose with Enumerator methods:
|
|
67
|
+
adults = Philiprehberger::CsvKit.each_hash("data.csv")
|
|
68
|
+
.select { |r| r[:age].to_i >= 18 }
|
|
69
|
+
.first(10)
|
|
70
|
+
```
|
|
71
|
+
|
|
43
72
|
### Filter Rows
|
|
44
73
|
|
|
45
74
|
```ruby
|
|
@@ -108,6 +137,15 @@ rows = Philiprehberger::CsvKit.process("data.csv") do |p|
|
|
|
108
137
|
end
|
|
109
138
|
```
|
|
110
139
|
|
|
140
|
+
### Skip and Limit
|
|
141
|
+
|
|
142
|
+
```ruby
|
|
143
|
+
rows = Philiprehberger::CsvKit.process("data.csv") do |p|
|
|
144
|
+
p.skip(10) # skip first 10 rows
|
|
145
|
+
p.limit(50) # stop after 50 rows
|
|
146
|
+
end
|
|
147
|
+
```
|
|
148
|
+
|
|
111
149
|
### Column Aliasing
|
|
112
150
|
|
|
113
151
|
```ruby
|
|
@@ -130,11 +168,16 @@ delimiter = Philiprehberger::CsvKit::Detector.detect("data.tsv")
|
|
|
130
168
|
| `CsvKit.to_hashes(path, dialect:)` | Load CSV into array of symbolized hashes |
|
|
131
169
|
| `CsvKit.pluck(path, *keys, dialect:)` | Extract specific columns |
|
|
132
170
|
| `CsvKit.filter(path, dialect:, &block)` | Filter rows, return CSV string |
|
|
171
|
+
| `CsvKit.headers(path, dialect:)` | Return header row as array of symbols |
|
|
172
|
+
| `CsvKit.count(path, dialect:)` | Count data rows without loading into memory |
|
|
173
|
+
| `CsvKit.each_hash(path, dialect:, &block)` | Stream rows as symbolized hashes; returns Enumerator if no block |
|
|
133
174
|
| `CsvKit.process(path_or_io, dialect:, &block)` | Streaming DSL with transforms and validations |
|
|
134
175
|
| `Processor#headers(*names)` | Override header names |
|
|
135
176
|
| `Processor#transform(key, &block)` | Register column transform |
|
|
136
177
|
| `Processor#type(key, type, **opts)` | Register built-in type coercion (:integer, :float, :string, :date, :datetime) |
|
|
137
178
|
| `Processor#validate(key, &block)` | Register column validation (skip invalid) |
|
|
179
|
+
| `Processor#skip(n)` | Skip the first N data rows |
|
|
180
|
+
| `Processor#limit(n)` | Stop after processing N rows |
|
|
138
181
|
| `Processor#reject(&block)` | Reject rows matching predicate |
|
|
139
182
|
| `Processor#each(&block)` | Callback for each processed row |
|
|
140
183
|
| `Processor#on_error(&block)` | Per-row error handler (return `:skip` or `:abort`) |
|
|
@@ -149,6 +192,11 @@ delimiter = Philiprehberger::CsvKit::Detector.detect("data.tsv")
|
|
|
149
192
|
| `Dialect.new(name_or_hash)` | Create a dialect from preset or custom hash |
|
|
150
193
|
| `Detector.detect(path_or_io)` | Auto-detect CSV delimiter |
|
|
151
194
|
| `Row#[](key)` | Access value by symbol key |
|
|
195
|
+
| `Row#keys` | Column names as array of symbols |
|
|
196
|
+
| `Row#values` | Column values as array |
|
|
197
|
+
| `Row#size` | Number of columns |
|
|
198
|
+
| `Row#each { \|k, v\| }` | Iterate key-value pairs (Enumerable) |
|
|
199
|
+
| `Row#merge(other)` | Return new Row with merged data |
|
|
152
200
|
| `Row#to_h` | Convert row to plain hash |
|
|
153
201
|
|
|
154
202
|
## Development
|
|
@@ -35,6 +35,8 @@ module Philiprehberger
|
|
|
35
35
|
@reject_block = nil
|
|
36
36
|
@each_block = nil
|
|
37
37
|
@header_names = nil
|
|
38
|
+
@skip_count = nil
|
|
39
|
+
@limit_count = nil
|
|
38
40
|
init_error_handler
|
|
39
41
|
init_callbacks
|
|
40
42
|
end
|
|
@@ -66,6 +68,22 @@ module Philiprehberger
|
|
|
66
68
|
@validations[key] = block
|
|
67
69
|
end
|
|
68
70
|
|
|
71
|
+
# Skip the first N data rows during processing.
|
|
72
|
+
#
|
|
73
|
+
# @param n [Integer] number of rows to skip
|
|
74
|
+
# @return [void]
|
|
75
|
+
def skip(n)
|
|
76
|
+
@skip_count = n
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Stop after processing N rows.
|
|
80
|
+
#
|
|
81
|
+
# @param n [Integer] maximum rows to collect
|
|
82
|
+
# @return [void]
|
|
83
|
+
def limit(n)
|
|
84
|
+
@limit_count = n
|
|
85
|
+
end
|
|
86
|
+
|
|
69
87
|
# Register a reject predicate.
|
|
70
88
|
def reject(&block)
|
|
71
89
|
@reject_block = block
|
|
@@ -87,7 +105,14 @@ module Philiprehberger
|
|
|
87
105
|
private
|
|
88
106
|
|
|
89
107
|
def process_rows(csv)
|
|
108
|
+
skipped = 0
|
|
90
109
|
csv.each_with_object([]) do |csv_row, results|
|
|
110
|
+
if @skip_count && skipped < @skip_count
|
|
111
|
+
skipped += 1
|
|
112
|
+
next
|
|
113
|
+
end
|
|
114
|
+
break results if @limit_count && results.length >= @limit_count
|
|
115
|
+
|
|
91
116
|
process_single_row(csv_row, results)
|
|
92
117
|
end
|
|
93
118
|
end
|
|
@@ -4,11 +4,50 @@ module Philiprehberger
|
|
|
4
4
|
module CsvKit
|
|
5
5
|
# Wraps a CSV row as a hash with symbolized keys.
|
|
6
6
|
class Row
|
|
7
|
+
include Enumerable
|
|
8
|
+
|
|
7
9
|
# @param data [Hash{Symbol => String}]
|
|
8
10
|
def initialize(data)
|
|
9
11
|
@data = data
|
|
10
12
|
end
|
|
11
13
|
|
|
14
|
+
# Iterate over key-value pairs.
|
|
15
|
+
#
|
|
16
|
+
# @yield [Symbol, Object] key and value
|
|
17
|
+
def each(&)
|
|
18
|
+
@data.each(&)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# Return column names.
|
|
22
|
+
#
|
|
23
|
+
# @return [Array<Symbol>]
|
|
24
|
+
def keys
|
|
25
|
+
@data.keys
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Return column values.
|
|
29
|
+
#
|
|
30
|
+
# @return [Array<Object>]
|
|
31
|
+
def values
|
|
32
|
+
@data.values
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Return the number of columns.
|
|
36
|
+
#
|
|
37
|
+
# @return [Integer]
|
|
38
|
+
def size
|
|
39
|
+
@data.size
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Merge another hash or Row into this row, returning a new Row.
|
|
43
|
+
#
|
|
44
|
+
# @param other [Hash, Row] data to merge
|
|
45
|
+
# @return [Row]
|
|
46
|
+
def merge(other)
|
|
47
|
+
other_data = other.is_a?(Row) ? other.to_h : other
|
|
48
|
+
Row.new(@data.merge(other_data))
|
|
49
|
+
end
|
|
50
|
+
|
|
12
51
|
# Access a value by symbolized key.
|
|
13
52
|
#
|
|
14
53
|
# @param key [Symbol] column name
|
|
@@ -51,6 +51,55 @@ module Philiprehberger
|
|
|
51
51
|
to_hashes(path, dialect: dialect).map { |h| h.slice(*keys) }
|
|
52
52
|
end
|
|
53
53
|
|
|
54
|
+
# Return the header row as an array of symbols.
|
|
55
|
+
#
|
|
56
|
+
# @param path [String] file path
|
|
57
|
+
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
58
|
+
# @return [Array<Symbol>]
|
|
59
|
+
def self.headers(path, dialect: nil)
|
|
60
|
+
csv_opts = {}
|
|
61
|
+
csv_opts = Dialect.new(dialect).merge_into(csv_opts) if dialect
|
|
62
|
+
CSV.open(path, **csv_opts) do |csv|
|
|
63
|
+
row = csv.shift
|
|
64
|
+
return [] unless row
|
|
65
|
+
|
|
66
|
+
row.map(&:to_sym)
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Count data rows without loading them all into memory.
|
|
71
|
+
#
|
|
72
|
+
# @param path [String] file path
|
|
73
|
+
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
74
|
+
# @return [Integer]
|
|
75
|
+
def self.count(path, dialect: nil)
|
|
76
|
+
csv_opts = { headers: true }
|
|
77
|
+
csv_opts = Dialect.new(dialect).merge_into(csv_opts) if dialect
|
|
78
|
+
n = 0
|
|
79
|
+
CSV.foreach(path, **csv_opts) { |_| n += 1 }
|
|
80
|
+
n
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Stream rows one at a time as symbolized hashes with constant memory.
|
|
84
|
+
# Returns an Enumerator if no block is given.
|
|
85
|
+
#
|
|
86
|
+
# @param path [String] file path
|
|
87
|
+
# @param dialect [Symbol, Hash, nil] CSV dialect preset or custom options
|
|
88
|
+
# @yield [Hash{Symbol => String}] each row
|
|
89
|
+
# @return [Enumerator, nil]
|
|
90
|
+
def self.each_hash(path, dialect: nil, &block)
|
|
91
|
+
csv_opts = { headers: true }
|
|
92
|
+
csv_opts = Dialect.new(dialect).merge_into(csv_opts) if dialect
|
|
93
|
+
|
|
94
|
+
enum = Enumerator.new do |yielder|
|
|
95
|
+
CSV.foreach(path, **csv_opts) do |row|
|
|
96
|
+
yielder.yield(row.to_h.transform_keys(&:to_sym))
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
block ? enum.each(&block) : enum
|
|
101
|
+
end
|
|
102
|
+
|
|
54
103
|
# Filter rows and return matching rows as a CSV string.
|
|
55
104
|
#
|
|
56
105
|
# @param path [String] file path
|
metadata
CHANGED
|
@@ -1,17 +1,18 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: philiprehberger-csv_kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.5.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Philip Rehberger
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-04-10 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: Streaming CSV processor with row-by-row transforms, validations, column
|
|
14
|
-
plucking, filtering, writing, error recovery, and
|
|
14
|
+
plucking, streaming each_hash iteration, filtering, writing, error recovery, and
|
|
15
|
+
automatic delimiter detection.
|
|
15
16
|
email:
|
|
16
17
|
- me@philiprehberger.com
|
|
17
18
|
executables: []
|
|
@@ -30,11 +31,11 @@ files:
|
|
|
30
31
|
- lib/philiprehberger/csv_kit/row.rb
|
|
31
32
|
- lib/philiprehberger/csv_kit/version.rb
|
|
32
33
|
- lib/philiprehberger/csv_kit/writer.rb
|
|
33
|
-
homepage: https://
|
|
34
|
+
homepage: https://philiprehberger.com/open-source-packages/ruby/philiprehberger-csv_kit
|
|
34
35
|
licenses:
|
|
35
36
|
- MIT
|
|
36
37
|
metadata:
|
|
37
|
-
homepage_uri: https://
|
|
38
|
+
homepage_uri: https://philiprehberger.com/open-source-packages/ruby/philiprehberger-csv_kit
|
|
38
39
|
source_code_uri: https://github.com/philiprehberger/rb-csv-kit
|
|
39
40
|
changelog_uri: https://github.com/philiprehberger/rb-csv-kit/blob/main/CHANGELOG.md
|
|
40
41
|
bug_tracker_uri: https://github.com/philiprehberger/rb-csv-kit/issues
|