philiprehberger-csv_kit 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +17 -0
- data/LICENSE +21 -0
- data/README.md +105 -0
- data/lib/philiprehberger/csv_kit/detector.rb +43 -0
- data/lib/philiprehberger/csv_kit/processor.rb +107 -0
- data/lib/philiprehberger/csv_kit/row.rb +36 -0
- data/lib/philiprehberger/csv_kit/version.rb +7 -0
- data/lib/philiprehberger/csv_kit.rb +59 -0
- metadata +56 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: a3c2bc60fa0f4ed93e30547903b2ea5dac2d8fdf2f7fd8eae6519839f5d581f1
|
|
4
|
+
data.tar.gz: 105ed419b40775e09c72c00669e6c59d876684e8462566ce5f82d66cf0957792
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 89f1b6c68c8b2ef72bc2e1d7be187605fc3680682fe2e0003819f4c2b85787e581ea9c8a25bd1ed4eb7ef42422e304bd0982b9e1f1d3c1a5c4fa0519e08de6e7
|
|
7
|
+
data.tar.gz: f187ca48f5a86a3bf530dcb715e2da6474c645836a5c9a2b9f872a674a30864f150bcb38bdb98fcd0883babd2d67d25e1e31b74ef0358f85efdf714434bdcc5a
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this gem will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.1.0] - 2026-03-15
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- Initial release
|
|
14
|
+
- Streaming CSV processing with constant memory
|
|
15
|
+
- Auto-detect delimiter
|
|
16
|
+
- Type coercion and row validation
|
|
17
|
+
- Quick load and filtering convenience methods
|
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 philiprehberger
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# philiprehberger-csv_kit
|
|
2
|
+
|
|
3
|
+
[](https://github.com/philiprehberger/rb-csv-kit/actions/workflows/ci.yml)
|
|
4
|
+
[](https://rubygems.org/gems/philiprehberger-csv_kit)
|
|
5
|
+
|
|
6
|
+
Streaming CSV processor with type coercion and validation.
|
|
7
|
+
|
|
8
|
+
## Requirements
|
|
9
|
+
|
|
10
|
+
- Ruby >= 3.1
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
|
|
14
|
+
Add to your Gemfile:
|
|
15
|
+
|
|
16
|
+
```ruby
|
|
17
|
+
gem 'philiprehberger-csv_kit'
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
Then run:
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
bundle install
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Or install directly:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
gem install philiprehberger-csv_kit
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Usage
|
|
33
|
+
|
|
34
|
+
```ruby
|
|
35
|
+
require 'philiprehberger/csv_kit'
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### Quick Load
|
|
39
|
+
|
|
40
|
+
```ruby
|
|
41
|
+
rows = Philiprehberger::CsvKit.to_hashes('data.csv')
|
|
42
|
+
# => [{name: "Alice", age: "30"}, ...]
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Pluck Columns
|
|
46
|
+
|
|
47
|
+
```ruby
|
|
48
|
+
names = Philiprehberger::CsvKit.pluck('data.csv', :name, :city)
|
|
49
|
+
# => [{name: "Alice", city: "Berlin"}, ...]
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Filter Rows
|
|
53
|
+
|
|
54
|
+
```ruby
|
|
55
|
+
csv_string = Philiprehberger::CsvKit.filter('data.csv') do |row|
|
|
56
|
+
row[:age].to_i >= 30
|
|
57
|
+
end
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Streaming Processor
|
|
61
|
+
|
|
62
|
+
```ruby
|
|
63
|
+
rows = Philiprehberger::CsvKit.process('data.csv') do |p|
|
|
64
|
+
p.transform(:age) { |v| v.to_i }
|
|
65
|
+
p.validate(:age) { |v| v.to_i.positive? }
|
|
66
|
+
p.reject { |row| row[:city] == 'Unknown' }
|
|
67
|
+
p.each { |row| puts row[:name] }
|
|
68
|
+
end
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Delimiter Detection
|
|
72
|
+
|
|
73
|
+
```ruby
|
|
74
|
+
delimiter = Philiprehberger::CsvKit::Detector.detect('data.tsv')
|
|
75
|
+
# => "\t"
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## API
|
|
79
|
+
|
|
80
|
+
| Method / Class | Description |
|
|
81
|
+
|----------------|-------------|
|
|
82
|
+
| `CsvKit.to_hashes(path)` | Load CSV into array of symbolized hashes |
|
|
83
|
+
| `CsvKit.pluck(path, *keys)` | Extract specific columns |
|
|
84
|
+
| `CsvKit.filter(path, &block)` | Filter rows, return CSV string |
|
|
85
|
+
| `CsvKit.process(path_or_io, &block)` | Streaming DSL with transforms and validations |
|
|
86
|
+
| `Processor#headers(*names)` | Override header names |
|
|
87
|
+
| `Processor#transform(key, &block)` | Register column transform |
|
|
88
|
+
| `Processor#validate(key, &block)` | Register column validation (skip invalid) |
|
|
89
|
+
| `Processor#reject(&block)` | Reject rows matching predicate |
|
|
90
|
+
| `Processor#each(&block)` | Callback for each processed row |
|
|
91
|
+
| `Detector.detect(path_or_io)` | Auto-detect CSV delimiter |
|
|
92
|
+
| `Row#[](key)` | Access value by symbol key |
|
|
93
|
+
| `Row#to_h` | Convert row to plain hash |
|
|
94
|
+
|
|
95
|
+
## Development
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
bundle install
|
|
99
|
+
bundle exec rspec # Run tests
|
|
100
|
+
bundle exec rubocop # Check code style
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## License
|
|
104
|
+
|
|
105
|
+
MIT
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Philiprehberger
|
|
4
|
+
module CsvKit
|
|
5
|
+
# Detects the most likely delimiter for a CSV file by sampling its first lines.
|
|
6
|
+
class Detector
|
|
7
|
+
DELIMITERS = [',', "\t", ';', '|'].freeze
|
|
8
|
+
SAMPLE_LINES = 5
|
|
9
|
+
|
|
10
|
+
# Detect the delimiter used in a file or IO.
|
|
11
|
+
#
|
|
12
|
+
# @param path_or_io [String, IO] file path or IO object
|
|
13
|
+
# @return [String] the detected delimiter
|
|
14
|
+
def self.detect(path_or_io)
|
|
15
|
+
lines = read_sample(path_or_io)
|
|
16
|
+
return ',' if lines.empty?
|
|
17
|
+
|
|
18
|
+
DELIMITERS.min_by { |d| variance(lines, d) }
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# @api private
|
|
22
|
+
def self.read_sample(path_or_io)
|
|
23
|
+
if path_or_io.is_a?(String)
|
|
24
|
+
File.foreach(path_or_io).first(SAMPLE_LINES)
|
|
25
|
+
else
|
|
26
|
+
path_or_io.rewind if path_or_io.respond_to?(:rewind)
|
|
27
|
+
path_or_io.each_line.first(SAMPLE_LINES)
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# @api private
|
|
32
|
+
def self.variance(lines, delimiter)
|
|
33
|
+
counts = lines.map { |l| l.count(delimiter) }
|
|
34
|
+
return Float::INFINITY if counts.all?(&:zero?)
|
|
35
|
+
|
|
36
|
+
mean = counts.sum.to_f / counts.size
|
|
37
|
+
counts.sum { |c| (c - mean)**2 } / counts.size
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
private_class_method :read_sample, :variance
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Philiprehberger
|
|
4
|
+
module CsvKit
|
|
5
|
+
# Streaming CSV processor with a DSL for transforms, validations, and filtering.
|
|
6
|
+
class Processor
|
|
7
|
+
def initialize(path_or_io)
|
|
8
|
+
@path_or_io = path_or_io
|
|
9
|
+
@transforms = {}
|
|
10
|
+
@validations = {}
|
|
11
|
+
@reject_block = nil
|
|
12
|
+
@each_block = nil
|
|
13
|
+
@header_names = nil
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Override header names used for symbolized keys.
|
|
17
|
+
#
|
|
18
|
+
# @param names [Array<Symbol>] header names
|
|
19
|
+
def headers(*names)
|
|
20
|
+
@header_names = names.map(&:to_sym)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Register a transform for a specific column.
|
|
24
|
+
#
|
|
25
|
+
# @param key [Symbol] column name
|
|
26
|
+
# @yield [String] raw cell value
|
|
27
|
+
def transform(key, &block)
|
|
28
|
+
@transforms[key] = block
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Register a validation for a specific column. Rows failing validation are skipped.
|
|
32
|
+
#
|
|
33
|
+
# @param key [Symbol] column name
|
|
34
|
+
# @yield [String] cell value
|
|
35
|
+
def validate(key, &block)
|
|
36
|
+
@validations[key] = block
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Register a reject predicate. Rows matching are excluded.
|
|
40
|
+
#
|
|
41
|
+
# @yield [Row] the row
|
|
42
|
+
def reject(&block)
|
|
43
|
+
@reject_block = block
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Register a callback for each processed row.
|
|
47
|
+
#
|
|
48
|
+
# @yield [Row] the row
|
|
49
|
+
def each(&block)
|
|
50
|
+
@each_block = block
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Execute the processor, streaming row by row.
|
|
54
|
+
#
|
|
55
|
+
# @return [Array<Row>] collected rows
|
|
56
|
+
def run
|
|
57
|
+
open_csv { |csv| process_rows(csv) }
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
private
|
|
61
|
+
|
|
62
|
+
def process_rows(csv)
|
|
63
|
+
csv.each_with_object([]) do |csv_row, results|
|
|
64
|
+
row = build_row(csv_row)
|
|
65
|
+
next unless valid?(row)
|
|
66
|
+
next if rejected?(row)
|
|
67
|
+
|
|
68
|
+
apply_transforms!(row)
|
|
69
|
+
@each_block&.call(row)
|
|
70
|
+
results << row
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def open_csv(&block)
|
|
75
|
+
if @path_or_io.is_a?(String)
|
|
76
|
+
CSV.open(@path_or_io, headers: true, &block)
|
|
77
|
+
else
|
|
78
|
+
csv = CSV.new(@path_or_io, headers: true)
|
|
79
|
+
block.call(csv)
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def build_row(csv_row)
|
|
84
|
+
data = csv_row.to_h
|
|
85
|
+
if @header_names
|
|
86
|
+
values = data.values
|
|
87
|
+
mapped = @header_names.zip(values).to_h
|
|
88
|
+
Row.new(mapped)
|
|
89
|
+
else
|
|
90
|
+
Row.new(data.transform_keys(&:to_sym))
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def valid?(row)
|
|
95
|
+
@validations.all? { |key, blk| blk.call(row[key]) }
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def rejected?(row)
|
|
99
|
+
@reject_block&.call(row) || false
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def apply_transforms!(row)
|
|
103
|
+
@transforms.each { |key, blk| row[key] = blk.call(row[key]) }
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Philiprehberger
|
|
4
|
+
module CsvKit
|
|
5
|
+
# Wraps a CSV row as a hash with symbolized keys.
|
|
6
|
+
class Row
|
|
7
|
+
# @param data [Hash{Symbol => String}]
|
|
8
|
+
def initialize(data)
|
|
9
|
+
@data = data
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# Access a value by symbolized key.
|
|
13
|
+
#
|
|
14
|
+
# @param key [Symbol] column name
|
|
15
|
+
# @return [Object]
|
|
16
|
+
def [](key)
|
|
17
|
+
@data[key]
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Set a value by symbolized key.
|
|
21
|
+
#
|
|
22
|
+
# @param key [Symbol] column name
|
|
23
|
+
# @param value [Object] new value
|
|
24
|
+
def []=(key, value)
|
|
25
|
+
@data[key] = value
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Return the row as a plain hash.
|
|
29
|
+
#
|
|
30
|
+
# @return [Hash{Symbol => Object}]
|
|
31
|
+
def to_h
|
|
32
|
+
@data.dup
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'csv'
|
|
4
|
+
require_relative 'csv_kit/version'
|
|
5
|
+
require_relative 'csv_kit/detector'
|
|
6
|
+
require_relative 'csv_kit/row'
|
|
7
|
+
require_relative 'csv_kit/processor'
|
|
8
|
+
|
|
9
|
+
module Philiprehberger
|
|
10
|
+
module CsvKit
|
|
11
|
+
class Error < StandardError; end
|
|
12
|
+
|
|
13
|
+
# Streaming DSL — yields a Processor for configuration, then executes.
|
|
14
|
+
#
|
|
15
|
+
# @param path_or_io [String, IO] file path or IO object
|
|
16
|
+
# @yield [Processor] processor to configure transforms and validations
|
|
17
|
+
# @return [Array<Row>] collected rows
|
|
18
|
+
def self.process(path_or_io, &block)
|
|
19
|
+
processor = Processor.new(path_or_io)
|
|
20
|
+
block.call(processor)
|
|
21
|
+
processor.run
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Load an entire CSV into an array of symbolized hashes.
|
|
25
|
+
#
|
|
26
|
+
# @param path [String] file path
|
|
27
|
+
# @return [Array<Hash{Symbol => String}>]
|
|
28
|
+
def self.to_hashes(path)
|
|
29
|
+
CSV.foreach(path, headers: true).map do |row|
|
|
30
|
+
row.to_h.transform_keys(&:to_sym)
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Extract specific columns from a CSV.
|
|
35
|
+
#
|
|
36
|
+
# @param path [String] file path
|
|
37
|
+
# @param keys [Array<Symbol>] column names to extract
|
|
38
|
+
# @return [Array<Hash{Symbol => String}>]
|
|
39
|
+
def self.pluck(path, *keys)
|
|
40
|
+
to_hashes(path).map { |h| h.slice(*keys) }
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Filter rows and return matching rows as a CSV string.
|
|
44
|
+
#
|
|
45
|
+
# @param path [String] file path
|
|
46
|
+
# @yield [Hash{Symbol => String}] each row as a symbolized hash
|
|
47
|
+
# @return [String] CSV string with headers
|
|
48
|
+
def self.filter(path, &)
|
|
49
|
+
rows = to_hashes(path).select(&)
|
|
50
|
+
return '' if rows.empty?
|
|
51
|
+
|
|
52
|
+
headers = rows.first.keys
|
|
53
|
+
CSV.generate do |csv|
|
|
54
|
+
csv << headers
|
|
55
|
+
rows.each { |row| csv << headers.map { |k| row[k] } }
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: philiprehberger-csv_kit
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.1
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Philip Rehberger
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2026-03-15 00:00:00.000000000 Z
|
|
12
|
+
dependencies: []
|
|
13
|
+
description: Streaming CSV processor with row-by-row transforms, validations, column
|
|
14
|
+
plucking, filtering, and automatic delimiter detection.
|
|
15
|
+
email:
|
|
16
|
+
- me@philiprehberger.com
|
|
17
|
+
executables: []
|
|
18
|
+
extensions: []
|
|
19
|
+
extra_rdoc_files: []
|
|
20
|
+
files:
|
|
21
|
+
- CHANGELOG.md
|
|
22
|
+
- LICENSE
|
|
23
|
+
- README.md
|
|
24
|
+
- lib/philiprehberger/csv_kit.rb
|
|
25
|
+
- lib/philiprehberger/csv_kit/detector.rb
|
|
26
|
+
- lib/philiprehberger/csv_kit/processor.rb
|
|
27
|
+
- lib/philiprehberger/csv_kit/row.rb
|
|
28
|
+
- lib/philiprehberger/csv_kit/version.rb
|
|
29
|
+
homepage: https://github.com/philiprehberger/rb-csv-kit
|
|
30
|
+
licenses:
|
|
31
|
+
- MIT
|
|
32
|
+
metadata:
|
|
33
|
+
homepage_uri: https://github.com/philiprehberger/rb-csv-kit
|
|
34
|
+
source_code_uri: https://github.com/philiprehberger/rb-csv-kit
|
|
35
|
+
changelog_uri: https://github.com/philiprehberger/rb-csv-kit/blob/main/CHANGELOG.md
|
|
36
|
+
rubygems_mfa_required: 'true'
|
|
37
|
+
post_install_message:
|
|
38
|
+
rdoc_options: []
|
|
39
|
+
require_paths:
|
|
40
|
+
- lib
|
|
41
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
42
|
+
requirements:
|
|
43
|
+
- - ">="
|
|
44
|
+
- !ruby/object:Gem::Version
|
|
45
|
+
version: 3.1.0
|
|
46
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
47
|
+
requirements:
|
|
48
|
+
- - ">="
|
|
49
|
+
- !ruby/object:Gem::Version
|
|
50
|
+
version: '0'
|
|
51
|
+
requirements: []
|
|
52
|
+
rubygems_version: 3.5.22
|
|
53
|
+
signing_key:
|
|
54
|
+
specification_version: 4
|
|
55
|
+
summary: Streaming CSV processor with type coercion and validation
|
|
56
|
+
test_files: []
|