osv 0.3.11 → 0.3.12
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/osv/version.rb +1 -1
- metadata +219 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3158fed2ced03b0fd6bdc792198529e675a10656ec32d5d375a0fee88481d5e4
|
4
|
+
data.tar.gz: 0e2203377d114a6ee12cc7aa4abdb80baeea5adba3bd253825a6fa975cdd51aa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 82ae4170b55db12c767ccff522f0415e6ff586455a9c10e09d603323a8d177e0d532fae45d20e5eef57de59a6e75e0e711930b2a5c881af8e8192f9c804fedc1
|
7
|
+
data.tar.gz: 3e00897d9e6ae42756e22e55a0d3e2450b9375a63af71618f9d866bf256783b1315f18d27632a7f94f0d4ef7cefd20210efef85f3a5bddc47b216f77e51c69f3
|
data/lib/osv/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: osv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
@@ -38,7 +38,224 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: 1.2.0
|
41
|
-
description:
|
41
|
+
description: |
|
42
|
+
# OSV
|
43
|
+
|
44
|
+
[![Gem Version](https://badge.fury.io/rb/osv.svg)](https://badge.fury.io/rb/osv)
|
45
|
+
|
46
|
+
OSV is a high-performance CSV parser for Ruby, implemented in Rust. It wraps BurntSushi's excellent [csv-rs](https://github.com/BurntSushi/rust-csv) crate.
|
47
|
+
|
48
|
+
It provides a simple interface for reading CSV files with support for both hash-based and array-based row formats.
|
49
|
+
|
50
|
+
The array-based mode is faster than the hash-based mode, so if you don't need the hash keys, use the array-based mode.
|
51
|
+
|
52
|
+
## Installation
|
53
|
+
|
54
|
+
Add this line to your application's Gemfile:
|
55
|
+
|
56
|
+
```ruby
|
57
|
+
gem 'osv'
|
58
|
+
```
|
59
|
+
|
60
|
+
And then execute:
|
61
|
+
|
62
|
+
```bash
|
63
|
+
bundle install
|
64
|
+
```
|
65
|
+
|
66
|
+
Or install it directly:
|
67
|
+
|
68
|
+
```bash
|
69
|
+
gem install osv
|
70
|
+
```
|
71
|
+
|
72
|
+
## Usage
|
73
|
+
|
74
|
+
### Reading CSV Files
|
75
|
+
|
76
|
+
```ruby
|
77
|
+
require 'osv'
|
78
|
+
|
79
|
+
# Basic usage - each row as a hash
|
80
|
+
OSV.for_each("data.csv") do |row|
|
81
|
+
puts row["name"] # => "John"
|
82
|
+
puts row["age"] # => "25"
|
83
|
+
end
|
84
|
+
|
85
|
+
# Return an enumerator instead of using a block
|
86
|
+
rows = OSV.for_each("data.csv")
|
87
|
+
rows.each { |row| puts row["name"] }
|
88
|
+
|
89
|
+
# High-performance array mode
|
90
|
+
OSV.for_each("data.csv", result_type: :array) do |row|
|
91
|
+
puts row[0] # First column
|
92
|
+
puts row[1] # Second column
|
93
|
+
end
|
94
|
+
```
|
95
|
+
|
96
|
+
### Input Sources
|
97
|
+
|
98
|
+
```ruby
|
99
|
+
# From a file path
|
100
|
+
OSV.for_each("data.csv") { |row| puts row["name"] }
|
101
|
+
|
102
|
+
# From a file path
|
103
|
+
OSV.for_each("data.csv.gz") { |row| puts row["name"] }
|
104
|
+
|
105
|
+
# From an IO object
|
106
|
+
File.open("data.csv") { |file| OSV.for_each(file) { |row| puts row["name"] } }
|
107
|
+
|
108
|
+
# From a string
|
109
|
+
data = StringIO.new("name,age\nJohn,25")
|
110
|
+
OSV.for_each(data) { |row| puts row["name"] }
|
111
|
+
```
|
112
|
+
|
113
|
+
### Configuration Options
|
114
|
+
|
115
|
+
```ruby
|
116
|
+
OSV.for_each("data.csv",
|
117
|
+
# Input formatting
|
118
|
+
has_headers: true, # First row contains headers (default: true)
|
119
|
+
col_sep: ",", # Column separator (default: ",")
|
120
|
+
quote_char: '"', # Quote character (default: '"')
|
121
|
+
|
122
|
+
# Output formatting
|
123
|
+
result_type: :hash, # :hash or :array (hash is default)
|
124
|
+
nil_string: nil, # String to interpret as nil when parsing (default: nil)
|
125
|
+
|
126
|
+
# Parsing behavior
|
127
|
+
flexible: false, # Allow varying number of fields (default: false)
|
128
|
+
flexible_default: nil, # Default value for missing fields. If unset, we ignore missing fields.
|
129
|
+
# Implicitly enables flexible mode if set.
|
130
|
+
trim: :all, # Whether to trim whitespace. Options are :all, :headers, or :fields (default: nil)
|
131
|
+
buffer_size: 1024, # Number of rows to buffer in memory (default: 1024)
|
132
|
+
)
|
133
|
+
```
|
134
|
+
|
135
|
+
#### Available Options
|
136
|
+
|
137
|
+
- `has_headers`: Boolean indicating if the first row contains headers (default: true)
|
138
|
+
- `col_sep`: String specifying the field separator (default: ",")
|
139
|
+
- `quote_char`: String specifying the quote character (default: "\"")
|
140
|
+
- `nil_string`: String that should be interpreted as nil
|
141
|
+
- by default, empty strings are interpreted as empty strings
|
142
|
+
- if you want to interpret empty strings as nil, set this to an empty string
|
143
|
+
- `buffer_size`: Integer specifying the number of rows to buffer in memory (default: 1024)
|
144
|
+
- `result_type`: String specifying the output format ("hash" or "array" or :hash or :array)
|
145
|
+
- `flexible`: Boolean specifying if the parser should be flexible (default: false)
|
146
|
+
- `flexible_default`: String specifying the default value for missing fields. Implicitly enables flexible mode if set. (default: `nil`)
|
147
|
+
- `trim`: String specifying the trim mode ("all" or "headers" or "fields" or :all or :headers or :fields)
|
148
|
+
|
149
|
+
When `has_headers` is false, hash keys will be generated as `"c0"`, `"c1"`, etc.
|
150
|
+
|
151
|
+
## Requirements
|
152
|
+
|
153
|
+
- Ruby >= 3.1.0
|
154
|
+
- Rust toolchain (for installation from source)
|
155
|
+
|
156
|
+
## Performance
|
157
|
+
|
158
|
+
This library is faster than the standard Ruby CSV library, and is comparable to the fastest CSV parser gems I've used.
|
159
|
+
|
160
|
+
Here's some unscientific benchmarks. You can find the code in the [benchmark/comparison_benchmark.rb](benchmark/comparison_benchmark.rb) file.
|
161
|
+
|
162
|
+
### 10,000 lines
|
163
|
+
|
164
|
+
```
|
165
|
+
Benchmarking with 100001 lines of data
|
166
|
+
|
167
|
+
ruby 3.3.6 (2024-11-05 revision 75015d4c1f) [arm64-darwin24]
|
168
|
+
Warming up --------------------------------------
|
169
|
+
OSV - Hash output 1.000 i/100ms
|
170
|
+
CSV - Hash output 1.000 i/100ms
|
171
|
+
OSV - Array output 1.000 i/100ms
|
172
|
+
OSV - Direct Open Array output
|
173
|
+
12.719M i/100ms
|
174
|
+
CSV - Array output 1.000 i/100ms
|
175
|
+
FastCSV - Array output
|
176
|
+
1.000 i/100ms
|
177
|
+
OSV - StringIO 1.000 i/100ms
|
178
|
+
CSV - StringIO 1.000 i/100ms
|
179
|
+
FastCSV - StringIO 1.000 i/100ms
|
180
|
+
OSV - Gzipped 1.000 i/100ms
|
181
|
+
CSV - Gzipped 1.000 i/100ms
|
182
|
+
Calculating -------------------------------------
|
183
|
+
OSV - Hash output 6.722 (±14.9%) i/s (148.77 ms/i) - 59.000 in 10.074753s
|
184
|
+
CSV - Hash output 1.223 (± 0.0%) i/s (817.62 ms/i) - 13.000 in 10.788284s
|
185
|
+
OSV - Array output 17.284 (±11.6%) i/s (57.86 ms/i) - 171.000 in 10.007321s
|
186
|
+
OSV - Direct Open Array output
|
187
|
+
213.629M (±13.5%) i/s (4.68 ns/i) - 1.921B in 10.005506s
|
188
|
+
CSV - Array output 2.193 (± 0.0%) i/s (455.93 ms/i) - 22.000 in 10.052607s
|
189
|
+
FastCSV - Array output
|
190
|
+
7.993 (± 0.0%) i/s (125.11 ms/i) - 80.000 in 10.053729s
|
191
|
+
OSV - StringIO 6.626 (±15.1%) i/s (150.91 ms/i) - 66.000 in 10.103646s
|
192
|
+
CSV - StringIO 1.478 (± 0.0%) i/s (676.78 ms/i) - 15.000 in 10.158640s
|
193
|
+
FastCSV - StringIO 17.074 (± 5.9%) i/s (58.57 ms/i) - 171.000 in 10.059266s
|
194
|
+
OSV - Gzipped 5.639 (± 0.0%) i/s (177.32 ms/i) - 57.000 in 10.152487s
|
195
|
+
CSV - Gzipped 1.176 (± 0.0%) i/s (850.19 ms/i) - 12.000 in 10.233398s
|
196
|
+
|
197
|
+
Comparison:
|
198
|
+
OSV - Direct Open Array output: 213629268.6 i/s
|
199
|
+
OSV - Array output: 17.3 i/s - 12360250.79x slower
|
200
|
+
FastCSV - StringIO: 17.1 i/s - 12511956.50x slower
|
201
|
+
FastCSV - Array output: 8.0 i/s - 26727225.72x slower
|
202
|
+
OSV - Hash output: 6.7 i/s - 31780615.83x slower
|
203
|
+
OSV - StringIO: 6.6 i/s - 32239620.60x slower
|
204
|
+
OSV - Gzipped: 5.6 i/s - 37881517.48x slower
|
205
|
+
CSV - Array output: 2.2 i/s - 97400427.87x slower
|
206
|
+
CSV - StringIO: 1.5 i/s - 144580048.04x slower
|
207
|
+
CSV - Hash output: 1.2 i/s - 174666591.31x slower
|
208
|
+
CSV - Gzipped: 1.2 i/s - 181626018.23x slower
|
209
|
+
```
|
210
|
+
|
211
|
+
### 1,000,000 lines
|
212
|
+
|
213
|
+
```
|
214
|
+
Benchmarking with 1000001 lines of data
|
215
|
+
|
216
|
+
ruby 3.3.6 (2024-11-05 revision 75015d4c1f) [arm64-darwin24]
|
217
|
+
Warming up --------------------------------------
|
218
|
+
OSV - Hash output 1.000 i/100ms
|
219
|
+
CSV - Hash output 1.000 i/100ms
|
220
|
+
OSV - Array output 1.000 i/100ms
|
221
|
+
OSV - Direct Open Array output
|
222
|
+
1.000 i/100ms
|
223
|
+
CSV - Array output 1.000 i/100ms
|
224
|
+
FastCSV - Array output
|
225
|
+
1.000 i/100ms
|
226
|
+
OSV - StringIO 1.000 i/100ms
|
227
|
+
CSV - StringIO 1.000 i/100ms
|
228
|
+
FastCSV - StringIO 1.000 i/100ms
|
229
|
+
OSV - Gzipped 1.000 i/100ms
|
230
|
+
CSV - Gzipped 1.000 i/100ms
|
231
|
+
Calculating -------------------------------------
|
232
|
+
OSV - Hash output 0.492 (± 0.0%) i/s (2.03 s/i) - 5.000 in 10.463278s
|
233
|
+
CSV - Hash output 0.114 (± 0.0%) i/s (8.75 s/i) - 2.000 in 17.573877s
|
234
|
+
OSV - Array output 1.502 (± 0.0%) i/s (665.58 ms/i) - 14.000 in 10.217551s
|
235
|
+
OSV - Direct Open Array output
|
236
|
+
1.626 (± 0.0%) i/s (614.90 ms/i) - 16.000 in 10.190323s
|
237
|
+
CSV - Array output 0.183 (± 0.0%) i/s (5.46 s/i) - 2.000 in 10.951943s
|
238
|
+
FastCSV - Array output
|
239
|
+
0.326 (± 0.0%) i/s (3.07 s/i) - 4.000 in 12.340605s
|
240
|
+
OSV - StringIO 0.567 (± 0.0%) i/s (1.76 s/i) - 6.000 in 10.698027s
|
241
|
+
CSV - StringIO 0.141 (± 0.0%) i/s (7.10 s/i) - 2.000 in 14.237144s
|
242
|
+
FastCSV - StringIO 0.923 (± 0.0%) i/s (1.08 s/i) - 10.000 in 11.567775s
|
243
|
+
OSV - Gzipped 0.437 (± 0.0%) i/s (2.29 s/i) - 5.000 in 11.452764s
|
244
|
+
CSV - Gzipped 0.104 (± 0.0%) i/s (9.64 s/i) - 2.000 in 19.373423s
|
245
|
+
|
246
|
+
Comparison:
|
247
|
+
OSV - Direct Open Array output: 1.6 i/s
|
248
|
+
OSV - Array output: 1.5 i/s - 1.08x slower
|
249
|
+
FastCSV - StringIO: 0.9 i/s - 1.76x slower
|
250
|
+
OSV - StringIO: 0.6 i/s - 2.87x slower
|
251
|
+
OSV - Hash output: 0.5 i/s - 3.30x slower
|
252
|
+
OSV - Gzipped: 0.4 i/s - 3.72x slower
|
253
|
+
FastCSV - Array output: 0.3 i/s - 4.99x slower
|
254
|
+
CSV - Array output: 0.2 i/s - 8.88x slower
|
255
|
+
CSV - StringIO: 0.1 i/s - 11.55x slower
|
256
|
+
CSV - Hash output: 0.1 i/s - 14.24x slower
|
257
|
+
CSV - Gzipped: 0.1 i/s - 15.68x slower
|
258
|
+
```
|
42
259
|
email:
|
43
260
|
- nathan@jaremko.ca
|
44
261
|
executables: []
|