osv 0.3.12 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +1 -1
- data/README.md +39 -81
- data/Rakefile +6 -8
- data/ext/osv/src/csv/builder.rs +59 -175
- data/ext/osv/src/csv/mod.rs +4 -3
- data/ext/osv/src/csv/parser.rs +90 -14
- data/ext/osv/src/csv/record.rs +19 -6
- data/ext/osv/src/csv/record_reader.rs +175 -0
- data/ext/osv/src/csv/ruby_reader.rs +181 -0
- data/ext/osv/src/reader.rs +24 -19
- data/lib/osv/version.rb +1 -1
- metadata +14 -222
- data/ext/osv/src/csv/read_impl.rs +0 -75
- data/ext/osv/src/csv/reader.rs +0 -57
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: osv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.14
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2025-01-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -38,224 +38,12 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: 1.2.0
|
41
|
-
description: |
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
It provides a simple interface for reading CSV files with support for both hash-based and array-based row formats.
|
49
|
-
|
50
|
-
The array-based mode is faster than the hash-based mode, so if you don't need the hash keys, use the array-based mode.
|
51
|
-
|
52
|
-
## Installation
|
53
|
-
|
54
|
-
Add this line to your application's Gemfile:
|
55
|
-
|
56
|
-
```ruby
|
57
|
-
gem 'osv'
|
58
|
-
```
|
59
|
-
|
60
|
-
And then execute:
|
61
|
-
|
62
|
-
```bash
|
63
|
-
bundle install
|
64
|
-
```
|
65
|
-
|
66
|
-
Or install it directly:
|
67
|
-
|
68
|
-
```bash
|
69
|
-
gem install osv
|
70
|
-
```
|
71
|
-
|
72
|
-
## Usage
|
73
|
-
|
74
|
-
### Reading CSV Files
|
75
|
-
|
76
|
-
```ruby
|
77
|
-
require 'osv'
|
78
|
-
|
79
|
-
# Basic usage - each row as a hash
|
80
|
-
OSV.for_each("data.csv") do |row|
|
81
|
-
puts row["name"] # => "John"
|
82
|
-
puts row["age"] # => "25"
|
83
|
-
end
|
84
|
-
|
85
|
-
# Return an enumerator instead of using a block
|
86
|
-
rows = OSV.for_each("data.csv")
|
87
|
-
rows.each { |row| puts row["name"] }
|
88
|
-
|
89
|
-
# High-performance array mode
|
90
|
-
OSV.for_each("data.csv", result_type: :array) do |row|
|
91
|
-
puts row[0] # First column
|
92
|
-
puts row[1] # Second column
|
93
|
-
end
|
94
|
-
```
|
95
|
-
|
96
|
-
### Input Sources
|
97
|
-
|
98
|
-
```ruby
|
99
|
-
# From a file path
|
100
|
-
OSV.for_each("data.csv") { |row| puts row["name"] }
|
101
|
-
|
102
|
-
# From a file path
|
103
|
-
OSV.for_each("data.csv.gz") { |row| puts row["name"] }
|
104
|
-
|
105
|
-
# From an IO object
|
106
|
-
File.open("data.csv") { |file| OSV.for_each(file) { |row| puts row["name"] } }
|
107
|
-
|
108
|
-
# From a string
|
109
|
-
data = StringIO.new("name,age\nJohn,25")
|
110
|
-
OSV.for_each(data) { |row| puts row["name"] }
|
111
|
-
```
|
112
|
-
|
113
|
-
### Configuration Options
|
114
|
-
|
115
|
-
```ruby
|
116
|
-
OSV.for_each("data.csv",
|
117
|
-
# Input formatting
|
118
|
-
has_headers: true, # First row contains headers (default: true)
|
119
|
-
col_sep: ",", # Column separator (default: ",")
|
120
|
-
quote_char: '"', # Quote character (default: '"')
|
121
|
-
|
122
|
-
# Output formatting
|
123
|
-
result_type: :hash, # :hash or :array (hash is default)
|
124
|
-
nil_string: nil, # String to interpret as nil when parsing (default: nil)
|
125
|
-
|
126
|
-
# Parsing behavior
|
127
|
-
flexible: false, # Allow varying number of fields (default: false)
|
128
|
-
flexible_default: nil, # Default value for missing fields. If unset, we ignore missing fields.
|
129
|
-
# Implicitly enables flexible mode if set.
|
130
|
-
trim: :all, # Whether to trim whitespace. Options are :all, :headers, or :fields (default: nil)
|
131
|
-
buffer_size: 1024, # Number of rows to buffer in memory (default: 1024)
|
132
|
-
)
|
133
|
-
```
|
134
|
-
|
135
|
-
#### Available Options
|
136
|
-
|
137
|
-
- `has_headers`: Boolean indicating if the first row contains headers (default: true)
|
138
|
-
- `col_sep`: String specifying the field separator (default: ",")
|
139
|
-
- `quote_char`: String specifying the quote character (default: "\"")
|
140
|
-
- `nil_string`: String that should be interpreted as nil
|
141
|
-
- by default, empty strings are interpreted as empty strings
|
142
|
-
- if you want to interpret empty strings as nil, set this to an empty string
|
143
|
-
- `buffer_size`: Integer specifying the number of rows to buffer in memory (default: 1024)
|
144
|
-
- `result_type`: String specifying the output format ("hash" or "array" or :hash or :array)
|
145
|
-
- `flexible`: Boolean specifying if the parser should be flexible (default: false)
|
146
|
-
- `flexible_default`: String specifying the default value for missing fields. Implicitly enables flexible mode if set. (default: `nil`)
|
147
|
-
- `trim`: String specifying the trim mode ("all" or "headers" or "fields" or :all or :headers or :fields)
|
148
|
-
|
149
|
-
When `has_headers` is false, hash keys will be generated as `"c0"`, `"c1"`, etc.
|
150
|
-
|
151
|
-
## Requirements
|
152
|
-
|
153
|
-
- Ruby >= 3.1.0
|
154
|
-
- Rust toolchain (for installation from source)
|
155
|
-
|
156
|
-
## Performance
|
157
|
-
|
158
|
-
This library is faster than the standard Ruby CSV library, and is comparable to the fastest CSV parser gems I've used.
|
159
|
-
|
160
|
-
Here's some unscientific benchmarks. You can find the code in the [benchmark/comparison_benchmark.rb](benchmark/comparison_benchmark.rb) file.
|
161
|
-
|
162
|
-
### 10,000 lines
|
163
|
-
|
164
|
-
```
|
165
|
-
Benchmarking with 100001 lines of data
|
166
|
-
|
167
|
-
ruby 3.3.6 (2024-11-05 revision 75015d4c1f) [arm64-darwin24]
|
168
|
-
Warming up --------------------------------------
|
169
|
-
OSV - Hash output 1.000 i/100ms
|
170
|
-
CSV - Hash output 1.000 i/100ms
|
171
|
-
OSV - Array output 1.000 i/100ms
|
172
|
-
OSV - Direct Open Array output
|
173
|
-
12.719M i/100ms
|
174
|
-
CSV - Array output 1.000 i/100ms
|
175
|
-
FastCSV - Array output
|
176
|
-
1.000 i/100ms
|
177
|
-
OSV - StringIO 1.000 i/100ms
|
178
|
-
CSV - StringIO 1.000 i/100ms
|
179
|
-
FastCSV - StringIO 1.000 i/100ms
|
180
|
-
OSV - Gzipped 1.000 i/100ms
|
181
|
-
CSV - Gzipped 1.000 i/100ms
|
182
|
-
Calculating -------------------------------------
|
183
|
-
OSV - Hash output 6.722 (±14.9%) i/s (148.77 ms/i) - 59.000 in 10.074753s
|
184
|
-
CSV - Hash output 1.223 (± 0.0%) i/s (817.62 ms/i) - 13.000 in 10.788284s
|
185
|
-
OSV - Array output 17.284 (±11.6%) i/s (57.86 ms/i) - 171.000 in 10.007321s
|
186
|
-
OSV - Direct Open Array output
|
187
|
-
213.629M (±13.5%) i/s (4.68 ns/i) - 1.921B in 10.005506s
|
188
|
-
CSV - Array output 2.193 (± 0.0%) i/s (455.93 ms/i) - 22.000 in 10.052607s
|
189
|
-
FastCSV - Array output
|
190
|
-
7.993 (± 0.0%) i/s (125.11 ms/i) - 80.000 in 10.053729s
|
191
|
-
OSV - StringIO 6.626 (±15.1%) i/s (150.91 ms/i) - 66.000 in 10.103646s
|
192
|
-
CSV - StringIO 1.478 (± 0.0%) i/s (676.78 ms/i) - 15.000 in 10.158640s
|
193
|
-
FastCSV - StringIO 17.074 (± 5.9%) i/s (58.57 ms/i) - 171.000 in 10.059266s
|
194
|
-
OSV - Gzipped 5.639 (± 0.0%) i/s (177.32 ms/i) - 57.000 in 10.152487s
|
195
|
-
CSV - Gzipped 1.176 (± 0.0%) i/s (850.19 ms/i) - 12.000 in 10.233398s
|
196
|
-
|
197
|
-
Comparison:
|
198
|
-
OSV - Direct Open Array output: 213629268.6 i/s
|
199
|
-
OSV - Array output: 17.3 i/s - 12360250.79x slower
|
200
|
-
FastCSV - StringIO: 17.1 i/s - 12511956.50x slower
|
201
|
-
FastCSV - Array output: 8.0 i/s - 26727225.72x slower
|
202
|
-
OSV - Hash output: 6.7 i/s - 31780615.83x slower
|
203
|
-
OSV - StringIO: 6.6 i/s - 32239620.60x slower
|
204
|
-
OSV - Gzipped: 5.6 i/s - 37881517.48x slower
|
205
|
-
CSV - Array output: 2.2 i/s - 97400427.87x slower
|
206
|
-
CSV - StringIO: 1.5 i/s - 144580048.04x slower
|
207
|
-
CSV - Hash output: 1.2 i/s - 174666591.31x slower
|
208
|
-
CSV - Gzipped: 1.2 i/s - 181626018.23x slower
|
209
|
-
```
|
210
|
-
|
211
|
-
### 1,000,000 lines
|
212
|
-
|
213
|
-
```
|
214
|
-
Benchmarking with 1000001 lines of data
|
215
|
-
|
216
|
-
ruby 3.3.6 (2024-11-05 revision 75015d4c1f) [arm64-darwin24]
|
217
|
-
Warming up --------------------------------------
|
218
|
-
OSV - Hash output 1.000 i/100ms
|
219
|
-
CSV - Hash output 1.000 i/100ms
|
220
|
-
OSV - Array output 1.000 i/100ms
|
221
|
-
OSV - Direct Open Array output
|
222
|
-
1.000 i/100ms
|
223
|
-
CSV - Array output 1.000 i/100ms
|
224
|
-
FastCSV - Array output
|
225
|
-
1.000 i/100ms
|
226
|
-
OSV - StringIO 1.000 i/100ms
|
227
|
-
CSV - StringIO 1.000 i/100ms
|
228
|
-
FastCSV - StringIO 1.000 i/100ms
|
229
|
-
OSV - Gzipped 1.000 i/100ms
|
230
|
-
CSV - Gzipped 1.000 i/100ms
|
231
|
-
Calculating -------------------------------------
|
232
|
-
OSV - Hash output 0.492 (± 0.0%) i/s (2.03 s/i) - 5.000 in 10.463278s
|
233
|
-
CSV - Hash output 0.114 (± 0.0%) i/s (8.75 s/i) - 2.000 in 17.573877s
|
234
|
-
OSV - Array output 1.502 (± 0.0%) i/s (665.58 ms/i) - 14.000 in 10.217551s
|
235
|
-
OSV - Direct Open Array output
|
236
|
-
1.626 (± 0.0%) i/s (614.90 ms/i) - 16.000 in 10.190323s
|
237
|
-
CSV - Array output 0.183 (± 0.0%) i/s (5.46 s/i) - 2.000 in 10.951943s
|
238
|
-
FastCSV - Array output
|
239
|
-
0.326 (± 0.0%) i/s (3.07 s/i) - 4.000 in 12.340605s
|
240
|
-
OSV - StringIO 0.567 (± 0.0%) i/s (1.76 s/i) - 6.000 in 10.698027s
|
241
|
-
CSV - StringIO 0.141 (± 0.0%) i/s (7.10 s/i) - 2.000 in 14.237144s
|
242
|
-
FastCSV - StringIO 0.923 (± 0.0%) i/s (1.08 s/i) - 10.000 in 11.567775s
|
243
|
-
OSV - Gzipped 0.437 (± 0.0%) i/s (2.29 s/i) - 5.000 in 11.452764s
|
244
|
-
CSV - Gzipped 0.104 (± 0.0%) i/s (9.64 s/i) - 2.000 in 19.373423s
|
245
|
-
|
246
|
-
Comparison:
|
247
|
-
OSV - Direct Open Array output: 1.6 i/s
|
248
|
-
OSV - Array output: 1.5 i/s - 1.08x slower
|
249
|
-
FastCSV - StringIO: 0.9 i/s - 1.76x slower
|
250
|
-
OSV - StringIO: 0.6 i/s - 2.87x slower
|
251
|
-
OSV - Hash output: 0.5 i/s - 3.30x slower
|
252
|
-
OSV - Gzipped: 0.4 i/s - 3.72x slower
|
253
|
-
FastCSV - Array output: 0.3 i/s - 4.99x slower
|
254
|
-
CSV - Array output: 0.2 i/s - 8.88x slower
|
255
|
-
CSV - StringIO: 0.1 i/s - 11.55x slower
|
256
|
-
CSV - Hash output: 0.1 i/s - 14.24x slower
|
257
|
-
CSV - Gzipped: 0.1 i/s - 15.68x slower
|
258
|
-
```
|
41
|
+
description: |2
|
42
|
+
OSV is a high-performance CSV parser for Ruby, implemented in Rust.
|
43
|
+
It wraps BurntSushi's csv-rs crate to provide fast CSV parsing with support for both hash-based and array-based row formats.
|
44
|
+
Features include: Flexible input sources (file paths, gzipped files, IO objects, strings),
|
45
|
+
configurable parsing options (headers, separators, quote chars), support for both hash and array output formats,
|
46
|
+
whitespace trimming options, strict or flexible parsing modes, and is significantly faster than Ruby's standard CSV library.
|
259
47
|
email:
|
260
48
|
- nathan@jaremko.ca
|
261
49
|
executables: []
|
@@ -275,9 +63,9 @@ files:
|
|
275
63
|
- ext/osv/src/csv/header_cache.rs
|
276
64
|
- ext/osv/src/csv/mod.rs
|
277
65
|
- ext/osv/src/csv/parser.rs
|
278
|
-
- ext/osv/src/csv/read_impl.rs
|
279
|
-
- ext/osv/src/csv/reader.rs
|
280
66
|
- ext/osv/src/csv/record.rs
|
67
|
+
- ext/osv/src/csv/record_reader.rs
|
68
|
+
- ext/osv/src/csv/ruby_reader.rs
|
281
69
|
- ext/osv/src/lib.rs
|
282
70
|
- ext/osv/src/reader.rs
|
283
71
|
- ext/osv/src/utils.rs
|
@@ -290,6 +78,10 @@ licenses:
|
|
290
78
|
metadata:
|
291
79
|
homepage_uri: https://github.com/njaremko/osv
|
292
80
|
source_code_uri: https://github.com/njaremko/osv
|
81
|
+
readme_uri: https://github.com/njaremko/osv/blob/main/README.md
|
82
|
+
changelog_uri: https://github.com/njaremko/osv/blob/main/CHANGELOG.md
|
83
|
+
documentation_uri: https://www.rubydoc.info/gems/osv
|
84
|
+
funding_uri: https://github.com/sponsors/njaremko
|
293
85
|
post_install_message:
|
294
86
|
rdoc_options: []
|
295
87
|
require_paths:
|
@@ -1,75 +0,0 @@
|
|
1
|
-
use super::{header_cache::StringCache, parser::RecordParser};
|
2
|
-
use std::{io::Read, thread};
|
3
|
-
|
4
|
-
pub(crate) const READ_BUFFER_SIZE: usize = 8192;
|
5
|
-
|
6
|
-
pub enum ReadImpl<T: RecordParser> {
|
7
|
-
SingleThreaded {
|
8
|
-
reader: csv::Reader<Box<dyn Read>>,
|
9
|
-
headers: Vec<&'static str>,
|
10
|
-
null_string: Option<String>,
|
11
|
-
flexible_default: Option<String>,
|
12
|
-
},
|
13
|
-
MultiThreaded {
|
14
|
-
headers: Vec<&'static str>,
|
15
|
-
receiver: kanal::Receiver<T::Output>,
|
16
|
-
handle: Option<thread::JoinHandle<()>>,
|
17
|
-
},
|
18
|
-
}
|
19
|
-
|
20
|
-
impl<T: RecordParser> ReadImpl<T> {
|
21
|
-
#[inline]
|
22
|
-
pub fn next(&mut self) -> Option<T::Output> {
|
23
|
-
match self {
|
24
|
-
Self::MultiThreaded {
|
25
|
-
receiver, handle, ..
|
26
|
-
} => match receiver.recv() {
|
27
|
-
Ok(record) => Some(record),
|
28
|
-
Err(_) => {
|
29
|
-
if let Some(handle) = handle.take() {
|
30
|
-
let _ = handle.join();
|
31
|
-
}
|
32
|
-
None
|
33
|
-
}
|
34
|
-
},
|
35
|
-
Self::SingleThreaded {
|
36
|
-
reader,
|
37
|
-
headers,
|
38
|
-
null_string,
|
39
|
-
flexible_default,
|
40
|
-
} => {
|
41
|
-
let mut record = csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers.len());
|
42
|
-
match reader.read_record(&mut record) {
|
43
|
-
Ok(true) => Some(T::parse(
|
44
|
-
headers,
|
45
|
-
&record,
|
46
|
-
null_string.as_deref(),
|
47
|
-
flexible_default.as_deref(),
|
48
|
-
)),
|
49
|
-
_ => None,
|
50
|
-
}
|
51
|
-
}
|
52
|
-
}
|
53
|
-
}
|
54
|
-
|
55
|
-
#[inline]
|
56
|
-
pub fn cleanup(&mut self) {
|
57
|
-
match self {
|
58
|
-
Self::MultiThreaded {
|
59
|
-
receiver,
|
60
|
-
handle,
|
61
|
-
headers,
|
62
|
-
..
|
63
|
-
} => {
|
64
|
-
receiver.close();
|
65
|
-
if let Some(handle) = handle.take() {
|
66
|
-
let _ = handle.join();
|
67
|
-
}
|
68
|
-
let _ = StringCache::clear(headers);
|
69
|
-
}
|
70
|
-
Self::SingleThreaded { headers, .. } => {
|
71
|
-
let _ = StringCache::clear(headers);
|
72
|
-
}
|
73
|
-
}
|
74
|
-
}
|
75
|
-
}
|
data/ext/osv/src/csv/reader.rs
DELETED
@@ -1,57 +0,0 @@
|
|
1
|
-
use super::{parser::RecordParser, read_impl::ReadImpl};
|
2
|
-
use magnus::{Error, Ruby};
|
3
|
-
use std::{borrow::Cow, io::Read};
|
4
|
-
|
5
|
-
pub struct RecordReader<T: RecordParser> {
|
6
|
-
pub(crate) reader: ReadImpl<T>,
|
7
|
-
}
|
8
|
-
|
9
|
-
impl<T: RecordParser> RecordReader<T> {
|
10
|
-
#[inline]
|
11
|
-
pub(crate) fn get_headers(
|
12
|
-
ruby: &Ruby,
|
13
|
-
reader: &mut csv::Reader<impl Read>,
|
14
|
-
has_headers: bool,
|
15
|
-
) -> Result<Vec<String>, Error> {
|
16
|
-
let first_row = reader.headers().map_err(|e| {
|
17
|
-
Error::new(
|
18
|
-
ruby.exception_runtime_error(),
|
19
|
-
Cow::Owned(format!("Failed to read headers: {e}")),
|
20
|
-
)
|
21
|
-
})?;
|
22
|
-
|
23
|
-
Ok(if has_headers {
|
24
|
-
// Pre-allocate the vector with exact capacity
|
25
|
-
let mut headers = Vec::with_capacity(first_row.len());
|
26
|
-
headers.extend(first_row.iter().map(String::from));
|
27
|
-
headers
|
28
|
-
} else {
|
29
|
-
// Pre-allocate the vector with exact capacity
|
30
|
-
let mut headers = Vec::with_capacity(first_row.len());
|
31
|
-
headers.extend((0..first_row.len()).map(|i| format!("c{i}")));
|
32
|
-
headers
|
33
|
-
})
|
34
|
-
}
|
35
|
-
}
|
36
|
-
|
37
|
-
impl<T: RecordParser> Iterator for RecordReader<T> {
|
38
|
-
type Item = T::Output;
|
39
|
-
|
40
|
-
#[inline]
|
41
|
-
fn next(&mut self) -> Option<Self::Item> {
|
42
|
-
self.reader.next()
|
43
|
-
}
|
44
|
-
|
45
|
-
#[inline]
|
46
|
-
fn size_hint(&self) -> (usize, Option<usize>) {
|
47
|
-
// We can't know the exact size without reading the whole file
|
48
|
-
(0, None)
|
49
|
-
}
|
50
|
-
}
|
51
|
-
|
52
|
-
impl<T: RecordParser> Drop for RecordReader<T> {
|
53
|
-
#[inline]
|
54
|
-
fn drop(&mut self) {
|
55
|
-
self.reader.cleanup();
|
56
|
-
}
|
57
|
-
}
|