osv 0.3.12 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: osv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.12
4
+ version: 0.3.14
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-12-24 00:00:00.000000000 Z
11
+ date: 2025-01-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -38,224 +38,12 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: 1.2.0
41
- description: |
42
- # OSV
43
-
44
- [![Gem Version](https://badge.fury.io/rb/osv.svg)](https://badge.fury.io/rb/osv)
45
-
46
- OSV is a high-performance CSV parser for Ruby, implemented in Rust. It wraps BurntSushi's excellent [csv-rs](https://github.com/BurntSushi/rust-csv) crate.
47
-
48
- It provides a simple interface for reading CSV files with support for both hash-based and array-based row formats.
49
-
50
- The array-based mode is faster than the hash-based mode, so if you don't need the hash keys, use the array-based mode.
51
-
52
- ## Installation
53
-
54
- Add this line to your application's Gemfile:
55
-
56
- ```ruby
57
- gem 'osv'
58
- ```
59
-
60
- And then execute:
61
-
62
- ```bash
63
- bundle install
64
- ```
65
-
66
- Or install it directly:
67
-
68
- ```bash
69
- gem install osv
70
- ```
71
-
72
- ## Usage
73
-
74
- ### Reading CSV Files
75
-
76
- ```ruby
77
- require 'osv'
78
-
79
- # Basic usage - each row as a hash
80
- OSV.for_each("data.csv") do |row|
81
- puts row["name"] # => "John"
82
- puts row["age"] # => "25"
83
- end
84
-
85
- # Return an enumerator instead of using a block
86
- rows = OSV.for_each("data.csv")
87
- rows.each { |row| puts row["name"] }
88
-
89
- # High-performance array mode
90
- OSV.for_each("data.csv", result_type: :array) do |row|
91
- puts row[0] # First column
92
- puts row[1] # Second column
93
- end
94
- ```
95
-
96
- ### Input Sources
97
-
98
- ```ruby
99
- # From a file path
100
- OSV.for_each("data.csv") { |row| puts row["name"] }
101
-
102
- # From a file path
103
- OSV.for_each("data.csv.gz") { |row| puts row["name"] }
104
-
105
- # From an IO object
106
- File.open("data.csv") { |file| OSV.for_each(file) { |row| puts row["name"] } }
107
-
108
- # From a string
109
- data = StringIO.new("name,age\nJohn,25")
110
- OSV.for_each(data) { |row| puts row["name"] }
111
- ```
112
-
113
- ### Configuration Options
114
-
115
- ```ruby
116
- OSV.for_each("data.csv",
117
- # Input formatting
118
- has_headers: true, # First row contains headers (default: true)
119
- col_sep: ",", # Column separator (default: ",")
120
- quote_char: '"', # Quote character (default: '"')
121
-
122
- # Output formatting
123
- result_type: :hash, # :hash or :array (hash is default)
124
- nil_string: nil, # String to interpret as nil when parsing (default: nil)
125
-
126
- # Parsing behavior
127
- flexible: false, # Allow varying number of fields (default: false)
128
- flexible_default: nil, # Default value for missing fields. If unset, we ignore missing fields.
129
- # Implicitly enables flexible mode if set.
130
- trim: :all, # Whether to trim whitespace. Options are :all, :headers, or :fields (default: nil)
131
- buffer_size: 1024, # Number of rows to buffer in memory (default: 1024)
132
- )
133
- ```
134
-
135
- #### Available Options
136
-
137
- - `has_headers`: Boolean indicating if the first row contains headers (default: true)
138
- - `col_sep`: String specifying the field separator (default: ",")
139
- - `quote_char`: String specifying the quote character (default: "\"")
140
- - `nil_string`: String that should be interpreted as nil
141
- - by default, empty strings are interpreted as empty strings
142
- - if you want to interpret empty strings as nil, set this to an empty string
143
- - `buffer_size`: Integer specifying the number of rows to buffer in memory (default: 1024)
144
- - `result_type`: String specifying the output format ("hash" or "array" or :hash or :array)
145
- - `flexible`: Boolean specifying if the parser should be flexible (default: false)
146
- - `flexible_default`: String specifying the default value for missing fields. Implicitly enables flexible mode if set. (default: `nil`)
147
- - `trim`: String specifying the trim mode ("all" or "headers" or "fields" or :all or :headers or :fields)
148
-
149
- When `has_headers` is false, hash keys will be generated as `"c0"`, `"c1"`, etc.
150
-
151
- ## Requirements
152
-
153
- - Ruby >= 3.1.0
154
- - Rust toolchain (for installation from source)
155
-
156
- ## Performance
157
-
158
- This library is faster than the standard Ruby CSV library, and is comparable to the fastest CSV parser gems I've used.
159
-
160
- Here's some unscientific benchmarks. You can find the code in the [benchmark/comparison_benchmark.rb](benchmark/comparison_benchmark.rb) file.
161
-
162
- ### 10,000 lines
163
-
164
- ```
165
- Benchmarking with 100001 lines of data
166
-
167
- ruby 3.3.6 (2024-11-05 revision 75015d4c1f) [arm64-darwin24]
168
- Warming up --------------------------------------
169
- OSV - Hash output 1.000 i/100ms
170
- CSV - Hash output 1.000 i/100ms
171
- OSV - Array output 1.000 i/100ms
172
- OSV - Direct Open Array output
173
- 12.719M i/100ms
174
- CSV - Array output 1.000 i/100ms
175
- FastCSV - Array output
176
- 1.000 i/100ms
177
- OSV - StringIO 1.000 i/100ms
178
- CSV - StringIO 1.000 i/100ms
179
- FastCSV - StringIO 1.000 i/100ms
180
- OSV - Gzipped 1.000 i/100ms
181
- CSV - Gzipped 1.000 i/100ms
182
- Calculating -------------------------------------
183
- OSV - Hash output 6.722 (±14.9%) i/s (148.77 ms/i) - 59.000 in 10.074753s
184
- CSV - Hash output 1.223 (± 0.0%) i/s (817.62 ms/i) - 13.000 in 10.788284s
185
- OSV - Array output 17.284 (±11.6%) i/s (57.86 ms/i) - 171.000 in 10.007321s
186
- OSV - Direct Open Array output
187
- 213.629M (±13.5%) i/s (4.68 ns/i) - 1.921B in 10.005506s
188
- CSV - Array output 2.193 (± 0.0%) i/s (455.93 ms/i) - 22.000 in 10.052607s
189
- FastCSV - Array output
190
- 7.993 (± 0.0%) i/s (125.11 ms/i) - 80.000 in 10.053729s
191
- OSV - StringIO 6.626 (±15.1%) i/s (150.91 ms/i) - 66.000 in 10.103646s
192
- CSV - StringIO 1.478 (± 0.0%) i/s (676.78 ms/i) - 15.000 in 10.158640s
193
- FastCSV - StringIO 17.074 (± 5.9%) i/s (58.57 ms/i) - 171.000 in 10.059266s
194
- OSV - Gzipped 5.639 (± 0.0%) i/s (177.32 ms/i) - 57.000 in 10.152487s
195
- CSV - Gzipped 1.176 (± 0.0%) i/s (850.19 ms/i) - 12.000 in 10.233398s
196
-
197
- Comparison:
198
- OSV - Direct Open Array output: 213629268.6 i/s
199
- OSV - Array output: 17.3 i/s - 12360250.79x slower
200
- FastCSV - StringIO: 17.1 i/s - 12511956.50x slower
201
- FastCSV - Array output: 8.0 i/s - 26727225.72x slower
202
- OSV - Hash output: 6.7 i/s - 31780615.83x slower
203
- OSV - StringIO: 6.6 i/s - 32239620.60x slower
204
- OSV - Gzipped: 5.6 i/s - 37881517.48x slower
205
- CSV - Array output: 2.2 i/s - 97400427.87x slower
206
- CSV - StringIO: 1.5 i/s - 144580048.04x slower
207
- CSV - Hash output: 1.2 i/s - 174666591.31x slower
208
- CSV - Gzipped: 1.2 i/s - 181626018.23x slower
209
- ```
210
-
211
- ### 1,000,000 lines
212
-
213
- ```
214
- Benchmarking with 1000001 lines of data
215
-
216
- ruby 3.3.6 (2024-11-05 revision 75015d4c1f) [arm64-darwin24]
217
- Warming up --------------------------------------
218
- OSV - Hash output 1.000 i/100ms
219
- CSV - Hash output 1.000 i/100ms
220
- OSV - Array output 1.000 i/100ms
221
- OSV - Direct Open Array output
222
- 1.000 i/100ms
223
- CSV - Array output 1.000 i/100ms
224
- FastCSV - Array output
225
- 1.000 i/100ms
226
- OSV - StringIO 1.000 i/100ms
227
- CSV - StringIO 1.000 i/100ms
228
- FastCSV - StringIO 1.000 i/100ms
229
- OSV - Gzipped 1.000 i/100ms
230
- CSV - Gzipped 1.000 i/100ms
231
- Calculating -------------------------------------
232
- OSV - Hash output 0.492 (± 0.0%) i/s (2.03 s/i) - 5.000 in 10.463278s
233
- CSV - Hash output 0.114 (± 0.0%) i/s (8.75 s/i) - 2.000 in 17.573877s
234
- OSV - Array output 1.502 (± 0.0%) i/s (665.58 ms/i) - 14.000 in 10.217551s
235
- OSV - Direct Open Array output
236
- 1.626 (± 0.0%) i/s (614.90 ms/i) - 16.000 in 10.190323s
237
- CSV - Array output 0.183 (± 0.0%) i/s (5.46 s/i) - 2.000 in 10.951943s
238
- FastCSV - Array output
239
- 0.326 (± 0.0%) i/s (3.07 s/i) - 4.000 in 12.340605s
240
- OSV - StringIO 0.567 (± 0.0%) i/s (1.76 s/i) - 6.000 in 10.698027s
241
- CSV - StringIO 0.141 (± 0.0%) i/s (7.10 s/i) - 2.000 in 14.237144s
242
- FastCSV - StringIO 0.923 (± 0.0%) i/s (1.08 s/i) - 10.000 in 11.567775s
243
- OSV - Gzipped 0.437 (± 0.0%) i/s (2.29 s/i) - 5.000 in 11.452764s
244
- CSV - Gzipped 0.104 (± 0.0%) i/s (9.64 s/i) - 2.000 in 19.373423s
245
-
246
- Comparison:
247
- OSV - Direct Open Array output: 1.6 i/s
248
- OSV - Array output: 1.5 i/s - 1.08x slower
249
- FastCSV - StringIO: 0.9 i/s - 1.76x slower
250
- OSV - StringIO: 0.6 i/s - 2.87x slower
251
- OSV - Hash output: 0.5 i/s - 3.30x slower
252
- OSV - Gzipped: 0.4 i/s - 3.72x slower
253
- FastCSV - Array output: 0.3 i/s - 4.99x slower
254
- CSV - Array output: 0.2 i/s - 8.88x slower
255
- CSV - StringIO: 0.1 i/s - 11.55x slower
256
- CSV - Hash output: 0.1 i/s - 14.24x slower
257
- CSV - Gzipped: 0.1 i/s - 15.68x slower
258
- ```
41
+ description: |2
42
+ OSV is a high-performance CSV parser for Ruby, implemented in Rust.
43
+ It wraps BurntSushi's csv-rs crate to provide fast CSV parsing with support for both hash-based and array-based row formats.
44
+ Features include: Flexible input sources (file paths, gzipped files, IO objects, strings),
45
+ configurable parsing options (headers, separators, quote chars), support for both hash and array output formats,
46
+ whitespace trimming options, strict or flexible parsing modes, and is significantly faster than Ruby's standard CSV library.
259
47
  email:
260
48
  - nathan@jaremko.ca
261
49
  executables: []
@@ -275,9 +63,9 @@ files:
275
63
  - ext/osv/src/csv/header_cache.rs
276
64
  - ext/osv/src/csv/mod.rs
277
65
  - ext/osv/src/csv/parser.rs
278
- - ext/osv/src/csv/read_impl.rs
279
- - ext/osv/src/csv/reader.rs
280
66
  - ext/osv/src/csv/record.rs
67
+ - ext/osv/src/csv/record_reader.rs
68
+ - ext/osv/src/csv/ruby_reader.rs
281
69
  - ext/osv/src/lib.rs
282
70
  - ext/osv/src/reader.rs
283
71
  - ext/osv/src/utils.rs
@@ -290,6 +78,10 @@ licenses:
290
78
  metadata:
291
79
  homepage_uri: https://github.com/njaremko/osv
292
80
  source_code_uri: https://github.com/njaremko/osv
81
+ readme_uri: https://github.com/njaremko/osv/blob/main/README.md
82
+ changelog_uri: https://github.com/njaremko/osv/blob/main/CHANGELOG.md
83
+ documentation_uri: https://www.rubydoc.info/gems/osv
84
+ funding_uri: https://github.com/sponsors/njaremko
293
85
  post_install_message:
294
86
  rdoc_options: []
295
87
  require_paths:
@@ -1,75 +0,0 @@
1
- use super::{header_cache::StringCache, parser::RecordParser};
2
- use std::{io::Read, thread};
3
-
4
- pub(crate) const READ_BUFFER_SIZE: usize = 8192;
5
-
6
- pub enum ReadImpl<T: RecordParser> {
7
- SingleThreaded {
8
- reader: csv::Reader<Box<dyn Read>>,
9
- headers: Vec<&'static str>,
10
- null_string: Option<String>,
11
- flexible_default: Option<String>,
12
- },
13
- MultiThreaded {
14
- headers: Vec<&'static str>,
15
- receiver: kanal::Receiver<T::Output>,
16
- handle: Option<thread::JoinHandle<()>>,
17
- },
18
- }
19
-
20
- impl<T: RecordParser> ReadImpl<T> {
21
- #[inline]
22
- pub fn next(&mut self) -> Option<T::Output> {
23
- match self {
24
- Self::MultiThreaded {
25
- receiver, handle, ..
26
- } => match receiver.recv() {
27
- Ok(record) => Some(record),
28
- Err(_) => {
29
- if let Some(handle) = handle.take() {
30
- let _ = handle.join();
31
- }
32
- None
33
- }
34
- },
35
- Self::SingleThreaded {
36
- reader,
37
- headers,
38
- null_string,
39
- flexible_default,
40
- } => {
41
- let mut record = csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers.len());
42
- match reader.read_record(&mut record) {
43
- Ok(true) => Some(T::parse(
44
- headers,
45
- &record,
46
- null_string.as_deref(),
47
- flexible_default.as_deref(),
48
- )),
49
- _ => None,
50
- }
51
- }
52
- }
53
- }
54
-
55
- #[inline]
56
- pub fn cleanup(&mut self) {
57
- match self {
58
- Self::MultiThreaded {
59
- receiver,
60
- handle,
61
- headers,
62
- ..
63
- } => {
64
- receiver.close();
65
- if let Some(handle) = handle.take() {
66
- let _ = handle.join();
67
- }
68
- let _ = StringCache::clear(headers);
69
- }
70
- Self::SingleThreaded { headers, .. } => {
71
- let _ = StringCache::clear(headers);
72
- }
73
- }
74
- }
75
- }
@@ -1,57 +0,0 @@
1
- use super::{parser::RecordParser, read_impl::ReadImpl};
2
- use magnus::{Error, Ruby};
3
- use std::{borrow::Cow, io::Read};
4
-
5
- pub struct RecordReader<T: RecordParser> {
6
- pub(crate) reader: ReadImpl<T>,
7
- }
8
-
9
- impl<T: RecordParser> RecordReader<T> {
10
- #[inline]
11
- pub(crate) fn get_headers(
12
- ruby: &Ruby,
13
- reader: &mut csv::Reader<impl Read>,
14
- has_headers: bool,
15
- ) -> Result<Vec<String>, Error> {
16
- let first_row = reader.headers().map_err(|e| {
17
- Error::new(
18
- ruby.exception_runtime_error(),
19
- Cow::Owned(format!("Failed to read headers: {e}")),
20
- )
21
- })?;
22
-
23
- Ok(if has_headers {
24
- // Pre-allocate the vector with exact capacity
25
- let mut headers = Vec::with_capacity(first_row.len());
26
- headers.extend(first_row.iter().map(String::from));
27
- headers
28
- } else {
29
- // Pre-allocate the vector with exact capacity
30
- let mut headers = Vec::with_capacity(first_row.len());
31
- headers.extend((0..first_row.len()).map(|i| format!("c{i}")));
32
- headers
33
- })
34
- }
35
- }
36
-
37
- impl<T: RecordParser> Iterator for RecordReader<T> {
38
- type Item = T::Output;
39
-
40
- #[inline]
41
- fn next(&mut self) -> Option<Self::Item> {
42
- self.reader.next()
43
- }
44
-
45
- #[inline]
46
- fn size_hint(&self) -> (usize, Option<usize>) {
47
- // We can't know the exact size without reading the whole file
48
- (0, None)
49
- }
50
- }
51
-
52
- impl<T: RecordParser> Drop for RecordReader<T> {
53
- #[inline]
54
- fn drop(&mut self) {
55
- self.reader.cleanup();
56
- }
57
- }