osv 0.3.8 → 0.3.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Cargo.lock +7 -0
- data/README.md +60 -50
- data/Rakefile +1 -1
- data/ext/osv/Cargo.toml +1 -0
- data/ext/osv/src/csv/builder.rs +53 -19
- data/ext/osv/src/csv/mod.rs +2 -0
- data/ext/osv/src/csv/parser.rs +16 -15
- data/ext/osv/src/csv/read_impl.rs +3 -1
- data/ext/osv/src/csv/record.rs +4 -4
- data/ext/osv/src/reader.rs +16 -13
- data/ext/osv/src/utils.rs +3 -1
- data/lib/osv/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 935cf4c277ef52eb1b1c4a4c27d2fe54461489b38b4203a1e574ac1a9e3298df
|
4
|
+
data.tar.gz: aa13944483b0f9fa8963a830d5f2a2932110e86bb32c722565c2b4d5f9dbef3c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ea38b823a0423ef8883c04f6b677b16ba5d55f9a23faf434fc758788d238972b17b6e2435f0ec92661a32ee31fafb39170c81933d67a201af8cd528678e6261a
|
7
|
+
data.tar.gz: 77a62c7d62f36bd4143166d65c64eecccece56f1ebb969295cf7af6ff8639af9df71b3923d3ba13590824db68604e90cdc61e22490e81a3d0e24bf1af5f0be47
|
data/Cargo.lock
CHANGED
@@ -274,6 +274,7 @@ dependencies = [
|
|
274
274
|
"serde",
|
275
275
|
"serde_magnus",
|
276
276
|
"thiserror",
|
277
|
+
"xxhash-rust",
|
277
278
|
]
|
278
279
|
|
279
280
|
[[package]]
|
@@ -526,3 +527,9 @@ name = "windows_x86_64_msvc"
|
|
526
527
|
version = "0.52.6"
|
527
528
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
528
529
|
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
530
|
+
|
531
|
+
[[package]]
|
532
|
+
name = "xxhash-rust"
|
533
|
+
version = "0.8.14"
|
534
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
535
|
+
checksum = "d7d48f1b18be023c95e7b75f481cac649d74be7c507ff4a407c55cfb957f7934"
|
data/README.md
CHANGED
@@ -112,45 +112,50 @@ Here's some unscientific benchmarks. You can find the code in the [benchmark/com
|
|
112
112
|
### 10,000 lines
|
113
113
|
|
114
114
|
```
|
115
|
-
Benchmarking with
|
115
|
+
Benchmarking with 100001 lines of data
|
116
116
|
|
117
|
-
ruby 3.3.
|
117
|
+
ruby 3.3.6 (2024-11-05 revision 75015d4c1f) [arm64-darwin24]
|
118
118
|
Warming up --------------------------------------
|
119
|
-
OSV - Hash output
|
119
|
+
OSV - Hash output 1.000 i/100ms
|
120
120
|
CSV - Hash output 1.000 i/100ms
|
121
|
-
OSV - Array output
|
122
|
-
|
121
|
+
OSV - Array output 1.000 i/100ms
|
122
|
+
OSV - Direct Open Array output
|
123
|
+
12.719M i/100ms
|
124
|
+
CSV - Array output 1.000 i/100ms
|
123
125
|
FastCSV - Array output
|
124
|
-
|
125
|
-
OSV - StringIO
|
126
|
+
1.000 i/100ms
|
127
|
+
OSV - StringIO 1.000 i/100ms
|
126
128
|
CSV - StringIO 1.000 i/100ms
|
127
|
-
FastCSV - StringIO
|
128
|
-
OSV - Gzipped
|
129
|
+
FastCSV - StringIO 1.000 i/100ms
|
130
|
+
OSV - Gzipped 1.000 i/100ms
|
129
131
|
CSV - Gzipped 1.000 i/100ms
|
130
132
|
Calculating -------------------------------------
|
131
|
-
OSV - Hash output
|
132
|
-
CSV - Hash output
|
133
|
-
OSV - Array output
|
134
|
-
|
133
|
+
OSV - Hash output 6.722 (±14.9%) i/s (148.77 ms/i) - 59.000 in 10.074753s
|
134
|
+
CSV - Hash output 1.223 (± 0.0%) i/s (817.62 ms/i) - 13.000 in 10.788284s
|
135
|
+
OSV - Array output 17.284 (±11.6%) i/s (57.86 ms/i) - 171.000 in 10.007321s
|
136
|
+
OSV - Direct Open Array output
|
137
|
+
213.629M (±13.5%) i/s (4.68 ns/i) - 1.921B in 10.005506s
|
138
|
+
CSV - Array output 2.193 (± 0.0%) i/s (455.93 ms/i) - 22.000 in 10.052607s
|
135
139
|
FastCSV - Array output
|
136
|
-
|
137
|
-
OSV - StringIO
|
138
|
-
CSV - StringIO
|
139
|
-
FastCSV - StringIO
|
140
|
-
OSV - Gzipped
|
141
|
-
CSV - Gzipped
|
140
|
+
7.993 (± 0.0%) i/s (125.11 ms/i) - 80.000 in 10.053729s
|
141
|
+
OSV - StringIO 6.626 (±15.1%) i/s (150.91 ms/i) - 66.000 in 10.103646s
|
142
|
+
CSV - StringIO 1.478 (± 0.0%) i/s (676.78 ms/i) - 15.000 in 10.158640s
|
143
|
+
FastCSV - StringIO 17.074 (± 5.9%) i/s (58.57 ms/i) - 171.000 in 10.059266s
|
144
|
+
OSV - Gzipped 5.639 (± 0.0%) i/s (177.32 ms/i) - 57.000 in 10.152487s
|
145
|
+
CSV - Gzipped 1.176 (± 0.0%) i/s (850.19 ms/i) - 12.000 in 10.233398s
|
142
146
|
|
143
147
|
Comparison:
|
144
|
-
|
145
|
-
OSV - Array output:
|
146
|
-
FastCSV -
|
147
|
-
|
148
|
-
OSV - Hash output:
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
CSV - Hash output:
|
148
|
+
OSV - Direct Open Array output: 213629268.6 i/s
|
149
|
+
OSV - Array output: 17.3 i/s - 12360250.79x slower
|
150
|
+
FastCSV - StringIO: 17.1 i/s - 12511956.50x slower
|
151
|
+
FastCSV - Array output: 8.0 i/s - 26727225.72x slower
|
152
|
+
OSV - Hash output: 6.7 i/s - 31780615.83x slower
|
153
|
+
OSV - StringIO: 6.6 i/s - 32239620.60x slower
|
154
|
+
OSV - Gzipped: 5.6 i/s - 37881517.48x slower
|
155
|
+
CSV - Array output: 2.2 i/s - 97400427.87x slower
|
156
|
+
CSV - StringIO: 1.5 i/s - 144580048.04x slower
|
157
|
+
CSV - Hash output: 1.2 i/s - 174666591.31x slower
|
158
|
+
CSV - Gzipped: 1.2 i/s - 181626018.23x slower
|
154
159
|
```
|
155
160
|
|
156
161
|
### 1,000,000 lines
|
@@ -158,11 +163,13 @@ FastCSV - Array output: 97.9 i/s - 2.05x slower
|
|
158
163
|
```
|
159
164
|
Benchmarking with 1000001 lines of data
|
160
165
|
|
161
|
-
ruby 3.3.
|
166
|
+
ruby 3.3.6 (2024-11-05 revision 75015d4c1f) [arm64-darwin24]
|
162
167
|
Warming up --------------------------------------
|
163
168
|
OSV - Hash output 1.000 i/100ms
|
164
169
|
CSV - Hash output 1.000 i/100ms
|
165
170
|
OSV - Array output 1.000 i/100ms
|
171
|
+
OSV - Direct Open Array output
|
172
|
+
1.000 i/100ms
|
166
173
|
CSV - Array output 1.000 i/100ms
|
167
174
|
FastCSV - Array output
|
168
175
|
1.000 i/100ms
|
@@ -172,27 +179,30 @@ FastCSV - Array output
|
|
172
179
|
OSV - Gzipped 1.000 i/100ms
|
173
180
|
CSV - Gzipped 1.000 i/100ms
|
174
181
|
Calculating -------------------------------------
|
175
|
-
OSV - Hash output 0.
|
176
|
-
CSV - Hash output 0.
|
177
|
-
OSV - Array output 1.
|
178
|
-
|
182
|
+
OSV - Hash output 0.492 (± 0.0%) i/s (2.03 s/i) - 5.000 in 10.463278s
|
183
|
+
CSV - Hash output 0.114 (± 0.0%) i/s (8.75 s/i) - 2.000 in 17.573877s
|
184
|
+
OSV - Array output 1.502 (± 0.0%) i/s (665.58 ms/i) - 14.000 in 10.217551s
|
185
|
+
OSV - Direct Open Array output
|
186
|
+
1.626 (± 0.0%) i/s (614.90 ms/i) - 16.000 in 10.190323s
|
187
|
+
CSV - Array output 0.183 (± 0.0%) i/s (5.46 s/i) - 2.000 in 10.951943s
|
179
188
|
FastCSV - Array output
|
180
|
-
0.
|
181
|
-
OSV - StringIO 0.
|
182
|
-
CSV - StringIO 0.
|
183
|
-
FastCSV - StringIO
|
184
|
-
OSV - Gzipped 0.437 (± 0.0%) i/s (2.29 s/i) -
|
185
|
-
CSV - Gzipped 0.
|
189
|
+
0.326 (± 0.0%) i/s (3.07 s/i) - 4.000 in 12.340605s
|
190
|
+
OSV - StringIO 0.567 (± 0.0%) i/s (1.76 s/i) - 6.000 in 10.698027s
|
191
|
+
CSV - StringIO 0.141 (± 0.0%) i/s (7.10 s/i) - 2.000 in 14.237144s
|
192
|
+
FastCSV - StringIO 0.923 (± 0.0%) i/s (1.08 s/i) - 10.000 in 11.567775s
|
193
|
+
OSV - Gzipped 0.437 (± 0.0%) i/s (2.29 s/i) - 5.000 in 11.452764s
|
194
|
+
CSV - Gzipped 0.104 (± 0.0%) i/s (9.64 s/i) - 2.000 in 19.373423s
|
186
195
|
|
187
196
|
Comparison:
|
188
|
-
|
189
|
-
|
190
|
-
FastCSV -
|
191
|
-
|
192
|
-
|
193
|
-
OSV - Gzipped: 0.4 i/s -
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
197
|
+
OSV - Direct Open Array output: 1.6 i/s
|
198
|
+
OSV - Array output: 1.5 i/s - 1.08x slower
|
199
|
+
FastCSV - StringIO: 0.9 i/s - 1.76x slower
|
200
|
+
OSV - StringIO: 0.6 i/s - 2.87x slower
|
201
|
+
OSV - Hash output: 0.5 i/s - 3.30x slower
|
202
|
+
OSV - Gzipped: 0.4 i/s - 3.72x slower
|
203
|
+
FastCSV - Array output: 0.3 i/s - 4.99x slower
|
204
|
+
CSV - Array output: 0.2 i/s - 8.88x slower
|
205
|
+
CSV - StringIO: 0.1 i/s - 11.55x slower
|
206
|
+
CSV - Hash output: 0.1 i/s - 14.24x slower
|
207
|
+
CSV - Gzipped: 0.1 i/s - 15.68x slower
|
198
208
|
```
|
data/Rakefile
CHANGED
data/ext/osv/Cargo.toml
CHANGED
data/ext/osv/src/csv/builder.rs
CHANGED
@@ -3,18 +3,21 @@ use super::{
|
|
3
3
|
parser::RecordParser,
|
4
4
|
read_impl::ReadImpl,
|
5
5
|
reader::RecordReader,
|
6
|
+
READ_BUFFER_SIZE,
|
6
7
|
};
|
7
8
|
use flate2::read::GzDecoder;
|
8
9
|
use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, RString, Ruby, Value};
|
9
10
|
use std::{
|
10
11
|
fs::File,
|
11
|
-
io::{self, Read},
|
12
|
+
io::{self, BufReader, Read},
|
12
13
|
marker::PhantomData,
|
13
14
|
os::fd::FromRawFd,
|
14
15
|
thread,
|
15
16
|
};
|
16
17
|
use thiserror::Error;
|
17
18
|
|
19
|
+
pub(crate) static BUFFER_CHANNEL_SIZE: usize = 1024;
|
20
|
+
|
18
21
|
#[derive(Error, Debug)]
|
19
22
|
pub enum ReaderError {
|
20
23
|
#[error("Failed to get file descriptor: {0}")]
|
@@ -68,7 +71,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
68
71
|
delimiter: b',',
|
69
72
|
quote_char: b'"',
|
70
73
|
null_string: None,
|
71
|
-
buffer:
|
74
|
+
buffer: BUFFER_CHANNEL_SIZE,
|
72
75
|
flexible: false,
|
73
76
|
flexible_default: None,
|
74
77
|
_phantom: PhantomData,
|
@@ -128,7 +131,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
128
131
|
}
|
129
132
|
|
130
133
|
let file = unsafe { File::from_raw_fd(fd) };
|
131
|
-
Ok(Box::new(file))
|
134
|
+
Ok(Box::new(BufReader::with_capacity(READ_BUFFER_SIZE, file)))
|
132
135
|
}
|
133
136
|
|
134
137
|
fn handle_file_path(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
|
@@ -136,24 +139,27 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
136
139
|
let file = File::open(&path)?;
|
137
140
|
|
138
141
|
Ok(if path.ends_with(".gz") {
|
139
|
-
Box::new(GzDecoder::new(
|
142
|
+
Box::new(GzDecoder::new(BufReader::with_capacity(
|
143
|
+
READ_BUFFER_SIZE,
|
144
|
+
file,
|
145
|
+
)))
|
140
146
|
} else {
|
141
|
-
Box::new(file)
|
147
|
+
Box::new(BufReader::with_capacity(READ_BUFFER_SIZE, file))
|
142
148
|
})
|
143
149
|
}
|
144
150
|
|
145
|
-
fn get_reader(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
|
151
|
+
fn get_reader(&self) -> Result<(Box<dyn Read + Send + 'static>, bool), ReaderError> {
|
146
152
|
let string_io: magnus::RClass = self.ruby.eval("StringIO")?;
|
147
153
|
let gzip_reader_class: magnus::RClass = self.ruby.eval("Zlib::GzipReader")?;
|
148
154
|
|
149
155
|
if self.to_read.is_kind_of(string_io) {
|
150
|
-
self.handle_string_io()
|
156
|
+
self.handle_string_io().map(|r| (r, false))
|
151
157
|
} else if self.to_read.is_kind_of(gzip_reader_class) {
|
152
158
|
Err(ReaderError::UnsupportedGzipReader)
|
153
159
|
} else if self.to_read.is_kind_of(self.ruby.class_io()) {
|
154
|
-
self.handle_file_descriptor()
|
160
|
+
self.handle_file_descriptor().map(|r| (r, true))
|
155
161
|
} else {
|
156
|
-
self.handle_file_path()
|
162
|
+
self.handle_file_path().map(|r| (r, false))
|
157
163
|
}
|
158
164
|
}
|
159
165
|
|
@@ -175,7 +181,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
175
181
|
|
176
182
|
pub fn build(self) -> Result<RecordReader<T>, ReaderError> {
|
177
183
|
match self.get_reader() {
|
178
|
-
Ok(readable) => self.build_multi_threaded(readable),
|
184
|
+
Ok((readable, should_forget)) => self.build_multi_threaded(readable, should_forget),
|
179
185
|
Err(_) => {
|
180
186
|
let readable = self.get_single_threaded_reader()?;
|
181
187
|
self.build_single_threaded(readable)
|
@@ -186,6 +192,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
186
192
|
fn build_multi_threaded(
|
187
193
|
self,
|
188
194
|
readable: Box<dyn Read + Send + 'static>,
|
195
|
+
should_forget: bool,
|
189
196
|
) -> Result<RecordReader<T>, ReaderError> {
|
190
197
|
let flexible = self.flexible || self.flexible_default.is_some();
|
191
198
|
let mut reader = csv::ReaderBuilder::new()
|
@@ -204,7 +211,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
204
211
|
|
205
212
|
let flexible_default = self.flexible_default.clone();
|
206
213
|
let handle = thread::spawn(move || {
|
207
|
-
let mut record = csv::StringRecord::
|
214
|
+
let mut record = csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers.len());
|
208
215
|
while let Ok(true) = reader.read_record(&mut record) {
|
209
216
|
let row = T::parse(
|
210
217
|
&static_headers,
|
@@ -216,8 +223,10 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
216
223
|
break;
|
217
224
|
}
|
218
225
|
}
|
219
|
-
|
220
|
-
|
226
|
+
if should_forget {
|
227
|
+
let file_to_forget = reader.into_inner();
|
228
|
+
std::mem::forget(file_to_forget);
|
229
|
+
}
|
221
230
|
});
|
222
231
|
|
223
232
|
Ok(RecordReader {
|
@@ -257,30 +266,55 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
257
266
|
|
258
267
|
struct RubyReader {
|
259
268
|
inner: Value,
|
269
|
+
buffer: Option<Vec<u8>>,
|
270
|
+
offset: usize,
|
260
271
|
}
|
261
272
|
|
262
273
|
impl RubyReader {
|
263
274
|
fn new(inner: Value) -> Self {
|
264
|
-
Self {
|
275
|
+
Self {
|
276
|
+
inner,
|
277
|
+
buffer: None,
|
278
|
+
offset: 0,
|
279
|
+
}
|
265
280
|
}
|
266
281
|
}
|
267
282
|
|
283
|
+
// Read the entire inner into a vector and then read future reads from that vector with offset
|
268
284
|
impl Read for RubyReader {
|
269
285
|
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
270
|
-
|
286
|
+
// If we have an existing buffer, read from it
|
287
|
+
if let Some(buffer) = self.buffer.as_ref() {
|
288
|
+
let remaining = buffer.len() - self.offset;
|
289
|
+
let copy_size = remaining.min(buf.len());
|
290
|
+
buf[..copy_size].copy_from_slice(&buffer[self.offset..self.offset + copy_size]);
|
291
|
+
self.offset += copy_size;
|
292
|
+
return Ok(copy_size);
|
293
|
+
}
|
294
|
+
|
295
|
+
// No buffer yet - read the entire content from Ruby
|
296
|
+
let result = self.inner.funcall::<_, _, Value>("read", ());
|
271
297
|
match result {
|
272
298
|
Ok(data) => {
|
273
299
|
if data.is_nil() {
|
274
|
-
return Ok(0);
|
300
|
+
return Ok(0); // EOF
|
275
301
|
}
|
276
302
|
|
277
303
|
let string = RString::from_value(data).ok_or_else(|| {
|
278
304
|
io::Error::new(io::ErrorKind::Other, "Failed to convert to RString")
|
279
305
|
})?;
|
280
306
|
let bytes = unsafe { string.as_slice() };
|
281
|
-
|
282
|
-
|
283
|
-
|
307
|
+
|
308
|
+
// Store the entire content in the buffer
|
309
|
+
self.buffer = Some(bytes.to_vec());
|
310
|
+
self.offset = 0;
|
311
|
+
|
312
|
+
// Read initial chunk
|
313
|
+
let copy_size = bytes.len().min(buf.len());
|
314
|
+
buf[..copy_size].copy_from_slice(&bytes[..copy_size]);
|
315
|
+
self.offset = copy_size;
|
316
|
+
|
317
|
+
Ok(copy_size)
|
284
318
|
}
|
285
319
|
Err(e) => Err(io::Error::new(io::ErrorKind::Other, e.to_string())),
|
286
320
|
}
|
data/ext/osv/src/csv/mod.rs
CHANGED
data/ext/osv/src/csv/parser.rs
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
use std::collections::HashMap;
|
2
|
+
use std::hash::BuildHasher;
|
2
3
|
|
3
4
|
pub trait RecordParser {
|
4
5
|
type Output;
|
@@ -11,7 +12,7 @@ pub trait RecordParser {
|
|
11
12
|
) -> Self::Output;
|
12
13
|
}
|
13
14
|
|
14
|
-
impl RecordParser for HashMap<&'static str, Option<String
|
15
|
+
impl<S: BuildHasher + Default> RecordParser for HashMap<&'static str, Option<String>, S> {
|
15
16
|
type Output = Self;
|
16
17
|
|
17
18
|
#[inline]
|
@@ -21,21 +22,21 @@ impl RecordParser for HashMap<&'static str, Option<String>> {
|
|
21
22
|
null_string: Option<&str>,
|
22
23
|
flexible_default: Option<&str>,
|
23
24
|
) -> Self::Output {
|
24
|
-
let mut map = HashMap::
|
25
|
-
headers.iter().enumerate().for_each(|(i, header)| {
|
25
|
+
let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
|
26
|
+
headers.iter().enumerate().for_each(|(i, &header)| {
|
26
27
|
let value = record.get(i).map_or_else(
|
27
|
-
|| flexible_default.map(
|
28
|
+
|| flexible_default.map(ToString::to_string),
|
28
29
|
|field| {
|
29
30
|
if null_string == Some(field) {
|
30
31
|
None
|
31
32
|
} else if field.is_empty() {
|
32
33
|
Some(String::new())
|
33
34
|
} else {
|
34
|
-
Some(field.
|
35
|
+
Some(field.into())
|
35
36
|
}
|
36
37
|
},
|
37
38
|
);
|
38
|
-
map.insert(
|
39
|
+
map.insert(header, value);
|
39
40
|
});
|
40
41
|
map
|
41
42
|
}
|
@@ -53,20 +54,20 @@ impl RecordParser for Vec<Option<String>> {
|
|
53
54
|
) -> Self::Output {
|
54
55
|
let target_len = headers.len();
|
55
56
|
let mut vec = Vec::with_capacity(target_len);
|
56
|
-
|
57
|
-
|
57
|
+
for field in record.iter() {
|
58
|
+
let value = if Some(field) == null_string {
|
58
59
|
None
|
59
60
|
} else if field.is_empty() {
|
60
61
|
Some(String::new())
|
61
62
|
} else {
|
62
|
-
Some(field.
|
63
|
-
}
|
64
|
-
|
63
|
+
Some(field.into())
|
64
|
+
};
|
65
|
+
vec.push(value);
|
66
|
+
}
|
65
67
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
vec.push(Some(default.to_string()));
|
68
|
+
if vec.len() < target_len {
|
69
|
+
if let Some(default) = flexible_default {
|
70
|
+
vec.resize_with(target_len, || Some(default.to_string()));
|
70
71
|
}
|
71
72
|
}
|
72
73
|
vec
|
@@ -1,6 +1,8 @@
|
|
1
1
|
use super::{header_cache::StringCache, parser::RecordParser};
|
2
2
|
use std::{io::Read, thread};
|
3
3
|
|
4
|
+
pub(crate) const READ_BUFFER_SIZE: usize = 8192;
|
5
|
+
|
4
6
|
pub enum ReadImpl<T: RecordParser> {
|
5
7
|
SingleThreaded {
|
6
8
|
reader: csv::Reader<Box<dyn Read>>,
|
@@ -36,7 +38,7 @@ impl<T: RecordParser> ReadImpl<T> {
|
|
36
38
|
null_string,
|
37
39
|
flexible_default,
|
38
40
|
} => {
|
39
|
-
let mut record = csv::StringRecord::
|
41
|
+
let mut record = csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers.len());
|
40
42
|
match reader.read_record(&mut record) {
|
41
43
|
Ok(true) => Some(T::parse(
|
42
44
|
headers,
|
data/ext/osv/src/csv/record.rs
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
use magnus::{IntoValue, Ruby, Value};
|
2
|
-
use std::collections::HashMap;
|
2
|
+
use std::{collections::HashMap, hash::BuildHasher};
|
3
3
|
|
4
4
|
#[derive(Debug)]
|
5
|
-
pub enum CsvRecord {
|
5
|
+
pub enum CsvRecord<S: BuildHasher + Default> {
|
6
6
|
Vec(Vec<Option<String>>),
|
7
|
-
Map(HashMap<&'static str, Option<String
|
7
|
+
Map(HashMap<&'static str, Option<String>, S>),
|
8
8
|
}
|
9
9
|
|
10
|
-
impl IntoValue for CsvRecord {
|
10
|
+
impl<S: BuildHasher + Default> IntoValue for CsvRecord<S> {
|
11
11
|
#[inline]
|
12
12
|
fn into_value_with(self, handle: &Ruby) -> Value {
|
13
13
|
match self {
|
data/ext/osv/src/reader.rs
CHANGED
@@ -3,11 +3,12 @@ use crate::utils::*;
|
|
3
3
|
use magnus::value::ReprValue;
|
4
4
|
use magnus::{block::Yield, Error, KwArgs, RHash, Ruby, Symbol, Value};
|
5
5
|
use std::collections::HashMap;
|
6
|
+
use xxhash_rust::xxh3::Xxh3Builder;
|
6
7
|
|
7
8
|
pub fn parse_csv(
|
8
9
|
rb_self: Value,
|
9
10
|
args: &[Value],
|
10
|
-
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord
|
11
|
+
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<Xxh3Builder>>>>, Error> {
|
11
12
|
let ruby = unsafe { Ruby::get_unchecked() };
|
12
13
|
|
13
14
|
let CsvArgs {
|
@@ -37,18 +38,20 @@ pub fn parse_csv(
|
|
37
38
|
});
|
38
39
|
}
|
39
40
|
|
40
|
-
let iter: Box<dyn Iterator<Item = CsvRecord
|
41
|
+
let iter: Box<dyn Iterator<Item = CsvRecord<Xxh3Builder>>> = match result_type.as_str() {
|
41
42
|
"hash" => Box::new(
|
42
|
-
RecordReaderBuilder::<HashMap<&'static str, Option<String
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
43
|
+
RecordReaderBuilder::<HashMap<&'static str, Option<String>, Xxh3Builder>>::new(
|
44
|
+
&ruby, to_read,
|
45
|
+
)
|
46
|
+
.has_headers(has_headers)
|
47
|
+
.flexible(flexible)
|
48
|
+
.flexible_default(flexible_default)
|
49
|
+
.delimiter(delimiter)
|
50
|
+
.quote_char(quote_char)
|
51
|
+
.null_string(null_string)
|
52
|
+
.buffer(buffer_size)
|
53
|
+
.build()?
|
54
|
+
.map(CsvRecord::Map),
|
52
55
|
),
|
53
56
|
"array" => Box::new(
|
54
57
|
RecordReaderBuilder::<Vec<Option<String>>>::new(&ruby, to_read)
|
@@ -88,7 +91,7 @@ struct EnumeratorArgs {
|
|
88
91
|
|
89
92
|
fn create_enumerator(
|
90
93
|
args: EnumeratorArgs,
|
91
|
-
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord
|
94
|
+
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<Xxh3Builder>>>>, Error> {
|
92
95
|
let kwargs = RHash::new();
|
93
96
|
kwargs.aset(Symbol::new("has_headers"), args.has_headers)?;
|
94
97
|
kwargs.aset(
|
data/ext/osv/src/utils.rs
CHANGED
@@ -4,6 +4,8 @@ use magnus::{
|
|
4
4
|
Error, RString, Ruby, Symbol, Value,
|
5
5
|
};
|
6
6
|
|
7
|
+
use crate::csv::BUFFER_CHANNEL_SIZE;
|
8
|
+
|
7
9
|
#[derive(Debug)]
|
8
10
|
pub struct CsvArgs {
|
9
11
|
pub to_read: Value,
|
@@ -81,7 +83,7 @@ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
|
|
81
83
|
|
82
84
|
let null_string = kwargs.optional.3.unwrap_or_default();
|
83
85
|
|
84
|
-
let buffer_size = kwargs.optional.4.unwrap_or(
|
86
|
+
let buffer_size = kwargs.optional.4.unwrap_or(BUFFER_CHANNEL_SIZE);
|
85
87
|
|
86
88
|
let result_type = match kwargs.optional.5 {
|
87
89
|
Some(value) => {
|
data/lib/osv/version.rb
CHANGED