osv 0.3.8 → 0.3.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +7 -0
- data/README.md +60 -50
- data/Rakefile +1 -1
- data/ext/osv/Cargo.toml +1 -0
- data/ext/osv/src/csv/builder.rs +53 -19
- data/ext/osv/src/csv/mod.rs +2 -0
- data/ext/osv/src/csv/parser.rs +16 -15
- data/ext/osv/src/csv/read_impl.rs +3 -1
- data/ext/osv/src/csv/record.rs +4 -4
- data/ext/osv/src/reader.rs +16 -13
- data/ext/osv/src/utils.rs +3 -1
- data/lib/osv/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 935cf4c277ef52eb1b1c4a4c27d2fe54461489b38b4203a1e574ac1a9e3298df
|
|
4
|
+
data.tar.gz: aa13944483b0f9fa8963a830d5f2a2932110e86bb32c722565c2b4d5f9dbef3c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ea38b823a0423ef8883c04f6b677b16ba5d55f9a23faf434fc758788d238972b17b6e2435f0ec92661a32ee31fafb39170c81933d67a201af8cd528678e6261a
|
|
7
|
+
data.tar.gz: 77a62c7d62f36bd4143166d65c64eecccece56f1ebb969295cf7af6ff8639af9df71b3923d3ba13590824db68604e90cdc61e22490e81a3d0e24bf1af5f0be47
|
data/Cargo.lock
CHANGED
|
@@ -274,6 +274,7 @@ dependencies = [
|
|
|
274
274
|
"serde",
|
|
275
275
|
"serde_magnus",
|
|
276
276
|
"thiserror",
|
|
277
|
+
"xxhash-rust",
|
|
277
278
|
]
|
|
278
279
|
|
|
279
280
|
[[package]]
|
|
@@ -526,3 +527,9 @@ name = "windows_x86_64_msvc"
|
|
|
526
527
|
version = "0.52.6"
|
|
527
528
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
528
529
|
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
|
530
|
+
|
|
531
|
+
[[package]]
|
|
532
|
+
name = "xxhash-rust"
|
|
533
|
+
version = "0.8.14"
|
|
534
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
535
|
+
checksum = "d7d48f1b18be023c95e7b75f481cac649d74be7c507ff4a407c55cfb957f7934"
|
data/README.md
CHANGED
|
@@ -112,45 +112,50 @@ Here's some unscientific benchmarks. You can find the code in the [benchmark/com
|
|
|
112
112
|
### 10,000 lines
|
|
113
113
|
|
|
114
114
|
```
|
|
115
|
-
Benchmarking with
|
|
115
|
+
Benchmarking with 100001 lines of data
|
|
116
116
|
|
|
117
|
-
ruby 3.3.
|
|
117
|
+
ruby 3.3.6 (2024-11-05 revision 75015d4c1f) [arm64-darwin24]
|
|
118
118
|
Warming up --------------------------------------
|
|
119
|
-
OSV - Hash output
|
|
119
|
+
OSV - Hash output 1.000 i/100ms
|
|
120
120
|
CSV - Hash output 1.000 i/100ms
|
|
121
|
-
OSV - Array output
|
|
122
|
-
|
|
121
|
+
OSV - Array output 1.000 i/100ms
|
|
122
|
+
OSV - Direct Open Array output
|
|
123
|
+
12.719M i/100ms
|
|
124
|
+
CSV - Array output 1.000 i/100ms
|
|
123
125
|
FastCSV - Array output
|
|
124
|
-
|
|
125
|
-
OSV - StringIO
|
|
126
|
+
1.000 i/100ms
|
|
127
|
+
OSV - StringIO 1.000 i/100ms
|
|
126
128
|
CSV - StringIO 1.000 i/100ms
|
|
127
|
-
FastCSV - StringIO
|
|
128
|
-
OSV - Gzipped
|
|
129
|
+
FastCSV - StringIO 1.000 i/100ms
|
|
130
|
+
OSV - Gzipped 1.000 i/100ms
|
|
129
131
|
CSV - Gzipped 1.000 i/100ms
|
|
130
132
|
Calculating -------------------------------------
|
|
131
|
-
OSV - Hash output
|
|
132
|
-
CSV - Hash output
|
|
133
|
-
OSV - Array output
|
|
134
|
-
|
|
133
|
+
OSV - Hash output 6.722 (±14.9%) i/s (148.77 ms/i) - 59.000 in 10.074753s
|
|
134
|
+
CSV - Hash output 1.223 (± 0.0%) i/s (817.62 ms/i) - 13.000 in 10.788284s
|
|
135
|
+
OSV - Array output 17.284 (±11.6%) i/s (57.86 ms/i) - 171.000 in 10.007321s
|
|
136
|
+
OSV - Direct Open Array output
|
|
137
|
+
213.629M (±13.5%) i/s (4.68 ns/i) - 1.921B in 10.005506s
|
|
138
|
+
CSV - Array output 2.193 (± 0.0%) i/s (455.93 ms/i) - 22.000 in 10.052607s
|
|
135
139
|
FastCSV - Array output
|
|
136
|
-
|
|
137
|
-
OSV - StringIO
|
|
138
|
-
CSV - StringIO
|
|
139
|
-
FastCSV - StringIO
|
|
140
|
-
OSV - Gzipped
|
|
141
|
-
CSV - Gzipped
|
|
140
|
+
7.993 (± 0.0%) i/s (125.11 ms/i) - 80.000 in 10.053729s
|
|
141
|
+
OSV - StringIO 6.626 (±15.1%) i/s (150.91 ms/i) - 66.000 in 10.103646s
|
|
142
|
+
CSV - StringIO 1.478 (± 0.0%) i/s (676.78 ms/i) - 15.000 in 10.158640s
|
|
143
|
+
FastCSV - StringIO 17.074 (± 5.9%) i/s (58.57 ms/i) - 171.000 in 10.059266s
|
|
144
|
+
OSV - Gzipped 5.639 (± 0.0%) i/s (177.32 ms/i) - 57.000 in 10.152487s
|
|
145
|
+
CSV - Gzipped 1.176 (± 0.0%) i/s (850.19 ms/i) - 12.000 in 10.233398s
|
|
142
146
|
|
|
143
147
|
Comparison:
|
|
144
|
-
|
|
145
|
-
OSV - Array output:
|
|
146
|
-
FastCSV -
|
|
147
|
-
|
|
148
|
-
OSV - Hash output:
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
CSV - Hash output:
|
|
148
|
+
OSV - Direct Open Array output: 213629268.6 i/s
|
|
149
|
+
OSV - Array output: 17.3 i/s - 12360250.79x slower
|
|
150
|
+
FastCSV - StringIO: 17.1 i/s - 12511956.50x slower
|
|
151
|
+
FastCSV - Array output: 8.0 i/s - 26727225.72x slower
|
|
152
|
+
OSV - Hash output: 6.7 i/s - 31780615.83x slower
|
|
153
|
+
OSV - StringIO: 6.6 i/s - 32239620.60x slower
|
|
154
|
+
OSV - Gzipped: 5.6 i/s - 37881517.48x slower
|
|
155
|
+
CSV - Array output: 2.2 i/s - 97400427.87x slower
|
|
156
|
+
CSV - StringIO: 1.5 i/s - 144580048.04x slower
|
|
157
|
+
CSV - Hash output: 1.2 i/s - 174666591.31x slower
|
|
158
|
+
CSV - Gzipped: 1.2 i/s - 181626018.23x slower
|
|
154
159
|
```
|
|
155
160
|
|
|
156
161
|
### 1,000,000 lines
|
|
@@ -158,11 +163,13 @@ FastCSV - Array output: 97.9 i/s - 2.05x slower
|
|
|
158
163
|
```
|
|
159
164
|
Benchmarking with 1000001 lines of data
|
|
160
165
|
|
|
161
|
-
ruby 3.3.
|
|
166
|
+
ruby 3.3.6 (2024-11-05 revision 75015d4c1f) [arm64-darwin24]
|
|
162
167
|
Warming up --------------------------------------
|
|
163
168
|
OSV - Hash output 1.000 i/100ms
|
|
164
169
|
CSV - Hash output 1.000 i/100ms
|
|
165
170
|
OSV - Array output 1.000 i/100ms
|
|
171
|
+
OSV - Direct Open Array output
|
|
172
|
+
1.000 i/100ms
|
|
166
173
|
CSV - Array output 1.000 i/100ms
|
|
167
174
|
FastCSV - Array output
|
|
168
175
|
1.000 i/100ms
|
|
@@ -172,27 +179,30 @@ FastCSV - Array output
|
|
|
172
179
|
OSV - Gzipped 1.000 i/100ms
|
|
173
180
|
CSV - Gzipped 1.000 i/100ms
|
|
174
181
|
Calculating -------------------------------------
|
|
175
|
-
OSV - Hash output 0.
|
|
176
|
-
CSV - Hash output 0.
|
|
177
|
-
OSV - Array output 1.
|
|
178
|
-
|
|
182
|
+
OSV - Hash output 0.492 (± 0.0%) i/s (2.03 s/i) - 5.000 in 10.463278s
|
|
183
|
+
CSV - Hash output 0.114 (± 0.0%) i/s (8.75 s/i) - 2.000 in 17.573877s
|
|
184
|
+
OSV - Array output 1.502 (± 0.0%) i/s (665.58 ms/i) - 14.000 in 10.217551s
|
|
185
|
+
OSV - Direct Open Array output
|
|
186
|
+
1.626 (± 0.0%) i/s (614.90 ms/i) - 16.000 in 10.190323s
|
|
187
|
+
CSV - Array output 0.183 (± 0.0%) i/s (5.46 s/i) - 2.000 in 10.951943s
|
|
179
188
|
FastCSV - Array output
|
|
180
|
-
0.
|
|
181
|
-
OSV - StringIO 0.
|
|
182
|
-
CSV - StringIO 0.
|
|
183
|
-
FastCSV - StringIO
|
|
184
|
-
OSV - Gzipped 0.437 (± 0.0%) i/s (2.29 s/i) -
|
|
185
|
-
CSV - Gzipped 0.
|
|
189
|
+
0.326 (± 0.0%) i/s (3.07 s/i) - 4.000 in 12.340605s
|
|
190
|
+
OSV - StringIO 0.567 (± 0.0%) i/s (1.76 s/i) - 6.000 in 10.698027s
|
|
191
|
+
CSV - StringIO 0.141 (± 0.0%) i/s (7.10 s/i) - 2.000 in 14.237144s
|
|
192
|
+
FastCSV - StringIO 0.923 (± 0.0%) i/s (1.08 s/i) - 10.000 in 11.567775s
|
|
193
|
+
OSV - Gzipped 0.437 (± 0.0%) i/s (2.29 s/i) - 5.000 in 11.452764s
|
|
194
|
+
CSV - Gzipped 0.104 (± 0.0%) i/s (9.64 s/i) - 2.000 in 19.373423s
|
|
186
195
|
|
|
187
196
|
Comparison:
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
FastCSV -
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
OSV - Gzipped: 0.4 i/s -
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
197
|
+
OSV - Direct Open Array output: 1.6 i/s
|
|
198
|
+
OSV - Array output: 1.5 i/s - 1.08x slower
|
|
199
|
+
FastCSV - StringIO: 0.9 i/s - 1.76x slower
|
|
200
|
+
OSV - StringIO: 0.6 i/s - 2.87x slower
|
|
201
|
+
OSV - Hash output: 0.5 i/s - 3.30x slower
|
|
202
|
+
OSV - Gzipped: 0.4 i/s - 3.72x slower
|
|
203
|
+
FastCSV - Array output: 0.3 i/s - 4.99x slower
|
|
204
|
+
CSV - Array output: 0.2 i/s - 8.88x slower
|
|
205
|
+
CSV - StringIO: 0.1 i/s - 11.55x slower
|
|
206
|
+
CSV - Hash output: 0.1 i/s - 14.24x slower
|
|
207
|
+
CSV - Gzipped: 0.1 i/s - 15.68x slower
|
|
198
208
|
```
|
data/Rakefile
CHANGED
data/ext/osv/Cargo.toml
CHANGED
data/ext/osv/src/csv/builder.rs
CHANGED
|
@@ -3,18 +3,21 @@ use super::{
|
|
|
3
3
|
parser::RecordParser,
|
|
4
4
|
read_impl::ReadImpl,
|
|
5
5
|
reader::RecordReader,
|
|
6
|
+
READ_BUFFER_SIZE,
|
|
6
7
|
};
|
|
7
8
|
use flate2::read::GzDecoder;
|
|
8
9
|
use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, RString, Ruby, Value};
|
|
9
10
|
use std::{
|
|
10
11
|
fs::File,
|
|
11
|
-
io::{self, Read},
|
|
12
|
+
io::{self, BufReader, Read},
|
|
12
13
|
marker::PhantomData,
|
|
13
14
|
os::fd::FromRawFd,
|
|
14
15
|
thread,
|
|
15
16
|
};
|
|
16
17
|
use thiserror::Error;
|
|
17
18
|
|
|
19
|
+
pub(crate) static BUFFER_CHANNEL_SIZE: usize = 1024;
|
|
20
|
+
|
|
18
21
|
#[derive(Error, Debug)]
|
|
19
22
|
pub enum ReaderError {
|
|
20
23
|
#[error("Failed to get file descriptor: {0}")]
|
|
@@ -68,7 +71,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
|
68
71
|
delimiter: b',',
|
|
69
72
|
quote_char: b'"',
|
|
70
73
|
null_string: None,
|
|
71
|
-
buffer:
|
|
74
|
+
buffer: BUFFER_CHANNEL_SIZE,
|
|
72
75
|
flexible: false,
|
|
73
76
|
flexible_default: None,
|
|
74
77
|
_phantom: PhantomData,
|
|
@@ -128,7 +131,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
|
128
131
|
}
|
|
129
132
|
|
|
130
133
|
let file = unsafe { File::from_raw_fd(fd) };
|
|
131
|
-
Ok(Box::new(file))
|
|
134
|
+
Ok(Box::new(BufReader::with_capacity(READ_BUFFER_SIZE, file)))
|
|
132
135
|
}
|
|
133
136
|
|
|
134
137
|
fn handle_file_path(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
|
|
@@ -136,24 +139,27 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
|
136
139
|
let file = File::open(&path)?;
|
|
137
140
|
|
|
138
141
|
Ok(if path.ends_with(".gz") {
|
|
139
|
-
Box::new(GzDecoder::new(
|
|
142
|
+
Box::new(GzDecoder::new(BufReader::with_capacity(
|
|
143
|
+
READ_BUFFER_SIZE,
|
|
144
|
+
file,
|
|
145
|
+
)))
|
|
140
146
|
} else {
|
|
141
|
-
Box::new(file)
|
|
147
|
+
Box::new(BufReader::with_capacity(READ_BUFFER_SIZE, file))
|
|
142
148
|
})
|
|
143
149
|
}
|
|
144
150
|
|
|
145
|
-
fn get_reader(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
|
|
151
|
+
fn get_reader(&self) -> Result<(Box<dyn Read + Send + 'static>, bool), ReaderError> {
|
|
146
152
|
let string_io: magnus::RClass = self.ruby.eval("StringIO")?;
|
|
147
153
|
let gzip_reader_class: magnus::RClass = self.ruby.eval("Zlib::GzipReader")?;
|
|
148
154
|
|
|
149
155
|
if self.to_read.is_kind_of(string_io) {
|
|
150
|
-
self.handle_string_io()
|
|
156
|
+
self.handle_string_io().map(|r| (r, false))
|
|
151
157
|
} else if self.to_read.is_kind_of(gzip_reader_class) {
|
|
152
158
|
Err(ReaderError::UnsupportedGzipReader)
|
|
153
159
|
} else if self.to_read.is_kind_of(self.ruby.class_io()) {
|
|
154
|
-
self.handle_file_descriptor()
|
|
160
|
+
self.handle_file_descriptor().map(|r| (r, true))
|
|
155
161
|
} else {
|
|
156
|
-
self.handle_file_path()
|
|
162
|
+
self.handle_file_path().map(|r| (r, false))
|
|
157
163
|
}
|
|
158
164
|
}
|
|
159
165
|
|
|
@@ -175,7 +181,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
|
175
181
|
|
|
176
182
|
pub fn build(self) -> Result<RecordReader<T>, ReaderError> {
|
|
177
183
|
match self.get_reader() {
|
|
178
|
-
Ok(readable) => self.build_multi_threaded(readable),
|
|
184
|
+
Ok((readable, should_forget)) => self.build_multi_threaded(readable, should_forget),
|
|
179
185
|
Err(_) => {
|
|
180
186
|
let readable = self.get_single_threaded_reader()?;
|
|
181
187
|
self.build_single_threaded(readable)
|
|
@@ -186,6 +192,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
|
186
192
|
fn build_multi_threaded(
|
|
187
193
|
self,
|
|
188
194
|
readable: Box<dyn Read + Send + 'static>,
|
|
195
|
+
should_forget: bool,
|
|
189
196
|
) -> Result<RecordReader<T>, ReaderError> {
|
|
190
197
|
let flexible = self.flexible || self.flexible_default.is_some();
|
|
191
198
|
let mut reader = csv::ReaderBuilder::new()
|
|
@@ -204,7 +211,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
|
204
211
|
|
|
205
212
|
let flexible_default = self.flexible_default.clone();
|
|
206
213
|
let handle = thread::spawn(move || {
|
|
207
|
-
let mut record = csv::StringRecord::
|
|
214
|
+
let mut record = csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers.len());
|
|
208
215
|
while let Ok(true) = reader.read_record(&mut record) {
|
|
209
216
|
let row = T::parse(
|
|
210
217
|
&static_headers,
|
|
@@ -216,8 +223,10 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
|
216
223
|
break;
|
|
217
224
|
}
|
|
218
225
|
}
|
|
219
|
-
|
|
220
|
-
|
|
226
|
+
if should_forget {
|
|
227
|
+
let file_to_forget = reader.into_inner();
|
|
228
|
+
std::mem::forget(file_to_forget);
|
|
229
|
+
}
|
|
221
230
|
});
|
|
222
231
|
|
|
223
232
|
Ok(RecordReader {
|
|
@@ -257,30 +266,55 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
|
257
266
|
|
|
258
267
|
struct RubyReader {
|
|
259
268
|
inner: Value,
|
|
269
|
+
buffer: Option<Vec<u8>>,
|
|
270
|
+
offset: usize,
|
|
260
271
|
}
|
|
261
272
|
|
|
262
273
|
impl RubyReader {
|
|
263
274
|
fn new(inner: Value) -> Self {
|
|
264
|
-
Self {
|
|
275
|
+
Self {
|
|
276
|
+
inner,
|
|
277
|
+
buffer: None,
|
|
278
|
+
offset: 0,
|
|
279
|
+
}
|
|
265
280
|
}
|
|
266
281
|
}
|
|
267
282
|
|
|
283
|
+
// Read the entire inner into a vector and then read future reads from that vector with offset
|
|
268
284
|
impl Read for RubyReader {
|
|
269
285
|
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
|
270
|
-
|
|
286
|
+
// If we have an existing buffer, read from it
|
|
287
|
+
if let Some(buffer) = self.buffer.as_ref() {
|
|
288
|
+
let remaining = buffer.len() - self.offset;
|
|
289
|
+
let copy_size = remaining.min(buf.len());
|
|
290
|
+
buf[..copy_size].copy_from_slice(&buffer[self.offset..self.offset + copy_size]);
|
|
291
|
+
self.offset += copy_size;
|
|
292
|
+
return Ok(copy_size);
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
// No buffer yet - read the entire content from Ruby
|
|
296
|
+
let result = self.inner.funcall::<_, _, Value>("read", ());
|
|
271
297
|
match result {
|
|
272
298
|
Ok(data) => {
|
|
273
299
|
if data.is_nil() {
|
|
274
|
-
return Ok(0);
|
|
300
|
+
return Ok(0); // EOF
|
|
275
301
|
}
|
|
276
302
|
|
|
277
303
|
let string = RString::from_value(data).ok_or_else(|| {
|
|
278
304
|
io::Error::new(io::ErrorKind::Other, "Failed to convert to RString")
|
|
279
305
|
})?;
|
|
280
306
|
let bytes = unsafe { string.as_slice() };
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
307
|
+
|
|
308
|
+
// Store the entire content in the buffer
|
|
309
|
+
self.buffer = Some(bytes.to_vec());
|
|
310
|
+
self.offset = 0;
|
|
311
|
+
|
|
312
|
+
// Read initial chunk
|
|
313
|
+
let copy_size = bytes.len().min(buf.len());
|
|
314
|
+
buf[..copy_size].copy_from_slice(&bytes[..copy_size]);
|
|
315
|
+
self.offset = copy_size;
|
|
316
|
+
|
|
317
|
+
Ok(copy_size)
|
|
284
318
|
}
|
|
285
319
|
Err(e) => Err(io::Error::new(io::ErrorKind::Other, e.to_string())),
|
|
286
320
|
}
|
data/ext/osv/src/csv/mod.rs
CHANGED
data/ext/osv/src/csv/parser.rs
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
use std::collections::HashMap;
|
|
2
|
+
use std::hash::BuildHasher;
|
|
2
3
|
|
|
3
4
|
pub trait RecordParser {
|
|
4
5
|
type Output;
|
|
@@ -11,7 +12,7 @@ pub trait RecordParser {
|
|
|
11
12
|
) -> Self::Output;
|
|
12
13
|
}
|
|
13
14
|
|
|
14
|
-
impl RecordParser for HashMap<&'static str, Option<String
|
|
15
|
+
impl<S: BuildHasher + Default> RecordParser for HashMap<&'static str, Option<String>, S> {
|
|
15
16
|
type Output = Self;
|
|
16
17
|
|
|
17
18
|
#[inline]
|
|
@@ -21,21 +22,21 @@ impl RecordParser for HashMap<&'static str, Option<String>> {
|
|
|
21
22
|
null_string: Option<&str>,
|
|
22
23
|
flexible_default: Option<&str>,
|
|
23
24
|
) -> Self::Output {
|
|
24
|
-
let mut map = HashMap::
|
|
25
|
-
headers.iter().enumerate().for_each(|(i, header)| {
|
|
25
|
+
let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
|
|
26
|
+
headers.iter().enumerate().for_each(|(i, &header)| {
|
|
26
27
|
let value = record.get(i).map_or_else(
|
|
27
|
-
|| flexible_default.map(
|
|
28
|
+
|| flexible_default.map(ToString::to_string),
|
|
28
29
|
|field| {
|
|
29
30
|
if null_string == Some(field) {
|
|
30
31
|
None
|
|
31
32
|
} else if field.is_empty() {
|
|
32
33
|
Some(String::new())
|
|
33
34
|
} else {
|
|
34
|
-
Some(field.
|
|
35
|
+
Some(field.into())
|
|
35
36
|
}
|
|
36
37
|
},
|
|
37
38
|
);
|
|
38
|
-
map.insert(
|
|
39
|
+
map.insert(header, value);
|
|
39
40
|
});
|
|
40
41
|
map
|
|
41
42
|
}
|
|
@@ -53,20 +54,20 @@ impl RecordParser for Vec<Option<String>> {
|
|
|
53
54
|
) -> Self::Output {
|
|
54
55
|
let target_len = headers.len();
|
|
55
56
|
let mut vec = Vec::with_capacity(target_len);
|
|
56
|
-
|
|
57
|
-
|
|
57
|
+
for field in record.iter() {
|
|
58
|
+
let value = if Some(field) == null_string {
|
|
58
59
|
None
|
|
59
60
|
} else if field.is_empty() {
|
|
60
61
|
Some(String::new())
|
|
61
62
|
} else {
|
|
62
|
-
Some(field.
|
|
63
|
-
}
|
|
64
|
-
|
|
63
|
+
Some(field.into())
|
|
64
|
+
};
|
|
65
|
+
vec.push(value);
|
|
66
|
+
}
|
|
65
67
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
vec.push(Some(default.to_string()));
|
|
68
|
+
if vec.len() < target_len {
|
|
69
|
+
if let Some(default) = flexible_default {
|
|
70
|
+
vec.resize_with(target_len, || Some(default.to_string()));
|
|
70
71
|
}
|
|
71
72
|
}
|
|
72
73
|
vec
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
use super::{header_cache::StringCache, parser::RecordParser};
|
|
2
2
|
use std::{io::Read, thread};
|
|
3
3
|
|
|
4
|
+
pub(crate) const READ_BUFFER_SIZE: usize = 8192;
|
|
5
|
+
|
|
4
6
|
pub enum ReadImpl<T: RecordParser> {
|
|
5
7
|
SingleThreaded {
|
|
6
8
|
reader: csv::Reader<Box<dyn Read>>,
|
|
@@ -36,7 +38,7 @@ impl<T: RecordParser> ReadImpl<T> {
|
|
|
36
38
|
null_string,
|
|
37
39
|
flexible_default,
|
|
38
40
|
} => {
|
|
39
|
-
let mut record = csv::StringRecord::
|
|
41
|
+
let mut record = csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers.len());
|
|
40
42
|
match reader.read_record(&mut record) {
|
|
41
43
|
Ok(true) => Some(T::parse(
|
|
42
44
|
headers,
|
data/ext/osv/src/csv/record.rs
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
use magnus::{IntoValue, Ruby, Value};
|
|
2
|
-
use std::collections::HashMap;
|
|
2
|
+
use std::{collections::HashMap, hash::BuildHasher};
|
|
3
3
|
|
|
4
4
|
#[derive(Debug)]
|
|
5
|
-
pub enum CsvRecord {
|
|
5
|
+
pub enum CsvRecord<S: BuildHasher + Default> {
|
|
6
6
|
Vec(Vec<Option<String>>),
|
|
7
|
-
Map(HashMap<&'static str, Option<String
|
|
7
|
+
Map(HashMap<&'static str, Option<String>, S>),
|
|
8
8
|
}
|
|
9
9
|
|
|
10
|
-
impl IntoValue for CsvRecord {
|
|
10
|
+
impl<S: BuildHasher + Default> IntoValue for CsvRecord<S> {
|
|
11
11
|
#[inline]
|
|
12
12
|
fn into_value_with(self, handle: &Ruby) -> Value {
|
|
13
13
|
match self {
|
data/ext/osv/src/reader.rs
CHANGED
|
@@ -3,11 +3,12 @@ use crate::utils::*;
|
|
|
3
3
|
use magnus::value::ReprValue;
|
|
4
4
|
use magnus::{block::Yield, Error, KwArgs, RHash, Ruby, Symbol, Value};
|
|
5
5
|
use std::collections::HashMap;
|
|
6
|
+
use xxhash_rust::xxh3::Xxh3Builder;
|
|
6
7
|
|
|
7
8
|
pub fn parse_csv(
|
|
8
9
|
rb_self: Value,
|
|
9
10
|
args: &[Value],
|
|
10
|
-
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord
|
|
11
|
+
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<Xxh3Builder>>>>, Error> {
|
|
11
12
|
let ruby = unsafe { Ruby::get_unchecked() };
|
|
12
13
|
|
|
13
14
|
let CsvArgs {
|
|
@@ -37,18 +38,20 @@ pub fn parse_csv(
|
|
|
37
38
|
});
|
|
38
39
|
}
|
|
39
40
|
|
|
40
|
-
let iter: Box<dyn Iterator<Item = CsvRecord
|
|
41
|
+
let iter: Box<dyn Iterator<Item = CsvRecord<Xxh3Builder>>> = match result_type.as_str() {
|
|
41
42
|
"hash" => Box::new(
|
|
42
|
-
RecordReaderBuilder::<HashMap<&'static str, Option<String
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
43
|
+
RecordReaderBuilder::<HashMap<&'static str, Option<String>, Xxh3Builder>>::new(
|
|
44
|
+
&ruby, to_read,
|
|
45
|
+
)
|
|
46
|
+
.has_headers(has_headers)
|
|
47
|
+
.flexible(flexible)
|
|
48
|
+
.flexible_default(flexible_default)
|
|
49
|
+
.delimiter(delimiter)
|
|
50
|
+
.quote_char(quote_char)
|
|
51
|
+
.null_string(null_string)
|
|
52
|
+
.buffer(buffer_size)
|
|
53
|
+
.build()?
|
|
54
|
+
.map(CsvRecord::Map),
|
|
52
55
|
),
|
|
53
56
|
"array" => Box::new(
|
|
54
57
|
RecordReaderBuilder::<Vec<Option<String>>>::new(&ruby, to_read)
|
|
@@ -88,7 +91,7 @@ struct EnumeratorArgs {
|
|
|
88
91
|
|
|
89
92
|
fn create_enumerator(
|
|
90
93
|
args: EnumeratorArgs,
|
|
91
|
-
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord
|
|
94
|
+
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<Xxh3Builder>>>>, Error> {
|
|
92
95
|
let kwargs = RHash::new();
|
|
93
96
|
kwargs.aset(Symbol::new("has_headers"), args.has_headers)?;
|
|
94
97
|
kwargs.aset(
|
data/ext/osv/src/utils.rs
CHANGED
|
@@ -4,6 +4,8 @@ use magnus::{
|
|
|
4
4
|
Error, RString, Ruby, Symbol, Value,
|
|
5
5
|
};
|
|
6
6
|
|
|
7
|
+
use crate::csv::BUFFER_CHANNEL_SIZE;
|
|
8
|
+
|
|
7
9
|
#[derive(Debug)]
|
|
8
10
|
pub struct CsvArgs {
|
|
9
11
|
pub to_read: Value,
|
|
@@ -81,7 +83,7 @@ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
|
|
|
81
83
|
|
|
82
84
|
let null_string = kwargs.optional.3.unwrap_or_default();
|
|
83
85
|
|
|
84
|
-
let buffer_size = kwargs.optional.4.unwrap_or(
|
|
86
|
+
let buffer_size = kwargs.optional.4.unwrap_or(BUFFER_CHANNEL_SIZE);
|
|
85
87
|
|
|
86
88
|
let result_type = match kwargs.optional.5 {
|
|
87
89
|
Some(value) => {
|
data/lib/osv/version.rb
CHANGED