osv 0.3.8 → 0.3.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 02205de8cef4d5f7633c06720a9e925a2b608116354da4a1678d4746d2197d23
4
- data.tar.gz: 3e1d63323fdaad1b6a60e0a0a63801f98710615d6616c882f0cdce00e36c6e2e
3
+ metadata.gz: 935cf4c277ef52eb1b1c4a4c27d2fe54461489b38b4203a1e574ac1a9e3298df
4
+ data.tar.gz: aa13944483b0f9fa8963a830d5f2a2932110e86bb32c722565c2b4d5f9dbef3c
5
5
  SHA512:
6
- metadata.gz: df6a4a4b86c41010ea671ac0e98c2ee6307e62ceff35dab125868f0ee7edb6d14984348ecd4ac9f913489e5a6be0b364240b461334554385aabe5b3374fe798d
7
- data.tar.gz: d931b888ce9d0ad1cdb1fa3d0be8cd0e526292206742f5adde718f414e9feca97eff3af6d4139d144c18a50e4807650ea9f7582153bcee80cea1e6ed4ce4ef49
6
+ metadata.gz: ea38b823a0423ef8883c04f6b677b16ba5d55f9a23faf434fc758788d238972b17b6e2435f0ec92661a32ee31fafb39170c81933d67a201af8cd528678e6261a
7
+ data.tar.gz: 77a62c7d62f36bd4143166d65c64eecccece56f1ebb969295cf7af6ff8639af9df71b3923d3ba13590824db68604e90cdc61e22490e81a3d0e24bf1af5f0be47
data/Cargo.lock CHANGED
@@ -274,6 +274,7 @@ dependencies = [
274
274
  "serde",
275
275
  "serde_magnus",
276
276
  "thiserror",
277
+ "xxhash-rust",
277
278
  ]
278
279
 
279
280
  [[package]]
@@ -526,3 +527,9 @@ name = "windows_x86_64_msvc"
526
527
  version = "0.52.6"
527
528
  source = "registry+https://github.com/rust-lang/crates.io-index"
528
529
  checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
530
+
531
+ [[package]]
532
+ name = "xxhash-rust"
533
+ version = "0.8.14"
534
+ source = "registry+https://github.com/rust-lang/crates.io-index"
535
+ checksum = "d7d48f1b18be023c95e7b75f481cac649d74be7c507ff4a407c55cfb957f7934"
data/README.md CHANGED
@@ -112,45 +112,50 @@ Here's some unscientific benchmarks. You can find the code in the [benchmark/com
112
112
  ### 10,000 lines
113
113
 
114
114
  ```
115
- Benchmarking with 10001 lines of data
115
+ Benchmarking with 100001 lines of data
116
116
 
117
- ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin23]
117
+ ruby 3.3.6 (2024-11-05 revision 75015d4c1f) [arm64-darwin24]
118
118
  Warming up --------------------------------------
119
- OSV - Hash output 6.000 i/100ms
119
+ OSV - Hash output 1.000 i/100ms
120
120
  CSV - Hash output 1.000 i/100ms
121
- OSV - Array output 18.000 i/100ms
122
- CSV - Array output 2.000 i/100ms
121
+ OSV - Array output 1.000 i/100ms
122
+ OSV - Direct Open Array output
123
+ 12.719M i/100ms
124
+ CSV - Array output 1.000 i/100ms
123
125
  FastCSV - Array output
124
- 9.000 i/100ms
125
- OSV - StringIO 7.000 i/100ms
126
+ 1.000 i/100ms
127
+ OSV - StringIO 1.000 i/100ms
126
128
  CSV - StringIO 1.000 i/100ms
127
- FastCSV - StringIO 20.000 i/100ms
128
- OSV - Gzipped 6.000 i/100ms
129
+ FastCSV - StringIO 1.000 i/100ms
130
+ OSV - Gzipped 1.000 i/100ms
129
131
  CSV - Gzipped 1.000 i/100ms
130
132
  Calculating -------------------------------------
131
- OSV - Hash output 73.360 4.1%) i/s (13.63 ms/i) - 366.000 in 5.000390s
132
- CSV - Hash output 11.93725.1%) i/s (83.78 ms/i) - 52.000 in 5.036297s
133
- OSV - Array output 189.738 8.4%) i/s (5.27 ms/i) - 954.000 in 5.071018s
134
- CSV - Array output 25.471 (±11.8%) i/s (39.26 ms/i) - 120.000 in 5.015289s
133
+ OSV - Hash output 6.72214.9%) i/s (148.77 ms/i) - 59.000 in 10.074753s
134
+ CSV - Hash output 1.223 0.0%) i/s (817.62 ms/i) - 13.000 in 10.788284s
135
+ OSV - Array output 17.28411.6%) i/s (57.86 ms/i) - 171.000 in 10.007321s
136
+ OSV - Direct Open Array output
137
+ 213.629M (±13.5%) i/s (4.68 ns/i) - 1.921B in 10.005506s
138
+ CSV - Array output 2.193 (± 0.0%) i/s (455.93 ms/i) - 22.000 in 10.052607s
135
139
  FastCSV - Array output
136
- 97.8672.0%) i/s (10.22 ms/i) - 495.000 in 5.060957s
137
- OSV - StringIO 80.784 6.2%) i/s (12.38 ms/i) - 406.000 in 5.046696s
138
- CSV - StringIO 15.872 (± 0.0%) i/s (63.01 ms/i) - 80.000 in 5.043361s
139
- FastCSV - StringIO 200.5112.0%) i/s (4.99 ms/i) - 1.020k in 5.088592s
140
- OSV - Gzipped 55.22012.7%) i/s (18.11 ms/i) - 258.000 in 5.030928s
141
- CSV - Gzipped 12.59115.9%) i/s (79.42 ms/i) - 59.000 in 5.039709s
140
+ 7.9930.0%) i/s (125.11 ms/i) - 80.000 in 10.053729s
141
+ OSV - StringIO 6.62615.1%) i/s (150.91 ms/i) - 66.000 in 10.103646s
142
+ CSV - StringIO 1.478 (± 0.0%) i/s (676.78 ms/i) - 15.000 in 10.158640s
143
+ FastCSV - StringIO 17.0745.9%) i/s (58.57 ms/i) - 171.000 in 10.059266s
144
+ OSV - Gzipped 5.639 0.0%) i/s (177.32 ms/i) - 57.000 in 10.152487s
145
+ CSV - Gzipped 1.176 0.0%) i/s (850.19 ms/i) - 12.000 in 10.233398s
142
146
 
143
147
  Comparison:
144
- FastCSV - StringIO: 200.5 i/s
145
- OSV - Array output: 189.7 i/s - same-ish: difference falls within error
146
- FastCSV - Array output: 97.9 i/s - 2.05x slower
147
- OSV - StringIO: 80.8 i/s - 2.48x slower
148
- OSV - Hash output: 73.4 i/s - 2.73x slower
149
- OSV - Gzipped: 55.2 i/s - 3.63x slower
150
- CSV - Array output: 25.5 i/s - 7.87x slower
151
- CSV - StringIO: 15.9 i/s - 12.63x slower
152
- CSV - Gzipped: 12.6 i/s - 15.92x slower
153
- CSV - Hash output: 11.9 i/s - 16.80x slower
148
+ OSV - Direct Open Array output: 213629268.6 i/s
149
+ OSV - Array output: 17.3 i/s - 12360250.79x slower
150
+ FastCSV - StringIO: 17.1 i/s - 12511956.50x slower
151
+ FastCSV - Array output: 8.0 i/s - 26727225.72x slower
152
+ OSV - Hash output: 6.7 i/s - 31780615.83x slower
153
+ OSV - StringIO: 6.6 i/s - 32239620.60x slower
154
+ OSV - Gzipped: 5.6 i/s - 37881517.48x slower
155
+ CSV - Array output: 2.2 i/s - 97400427.87x slower
156
+ CSV - StringIO: 1.5 i/s - 144580048.04x slower
157
+ CSV - Hash output: 1.2 i/s - 174666591.31x slower
158
+ CSV - Gzipped: 1.2 i/s - 181626018.23x slower
154
159
  ```
155
160
 
156
161
  ### 1,000,000 lines
@@ -158,11 +163,13 @@ FastCSV - Array output: 97.9 i/s - 2.05x slower
158
163
  ```
159
164
  Benchmarking with 1000001 lines of data
160
165
 
161
- ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin23]
166
+ ruby 3.3.6 (2024-11-05 revision 75015d4c1f) [arm64-darwin24]
162
167
  Warming up --------------------------------------
163
168
  OSV - Hash output 1.000 i/100ms
164
169
  CSV - Hash output 1.000 i/100ms
165
170
  OSV - Array output 1.000 i/100ms
171
+ OSV - Direct Open Array output
172
+ 1.000 i/100ms
166
173
  CSV - Array output 1.000 i/100ms
167
174
  FastCSV - Array output
168
175
  1.000 i/100ms
@@ -172,27 +179,30 @@ FastCSV - Array output
172
179
  OSV - Gzipped 1.000 i/100ms
173
180
  CSV - Gzipped 1.000 i/100ms
174
181
  Calculating -------------------------------------
175
- OSV - Hash output 0.578 (± 0.0%) i/s (1.73 s/i) - 3.000 in 5.287845s
176
- CSV - Hash output 0.117 (± 0.0%) i/s (8.57 s/i) - 1.000 in 8.571770s
177
- OSV - Array output 1.142 (± 0.0%) i/s (875.97 ms/i) - 5.000 in 5.234694s
178
- CSV - Array output 0.235 (± 0.0%) i/s (4.25 s/i) - 2.000 in 8.561144s
182
+ OSV - Hash output 0.492 (± 0.0%) i/s (2.03 s/i) - 5.000 in 10.463278s
183
+ CSV - Hash output 0.114 (± 0.0%) i/s (8.75 s/i) - 2.000 in 17.573877s
184
+ OSV - Array output 1.502 (± 0.0%) i/s (665.58 ms/i) - 14.000 in 10.217551s
185
+ OSV - Direct Open Array output
186
+ 1.626 (± 0.0%) i/s (614.90 ms/i) - 16.000 in 10.190323s
187
+ CSV - Array output 0.183 (± 0.0%) i/s (5.46 s/i) - 2.000 in 10.951943s
179
188
  FastCSV - Array output
180
- 0.768 (± 0.0%) i/s (1.30 s/i) - 4.000 in 6.924574s
181
- OSV - StringIO 0.522 (± 0.0%) i/s (1.91 s/i) - 3.000 in 5.803969s
182
- CSV - StringIO 0.132 (± 0.0%) i/s (7.59 s/i) - 1.000 in 7.593243s
183
- FastCSV - StringIO 1.039 (± 0.0%) i/s (962.53 ms/i) - 6.000 in 5.806644s
184
- OSV - Gzipped 0.437 (± 0.0%) i/s (2.29 s/i) - 3.000 in 6.885125s
185
- CSV - Gzipped 0.115 (± 0.0%) i/s (8.68 s/i) - 1.000 in 8.684069s
189
+ 0.326 (± 0.0%) i/s (3.07 s/i) - 4.000 in 12.340605s
190
+ OSV - StringIO 0.567 (± 0.0%) i/s (1.76 s/i) - 6.000 in 10.698027s
191
+ CSV - StringIO 0.141 (± 0.0%) i/s (7.10 s/i) - 2.000 in 14.237144s
192
+ FastCSV - StringIO 0.923 (± 0.0%) i/s (1.08 s/i) - 10.000 in 11.567775s
193
+ OSV - Gzipped 0.437 (± 0.0%) i/s (2.29 s/i) - 5.000 in 11.452764s
194
+ CSV - Gzipped 0.104 (± 0.0%) i/s (9.64 s/i) - 2.000 in 19.373423s
186
195
 
187
196
  Comparison:
188
- OSV - Array output: 1.1 i/s
189
- FastCSV - StringIO: 1.0 i/s - 1.10x slower
190
- FastCSV - Array output: 0.8 i/s - 1.49x slower
191
- OSV - Hash output: 0.6 i/s - 1.98x slower
192
- OSV - StringIO: 0.5 i/s - 2.19x slower
193
- OSV - Gzipped: 0.4 i/s - 2.61x slower
194
- CSV - Array output: 0.2 i/s - 4.86x slower
195
- CSV - StringIO: 0.1 i/s - 8.67x slower
196
- CSV - Hash output: 0.1 i/s - 9.79x slower
197
- CSV - Gzipped: 0.1 i/s - 9.91x slower
197
+ OSV - Direct Open Array output: 1.6 i/s
198
+ OSV - Array output: 1.5 i/s - 1.08x slower
199
+ FastCSV - StringIO: 0.9 i/s - 1.76x slower
200
+ OSV - StringIO: 0.6 i/s - 2.87x slower
201
+ OSV - Hash output: 0.5 i/s - 3.30x slower
202
+ OSV - Gzipped: 0.4 i/s - 3.72x slower
203
+ FastCSV - Array output: 0.3 i/s - 4.99x slower
204
+ CSV - Array output: 0.2 i/s - 8.88x slower
205
+ CSV - StringIO: 0.1 i/s - 11.55x slower
206
+ CSV - Hash output: 0.1 i/s - 14.24x slower
207
+ CSV - Gzipped: 0.1 i/s - 15.68x slower
198
208
  ```
data/Rakefile CHANGED
@@ -11,7 +11,7 @@ Rake::ExtensionTask.new("osv") do |c|
11
11
  end
12
12
 
13
13
  task :dev do
14
- ENV["RB_SYS_CARGO_PROFILE"] = "dev"
14
+ ENV["RB_SYS_CARGO_PROFILE"] = "release"
15
15
  end
16
16
 
17
17
  Rake::TestTask.new do |t|
data/ext/osv/Cargo.toml CHANGED
@@ -15,3 +15,4 @@ rb-sys = "^0.9"
15
15
  serde = { version = "1.0", features = ["derive"] }
16
16
  serde_magnus = "0.8.1"
17
17
  thiserror = "2.0"
18
+ xxhash-rust = { version = "0.8.12", features = ["xxh3"] }
@@ -3,18 +3,21 @@ use super::{
3
3
  parser::RecordParser,
4
4
  read_impl::ReadImpl,
5
5
  reader::RecordReader,
6
+ READ_BUFFER_SIZE,
6
7
  };
7
8
  use flate2::read::GzDecoder;
8
9
  use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, RString, Ruby, Value};
9
10
  use std::{
10
11
  fs::File,
11
- io::{self, Read},
12
+ io::{self, BufReader, Read},
12
13
  marker::PhantomData,
13
14
  os::fd::FromRawFd,
14
15
  thread,
15
16
  };
16
17
  use thiserror::Error;
17
18
 
19
+ pub(crate) static BUFFER_CHANNEL_SIZE: usize = 1024;
20
+
18
21
  #[derive(Error, Debug)]
19
22
  pub enum ReaderError {
20
23
  #[error("Failed to get file descriptor: {0}")]
@@ -68,7 +71,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
68
71
  delimiter: b',',
69
72
  quote_char: b'"',
70
73
  null_string: None,
71
- buffer: 1000,
74
+ buffer: BUFFER_CHANNEL_SIZE,
72
75
  flexible: false,
73
76
  flexible_default: None,
74
77
  _phantom: PhantomData,
@@ -128,7 +131,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
128
131
  }
129
132
 
130
133
  let file = unsafe { File::from_raw_fd(fd) };
131
- Ok(Box::new(file))
134
+ Ok(Box::new(BufReader::with_capacity(READ_BUFFER_SIZE, file)))
132
135
  }
133
136
 
134
137
  fn handle_file_path(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
@@ -136,24 +139,27 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
136
139
  let file = File::open(&path)?;
137
140
 
138
141
  Ok(if path.ends_with(".gz") {
139
- Box::new(GzDecoder::new(file))
142
+ Box::new(GzDecoder::new(BufReader::with_capacity(
143
+ READ_BUFFER_SIZE,
144
+ file,
145
+ )))
140
146
  } else {
141
- Box::new(file)
147
+ Box::new(BufReader::with_capacity(READ_BUFFER_SIZE, file))
142
148
  })
143
149
  }
144
150
 
145
- fn get_reader(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
151
+ fn get_reader(&self) -> Result<(Box<dyn Read + Send + 'static>, bool), ReaderError> {
146
152
  let string_io: magnus::RClass = self.ruby.eval("StringIO")?;
147
153
  let gzip_reader_class: magnus::RClass = self.ruby.eval("Zlib::GzipReader")?;
148
154
 
149
155
  if self.to_read.is_kind_of(string_io) {
150
- self.handle_string_io()
156
+ self.handle_string_io().map(|r| (r, false))
151
157
  } else if self.to_read.is_kind_of(gzip_reader_class) {
152
158
  Err(ReaderError::UnsupportedGzipReader)
153
159
  } else if self.to_read.is_kind_of(self.ruby.class_io()) {
154
- self.handle_file_descriptor()
160
+ self.handle_file_descriptor().map(|r| (r, true))
155
161
  } else {
156
- self.handle_file_path()
162
+ self.handle_file_path().map(|r| (r, false))
157
163
  }
158
164
  }
159
165
 
@@ -175,7 +181,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
175
181
 
176
182
  pub fn build(self) -> Result<RecordReader<T>, ReaderError> {
177
183
  match self.get_reader() {
178
- Ok(readable) => self.build_multi_threaded(readable),
184
+ Ok((readable, should_forget)) => self.build_multi_threaded(readable, should_forget),
179
185
  Err(_) => {
180
186
  let readable = self.get_single_threaded_reader()?;
181
187
  self.build_single_threaded(readable)
@@ -186,6 +192,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
186
192
  fn build_multi_threaded(
187
193
  self,
188
194
  readable: Box<dyn Read + Send + 'static>,
195
+ should_forget: bool,
189
196
  ) -> Result<RecordReader<T>, ReaderError> {
190
197
  let flexible = self.flexible || self.flexible_default.is_some();
191
198
  let mut reader = csv::ReaderBuilder::new()
@@ -204,7 +211,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
204
211
 
205
212
  let flexible_default = self.flexible_default.clone();
206
213
  let handle = thread::spawn(move || {
207
- let mut record = csv::StringRecord::new();
214
+ let mut record = csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers.len());
208
215
  while let Ok(true) = reader.read_record(&mut record) {
209
216
  let row = T::parse(
210
217
  &static_headers,
@@ -216,8 +223,10 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
216
223
  break;
217
224
  }
218
225
  }
219
- let file_to_forget = reader.into_inner();
220
- std::mem::forget(file_to_forget);
226
+ if should_forget {
227
+ let file_to_forget = reader.into_inner();
228
+ std::mem::forget(file_to_forget);
229
+ }
221
230
  });
222
231
 
223
232
  Ok(RecordReader {
@@ -257,30 +266,55 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
257
266
 
258
267
  struct RubyReader {
259
268
  inner: Value,
269
+ buffer: Option<Vec<u8>>,
270
+ offset: usize,
260
271
  }
261
272
 
262
273
  impl RubyReader {
263
274
  fn new(inner: Value) -> Self {
264
- Self { inner }
275
+ Self {
276
+ inner,
277
+ buffer: None,
278
+ offset: 0,
279
+ }
265
280
  }
266
281
  }
267
282
 
283
+ // Read the entire inner into a vector and then read future reads from that vector with offset
268
284
  impl Read for RubyReader {
269
285
  fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
270
- let result = self.inner.funcall::<_, _, Value>("read", (buf.len(),));
286
+ // If we have an existing buffer, read from it
287
+ if let Some(buffer) = self.buffer.as_ref() {
288
+ let remaining = buffer.len() - self.offset;
289
+ let copy_size = remaining.min(buf.len());
290
+ buf[..copy_size].copy_from_slice(&buffer[self.offset..self.offset + copy_size]);
291
+ self.offset += copy_size;
292
+ return Ok(copy_size);
293
+ }
294
+
295
+ // No buffer yet - read the entire content from Ruby
296
+ let result = self.inner.funcall::<_, _, Value>("read", ());
271
297
  match result {
272
298
  Ok(data) => {
273
299
  if data.is_nil() {
274
- return Ok(0);
300
+ return Ok(0); // EOF
275
301
  }
276
302
 
277
303
  let string = RString::from_value(data).ok_or_else(|| {
278
304
  io::Error::new(io::ErrorKind::Other, "Failed to convert to RString")
279
305
  })?;
280
306
  let bytes = unsafe { string.as_slice() };
281
- let len = bytes.len().min(buf.len());
282
- buf[..len].copy_from_slice(&bytes[..len]);
283
- Ok(len)
307
+
308
+ // Store the entire content in the buffer
309
+ self.buffer = Some(bytes.to_vec());
310
+ self.offset = 0;
311
+
312
+ // Read initial chunk
313
+ let copy_size = bytes.len().min(buf.len());
314
+ buf[..copy_size].copy_from_slice(&bytes[..copy_size]);
315
+ self.offset = copy_size;
316
+
317
+ Ok(copy_size)
284
318
  }
285
319
  Err(e) => Err(io::Error::new(io::ErrorKind::Other, e.to_string())),
286
320
  }
@@ -6,4 +6,6 @@ mod reader;
6
6
  mod record;
7
7
 
8
8
  pub use builder::RecordReaderBuilder;
9
+ pub(crate) use builder::BUFFER_CHANNEL_SIZE;
10
+ pub(crate) use read_impl::READ_BUFFER_SIZE;
9
11
  pub use record::CsvRecord;
@@ -1,4 +1,5 @@
1
1
  use std::collections::HashMap;
2
+ use std::hash::BuildHasher;
2
3
 
3
4
  pub trait RecordParser {
4
5
  type Output;
@@ -11,7 +12,7 @@ pub trait RecordParser {
11
12
  ) -> Self::Output;
12
13
  }
13
14
 
14
- impl RecordParser for HashMap<&'static str, Option<String>> {
15
+ impl<S: BuildHasher + Default> RecordParser for HashMap<&'static str, Option<String>, S> {
15
16
  type Output = Self;
16
17
 
17
18
  #[inline]
@@ -21,21 +22,21 @@ impl RecordParser for HashMap<&'static str, Option<String>> {
21
22
  null_string: Option<&str>,
22
23
  flexible_default: Option<&str>,
23
24
  ) -> Self::Output {
24
- let mut map = HashMap::with_capacity(headers.len());
25
- headers.iter().enumerate().for_each(|(i, header)| {
25
+ let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
26
+ headers.iter().enumerate().for_each(|(i, &header)| {
26
27
  let value = record.get(i).map_or_else(
27
- || flexible_default.map(|s| s.to_string()),
28
+ || flexible_default.map(ToString::to_string),
28
29
  |field| {
29
30
  if null_string == Some(field) {
30
31
  None
31
32
  } else if field.is_empty() {
32
33
  Some(String::new())
33
34
  } else {
34
- Some(field.to_string())
35
+ Some(field.into())
35
36
  }
36
37
  },
37
38
  );
38
- map.insert(*header, value);
39
+ map.insert(header, value);
39
40
  });
40
41
  map
41
42
  }
@@ -53,20 +54,20 @@ impl RecordParser for Vec<Option<String>> {
53
54
  ) -> Self::Output {
54
55
  let target_len = headers.len();
55
56
  let mut vec = Vec::with_capacity(target_len);
56
- vec.extend(record.iter().map(|field| {
57
- if null_string == Some(field) {
57
+ for field in record.iter() {
58
+ let value = if Some(field) == null_string {
58
59
  None
59
60
  } else if field.is_empty() {
60
61
  Some(String::new())
61
62
  } else {
62
- Some(field.to_string())
63
- }
64
- }));
63
+ Some(field.into())
64
+ };
65
+ vec.push(value);
66
+ }
65
67
 
66
- // Fill remaining slots with flexible_default if needed
67
- if let Some(default) = flexible_default {
68
- while vec.len() < target_len {
69
- vec.push(Some(default.to_string()));
68
+ if vec.len() < target_len {
69
+ if let Some(default) = flexible_default {
70
+ vec.resize_with(target_len, || Some(default.to_string()));
70
71
  }
71
72
  }
72
73
  vec
@@ -1,6 +1,8 @@
1
1
  use super::{header_cache::StringCache, parser::RecordParser};
2
2
  use std::{io::Read, thread};
3
3
 
4
+ pub(crate) const READ_BUFFER_SIZE: usize = 8192;
5
+
4
6
  pub enum ReadImpl<T: RecordParser> {
5
7
  SingleThreaded {
6
8
  reader: csv::Reader<Box<dyn Read>>,
@@ -36,7 +38,7 @@ impl<T: RecordParser> ReadImpl<T> {
36
38
  null_string,
37
39
  flexible_default,
38
40
  } => {
39
- let mut record = csv::StringRecord::new();
41
+ let mut record = csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers.len());
40
42
  match reader.read_record(&mut record) {
41
43
  Ok(true) => Some(T::parse(
42
44
  headers,
@@ -1,13 +1,13 @@
1
1
  use magnus::{IntoValue, Ruby, Value};
2
- use std::collections::HashMap;
2
+ use std::{collections::HashMap, hash::BuildHasher};
3
3
 
4
4
  #[derive(Debug)]
5
- pub enum CsvRecord {
5
+ pub enum CsvRecord<S: BuildHasher + Default> {
6
6
  Vec(Vec<Option<String>>),
7
- Map(HashMap<&'static str, Option<String>>),
7
+ Map(HashMap<&'static str, Option<String>, S>),
8
8
  }
9
9
 
10
- impl IntoValue for CsvRecord {
10
+ impl<S: BuildHasher + Default> IntoValue for CsvRecord<S> {
11
11
  #[inline]
12
12
  fn into_value_with(self, handle: &Ruby) -> Value {
13
13
  match self {
@@ -3,11 +3,12 @@ use crate::utils::*;
3
3
  use magnus::value::ReprValue;
4
4
  use magnus::{block::Yield, Error, KwArgs, RHash, Ruby, Symbol, Value};
5
5
  use std::collections::HashMap;
6
+ use xxhash_rust::xxh3::Xxh3Builder;
6
7
 
7
8
  pub fn parse_csv(
8
9
  rb_self: Value,
9
10
  args: &[Value],
10
- ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord>>>, Error> {
11
+ ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<Xxh3Builder>>>>, Error> {
11
12
  let ruby = unsafe { Ruby::get_unchecked() };
12
13
 
13
14
  let CsvArgs {
@@ -37,18 +38,20 @@ pub fn parse_csv(
37
38
  });
38
39
  }
39
40
 
40
- let iter: Box<dyn Iterator<Item = CsvRecord>> = match result_type.as_str() {
41
+ let iter: Box<dyn Iterator<Item = CsvRecord<Xxh3Builder>>> = match result_type.as_str() {
41
42
  "hash" => Box::new(
42
- RecordReaderBuilder::<HashMap<&'static str, Option<String>>>::new(&ruby, to_read)
43
- .has_headers(has_headers)
44
- .flexible(flexible)
45
- .flexible_default(flexible_default)
46
- .delimiter(delimiter)
47
- .quote_char(quote_char)
48
- .null_string(null_string)
49
- .buffer(buffer_size)
50
- .build()?
51
- .map(CsvRecord::Map),
43
+ RecordReaderBuilder::<HashMap<&'static str, Option<String>, Xxh3Builder>>::new(
44
+ &ruby, to_read,
45
+ )
46
+ .has_headers(has_headers)
47
+ .flexible(flexible)
48
+ .flexible_default(flexible_default)
49
+ .delimiter(delimiter)
50
+ .quote_char(quote_char)
51
+ .null_string(null_string)
52
+ .buffer(buffer_size)
53
+ .build()?
54
+ .map(CsvRecord::Map),
52
55
  ),
53
56
  "array" => Box::new(
54
57
  RecordReaderBuilder::<Vec<Option<String>>>::new(&ruby, to_read)
@@ -88,7 +91,7 @@ struct EnumeratorArgs {
88
91
 
89
92
  fn create_enumerator(
90
93
  args: EnumeratorArgs,
91
- ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord>>>, Error> {
94
+ ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<Xxh3Builder>>>>, Error> {
92
95
  let kwargs = RHash::new();
93
96
  kwargs.aset(Symbol::new("has_headers"), args.has_headers)?;
94
97
  kwargs.aset(
data/ext/osv/src/utils.rs CHANGED
@@ -4,6 +4,8 @@ use magnus::{
4
4
  Error, RString, Ruby, Symbol, Value,
5
5
  };
6
6
 
7
+ use crate::csv::BUFFER_CHANNEL_SIZE;
8
+
7
9
  #[derive(Debug)]
8
10
  pub struct CsvArgs {
9
11
  pub to_read: Value,
@@ -81,7 +83,7 @@ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
81
83
 
82
84
  let null_string = kwargs.optional.3.unwrap_or_default();
83
85
 
84
- let buffer_size = kwargs.optional.4.unwrap_or(1000);
86
+ let buffer_size = kwargs.optional.4.unwrap_or(BUFFER_CHANNEL_SIZE);
85
87
 
86
88
  let result_type = match kwargs.optional.5 {
87
89
  Some(value) => {
data/lib/osv/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module OSV
2
- VERSION = "0.3.8"
2
+ VERSION = "0.3.9"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: osv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.8
4
+ version: 0.3.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko