osv 0.3.8 → 0.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 02205de8cef4d5f7633c06720a9e925a2b608116354da4a1678d4746d2197d23
4
- data.tar.gz: 3e1d63323fdaad1b6a60e0a0a63801f98710615d6616c882f0cdce00e36c6e2e
3
+ metadata.gz: 935cf4c277ef52eb1b1c4a4c27d2fe54461489b38b4203a1e574ac1a9e3298df
4
+ data.tar.gz: aa13944483b0f9fa8963a830d5f2a2932110e86bb32c722565c2b4d5f9dbef3c
5
5
  SHA512:
6
- metadata.gz: df6a4a4b86c41010ea671ac0e98c2ee6307e62ceff35dab125868f0ee7edb6d14984348ecd4ac9f913489e5a6be0b364240b461334554385aabe5b3374fe798d
7
- data.tar.gz: d931b888ce9d0ad1cdb1fa3d0be8cd0e526292206742f5adde718f414e9feca97eff3af6d4139d144c18a50e4807650ea9f7582153bcee80cea1e6ed4ce4ef49
6
+ metadata.gz: ea38b823a0423ef8883c04f6b677b16ba5d55f9a23faf434fc758788d238972b17b6e2435f0ec92661a32ee31fafb39170c81933d67a201af8cd528678e6261a
7
+ data.tar.gz: 77a62c7d62f36bd4143166d65c64eecccece56f1ebb969295cf7af6ff8639af9df71b3923d3ba13590824db68604e90cdc61e22490e81a3d0e24bf1af5f0be47
data/Cargo.lock CHANGED
@@ -274,6 +274,7 @@ dependencies = [
274
274
  "serde",
275
275
  "serde_magnus",
276
276
  "thiserror",
277
+ "xxhash-rust",
277
278
  ]
278
279
 
279
280
  [[package]]
@@ -526,3 +527,9 @@ name = "windows_x86_64_msvc"
526
527
  version = "0.52.6"
527
528
  source = "registry+https://github.com/rust-lang/crates.io-index"
528
529
  checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
530
+
531
+ [[package]]
532
+ name = "xxhash-rust"
533
+ version = "0.8.14"
534
+ source = "registry+https://github.com/rust-lang/crates.io-index"
535
+ checksum = "d7d48f1b18be023c95e7b75f481cac649d74be7c507ff4a407c55cfb957f7934"
data/README.md CHANGED
@@ -112,45 +112,50 @@ Here's some unscientific benchmarks. You can find the code in the [benchmark/com
112
112
  ### 10,000 lines
113
113
 
114
114
  ```
115
- Benchmarking with 10001 lines of data
115
+ Benchmarking with 100001 lines of data
116
116
 
117
- ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin23]
117
+ ruby 3.3.6 (2024-11-05 revision 75015d4c1f) [arm64-darwin24]
118
118
  Warming up --------------------------------------
119
- OSV - Hash output 6.000 i/100ms
119
+ OSV - Hash output 1.000 i/100ms
120
120
  CSV - Hash output 1.000 i/100ms
121
- OSV - Array output 18.000 i/100ms
122
- CSV - Array output 2.000 i/100ms
121
+ OSV - Array output 1.000 i/100ms
122
+ OSV - Direct Open Array output
123
+ 12.719M i/100ms
124
+ CSV - Array output 1.000 i/100ms
123
125
  FastCSV - Array output
124
- 9.000 i/100ms
125
- OSV - StringIO 7.000 i/100ms
126
+ 1.000 i/100ms
127
+ OSV - StringIO 1.000 i/100ms
126
128
  CSV - StringIO 1.000 i/100ms
127
- FastCSV - StringIO 20.000 i/100ms
128
- OSV - Gzipped 6.000 i/100ms
129
+ FastCSV - StringIO 1.000 i/100ms
130
+ OSV - Gzipped 1.000 i/100ms
129
131
  CSV - Gzipped 1.000 i/100ms
130
132
  Calculating -------------------------------------
131
- OSV - Hash output 73.360 4.1%) i/s (13.63 ms/i) - 366.000 in 5.000390s
132
- CSV - Hash output 11.93725.1%) i/s (83.78 ms/i) - 52.000 in 5.036297s
133
- OSV - Array output 189.738 8.4%) i/s (5.27 ms/i) - 954.000 in 5.071018s
134
- CSV - Array output 25.471 (±11.8%) i/s (39.26 ms/i) - 120.000 in 5.015289s
133
+ OSV - Hash output 6.72214.9%) i/s (148.77 ms/i) - 59.000 in 10.074753s
134
+ CSV - Hash output 1.223 0.0%) i/s (817.62 ms/i) - 13.000 in 10.788284s
135
+ OSV - Array output 17.28411.6%) i/s (57.86 ms/i) - 171.000 in 10.007321s
136
+ OSV - Direct Open Array output
137
+ 213.629M (±13.5%) i/s (4.68 ns/i) - 1.921B in 10.005506s
138
+ CSV - Array output 2.193 (± 0.0%) i/s (455.93 ms/i) - 22.000 in 10.052607s
135
139
  FastCSV - Array output
136
- 97.8672.0%) i/s (10.22 ms/i) - 495.000 in 5.060957s
137
- OSV - StringIO 80.784 6.2%) i/s (12.38 ms/i) - 406.000 in 5.046696s
138
- CSV - StringIO 15.872 (± 0.0%) i/s (63.01 ms/i) - 80.000 in 5.043361s
139
- FastCSV - StringIO 200.5112.0%) i/s (4.99 ms/i) - 1.020k in 5.088592s
140
- OSV - Gzipped 55.22012.7%) i/s (18.11 ms/i) - 258.000 in 5.030928s
141
- CSV - Gzipped 12.59115.9%) i/s (79.42 ms/i) - 59.000 in 5.039709s
140
+ 7.9930.0%) i/s (125.11 ms/i) - 80.000 in 10.053729s
141
+ OSV - StringIO 6.62615.1%) i/s (150.91 ms/i) - 66.000 in 10.103646s
142
+ CSV - StringIO 1.478 (± 0.0%) i/s (676.78 ms/i) - 15.000 in 10.158640s
143
+ FastCSV - StringIO 17.0745.9%) i/s (58.57 ms/i) - 171.000 in 10.059266s
144
+ OSV - Gzipped 5.639 0.0%) i/s (177.32 ms/i) - 57.000 in 10.152487s
145
+ CSV - Gzipped 1.176 0.0%) i/s (850.19 ms/i) - 12.000 in 10.233398s
142
146
 
143
147
  Comparison:
144
- FastCSV - StringIO: 200.5 i/s
145
- OSV - Array output: 189.7 i/s - same-ish: difference falls within error
146
- FastCSV - Array output: 97.9 i/s - 2.05x slower
147
- OSV - StringIO: 80.8 i/s - 2.48x slower
148
- OSV - Hash output: 73.4 i/s - 2.73x slower
149
- OSV - Gzipped: 55.2 i/s - 3.63x slower
150
- CSV - Array output: 25.5 i/s - 7.87x slower
151
- CSV - StringIO: 15.9 i/s - 12.63x slower
152
- CSV - Gzipped: 12.6 i/s - 15.92x slower
153
- CSV - Hash output: 11.9 i/s - 16.80x slower
148
+ OSV - Direct Open Array output: 213629268.6 i/s
149
+ OSV - Array output: 17.3 i/s - 12360250.79x slower
150
+ FastCSV - StringIO: 17.1 i/s - 12511956.50x slower
151
+ FastCSV - Array output: 8.0 i/s - 26727225.72x slower
152
+ OSV - Hash output: 6.7 i/s - 31780615.83x slower
153
+ OSV - StringIO: 6.6 i/s - 32239620.60x slower
154
+ OSV - Gzipped: 5.6 i/s - 37881517.48x slower
155
+ CSV - Array output: 2.2 i/s - 97400427.87x slower
156
+ CSV - StringIO: 1.5 i/s - 144580048.04x slower
157
+ CSV - Hash output: 1.2 i/s - 174666591.31x slower
158
+ CSV - Gzipped: 1.2 i/s - 181626018.23x slower
154
159
  ```
155
160
 
156
161
  ### 1,000,000 lines
@@ -158,11 +163,13 @@ FastCSV - Array output: 97.9 i/s - 2.05x slower
158
163
  ```
159
164
  Benchmarking with 1000001 lines of data
160
165
 
161
- ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin23]
166
+ ruby 3.3.6 (2024-11-05 revision 75015d4c1f) [arm64-darwin24]
162
167
  Warming up --------------------------------------
163
168
  OSV - Hash output 1.000 i/100ms
164
169
  CSV - Hash output 1.000 i/100ms
165
170
  OSV - Array output 1.000 i/100ms
171
+ OSV - Direct Open Array output
172
+ 1.000 i/100ms
166
173
  CSV - Array output 1.000 i/100ms
167
174
  FastCSV - Array output
168
175
  1.000 i/100ms
@@ -172,27 +179,30 @@ FastCSV - Array output
172
179
  OSV - Gzipped 1.000 i/100ms
173
180
  CSV - Gzipped 1.000 i/100ms
174
181
  Calculating -------------------------------------
175
- OSV - Hash output 0.578 (± 0.0%) i/s (1.73 s/i) - 3.000 in 5.287845s
176
- CSV - Hash output 0.117 (± 0.0%) i/s (8.57 s/i) - 1.000 in 8.571770s
177
- OSV - Array output 1.142 (± 0.0%) i/s (875.97 ms/i) - 5.000 in 5.234694s
178
- CSV - Array output 0.235 (± 0.0%) i/s (4.25 s/i) - 2.000 in 8.561144s
182
+ OSV - Hash output 0.492 (± 0.0%) i/s (2.03 s/i) - 5.000 in 10.463278s
183
+ CSV - Hash output 0.114 (± 0.0%) i/s (8.75 s/i) - 2.000 in 17.573877s
184
+ OSV - Array output 1.502 (± 0.0%) i/s (665.58 ms/i) - 14.000 in 10.217551s
185
+ OSV - Direct Open Array output
186
+ 1.626 (± 0.0%) i/s (614.90 ms/i) - 16.000 in 10.190323s
187
+ CSV - Array output 0.183 (± 0.0%) i/s (5.46 s/i) - 2.000 in 10.951943s
179
188
  FastCSV - Array output
180
- 0.768 (± 0.0%) i/s (1.30 s/i) - 4.000 in 6.924574s
181
- OSV - StringIO 0.522 (± 0.0%) i/s (1.91 s/i) - 3.000 in 5.803969s
182
- CSV - StringIO 0.132 (± 0.0%) i/s (7.59 s/i) - 1.000 in 7.593243s
183
- FastCSV - StringIO 1.039 (± 0.0%) i/s (962.53 ms/i) - 6.000 in 5.806644s
184
- OSV - Gzipped 0.437 (± 0.0%) i/s (2.29 s/i) - 3.000 in 6.885125s
185
- CSV - Gzipped 0.115 (± 0.0%) i/s (8.68 s/i) - 1.000 in 8.684069s
189
+ 0.326 (± 0.0%) i/s (3.07 s/i) - 4.000 in 12.340605s
190
+ OSV - StringIO 0.567 (± 0.0%) i/s (1.76 s/i) - 6.000 in 10.698027s
191
+ CSV - StringIO 0.141 (± 0.0%) i/s (7.10 s/i) - 2.000 in 14.237144s
192
+ FastCSV - StringIO 0.923 (± 0.0%) i/s (1.08 s/i) - 10.000 in 11.567775s
193
+ OSV - Gzipped 0.437 (± 0.0%) i/s (2.29 s/i) - 5.000 in 11.452764s
194
+ CSV - Gzipped 0.104 (± 0.0%) i/s (9.64 s/i) - 2.000 in 19.373423s
186
195
 
187
196
  Comparison:
188
- OSV - Array output: 1.1 i/s
189
- FastCSV - StringIO: 1.0 i/s - 1.10x slower
190
- FastCSV - Array output: 0.8 i/s - 1.49x slower
191
- OSV - Hash output: 0.6 i/s - 1.98x slower
192
- OSV - StringIO: 0.5 i/s - 2.19x slower
193
- OSV - Gzipped: 0.4 i/s - 2.61x slower
194
- CSV - Array output: 0.2 i/s - 4.86x slower
195
- CSV - StringIO: 0.1 i/s - 8.67x slower
196
- CSV - Hash output: 0.1 i/s - 9.79x slower
197
- CSV - Gzipped: 0.1 i/s - 9.91x slower
197
+ OSV - Direct Open Array output: 1.6 i/s
198
+ OSV - Array output: 1.5 i/s - 1.08x slower
199
+ FastCSV - StringIO: 0.9 i/s - 1.76x slower
200
+ OSV - StringIO: 0.6 i/s - 2.87x slower
201
+ OSV - Hash output: 0.5 i/s - 3.30x slower
202
+ OSV - Gzipped: 0.4 i/s - 3.72x slower
203
+ FastCSV - Array output: 0.3 i/s - 4.99x slower
204
+ CSV - Array output: 0.2 i/s - 8.88x slower
205
+ CSV - StringIO: 0.1 i/s - 11.55x slower
206
+ CSV - Hash output: 0.1 i/s - 14.24x slower
207
+ CSV - Gzipped: 0.1 i/s - 15.68x slower
198
208
  ```
data/Rakefile CHANGED
@@ -11,7 +11,7 @@ Rake::ExtensionTask.new("osv") do |c|
11
11
  end
12
12
 
13
13
  task :dev do
14
- ENV["RB_SYS_CARGO_PROFILE"] = "dev"
14
+ ENV["RB_SYS_CARGO_PROFILE"] = "release"
15
15
  end
16
16
 
17
17
  Rake::TestTask.new do |t|
data/ext/osv/Cargo.toml CHANGED
@@ -15,3 +15,4 @@ rb-sys = "^0.9"
15
15
  serde = { version = "1.0", features = ["derive"] }
16
16
  serde_magnus = "0.8.1"
17
17
  thiserror = "2.0"
18
+ xxhash-rust = { version = "0.8.12", features = ["xxh3"] }
@@ -3,18 +3,21 @@ use super::{
3
3
  parser::RecordParser,
4
4
  read_impl::ReadImpl,
5
5
  reader::RecordReader,
6
+ READ_BUFFER_SIZE,
6
7
  };
7
8
  use flate2::read::GzDecoder;
8
9
  use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, RString, Ruby, Value};
9
10
  use std::{
10
11
  fs::File,
11
- io::{self, Read},
12
+ io::{self, BufReader, Read},
12
13
  marker::PhantomData,
13
14
  os::fd::FromRawFd,
14
15
  thread,
15
16
  };
16
17
  use thiserror::Error;
17
18
 
19
+ pub(crate) static BUFFER_CHANNEL_SIZE: usize = 1024;
20
+
18
21
  #[derive(Error, Debug)]
19
22
  pub enum ReaderError {
20
23
  #[error("Failed to get file descriptor: {0}")]
@@ -68,7 +71,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
68
71
  delimiter: b',',
69
72
  quote_char: b'"',
70
73
  null_string: None,
71
- buffer: 1000,
74
+ buffer: BUFFER_CHANNEL_SIZE,
72
75
  flexible: false,
73
76
  flexible_default: None,
74
77
  _phantom: PhantomData,
@@ -128,7 +131,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
128
131
  }
129
132
 
130
133
  let file = unsafe { File::from_raw_fd(fd) };
131
- Ok(Box::new(file))
134
+ Ok(Box::new(BufReader::with_capacity(READ_BUFFER_SIZE, file)))
132
135
  }
133
136
 
134
137
  fn handle_file_path(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
@@ -136,24 +139,27 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
136
139
  let file = File::open(&path)?;
137
140
 
138
141
  Ok(if path.ends_with(".gz") {
139
- Box::new(GzDecoder::new(file))
142
+ Box::new(GzDecoder::new(BufReader::with_capacity(
143
+ READ_BUFFER_SIZE,
144
+ file,
145
+ )))
140
146
  } else {
141
- Box::new(file)
147
+ Box::new(BufReader::with_capacity(READ_BUFFER_SIZE, file))
142
148
  })
143
149
  }
144
150
 
145
- fn get_reader(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
151
+ fn get_reader(&self) -> Result<(Box<dyn Read + Send + 'static>, bool), ReaderError> {
146
152
  let string_io: magnus::RClass = self.ruby.eval("StringIO")?;
147
153
  let gzip_reader_class: magnus::RClass = self.ruby.eval("Zlib::GzipReader")?;
148
154
 
149
155
  if self.to_read.is_kind_of(string_io) {
150
- self.handle_string_io()
156
+ self.handle_string_io().map(|r| (r, false))
151
157
  } else if self.to_read.is_kind_of(gzip_reader_class) {
152
158
  Err(ReaderError::UnsupportedGzipReader)
153
159
  } else if self.to_read.is_kind_of(self.ruby.class_io()) {
154
- self.handle_file_descriptor()
160
+ self.handle_file_descriptor().map(|r| (r, true))
155
161
  } else {
156
- self.handle_file_path()
162
+ self.handle_file_path().map(|r| (r, false))
157
163
  }
158
164
  }
159
165
 
@@ -175,7 +181,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
175
181
 
176
182
  pub fn build(self) -> Result<RecordReader<T>, ReaderError> {
177
183
  match self.get_reader() {
178
- Ok(readable) => self.build_multi_threaded(readable),
184
+ Ok((readable, should_forget)) => self.build_multi_threaded(readable, should_forget),
179
185
  Err(_) => {
180
186
  let readable = self.get_single_threaded_reader()?;
181
187
  self.build_single_threaded(readable)
@@ -186,6 +192,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
186
192
  fn build_multi_threaded(
187
193
  self,
188
194
  readable: Box<dyn Read + Send + 'static>,
195
+ should_forget: bool,
189
196
  ) -> Result<RecordReader<T>, ReaderError> {
190
197
  let flexible = self.flexible || self.flexible_default.is_some();
191
198
  let mut reader = csv::ReaderBuilder::new()
@@ -204,7 +211,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
204
211
 
205
212
  let flexible_default = self.flexible_default.clone();
206
213
  let handle = thread::spawn(move || {
207
- let mut record = csv::StringRecord::new();
214
+ let mut record = csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers.len());
208
215
  while let Ok(true) = reader.read_record(&mut record) {
209
216
  let row = T::parse(
210
217
  &static_headers,
@@ -216,8 +223,10 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
216
223
  break;
217
224
  }
218
225
  }
219
- let file_to_forget = reader.into_inner();
220
- std::mem::forget(file_to_forget);
226
+ if should_forget {
227
+ let file_to_forget = reader.into_inner();
228
+ std::mem::forget(file_to_forget);
229
+ }
221
230
  });
222
231
 
223
232
  Ok(RecordReader {
@@ -257,30 +266,55 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
257
266
 
258
267
  struct RubyReader {
259
268
  inner: Value,
269
+ buffer: Option<Vec<u8>>,
270
+ offset: usize,
260
271
  }
261
272
 
262
273
  impl RubyReader {
263
274
  fn new(inner: Value) -> Self {
264
- Self { inner }
275
+ Self {
276
+ inner,
277
+ buffer: None,
278
+ offset: 0,
279
+ }
265
280
  }
266
281
  }
267
282
 
283
+ // Read the entire inner into a vector and then read future reads from that vector with offset
268
284
  impl Read for RubyReader {
269
285
  fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
270
- let result = self.inner.funcall::<_, _, Value>("read", (buf.len(),));
286
+ // If we have an existing buffer, read from it
287
+ if let Some(buffer) = self.buffer.as_ref() {
288
+ let remaining = buffer.len() - self.offset;
289
+ let copy_size = remaining.min(buf.len());
290
+ buf[..copy_size].copy_from_slice(&buffer[self.offset..self.offset + copy_size]);
291
+ self.offset += copy_size;
292
+ return Ok(copy_size);
293
+ }
294
+
295
+ // No buffer yet - read the entire content from Ruby
296
+ let result = self.inner.funcall::<_, _, Value>("read", ());
271
297
  match result {
272
298
  Ok(data) => {
273
299
  if data.is_nil() {
274
- return Ok(0);
300
+ return Ok(0); // EOF
275
301
  }
276
302
 
277
303
  let string = RString::from_value(data).ok_or_else(|| {
278
304
  io::Error::new(io::ErrorKind::Other, "Failed to convert to RString")
279
305
  })?;
280
306
  let bytes = unsafe { string.as_slice() };
281
- let len = bytes.len().min(buf.len());
282
- buf[..len].copy_from_slice(&bytes[..len]);
283
- Ok(len)
307
+
308
+ // Store the entire content in the buffer
309
+ self.buffer = Some(bytes.to_vec());
310
+ self.offset = 0;
311
+
312
+ // Read initial chunk
313
+ let copy_size = bytes.len().min(buf.len());
314
+ buf[..copy_size].copy_from_slice(&bytes[..copy_size]);
315
+ self.offset = copy_size;
316
+
317
+ Ok(copy_size)
284
318
  }
285
319
  Err(e) => Err(io::Error::new(io::ErrorKind::Other, e.to_string())),
286
320
  }
@@ -6,4 +6,6 @@ mod reader;
6
6
  mod record;
7
7
 
8
8
  pub use builder::RecordReaderBuilder;
9
+ pub(crate) use builder::BUFFER_CHANNEL_SIZE;
10
+ pub(crate) use read_impl::READ_BUFFER_SIZE;
9
11
  pub use record::CsvRecord;
@@ -1,4 +1,5 @@
1
1
  use std::collections::HashMap;
2
+ use std::hash::BuildHasher;
2
3
 
3
4
  pub trait RecordParser {
4
5
  type Output;
@@ -11,7 +12,7 @@ pub trait RecordParser {
11
12
  ) -> Self::Output;
12
13
  }
13
14
 
14
- impl RecordParser for HashMap<&'static str, Option<String>> {
15
+ impl<S: BuildHasher + Default> RecordParser for HashMap<&'static str, Option<String>, S> {
15
16
  type Output = Self;
16
17
 
17
18
  #[inline]
@@ -21,21 +22,21 @@ impl RecordParser for HashMap<&'static str, Option<String>> {
21
22
  null_string: Option<&str>,
22
23
  flexible_default: Option<&str>,
23
24
  ) -> Self::Output {
24
- let mut map = HashMap::with_capacity(headers.len());
25
- headers.iter().enumerate().for_each(|(i, header)| {
25
+ let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
26
+ headers.iter().enumerate().for_each(|(i, &header)| {
26
27
  let value = record.get(i).map_or_else(
27
- || flexible_default.map(|s| s.to_string()),
28
+ || flexible_default.map(ToString::to_string),
28
29
  |field| {
29
30
  if null_string == Some(field) {
30
31
  None
31
32
  } else if field.is_empty() {
32
33
  Some(String::new())
33
34
  } else {
34
- Some(field.to_string())
35
+ Some(field.into())
35
36
  }
36
37
  },
37
38
  );
38
- map.insert(*header, value);
39
+ map.insert(header, value);
39
40
  });
40
41
  map
41
42
  }
@@ -53,20 +54,20 @@ impl RecordParser for Vec<Option<String>> {
53
54
  ) -> Self::Output {
54
55
  let target_len = headers.len();
55
56
  let mut vec = Vec::with_capacity(target_len);
56
- vec.extend(record.iter().map(|field| {
57
- if null_string == Some(field) {
57
+ for field in record.iter() {
58
+ let value = if Some(field) == null_string {
58
59
  None
59
60
  } else if field.is_empty() {
60
61
  Some(String::new())
61
62
  } else {
62
- Some(field.to_string())
63
- }
64
- }));
63
+ Some(field.into())
64
+ };
65
+ vec.push(value);
66
+ }
65
67
 
66
- // Fill remaining slots with flexible_default if needed
67
- if let Some(default) = flexible_default {
68
- while vec.len() < target_len {
69
- vec.push(Some(default.to_string()));
68
+ if vec.len() < target_len {
69
+ if let Some(default) = flexible_default {
70
+ vec.resize_with(target_len, || Some(default.to_string()));
70
71
  }
71
72
  }
72
73
  vec
@@ -1,6 +1,8 @@
1
1
  use super::{header_cache::StringCache, parser::RecordParser};
2
2
  use std::{io::Read, thread};
3
3
 
4
+ pub(crate) const READ_BUFFER_SIZE: usize = 8192;
5
+
4
6
  pub enum ReadImpl<T: RecordParser> {
5
7
  SingleThreaded {
6
8
  reader: csv::Reader<Box<dyn Read>>,
@@ -36,7 +38,7 @@ impl<T: RecordParser> ReadImpl<T> {
36
38
  null_string,
37
39
  flexible_default,
38
40
  } => {
39
- let mut record = csv::StringRecord::new();
41
+ let mut record = csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers.len());
40
42
  match reader.read_record(&mut record) {
41
43
  Ok(true) => Some(T::parse(
42
44
  headers,
@@ -1,13 +1,13 @@
1
1
  use magnus::{IntoValue, Ruby, Value};
2
- use std::collections::HashMap;
2
+ use std::{collections::HashMap, hash::BuildHasher};
3
3
 
4
4
  #[derive(Debug)]
5
- pub enum CsvRecord {
5
+ pub enum CsvRecord<S: BuildHasher + Default> {
6
6
  Vec(Vec<Option<String>>),
7
- Map(HashMap<&'static str, Option<String>>),
7
+ Map(HashMap<&'static str, Option<String>, S>),
8
8
  }
9
9
 
10
- impl IntoValue for CsvRecord {
10
+ impl<S: BuildHasher + Default> IntoValue for CsvRecord<S> {
11
11
  #[inline]
12
12
  fn into_value_with(self, handle: &Ruby) -> Value {
13
13
  match self {
@@ -3,11 +3,12 @@ use crate::utils::*;
3
3
  use magnus::value::ReprValue;
4
4
  use magnus::{block::Yield, Error, KwArgs, RHash, Ruby, Symbol, Value};
5
5
  use std::collections::HashMap;
6
+ use xxhash_rust::xxh3::Xxh3Builder;
6
7
 
7
8
  pub fn parse_csv(
8
9
  rb_self: Value,
9
10
  args: &[Value],
10
- ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord>>>, Error> {
11
+ ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<Xxh3Builder>>>>, Error> {
11
12
  let ruby = unsafe { Ruby::get_unchecked() };
12
13
 
13
14
  let CsvArgs {
@@ -37,18 +38,20 @@ pub fn parse_csv(
37
38
  });
38
39
  }
39
40
 
40
- let iter: Box<dyn Iterator<Item = CsvRecord>> = match result_type.as_str() {
41
+ let iter: Box<dyn Iterator<Item = CsvRecord<Xxh3Builder>>> = match result_type.as_str() {
41
42
  "hash" => Box::new(
42
- RecordReaderBuilder::<HashMap<&'static str, Option<String>>>::new(&ruby, to_read)
43
- .has_headers(has_headers)
44
- .flexible(flexible)
45
- .flexible_default(flexible_default)
46
- .delimiter(delimiter)
47
- .quote_char(quote_char)
48
- .null_string(null_string)
49
- .buffer(buffer_size)
50
- .build()?
51
- .map(CsvRecord::Map),
43
+ RecordReaderBuilder::<HashMap<&'static str, Option<String>, Xxh3Builder>>::new(
44
+ &ruby, to_read,
45
+ )
46
+ .has_headers(has_headers)
47
+ .flexible(flexible)
48
+ .flexible_default(flexible_default)
49
+ .delimiter(delimiter)
50
+ .quote_char(quote_char)
51
+ .null_string(null_string)
52
+ .buffer(buffer_size)
53
+ .build()?
54
+ .map(CsvRecord::Map),
52
55
  ),
53
56
  "array" => Box::new(
54
57
  RecordReaderBuilder::<Vec<Option<String>>>::new(&ruby, to_read)
@@ -88,7 +91,7 @@ struct EnumeratorArgs {
88
91
 
89
92
  fn create_enumerator(
90
93
  args: EnumeratorArgs,
91
- ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord>>>, Error> {
94
+ ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<Xxh3Builder>>>>, Error> {
92
95
  let kwargs = RHash::new();
93
96
  kwargs.aset(Symbol::new("has_headers"), args.has_headers)?;
94
97
  kwargs.aset(
data/ext/osv/src/utils.rs CHANGED
@@ -4,6 +4,8 @@ use magnus::{
4
4
  Error, RString, Ruby, Symbol, Value,
5
5
  };
6
6
 
7
+ use crate::csv::BUFFER_CHANNEL_SIZE;
8
+
7
9
  #[derive(Debug)]
8
10
  pub struct CsvArgs {
9
11
  pub to_read: Value,
@@ -81,7 +83,7 @@ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
81
83
 
82
84
  let null_string = kwargs.optional.3.unwrap_or_default();
83
85
 
84
- let buffer_size = kwargs.optional.4.unwrap_or(1000);
86
+ let buffer_size = kwargs.optional.4.unwrap_or(BUFFER_CHANNEL_SIZE);
85
87
 
86
88
  let result_type = match kwargs.optional.5 {
87
89
  Some(value) => {
data/lib/osv/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module OSV
2
- VERSION = "0.3.8"
2
+ VERSION = "0.3.9"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: osv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.8
4
+ version: 0.3.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko