osv 0.3.14 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 689f28c935746890aa680fd2f649076a36d6ce233d4cbf2717dc129174b593dc
4
- data.tar.gz: 45ddaa6774a9a4e9391d000b30b6e92afb8560b81821d8fec363d54283bac6d9
3
+ metadata.gz: 872cf06d1389f45f77b4eefc178cc8462ab165b833ab2c5bf4dc7f92e1c8308e
4
+ data.tar.gz: 84e6c5d0e03389966b8882a5a73f1698ddee3ed0edae24f2fd5b7f257935a98e
5
5
  SHA512:
6
- metadata.gz: 74c2052ea9cbc61ddef5d1c46abdd5e4cdf7c60c946c421e4b8da7c160ba3f3eb761842279cd9f066aa6a1aa2214d0ef9ba9ff11c46294e8e1d4ebbb95161d70
7
- data.tar.gz: 5a795e5fa6d84b39082c2754dea655cd5b4f8a00558627fb64f661a14ec32daa8ea7b31a13724291e11531d03c8c5fa1bdb928c6d9422b87fcbb5b5aba7daad5
6
+ metadata.gz: 445581447e8f5ec336da7843af715a5f5fbc298232a24f303a22eebb844f83f65ecc2e85d877a448119adae9e6a5529e377d87399a36e6f070562fa4ce0a11b7
7
+ data.tar.gz: '08f417b19b0549aa4a3db1538e4be413c5ec8faa3bd18e4c101a6fc3ea3e9496d04c30e39ea8eec9cc0cc3a38f8f83f7c2274e09c75259a26f3609620cf07a80'
data/Cargo.lock CHANGED
@@ -1,6 +1,6 @@
1
1
  # This file is automatically @generated by Cargo.
2
2
  # It is not intended for manual editing.
3
- version = 3
3
+ version = 4
4
4
 
5
5
  [[package]]
6
6
  name = "adler2"
@@ -8,6 +8,19 @@ version = "2.0.0"
8
8
  source = "registry+https://github.com/rust-lang/crates.io-index"
9
9
  checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
10
10
 
11
+ [[package]]
12
+ name = "ahash"
13
+ version = "0.8.11"
14
+ source = "registry+https://github.com/rust-lang/crates.io-index"
15
+ checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
16
+ dependencies = [
17
+ "cfg-if",
18
+ "getrandom",
19
+ "once_cell",
20
+ "version_check",
21
+ "zerocopy",
22
+ ]
23
+
11
24
  [[package]]
12
25
  name = "aho-corasick"
13
26
  version = "1.1.3"
@@ -49,6 +62,15 @@ version = "2.6.0"
49
62
  source = "registry+https://github.com/rust-lang/crates.io-index"
50
63
  checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
51
64
 
65
+ [[package]]
66
+ name = "cc"
67
+ version = "1.2.7"
68
+ source = "registry+https://github.com/rust-lang/crates.io-index"
69
+ checksum = "a012a0df96dd6d06ba9a1b29d6402d1a5d77c6befd2566afdc26e10603dc93d7"
70
+ dependencies = [
71
+ "shlex",
72
+ ]
73
+
52
74
  [[package]]
53
75
  name = "cexpr"
54
76
  version = "0.6.0"
@@ -127,6 +149,17 @@ version = "0.3.31"
127
149
  source = "registry+https://github.com/rust-lang/crates.io-index"
128
150
  checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
129
151
 
152
+ [[package]]
153
+ name = "getrandom"
154
+ version = "0.2.15"
155
+ source = "registry+https://github.com/rust-lang/crates.io-index"
156
+ checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
157
+ dependencies = [
158
+ "cfg-if",
159
+ "libc",
160
+ "wasi",
161
+ ]
162
+
130
163
  [[package]]
131
164
  name = "glob"
132
165
  version = "0.3.1"
@@ -148,6 +181,26 @@ version = "1.0.14"
148
181
  source = "registry+https://github.com/rust-lang/crates.io-index"
149
182
  checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
150
183
 
184
+ [[package]]
185
+ name = "jemalloc-sys"
186
+ version = "0.5.4+5.3.0-patched"
187
+ source = "registry+https://github.com/rust-lang/crates.io-index"
188
+ checksum = "ac6c1946e1cea1788cbfde01c993b52a10e2da07f4bac608228d1bed20bfebf2"
189
+ dependencies = [
190
+ "cc",
191
+ "libc",
192
+ ]
193
+
194
+ [[package]]
195
+ name = "jemallocator"
196
+ version = "0.5.4"
197
+ source = "registry+https://github.com/rust-lang/crates.io-index"
198
+ checksum = "a0de374a9f8e63150e6f5e8a60cc14c668226d7a347d8aee1a45766e3c4dd3bc"
199
+ dependencies = [
200
+ "jemalloc-sys",
201
+ "libc",
202
+ ]
203
+
151
204
  [[package]]
152
205
  name = "kanal"
153
206
  version = "0.1.0-pre8"
@@ -186,6 +239,16 @@ dependencies = [
186
239
  "windows-targets",
187
240
  ]
188
241
 
242
+ [[package]]
243
+ name = "libmimalloc-sys"
244
+ version = "0.1.39"
245
+ source = "registry+https://github.com/rust-lang/crates.io-index"
246
+ checksum = "23aa6811d3bd4deb8a84dde645f943476d13b248d818edcf8ce0b2f37f036b44"
247
+ dependencies = [
248
+ "cc",
249
+ "libc",
250
+ ]
251
+
189
252
  [[package]]
190
253
  name = "lock_api"
191
254
  version = "0.4.12"
@@ -237,6 +300,15 @@ version = "2.7.4"
237
300
  source = "registry+https://github.com/rust-lang/crates.io-index"
238
301
  checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
239
302
 
303
+ [[package]]
304
+ name = "mimalloc"
305
+ version = "0.1.43"
306
+ source = "registry+https://github.com/rust-lang/crates.io-index"
307
+ checksum = "68914350ae34959d83f732418d51e2427a794055d0b9529f48259ac07af65633"
308
+ dependencies = [
309
+ "libmimalloc-sys",
310
+ ]
311
+
240
312
  [[package]]
241
313
  name = "minimal-lexical"
242
314
  version = "0.2.1"
@@ -262,19 +334,27 @@ dependencies = [
262
334
  "minimal-lexical",
263
335
  ]
264
336
 
337
+ [[package]]
338
+ name = "once_cell"
339
+ version = "1.20.2"
340
+ source = "registry+https://github.com/rust-lang/crates.io-index"
341
+ checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
342
+
265
343
  [[package]]
266
344
  name = "osv"
267
345
  version = "0.1.0"
268
346
  dependencies = [
347
+ "ahash",
269
348
  "csv",
270
349
  "flate2",
350
+ "jemallocator",
271
351
  "kanal",
272
352
  "magnus 0.7.1",
353
+ "mimalloc",
273
354
  "rb-sys",
274
355
  "serde",
275
356
  "serde_magnus",
276
357
  "thiserror",
277
- "xxhash-rust",
278
358
  ]
279
359
 
280
360
  [[package]]
@@ -464,6 +544,18 @@ version = "1.0.14"
464
544
  source = "registry+https://github.com/rust-lang/crates.io-index"
465
545
  checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
466
546
 
547
+ [[package]]
548
+ name = "version_check"
549
+ version = "0.9.5"
550
+ source = "registry+https://github.com/rust-lang/crates.io-index"
551
+ checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
552
+
553
+ [[package]]
554
+ name = "wasi"
555
+ version = "0.11.0+wasi-snapshot-preview1"
556
+ source = "registry+https://github.com/rust-lang/crates.io-index"
557
+ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
558
+
467
559
  [[package]]
468
560
  name = "windows-targets"
469
561
  version = "0.52.6"
@@ -529,7 +621,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
529
621
  checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
530
622
 
531
623
  [[package]]
532
- name = "xxhash-rust"
533
- version = "0.8.14"
624
+ name = "zerocopy"
625
+ version = "0.7.35"
534
626
  source = "registry+https://github.com/rust-lang/crates.io-index"
535
- checksum = "d7d48f1b18be023c95e7b75f481cac649d74be7c507ff4a407c55cfb957f7934"
627
+ checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
628
+ dependencies = [
629
+ "zerocopy-derive",
630
+ ]
631
+
632
+ [[package]]
633
+ name = "zerocopy-derive"
634
+ version = "0.7.35"
635
+ source = "registry+https://github.com/rust-lang/crates.io-index"
636
+ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
637
+ dependencies = [
638
+ "proc-macro2",
639
+ "quote",
640
+ "syn",
641
+ ]
data/README.md CHANGED
@@ -118,10 +118,10 @@ This library is faster than the standard Ruby CSV library. It's also faster than
118
118
 
119
119
  Here's some unscientific benchmarks. You can find the code in the [benchmark/comparison_benchmark.rb](benchmark/comparison_benchmark.rb) file.
120
120
 
121
- ### 1,000,000 lines
121
+ ### 1,000,000 records
122
122
 
123
123
  ```
124
- 🏃 Running benchmarks...
124
+ 🏃 Running benchmarks...
125
125
  Benchmarking with 3000001 lines of data
126
126
 
127
127
  ruby 3.3.6 (2024-11-05 revision 75015d4c1f) +YJIT [arm64-darwin24]
@@ -142,34 +142,34 @@ OSV - Gzipped Direct 1.000 i/100ms
142
142
  FastCSV - Gzipped 1.000 i/100ms
143
143
  CSV - Gzipped 1.000 i/100ms
144
144
  Calculating -------------------------------------
145
- CSV - StringIO 0.083 (± 0.0%) i/s (12.01 s/i) - 3.000 in 36.028672s
146
- FastCSV - StringIO 0.366 (± 0.0%) i/s (2.73 s/i) - 11.000 in 30.032350s
147
- OSV - StringIO 0.522 (± 0.0%) i/s (1.92 s/i) - 16.000 in 30.655768s
148
- CSV - Hash output 0.062 (± 0.0%) i/s (16.16 s/i) - 2.000 in 32.311990s
149
- OSV - Hash output 0.273 (± 0.0%) i/s (3.66 s/i) - 9.000 in 32.924970s
150
- CSV - Array output 0.069 (± 0.0%) i/s (14.50 s/i) - 3.000 in 43.488185s
151
- OSV - Array output 0.601 (± 0.0%) i/s (1.66 s/i) - 19.000 in 31.636782s
145
+ CSV - StringIO 0.079 (± 0.0%) i/s (12.69 s/i) - 3.000 in 38.139709s
146
+ FastCSV - StringIO 0.370 (± 0.0%) i/s (2.71 s/i) - 12.000 in 32.474164s
147
+ OSV - StringIO 0.635 (± 0.0%) i/s (1.58 s/i) - 19.000 in 30.772490s
148
+ CSV - Hash output 0.058 (± 0.0%) i/s (17.11 s/i) - 2.000 in 34.212335s
149
+ OSV - Hash output 0.249 (± 0.0%) i/s (4.01 s/i) - 8.000 in 32.124319s
150
+ CSV - Array output 0.066 (± 0.0%) i/s (15.11 s/i) - 2.000 in 30.212137s
151
+ OSV - Array output 0.665 (± 0.0%) i/s (1.50 s/i) - 20.000 in 30.813986s
152
152
  FastCSV - Array output
153
- 0.356 (± 0.0%) i/s (2.81 s/i) - 11.000 in 30.871931s
153
+ 0.351 (± 0.0%) i/s (2.85 s/i) - 11.000 in 31.418786s
154
154
  OSV - Direct Open Array output
155
- 0.604 (± 0.0%) i/s (1.66 s/i) - 19.000 in 31.469190s
156
- OSV - Gzipped 0.424 (± 0.0%) i/s (2.36 s/i) - 13.000 in 30.642322s
157
- OSV - Gzipped Direct 0.636 (± 0.0%) i/s (1.57 s/i) - 20.000 in 31.424083s
158
- FastCSV - Gzipped 0.323 (± 0.0%) i/s (3.10 s/i) - 10.000 in 30.990648s
159
- CSV - Gzipped 0.058 (± 0.0%) i/s (17.11 s/i) - 2.000 in 34.228691s
155
+ 0.713 (± 0.0%) i/s (1.40 s/i) - 22.000 in 30.938525s
156
+ OSV - Gzipped 0.506 (± 0.0%) i/s (1.98 s/i) - 16.000 in 31.709708s
157
+ OSV - Gzipped Direct 0.685 (± 0.0%) i/s (1.46 s/i) - 21.000 in 31.145435s
158
+ FastCSV - Gzipped 0.324 (± 0.0%) i/s (3.09 s/i) - 10.000 in 30.983582s
159
+ CSV - Gzipped 0.057 (± 0.0%) i/s (17.69 s/i) - 2.000 in 35.379009s
160
160
 
161
161
  Comparison:
162
- OSV - Gzipped Direct: 0.6 i/s
163
- OSV - Direct Open Array output: 0.6 i/s - 1.05x slower
164
- OSV - Array output: 0.6 i/s - 1.06x slower
165
- OSV - StringIO: 0.5 i/s - 1.22x slower
166
- OSV - Gzipped: 0.4 i/s - 1.50x slower
167
- FastCSV - StringIO: 0.4 i/s - 1.74x slower
168
- FastCSV - Array output: 0.4 i/s - 1.79x slower
169
- FastCSV - Gzipped: 0.3 i/s - 1.97x slower
170
- OSV - Hash output: 0.3 i/s - 2.33x slower
171
- CSV - StringIO: 0.1 i/s - 7.64x slower
172
- CSV - Array output: 0.1 i/s - 9.23x slower
173
- CSV - Hash output: 0.1 i/s - 10.28x slower
174
- CSV - Gzipped: 0.1 i/s - 10.89x slower
162
+ OSV - Direct Open Array output: 0.7 i/s
163
+ OSV - Gzipped Direct: 0.7 i/s - 1.04x slower
164
+ OSV - Array output: 0.7 i/s - 1.07x slower
165
+ OSV - StringIO: 0.6 i/s - 1.12x slower
166
+ OSV - Gzipped: 0.5 i/s - 1.41x slower
167
+ FastCSV - StringIO: 0.4 i/s - 1.93x slower
168
+ FastCSV - Array output: 0.4 i/s - 2.03x slower
169
+ FastCSV - Gzipped: 0.3 i/s - 2.20x slower
170
+ OSV - Hash output: 0.2 i/s - 2.86x slower
171
+ CSV - StringIO: 0.1 i/s - 9.05x slower
172
+ CSV - Array output: 0.1 i/s - 10.77x slower
173
+ CSV - Hash output: 0.1 i/s - 12.20x slower
174
+ CSV - Gzipped: 0.1 i/s - 12.61x slower
175
175
  ```
data/ext/osv/Cargo.toml CHANGED
@@ -7,6 +7,7 @@ edition = "2021"
7
7
  crate-type = ["cdylib"]
8
8
 
9
9
  [dependencies]
10
+ ahash = "0.8"
10
11
  csv = "^1.3"
11
12
  flate2 = "1.0.35"
12
13
  kanal = "0.1.0-pre8"
@@ -15,4 +16,9 @@ rb-sys = "^0.9"
15
16
  serde = { version = "1.0", features = ["derive"] }
16
17
  serde_magnus = "0.8.1"
17
18
  thiserror = "2.0"
18
- xxhash-rust = { version = "0.8.12", features = ["xxh3"] }
19
+
20
+ [target.'cfg(target_os = "linux")'.dependencies]
21
+ jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
22
+
23
+ [target.'cfg(not(any(target_os = "linux", target_os = "windows")))'.dependencies]
24
+ mimalloc = { version = "0.1", default-features = false }
@@ -0,0 +1,13 @@
1
+ #[cfg(target_os = "linux")]
2
+ use jemallocator::Jemalloc;
3
+
4
+ #[cfg(not(any(target_os = "linux", target_os = "windows")))]
5
+ use mimalloc::MiMalloc;
6
+
7
+ #[global_allocator]
8
+ #[cfg(target_os = "linux")]
9
+ static ALLOC: Jemalloc = Jemalloc;
10
+
11
+ #[global_allocator]
12
+ #[cfg(not(any(target_os = "linux", target_os = "windows")))]
13
+ static ALLOC: MiMalloc = MiMalloc;
@@ -2,7 +2,8 @@ use super::{
2
2
  header_cache::{CacheError, StringCache},
3
3
  parser::RecordParser,
4
4
  record_reader::{RecordReader, READ_BUFFER_SIZE},
5
- ruby_reader::build_ruby_reader,
5
+ ruby_reader::{build_ruby_reader, SeekableRead},
6
+ ForgottenFileHandle,
6
7
  };
7
8
  use flate2::read::GzDecoder;
8
9
  use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, Ruby, Value};
@@ -10,6 +11,7 @@ use std::{
10
11
  fs::File,
11
12
  io::{self, BufReader, Read},
12
13
  marker::PhantomData,
14
+ mem::ManuallyDrop,
13
15
  os::fd::FromRawFd,
14
16
  };
15
17
 
@@ -64,7 +66,6 @@ impl<T: RecordParser<'static> + Send + 'static> RecordReaderBuilder<'static, T>
64
66
  fn build_multi_threaded(
65
67
  self,
66
68
  readable: Box<dyn Read + Send + 'static>,
67
- should_forget: bool,
68
69
  ) -> Result<RecordReader<'static, T>, ReaderError> {
69
70
  let flexible = self.flexible || self.flexible_default.is_some();
70
71
  let mut reader = csv::ReaderBuilder::new()
@@ -84,21 +85,20 @@ impl<T: RecordParser<'static> + Send + 'static> RecordReaderBuilder<'static, T>
84
85
  self.buffer,
85
86
  self.null_string,
86
87
  self.flexible_default,
87
- should_forget,
88
88
  ))
89
89
  }
90
90
 
91
91
  pub fn build_threaded(self) -> Result<RecordReader<'static, T>, ReaderError> {
92
92
  if self.to_read.is_kind_of(self.ruby.class_io()) {
93
93
  let readable = self.handle_file_descriptor()?;
94
- self.build_multi_threaded(readable, true)
94
+ self.build_multi_threaded(readable)
95
95
  } else if self.to_read.is_kind_of(self.ruby.class_string()) {
96
96
  let readable = self.handle_file_path()?;
97
- self.build_multi_threaded(readable, false)
97
+ self.build_multi_threaded(readable)
98
98
  } else {
99
99
  let readable = build_ruby_reader(self.ruby, self.to_read)?;
100
-
101
- self.build_single_threaded(readable)
100
+ let buffered_reader = BufReader::with_capacity(READ_BUFFER_SIZE, readable);
101
+ self.build_single_threaded(buffered_reader)
102
102
  }
103
103
  }
104
104
  }
@@ -172,7 +172,11 @@ impl<'a, T: RecordParser<'a> + Send> RecordReaderBuilder<'a, T> {
172
172
  }
173
173
 
174
174
  let file = unsafe { File::from_raw_fd(fd) };
175
- Ok(Box::new(BufReader::with_capacity(READ_BUFFER_SIZE, file)))
175
+ let forgotten = ForgottenFileHandle(ManuallyDrop::new(file));
176
+ Ok(Box::new(BufReader::with_capacity(
177
+ READ_BUFFER_SIZE,
178
+ forgotten,
179
+ )))
176
180
  }
177
181
 
178
182
  fn handle_file_path(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
@@ -191,9 +195,10 @@ impl<'a, T: RecordParser<'a> + Send> RecordReaderBuilder<'a, T> {
191
195
 
192
196
  fn build_single_threaded(
193
197
  self,
194
- readable: Box<dyn Read + 'a>,
198
+ readable: BufReader<Box<dyn SeekableRead>>,
195
199
  ) -> Result<RecordReader<'a, T>, ReaderError> {
196
200
  let flexible = self.flexible || self.flexible_default.is_some();
201
+
197
202
  let mut reader = csv::ReaderBuilder::new()
198
203
  .has_headers(self.has_headers)
199
204
  .delimiter(self.delimiter)
@@ -3,10 +3,11 @@ mod header_cache;
3
3
  mod parser;
4
4
  mod record;
5
5
  mod record_reader;
6
+ mod ruby_integration;
6
7
  mod ruby_reader;
7
8
 
8
9
  pub use builder::RecordReaderBuilder;
9
10
  pub(crate) use builder::BUFFER_CHANNEL_SIZE;
10
11
  pub use record::CowValue;
11
12
  pub use record::CsvRecord;
12
- pub(crate) use record_reader::READ_BUFFER_SIZE;
13
+ pub use ruby_integration::*;
@@ -30,7 +30,7 @@ impl<'a, S: BuildHasher + Default + 'a> RecordParser<'a>
30
30
  let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
31
31
 
32
32
  let shared_empty = Cow::Borrowed("");
33
- let shared_default = flexible_default.map(|f| CowValue(f));
33
+ let shared_default = flexible_default.map(CowValue);
34
34
  headers.iter().enumerate().for_each(|(i, &header)| {
35
35
  let value = record.get(i).map_or_else(
36
36
  || shared_default.clone(),
@@ -64,7 +64,7 @@ impl<'a> RecordParser<'a> for Vec<Option<CowValue<'a>>> {
64
64
  let mut vec = Vec::with_capacity(target_len);
65
65
 
66
66
  let shared_empty = Cow::Borrowed("");
67
- let shared_default = flexible_default.map(|f| CowValue(f));
67
+ let shared_default = flexible_default.map(CowValue);
68
68
 
69
69
  for field in record.iter() {
70
70
  let value = if Some(field) == null_string {
@@ -7,7 +7,7 @@ pub enum CsvRecord<'a, S: BuildHasher + Default> {
7
7
  Map(HashMap<&'static str, Option<CowValue<'a>>, S>),
8
8
  }
9
9
 
10
- impl<'a, S: BuildHasher + Default> IntoValue for CsvRecord<'a, S> {
10
+ impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
11
11
  #[inline]
12
12
  fn into_value_with(self, handle: &Ruby) -> Value {
13
13
  match self {
@@ -31,7 +31,7 @@ impl<'a, S: BuildHasher + Default> IntoValue for CsvRecord<'a, S> {
31
31
  #[derive(Debug, Clone)]
32
32
  pub struct CowValue<'a>(pub Cow<'a, str>);
33
33
 
34
- impl<'a> IntoValue for CowValue<'a> {
34
+ impl IntoValue for CowValue<'_> {
35
35
  fn into_value_with(self, handle: &Ruby) -> Value {
36
36
  self.0.into_value_with(handle)
37
37
  }
@@ -1,6 +1,7 @@
1
- use super::header_cache::StringCache;
2
1
  use super::parser::RecordParser;
2
+ use super::{header_cache::StringCache, ruby_reader::SeekableRead};
3
3
  use magnus::{Error, Ruby};
4
+ use std::io::BufReader;
4
5
  use std::{borrow::Cow, io::Read, thread};
5
6
 
6
7
  pub(crate) const READ_BUFFER_SIZE: usize = 16384;
@@ -9,9 +10,10 @@ pub struct RecordReader<'a, T: RecordParser<'a>> {
9
10
  inner: ReaderImpl<'a, T>,
10
11
  }
11
12
 
13
+ #[allow(clippy::large_enum_variant)]
12
14
  enum ReaderImpl<'a, T: RecordParser<'a>> {
13
15
  SingleThreaded {
14
- reader: csv::Reader<Box<dyn Read + 'a>>,
16
+ reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
15
17
  headers: Vec<&'static str>,
16
18
  null_string: Option<String>,
17
19
  flexible_default: Option<Cow<'a, str>>,
@@ -48,7 +50,7 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
48
50
  }
49
51
 
50
52
  pub(crate) fn new_single_threaded(
51
- reader: csv::Reader<Box<dyn Read + 'a>>,
53
+ reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
52
54
  headers: Vec<&'static str>,
53
55
  null_string: Option<String>,
54
56
  flexible_default: Option<&'a str>,
@@ -59,7 +61,7 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
59
61
  reader,
60
62
  headers,
61
63
  null_string,
62
- flexible_default: flexible_default.map(|s| Cow::Borrowed(s)),
64
+ flexible_default: flexible_default.map(Cow::Borrowed),
63
65
  string_record: csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers_len),
64
66
  },
65
67
  }
@@ -73,7 +75,6 @@ impl<T: RecordParser<'static> + Send> RecordReader<'static, T> {
73
75
  buffer_size: usize,
74
76
  null_string: Option<String>,
75
77
  flexible_default: Option<&'static str>,
76
- should_forget: bool,
77
78
  ) -> Self {
78
79
  let (sender, receiver) = kanal::bounded(buffer_size);
79
80
  let headers_for_thread = headers.clone();
@@ -86,16 +87,12 @@ impl<T: RecordParser<'static> + Send> RecordReader<'static, T> {
86
87
  &headers_for_thread,
87
88
  &record,
88
89
  null_string.as_deref(),
89
- flexible_default.map(|s| Cow::Borrowed(s)),
90
+ flexible_default.map(Cow::Borrowed),
90
91
  );
91
92
  if sender.send(row).is_err() {
92
93
  break;
93
94
  }
94
95
  }
95
- if should_forget {
96
- let file_to_forget = reader.into_inner();
97
- std::mem::forget(file_to_forget);
98
- }
99
96
  });
100
97
 
101
98
  Self {
@@ -134,7 +131,7 @@ impl<'a, T: RecordParser<'a>> Iterator for RecordReader<'a, T> {
134
131
  } => match reader.read_record(string_record) {
135
132
  Ok(true) => Some(T::parse(
136
133
  headers,
137
- &string_record,
134
+ string_record,
138
135
  null_string.as_deref(),
139
136
  flexible_default.clone(),
140
137
  )),
@@ -0,0 +1,30 @@
1
+ use std::{fs::File, io, mem::ManuallyDrop};
2
+
3
+ pub struct ForgottenFileHandle(pub ManuallyDrop<File>);
4
+
5
+ impl std::io::Read for ForgottenFileHandle {
6
+ fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
7
+ self.0.read(buf)
8
+ }
9
+
10
+ fn read_vectored(&mut self, bufs: &mut [std::io::IoSliceMut<'_>]) -> io::Result<usize> {
11
+ self.0.read_vectored(bufs)
12
+ }
13
+
14
+ // fn read_buf(&mut self, cursor: BorrowedCursor<'_>) -> io::Result<()> {
15
+ // self.0.read_buf(cursor)
16
+ // }
17
+
18
+ // #[inline]
19
+ // fn is_read_vectored(&self) -> bool {
20
+ // self.0.is_read_vectored()
21
+ // }
22
+
23
+ fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
24
+ self.0.read_to_end(buf)
25
+ }
26
+
27
+ fn read_to_string(&mut self, buf: &mut String) -> io::Result<usize> {
28
+ self.0.read_to_string(buf)
29
+ }
30
+ }
@@ -1,43 +1,91 @@
1
- use super::READ_BUFFER_SIZE;
2
1
  use magnus::{
3
2
  value::{Opaque, ReprValue},
4
3
  RClass, RString, Ruby, Value,
5
4
  };
6
- use std::io::{self, Read};
5
+ use std::io::{self, Read, Seek, SeekFrom, Write};
7
6
  use std::sync::OnceLock;
8
7
 
9
8
  static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
10
9
 
11
10
  /// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
12
11
  /// and provide a standard Read implementation for them.
13
- pub struct RubyReader<'a, T> {
14
- #[allow(unused)]
15
- ruby: &'a Ruby,
12
+ pub struct RubyReader<T> {
16
13
  inner: T,
17
- buffer: Option<Vec<u8>>,
18
14
  offset: usize,
19
- // Number of bytes that have been read into the buffer
20
- // Used as an upper bound for offset
21
- buffered_bytes: usize,
22
15
  }
23
16
 
24
- pub fn build_ruby_reader<'a>(
25
- ruby: &'a Ruby,
17
+ pub trait SeekableRead: std::io::Read + Seek {}
18
+ impl SeekableRead for RubyReader<Value> {}
19
+ impl SeekableRead for RubyReader<RString> {}
20
+
21
+ pub fn build_ruby_reader(
22
+ ruby: &Ruby,
26
23
  input: Value,
27
- ) -> Result<Box<dyn Read + 'a>, magnus::Error> {
24
+ ) -> Result<Box<dyn SeekableRead>, magnus::Error> {
28
25
  if RubyReader::is_string_io(ruby, &input) {
29
26
  RubyReader::from_string_io(ruby, input)
30
27
  } else if RubyReader::is_io_like(&input) {
31
- RubyReader::from_io(ruby, input)
28
+ RubyReader::from_io(input)
32
29
  } else {
33
- RubyReader::from_string_like(ruby, input)
30
+ RubyReader::from_string_like(input)
34
31
  }
35
32
  }
36
33
 
37
- impl<'a> RubyReader<'a, Value> {
38
- fn from_io(ruby: &'a Ruby, input: Value) -> Result<Box<dyn Read + 'a>, magnus::Error> {
34
+ impl Seek for RubyReader<Value> {
35
+ fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
36
+ let (whence, offset) = match pos {
37
+ SeekFrom::Start(i) => (0, i as i64),
38
+ SeekFrom::Current(i) => (1, i),
39
+ SeekFrom::End(i) => (2, i),
40
+ };
41
+
42
+ let new_position = self
43
+ .inner
44
+ .funcall("seek", (offset, whence))
45
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
46
+
47
+ Ok(new_position)
48
+ }
49
+ }
50
+
51
+ impl Write for RubyReader<Value> {
52
+ fn write(&mut self, buf: &[u8]) -> Result<usize, io::Error> {
53
+ let ruby_bytes = RString::from_slice(buf);
54
+
55
+ let bytes_written = self
56
+ .inner
57
+ .funcall::<_, _, usize>("write", (ruby_bytes,))
58
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
59
+
60
+ Ok(bytes_written)
61
+ }
62
+
63
+ fn flush(&mut self) -> Result<(), io::Error> {
64
+ self.inner
65
+ .funcall::<_, _, Value>("flush", ())
66
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
67
+
68
+ Ok(())
69
+ }
70
+ }
71
+
72
+ impl Seek for RubyReader<RString> {
73
+ fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
74
+ match pos {
75
+ io::SeekFrom::Start(offset) => self.offset = offset as usize,
76
+ io::SeekFrom::Current(offset) => self.offset = (self.offset as i64 + offset) as usize,
77
+ io::SeekFrom::End(offset) => {
78
+ self.offset = self.inner.len() - offset as usize
79
+ }
80
+ }
81
+ Ok(self.offset as u64)
82
+ }
83
+ }
84
+
85
+ impl RubyReader<Value> {
86
+ fn from_io(input: Value) -> Result<Box<dyn SeekableRead>, magnus::Error> {
39
87
  if Self::is_io_like(&input) {
40
- Ok(Box::new(Self::from_io_like(ruby, input)))
88
+ Ok(Box::new(Self::from_io_like(input)))
41
89
  } else {
42
90
  Err(magnus::Error::new(
43
91
  magnus::exception::type_error(),
@@ -50,70 +98,19 @@ impl<'a> RubyReader<'a, Value> {
50
98
  input.respond_to("read", false).unwrap_or(false)
51
99
  }
52
100
 
53
- fn from_io_like(ruby: &'a Ruby, input: Value) -> Self {
101
+ fn from_io_like(input: Value) -> Self {
54
102
  Self {
55
- ruby,
56
103
  inner: input,
57
- buffer: Some(vec![0; READ_BUFFER_SIZE]),
58
104
  offset: 0,
59
- buffered_bytes: 0,
60
105
  }
61
106
  }
62
-
63
- fn read_from_buffer(&mut self, to_buf: &mut [u8]) -> Option<io::Result<usize>> {
64
- if let Some(from_buf) = &self.buffer {
65
- // If the offset is within the buffered bytes, copy the remaining bytes to the output buffer
66
- if self.offset < self.buffered_bytes {
67
- let remaining = self.buffered_bytes - self.offset;
68
- let copy_size = remaining.min(to_buf.len());
69
- to_buf[..copy_size]
70
- .copy_from_slice(&from_buf[self.offset..self.offset + copy_size]);
71
- self.offset += copy_size;
72
- Some(Ok(copy_size))
73
- } else {
74
- None
75
- }
76
- } else {
77
- None
78
- }
79
- }
80
-
81
- fn read_from_ruby(&mut self, buf: &mut [u8]) -> io::Result<usize> {
82
- let buffer = self.buffer.as_mut().unwrap();
83
- let result = self
84
- .inner
85
- .funcall::<_, _, RString>("read", (buffer.capacity(),))
86
- .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
87
-
88
- if result.is_nil() {
89
- return Ok(0); // EOF
90
- }
91
-
92
- let bytes = unsafe { result.as_slice() };
93
-
94
- // Update internal buffer
95
- let bytes_len = bytes.len();
96
- if bytes_len == 0 {
97
- return Ok(0);
98
- }
99
-
100
- // Only copy what we actually read
101
- buffer[..bytes_len].copy_from_slice(bytes);
102
- self.buffered_bytes = bytes_len;
103
-
104
- // Copy to output buffer
105
- let copy_size = bytes_len.min(buf.len());
106
- buf[..copy_size].copy_from_slice(&buffer[..copy_size]);
107
- self.offset = copy_size;
108
- Ok(copy_size)
109
- }
110
107
  }
111
108
 
112
- impl<'a> RubyReader<'a, RString> {
109
+ impl RubyReader<RString> {
113
110
  pub fn from_string_io(
114
- ruby: &'a Ruby,
111
+ ruby: &Ruby,
115
112
  input: Value,
116
- ) -> Result<Box<dyn Read + 'a>, magnus::Error> {
113
+ ) -> Result<Box<dyn SeekableRead>, magnus::Error> {
117
114
  if !Self::is_string_io(ruby, &input) {
118
115
  return Err(magnus::Error::new(
119
116
  magnus::exception::type_error(),
@@ -123,11 +120,8 @@ impl<'a> RubyReader<'a, RString> {
123
120
 
124
121
  let string_content = input.funcall::<_, _, RString>("string", ()).unwrap();
125
122
  Ok(Box::new(Self {
126
- ruby,
127
123
  inner: string_content,
128
- buffer: None,
129
124
  offset: 0,
130
- buffered_bytes: 0,
131
125
  }))
132
126
  }
133
127
 
@@ -139,33 +133,32 @@ impl<'a> RubyReader<'a, RString> {
139
133
  input.is_kind_of(ruby.get_inner(*string_io_class))
140
134
  }
141
135
 
142
- fn from_string_like(ruby: &'a Ruby, input: Value) -> Result<Box<dyn Read + 'a>, magnus::Error> {
136
+ fn from_string_like(input: Value) -> Result<Box<dyn SeekableRead>, magnus::Error> {
143
137
  // Try calling `to_str`, and if that fails, try `to_s`
144
138
  let string_content = input
145
139
  .funcall::<_, _, RString>("to_str", ())
146
140
  .or_else(|_| input.funcall::<_, _, RString>("to_s", ()))?;
147
141
  Ok(Box::new(Self {
148
- ruby,
149
142
  inner: string_content,
150
- buffer: None,
151
143
  offset: 0,
152
- buffered_bytes: 0,
153
144
  }))
154
145
  }
155
146
  }
156
147
 
157
- impl<'a> Read for RubyReader<'a, Value> {
158
- fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
159
- if let Some(result) = self.read_from_buffer(buf) {
160
- result
161
- } else {
162
- // If the buffer is empty, read from Ruby
163
- self.read_from_ruby(buf)
164
- }
148
+ impl Read for RubyReader<Value> {
149
+ fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
150
+ let bytes = self
151
+ .inner
152
+ .funcall::<_, _, RString>("read", (buf.len(),))
153
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
154
+
155
+ buf.write_all(unsafe { bytes.as_slice() })?;
156
+
157
+ Ok(bytes.len())
165
158
  }
166
159
  }
167
160
 
168
- impl<'a> Read for RubyReader<'a, RString> {
161
+ impl Read for RubyReader<RString> {
169
162
  fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
170
163
  let string_buffer = unsafe { self.inner.as_slice() };
171
164
  if self.offset >= string_buffer.len() {
data/ext/osv/src/lib.rs CHANGED
@@ -1,3 +1,4 @@
1
+ mod allocator;
1
2
  mod csv;
2
3
  mod reader;
3
4
  mod utils;
@@ -1,19 +1,19 @@
1
1
  use crate::csv::{CowValue, CsvRecord, RecordReaderBuilder};
2
2
  use crate::utils::*;
3
+ use ahash::RandomState;
3
4
  use csv::Trim;
4
5
  use magnus::value::ReprValue;
5
6
  use magnus::{block::Yield, Error, KwArgs, RHash, Ruby, Symbol, Value};
6
7
  use std::collections::HashMap;
7
- use xxhash_rust::xxh3::Xxh3Builder;
8
8
 
9
- pub fn parse_csv<'a>(
9
+ pub fn parse_csv(
10
10
  rb_self: Value,
11
11
  args: &[Value],
12
- ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<'static, Xxh3Builder>>>>, Error> {
12
+ ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<'static, RandomState>>>>, Error> {
13
13
  let original = unsafe { Ruby::get_unchecked() };
14
14
  let ruby: &'static Ruby = Box::leak(Box::new(original));
15
15
 
16
- let CsvArgs {
16
+ let ReadCsvArgs {
17
17
  to_read,
18
18
  has_headers,
19
19
  delimiter,
@@ -24,7 +24,7 @@ pub fn parse_csv<'a>(
24
24
  flexible,
25
25
  flexible_default,
26
26
  trim,
27
- } = parse_csv_args(&ruby, args)?;
27
+ } = parse_read_csv_args(ruby, args)?;
28
28
 
29
29
  let flexible_default: &'static Option<String> = Box::leak(Box::new(flexible_default));
30
30
  let leaked_flexible_default: &'static Option<&str> =
@@ -51,11 +51,11 @@ pub fn parse_csv<'a>(
51
51
  });
52
52
  }
53
53
 
54
- let iter: Box<dyn Iterator<Item = CsvRecord<Xxh3Builder>>> = match result_type.as_str() {
54
+ let iter: Box<dyn Iterator<Item = CsvRecord<RandomState>>> = match result_type.as_str() {
55
55
  "hash" => {
56
56
  let builder = RecordReaderBuilder::<
57
- HashMap<&'static str, Option<CowValue<'static>>, Xxh3Builder>,
58
- >::new(&ruby, to_read)
57
+ HashMap<&'static str, Option<CowValue<'static>>, RandomState>,
58
+ >::new(ruby, to_read)
59
59
  .has_headers(has_headers)
60
60
  .flexible(flexible)
61
61
  .flexible_default(flexible_default.as_deref())
@@ -68,7 +68,7 @@ pub fn parse_csv<'a>(
68
68
  Box::new(builder.build_threaded()?.map(CsvRecord::Map))
69
69
  }
70
70
  "array" => Box::new(
71
- RecordReaderBuilder::<Vec<Option<CowValue<'static>>>>::new(&ruby, to_read)
71
+ RecordReaderBuilder::<Vec<Option<CowValue<'static>>>>::new(ruby, to_read)
72
72
  .has_headers(has_headers)
73
73
  .flexible(flexible)
74
74
  .flexible_default(flexible_default.as_deref())
@@ -107,7 +107,7 @@ struct EnumeratorArgs {
107
107
 
108
108
  fn create_enumerator(
109
109
  args: EnumeratorArgs,
110
- ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<'static, Xxh3Builder>>>>, Error> {
110
+ ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<'static, RandomState>>>>, Error> {
111
111
  let kwargs = RHash::new();
112
112
  kwargs.aset(Symbol::new("has_headers"), args.has_headers)?;
113
113
  kwargs.aset(
data/ext/osv/src/utils.rs CHANGED
@@ -13,12 +13,12 @@ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, E
13
13
  RString::from_value(value)
14
14
  .ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid string value"))?
15
15
  .to_string()
16
- .map(|s| Some(s))
16
+ .map(Some)
17
17
  } else if value.is_kind_of(ruby.class_symbol()) {
18
18
  Symbol::from_value(value)
19
19
  .ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid symbol value"))?
20
20
  .funcall("to_s", ())
21
- .map(|s| Some(s))
21
+ .map(Some)
22
22
  } else {
23
23
  Err(Error::new(
24
24
  magnus::exception::type_error(),
@@ -28,7 +28,7 @@ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, E
28
28
  }
29
29
 
30
30
  #[derive(Debug)]
31
- pub struct CsvArgs {
31
+ pub struct ReadCsvArgs {
32
32
  pub to_read: Value,
33
33
  pub has_headers: bool,
34
34
  pub delimiter: u8,
@@ -42,7 +42,7 @@ pub struct CsvArgs {
42
42
  }
43
43
 
44
44
  /// Parse common arguments for CSV parsing
45
- pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
45
+ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, Error> {
46
46
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
47
47
  let (to_read,) = parsed_args.required;
48
48
 
@@ -166,7 +166,7 @@ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
166
166
  None => csv::Trim::None,
167
167
  };
168
168
 
169
- Ok(CsvArgs {
169
+ Ok(ReadCsvArgs {
170
170
  to_read,
171
171
  has_headers,
172
172
  delimiter,
data/lib/osv/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module OSV
2
- VERSION = "0.3.14"
2
+ VERSION = "0.3.15"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: osv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.14
4
+ version: 0.3.15
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-02 00:00:00.000000000 Z
11
+ date: 2025-01-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -59,12 +59,14 @@ files:
59
59
  - Rakefile
60
60
  - ext/osv/Cargo.toml
61
61
  - ext/osv/extconf.rb
62
+ - ext/osv/src/allocator.rs
62
63
  - ext/osv/src/csv/builder.rs
63
64
  - ext/osv/src/csv/header_cache.rs
64
65
  - ext/osv/src/csv/mod.rs
65
66
  - ext/osv/src/csv/parser.rs
66
67
  - ext/osv/src/csv/record.rs
67
68
  - ext/osv/src/csv/record_reader.rs
69
+ - ext/osv/src/csv/ruby_integration.rs
68
70
  - ext/osv/src/csv/ruby_reader.rs
69
71
  - ext/osv/src/lib.rs
70
72
  - ext/osv/src/reader.rs