osv 0.3.14 → 0.3.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 689f28c935746890aa680fd2f649076a36d6ce233d4cbf2717dc129174b593dc
4
- data.tar.gz: 45ddaa6774a9a4e9391d000b30b6e92afb8560b81821d8fec363d54283bac6d9
3
+ metadata.gz: 91401989a8532162a9731fed3cb07661c0676105f77465da23f9a267773e7651
4
+ data.tar.gz: aeba48f1338a4160044e8c7264f80eb065d950567288bded39acf5d9bc593d7b
5
5
  SHA512:
6
- metadata.gz: 74c2052ea9cbc61ddef5d1c46abdd5e4cdf7c60c946c421e4b8da7c160ba3f3eb761842279cd9f066aa6a1aa2214d0ef9ba9ff11c46294e8e1d4ebbb95161d70
7
- data.tar.gz: 5a795e5fa6d84b39082c2754dea655cd5b4f8a00558627fb64f661a14ec32daa8ea7b31a13724291e11531d03c8c5fa1bdb928c6d9422b87fcbb5b5aba7daad5
6
+ metadata.gz: 8d2ea3f724a6f7af317bb1ae865513c15f2ef0e475b070e7f9ae2e1b4155b2d82090387beb0c6a2e5cb8664b1f6dd0cf61e6ad9545957bc3ada1a3e87758b1ee
7
+ data.tar.gz: 0eaa86241092c14f4c2973d74e65877b7f3f87487a2681b9a094054f98db759772bcf012ec2f4fa073bd16f2b02927212b13afec484f84daf764d3b3e0811b6b
data/Cargo.lock CHANGED
@@ -1,6 +1,6 @@
1
1
  # This file is automatically @generated by Cargo.
2
2
  # It is not intended for manual editing.
3
- version = 3
3
+ version = 4
4
4
 
5
5
  [[package]]
6
6
  name = "adler2"
@@ -8,6 +8,19 @@ version = "2.0.0"
8
8
  source = "registry+https://github.com/rust-lang/crates.io-index"
9
9
  checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
10
10
 
11
+ [[package]]
12
+ name = "ahash"
13
+ version = "0.8.11"
14
+ source = "registry+https://github.com/rust-lang/crates.io-index"
15
+ checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
16
+ dependencies = [
17
+ "cfg-if",
18
+ "getrandom",
19
+ "once_cell",
20
+ "version_check",
21
+ "zerocopy",
22
+ ]
23
+
11
24
  [[package]]
12
25
  name = "aho-corasick"
13
26
  version = "1.1.3"
@@ -32,7 +45,7 @@ dependencies = [
32
45
  "bitflags",
33
46
  "cexpr",
34
47
  "clang-sys",
35
- "itertools",
48
+ "itertools 0.12.1",
36
49
  "lazy_static",
37
50
  "lazycell",
38
51
  "proc-macro2",
@@ -49,6 +62,15 @@ version = "2.6.0"
49
62
  source = "registry+https://github.com/rust-lang/crates.io-index"
50
63
  checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
51
64
 
65
+ [[package]]
66
+ name = "cc"
67
+ version = "1.2.7"
68
+ source = "registry+https://github.com/rust-lang/crates.io-index"
69
+ checksum = "a012a0df96dd6d06ba9a1b29d6402d1a5d77c6befd2566afdc26e10603dc93d7"
70
+ dependencies = [
71
+ "shlex",
72
+ ]
73
+
52
74
  [[package]]
53
75
  name = "cexpr"
54
76
  version = "0.6.0"
@@ -127,6 +149,17 @@ version = "0.3.31"
127
149
  source = "registry+https://github.com/rust-lang/crates.io-index"
128
150
  checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
129
151
 
152
+ [[package]]
153
+ name = "getrandom"
154
+ version = "0.2.15"
155
+ source = "registry+https://github.com/rust-lang/crates.io-index"
156
+ checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
157
+ dependencies = [
158
+ "cfg-if",
159
+ "libc",
160
+ "wasi",
161
+ ]
162
+
130
163
  [[package]]
131
164
  name = "glob"
132
165
  version = "0.3.1"
@@ -142,12 +175,41 @@ dependencies = [
142
175
  "either",
143
176
  ]
144
177
 
178
+ [[package]]
179
+ name = "itertools"
180
+ version = "0.14.0"
181
+ source = "registry+https://github.com/rust-lang/crates.io-index"
182
+ checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
183
+ dependencies = [
184
+ "either",
185
+ ]
186
+
145
187
  [[package]]
146
188
  name = "itoa"
147
189
  version = "1.0.14"
148
190
  source = "registry+https://github.com/rust-lang/crates.io-index"
149
191
  checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
150
192
 
193
+ [[package]]
194
+ name = "jemalloc-sys"
195
+ version = "0.5.4+5.3.0-patched"
196
+ source = "registry+https://github.com/rust-lang/crates.io-index"
197
+ checksum = "ac6c1946e1cea1788cbfde01c993b52a10e2da07f4bac608228d1bed20bfebf2"
198
+ dependencies = [
199
+ "cc",
200
+ "libc",
201
+ ]
202
+
203
+ [[package]]
204
+ name = "jemallocator"
205
+ version = "0.5.4"
206
+ source = "registry+https://github.com/rust-lang/crates.io-index"
207
+ checksum = "a0de374a9f8e63150e6f5e8a60cc14c668226d7a347d8aee1a45766e3c4dd3bc"
208
+ dependencies = [
209
+ "jemalloc-sys",
210
+ "libc",
211
+ ]
212
+
151
213
  [[package]]
152
214
  name = "kanal"
153
215
  version = "0.1.0-pre8"
@@ -186,6 +248,16 @@ dependencies = [
186
248
  "windows-targets",
187
249
  ]
188
250
 
251
+ [[package]]
252
+ name = "libmimalloc-sys"
253
+ version = "0.1.39"
254
+ source = "registry+https://github.com/rust-lang/crates.io-index"
255
+ checksum = "23aa6811d3bd4deb8a84dde645f943476d13b248d818edcf8ce0b2f37f036b44"
256
+ dependencies = [
257
+ "cc",
258
+ "libc",
259
+ ]
260
+
189
261
  [[package]]
190
262
  name = "lock_api"
191
263
  version = "0.4.12"
@@ -237,6 +309,15 @@ version = "2.7.4"
237
309
  source = "registry+https://github.com/rust-lang/crates.io-index"
238
310
  checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
239
311
 
312
+ [[package]]
313
+ name = "mimalloc"
314
+ version = "0.1.43"
315
+ source = "registry+https://github.com/rust-lang/crates.io-index"
316
+ checksum = "68914350ae34959d83f732418d51e2427a794055d0b9529f48259ac07af65633"
317
+ dependencies = [
318
+ "libmimalloc-sys",
319
+ ]
320
+
240
321
  [[package]]
241
322
  name = "minimal-lexical"
242
323
  version = "0.2.1"
@@ -262,19 +343,28 @@ dependencies = [
262
343
  "minimal-lexical",
263
344
  ]
264
345
 
346
+ [[package]]
347
+ name = "once_cell"
348
+ version = "1.20.2"
349
+ source = "registry+https://github.com/rust-lang/crates.io-index"
350
+ checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
351
+
265
352
  [[package]]
266
353
  name = "osv"
267
354
  version = "0.1.0"
268
355
  dependencies = [
356
+ "ahash",
269
357
  "csv",
270
358
  "flate2",
359
+ "itertools 0.14.0",
360
+ "jemallocator",
271
361
  "kanal",
272
362
  "magnus 0.7.1",
363
+ "mimalloc",
273
364
  "rb-sys",
274
365
  "serde",
275
366
  "serde_magnus",
276
367
  "thiserror",
277
- "xxhash-rust",
278
368
  ]
279
369
 
280
370
  [[package]]
@@ -464,6 +554,18 @@ version = "1.0.14"
464
554
  source = "registry+https://github.com/rust-lang/crates.io-index"
465
555
  checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
466
556
 
557
+ [[package]]
558
+ name = "version_check"
559
+ version = "0.9.5"
560
+ source = "registry+https://github.com/rust-lang/crates.io-index"
561
+ checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
562
+
563
+ [[package]]
564
+ name = "wasi"
565
+ version = "0.11.0+wasi-snapshot-preview1"
566
+ source = "registry+https://github.com/rust-lang/crates.io-index"
567
+ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
568
+
467
569
  [[package]]
468
570
  name = "windows-targets"
469
571
  version = "0.52.6"
@@ -529,7 +631,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
529
631
  checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
530
632
 
531
633
  [[package]]
532
- name = "xxhash-rust"
533
- version = "0.8.14"
634
+ name = "zerocopy"
635
+ version = "0.7.35"
534
636
  source = "registry+https://github.com/rust-lang/crates.io-index"
535
- checksum = "d7d48f1b18be023c95e7b75f481cac649d74be7c507ff4a407c55cfb957f7934"
637
+ checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
638
+ dependencies = [
639
+ "zerocopy-derive",
640
+ ]
641
+
642
+ [[package]]
643
+ name = "zerocopy-derive"
644
+ version = "0.7.35"
645
+ source = "registry+https://github.com/rust-lang/crates.io-index"
646
+ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
647
+ dependencies = [
648
+ "proc-macro2",
649
+ "quote",
650
+ "syn",
651
+ ]
data/README.md CHANGED
@@ -118,7 +118,7 @@ This library is faster than the standard Ruby CSV library. It's also faster than
118
118
 
119
119
  Here's some unscientific benchmarks. You can find the code in the [benchmark/comparison_benchmark.rb](benchmark/comparison_benchmark.rb) file.
120
120
 
121
- ### 1,000,000 lines
121
+ ### 1,000,000 records
122
122
 
123
123
  ```
124
124
  🏃 Running benchmarks...
@@ -142,34 +142,34 @@ OSV - Gzipped Direct 1.000 i/100ms
142
142
  FastCSV - Gzipped 1.000 i/100ms
143
143
  CSV - Gzipped 1.000 i/100ms
144
144
  Calculating -------------------------------------
145
- CSV - StringIO 0.083 (± 0.0%) i/s (12.01 s/i) - 3.000 in 36.028672s
146
- FastCSV - StringIO 0.366 (± 0.0%) i/s (2.73 s/i) - 11.000 in 30.032350s
147
- OSV - StringIO 0.522 (± 0.0%) i/s (1.92 s/i) - 16.000 in 30.655768s
148
- CSV - Hash output 0.062 (± 0.0%) i/s (16.16 s/i) - 2.000 in 32.311990s
149
- OSV - Hash output 0.273 (± 0.0%) i/s (3.66 s/i) - 9.000 in 32.924970s
150
- CSV - Array output 0.069 (± 0.0%) i/s (14.50 s/i) - 3.000 in 43.488185s
151
- OSV - Array output 0.601 (± 0.0%) i/s (1.66 s/i) - 19.000 in 31.636782s
145
+ CSV - StringIO 0.080 (± 0.0%) i/s (12.43 s/i) - 3.000 in 37.301114s
146
+ FastCSV - StringIO 0.368 (± 0.0%) i/s (2.72 s/i) - 12.000 in 32.619020s
147
+ OSV - StringIO 0.699 (± 0.0%) i/s (1.43 s/i) - 21.000 in 30.091225s
148
+ CSV - Hash output 0.059 (± 0.0%) i/s (16.95 s/i) - 2.000 in 33.908533s
149
+ OSV - Hash output 0.329 (± 0.0%) i/s (3.04 s/i) - 10.000 in 30.551275s
150
+ CSV - Array output 0.066 (± 0.0%) i/s (15.18 s/i) - 2.000 in 30.357327s
151
+ OSV - Array output 0.632 (± 0.0%) i/s (1.58 s/i) - 19.000 in 30.150113s
152
152
  FastCSV - Array output
153
- 0.356 (± 0.0%) i/s (2.81 s/i) - 11.000 in 30.871931s
153
+ 0.350 (± 0.0%) i/s (2.86 s/i) - 11.000 in 31.477268s
154
154
  OSV - Direct Open Array output
155
- 0.604 (± 0.0%) i/s (1.66 s/i) - 19.000 in 31.469190s
156
- OSV - Gzipped 0.424 (± 0.0%) i/s (2.36 s/i) - 13.000 in 30.642322s
157
- OSV - Gzipped Direct 0.636 (± 0.0%) i/s (1.57 s/i) - 20.000 in 31.424083s
158
- FastCSV - Gzipped 0.323 (± 0.0%) i/s (3.10 s/i) - 10.000 in 30.990648s
159
- CSV - Gzipped 0.058 (± 0.0%) i/s (17.11 s/i) - 2.000 in 34.228691s
155
+ 0.641 (± 0.0%) i/s (1.56 s/i) - 20.000 in 31.275201s
156
+ OSV - Gzipped 0.530 (± 0.0%) i/s (1.89 s/i) - 16.000 in 30.183753s
157
+ OSV - Gzipped Direct 0.727 (± 0.0%) i/s (1.37 s/i) - 22.000 in 30.283991s
158
+ FastCSV - Gzipped 0.323 (± 0.0%) i/s (3.09 s/i) - 10.000 in 30.949600s
159
+ CSV - Gzipped 0.056 (± 0.0%) i/s (17.72 s/i) - 2.000 in 35.440473s
160
160
 
161
161
  Comparison:
162
- OSV - Gzipped Direct: 0.6 i/s
163
- OSV - Direct Open Array output: 0.6 i/s - 1.05x slower
164
- OSV - Array output: 0.6 i/s - 1.06x slower
165
- OSV - StringIO: 0.5 i/s - 1.22x slower
166
- OSV - Gzipped: 0.4 i/s - 1.50x slower
167
- FastCSV - StringIO: 0.4 i/s - 1.74x slower
168
- FastCSV - Array output: 0.4 i/s - 1.79x slower
169
- FastCSV - Gzipped: 0.3 i/s - 1.97x slower
170
- OSV - Hash output: 0.3 i/s - 2.33x slower
171
- CSV - StringIO: 0.1 i/s - 7.64x slower
172
- CSV - Array output: 0.1 i/s - 9.23x slower
173
- CSV - Hash output: 0.1 i/s - 10.28x slower
174
- CSV - Gzipped: 0.1 i/s - 10.89x slower
162
+ OSV - Gzipped Direct: 0.7 i/s
163
+ OSV - StringIO: 0.7 i/s - 1.04x slower
164
+ OSV - Direct Open Array output: 0.6 i/s - 1.14x slower
165
+ OSV - Array output: 0.6 i/s - 1.15x slower
166
+ OSV - Gzipped: 0.5 i/s - 1.37x slower
167
+ FastCSV - StringIO: 0.4 i/s - 1.98x slower
168
+ FastCSV - Array output: 0.3 i/s - 2.08x slower
169
+ OSV - Hash output: 0.3 i/s - 2.21x slower
170
+ FastCSV - Gzipped: 0.3 i/s - 2.25x slower
171
+ CSV - StringIO: 0.1 i/s - 9.04x slower
172
+ CSV - Array output: 0.1 i/s - 11.04x slower
173
+ CSV - Hash output: 0.1 i/s - 12.33x slower
174
+ CSV - Gzipped: 0.1 i/s - 12.89x slower
175
175
  ```
data/ext/osv/Cargo.toml CHANGED
@@ -7,6 +7,7 @@ edition = "2021"
7
7
  crate-type = ["cdylib"]
8
8
 
9
9
  [dependencies]
10
+ ahash = "0.8"
10
11
  csv = "^1.3"
11
12
  flate2 = "1.0.35"
12
13
  kanal = "0.1.0-pre8"
@@ -15,4 +16,10 @@ rb-sys = "^0.9"
15
16
  serde = { version = "1.0", features = ["derive"] }
16
17
  serde_magnus = "0.8.1"
17
18
  thiserror = "2.0"
18
- xxhash-rust = { version = "0.8.12", features = ["xxh3"] }
19
+ itertools = "^0.14"
20
+
21
+ [target.'cfg(target_os = "linux")'.dependencies]
22
+ jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
23
+
24
+ [target.'cfg(not(any(target_os = "linux", target_os = "windows")))'.dependencies]
25
+ mimalloc = { version = "0.1", default-features = false }
@@ -0,0 +1,13 @@
1
+ #[cfg(target_os = "linux")]
2
+ use jemallocator::Jemalloc;
3
+
4
+ #[cfg(not(any(target_os = "linux", target_os = "windows")))]
5
+ use mimalloc::MiMalloc;
6
+
7
+ #[global_allocator]
8
+ #[cfg(target_os = "linux")]
9
+ static ALLOC: Jemalloc = Jemalloc;
10
+
11
+ #[global_allocator]
12
+ #[cfg(not(any(target_os = "linux", target_os = "windows")))]
13
+ static ALLOC: MiMalloc = MiMalloc;
@@ -2,7 +2,8 @@ use super::{
2
2
  header_cache::{CacheError, StringCache},
3
3
  parser::RecordParser,
4
4
  record_reader::{RecordReader, READ_BUFFER_SIZE},
5
- ruby_reader::build_ruby_reader,
5
+ ruby_reader::{build_ruby_reader, SeekableRead},
6
+ ForgottenFileHandle,
6
7
  };
7
8
  use flate2::read::GzDecoder;
8
9
  use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, Ruby, Value};
@@ -10,6 +11,7 @@ use std::{
10
11
  fs::File,
11
12
  io::{self, BufReader, Read},
12
13
  marker::PhantomData,
14
+ mem::ManuallyDrop,
13
15
  os::fd::FromRawFd,
14
16
  };
15
17
 
@@ -64,7 +66,6 @@ impl<T: RecordParser<'static> + Send + 'static> RecordReaderBuilder<'static, T>
64
66
  fn build_multi_threaded(
65
67
  self,
66
68
  readable: Box<dyn Read + Send + 'static>,
67
- should_forget: bool,
68
69
  ) -> Result<RecordReader<'static, T>, ReaderError> {
69
70
  let flexible = self.flexible || self.flexible_default.is_some();
70
71
  let mut reader = csv::ReaderBuilder::new()
@@ -84,21 +85,20 @@ impl<T: RecordParser<'static> + Send + 'static> RecordReaderBuilder<'static, T>
84
85
  self.buffer,
85
86
  self.null_string,
86
87
  self.flexible_default,
87
- should_forget,
88
88
  ))
89
89
  }
90
90
 
91
91
  pub fn build_threaded(self) -> Result<RecordReader<'static, T>, ReaderError> {
92
92
  if self.to_read.is_kind_of(self.ruby.class_io()) {
93
93
  let readable = self.handle_file_descriptor()?;
94
- self.build_multi_threaded(readable, true)
94
+ self.build_multi_threaded(readable)
95
95
  } else if self.to_read.is_kind_of(self.ruby.class_string()) {
96
96
  let readable = self.handle_file_path()?;
97
- self.build_multi_threaded(readable, false)
97
+ self.build_multi_threaded(readable)
98
98
  } else {
99
99
  let readable = build_ruby_reader(self.ruby, self.to_read)?;
100
-
101
- self.build_single_threaded(readable)
100
+ let buffered_reader = BufReader::with_capacity(READ_BUFFER_SIZE, readable);
101
+ self.build_single_threaded(buffered_reader)
102
102
  }
103
103
  }
104
104
  }
@@ -172,7 +172,11 @@ impl<'a, T: RecordParser<'a> + Send> RecordReaderBuilder<'a, T> {
172
172
  }
173
173
 
174
174
  let file = unsafe { File::from_raw_fd(fd) };
175
- Ok(Box::new(BufReader::with_capacity(READ_BUFFER_SIZE, file)))
175
+ let forgotten = ForgottenFileHandle(ManuallyDrop::new(file));
176
+ Ok(Box::new(BufReader::with_capacity(
177
+ READ_BUFFER_SIZE,
178
+ forgotten,
179
+ )))
176
180
  }
177
181
 
178
182
  fn handle_file_path(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
@@ -191,9 +195,10 @@ impl<'a, T: RecordParser<'a> + Send> RecordReaderBuilder<'a, T> {
191
195
 
192
196
  fn build_single_threaded(
193
197
  self,
194
- readable: Box<dyn Read + 'a>,
198
+ readable: BufReader<Box<dyn SeekableRead>>,
195
199
  ) -> Result<RecordReader<'a, T>, ReaderError> {
196
200
  let flexible = self.flexible || self.flexible_default.is_some();
201
+
197
202
  let mut reader = csv::ReaderBuilder::new()
198
203
  .has_headers(self.has_headers)
199
204
  .delimiter(self.delimiter)
@@ -1,3 +1,4 @@
1
+ use magnus::{r_string::FString, value::Opaque, IntoValue, RString, Ruby, Value};
1
2
  /// This module exists to avoid cloning header keys in returned HashMaps.
2
3
  /// Since the underlying RString creation already involves cloning,
3
4
  /// this caching layer aims to reduce redundant allocations.
@@ -6,7 +7,7 @@
6
7
  /// so this optimization could be removed if any issues arise.
7
8
  use std::{
8
9
  collections::HashMap,
9
- sync::{atomic::AtomicU32, LazyLock, Mutex},
10
+ sync::{atomic::AtomicU32, atomic::Ordering, LazyLock, Mutex},
10
11
  };
11
12
  use thiserror::Error;
12
13
 
@@ -16,64 +17,116 @@ pub enum CacheError {
16
17
  LockError(String),
17
18
  }
18
19
 
19
- static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, AtomicU32>>> =
20
+ static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey, AtomicU32)>>> =
20
21
  LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
21
22
 
22
23
  pub struct StringCache;
23
24
 
25
+ #[derive(Copy, Clone)]
26
+ pub struct StringCacheKey(Opaque<FString>, &'static str);
27
+
28
+ impl StringCacheKey {
29
+ pub fn new(string: &str) -> Self {
30
+ let rstr = RString::new(string);
31
+ let fstr = rstr.to_interned_str();
32
+ Self(Opaque::from(fstr), fstr.as_str().unwrap())
33
+ }
34
+ }
35
+
36
+ impl AsRef<str> for StringCacheKey {
37
+ fn as_ref(&self) -> &'static str {
38
+ self.1
39
+ }
40
+ }
41
+
42
+ impl IntoValue for StringCacheKey {
43
+ fn into_value_with(self, handle: &Ruby) -> Value {
44
+ handle.into_value(self.0)
45
+ }
46
+ }
47
+
48
+ impl std::fmt::Debug for StringCacheKey {
49
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
50
+ self.1.fmt(f)
51
+ }
52
+ }
53
+
54
+ impl PartialEq for StringCacheKey {
55
+ fn eq(&self, other: &Self) -> bool {
56
+ self.1 == other.1
57
+ }
58
+ }
59
+
60
+ impl std::cmp::Eq for StringCacheKey {}
61
+
62
+ impl std::hash::Hash for StringCacheKey {
63
+ fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
64
+ self.1.hash(state);
65
+ }
66
+ }
67
+
24
68
  impl StringCache {
25
69
  #[allow(dead_code)]
26
- pub fn intern(string: String) -> Result<&'static str, CacheError> {
70
+ pub fn intern(string: String) -> Result<StringCacheKey, CacheError> {
27
71
  let mut cache = STRING_CACHE
28
72
  .lock()
29
73
  .map_err(|e| CacheError::LockError(e.to_string()))?;
30
74
 
31
- if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
32
- count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
33
- Ok(existing)
75
+ if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
76
+ counter.fetch_add(1, Ordering::Relaxed);
77
+ Ok(*interned_string)
34
78
  } else {
79
+ let interned = StringCacheKey::new(string.as_str());
35
80
  let leaked = Box::leak(string.into_boxed_str());
36
- cache.insert(leaked, AtomicU32::new(1));
37
- Ok(leaked)
81
+ cache.insert(leaked, (interned, AtomicU32::new(1)));
82
+ Ok(interned)
38
83
  }
39
84
  }
40
85
 
41
- pub fn intern_many(strings: &[String]) -> Result<Vec<&'static str>, CacheError> {
86
+ pub fn intern_many(strings: &[String]) -> Result<Vec<StringCacheKey>, CacheError> {
42
87
  let mut cache = STRING_CACHE
43
88
  .lock()
44
89
  .map_err(|e| CacheError::LockError(e.to_string()))?;
45
90
 
46
- let mut result = Vec::with_capacity(strings.len());
91
+ let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
47
92
  for string in strings {
48
- if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
49
- count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
50
- result.push(existing);
93
+ if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
94
+ counter.fetch_add(1, Ordering::Relaxed);
95
+ result.push(*interned_string);
51
96
  } else {
97
+ let interned = StringCacheKey::new(&string);
52
98
  let leaked = Box::leak(string.clone().into_boxed_str());
53
- cache.insert(leaked, AtomicU32::new(1));
54
- result.push(leaked);
99
+ cache.insert(leaked, (interned, AtomicU32::new(1)));
100
+ result.push(interned);
55
101
  }
56
102
  }
57
103
  Ok(result)
58
104
  }
59
105
 
60
- pub fn clear(headers: &[&'static str]) -> Result<(), CacheError> {
106
+ pub fn clear(headers: &[StringCacheKey]) -> Result<(), CacheError> {
61
107
  let mut cache = STRING_CACHE
62
108
  .lock()
63
109
  .map_err(|e| CacheError::LockError(e.to_string()))?;
64
110
 
65
- for header in headers {
66
- if let Some(count) = cache.get(header) {
67
- // Returns the previous value of the counter
68
- let was = count.fetch_sub(1, std::sync::atomic::Ordering::Relaxed);
69
- if was == 1 {
70
- cache.remove(header);
71
- let ptr = *header as *const str as *mut str;
72
- unsafe {
73
- let _ = Box::from_raw(ptr);
111
+ let to_remove: Vec<_> = headers
112
+ .iter()
113
+ .filter_map(|header| {
114
+ let key = header.as_ref();
115
+ if let Some((_, (_, counter))) = cache.get_key_value(key) {
116
+ let prev_count = counter.fetch_sub(1, Ordering::Relaxed);
117
+ if prev_count == 1 {
118
+ Some(key)
119
+ } else {
120
+ None
74
121
  }
122
+ } else {
123
+ None
75
124
  }
76
- }
125
+ })
126
+ .collect();
127
+
128
+ for key in to_remove {
129
+ cache.remove(key);
77
130
  }
78
131
 
79
132
  Ok(())
@@ -3,10 +3,12 @@ mod header_cache;
3
3
  mod parser;
4
4
  mod record;
5
5
  mod record_reader;
6
+ mod ruby_integration;
6
7
  mod ruby_reader;
7
8
 
8
9
  pub use builder::RecordReaderBuilder;
9
10
  pub(crate) use builder::BUFFER_CHANNEL_SIZE;
11
+ pub use header_cache::StringCacheKey;
10
12
  pub use record::CowValue;
11
13
  pub use record::CsvRecord;
12
- pub(crate) use record_reader::READ_BUFFER_SIZE;
14
+ pub use ruby_integration::*;
@@ -2,13 +2,14 @@ use std::borrow::Cow;
2
2
  use std::collections::HashMap;
3
3
  use std::hash::BuildHasher;
4
4
 
5
+ use super::header_cache::StringCacheKey;
5
6
  use super::CowValue;
6
7
 
7
8
  pub trait RecordParser<'a> {
8
9
  type Output: 'a;
9
10
 
10
11
  fn parse(
11
- headers: &[&'static str],
12
+ headers: &[StringCacheKey],
12
13
  record: &csv::StringRecord,
13
14
  null_string: Option<&str>,
14
15
  flexible_default: Option<Cow<'a, str>>,
@@ -16,13 +17,13 @@ pub trait RecordParser<'a> {
16
17
  }
17
18
 
18
19
  impl<'a, S: BuildHasher + Default + 'a> RecordParser<'a>
19
- for HashMap<&'static str, Option<CowValue<'a>>, S>
20
+ for HashMap<StringCacheKey, Option<CowValue<'a>>, S>
20
21
  {
21
22
  type Output = Self;
22
23
 
23
24
  #[inline]
24
25
  fn parse(
25
- headers: &[&'static str],
26
+ headers: &[StringCacheKey],
26
27
  record: &csv::StringRecord,
27
28
  null_string: Option<&str>,
28
29
  flexible_default: Option<Cow<'a, str>>,
@@ -30,8 +31,8 @@ impl<'a, S: BuildHasher + Default + 'a> RecordParser<'a>
30
31
  let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
31
32
 
32
33
  let shared_empty = Cow::Borrowed("");
33
- let shared_default = flexible_default.map(|f| CowValue(f));
34
- headers.iter().enumerate().for_each(|(i, &header)| {
34
+ let shared_default = flexible_default.map(CowValue);
35
+ headers.iter().enumerate().for_each(|(i, ref header)| {
35
36
  let value = record.get(i).map_or_else(
36
37
  || shared_default.clone(),
37
38
  |field| {
@@ -44,7 +45,7 @@ impl<'a, S: BuildHasher + Default + 'a> RecordParser<'a>
44
45
  }
45
46
  },
46
47
  );
47
- map.insert(header, value);
48
+ map.insert((*header).clone(), value);
48
49
  });
49
50
  map
50
51
  }
@@ -55,7 +56,7 @@ impl<'a> RecordParser<'a> for Vec<Option<CowValue<'a>>> {
55
56
 
56
57
  #[inline]
57
58
  fn parse(
58
- headers: &[&'static str],
59
+ headers: &[StringCacheKey],
59
60
  record: &csv::StringRecord,
60
61
  null_string: Option<&str>,
61
62
  flexible_default: Option<Cow<'a, str>>,
@@ -64,7 +65,7 @@ impl<'a> RecordParser<'a> for Vec<Option<CowValue<'a>>> {
64
65
  let mut vec = Vec::with_capacity(target_len);
65
66
 
66
67
  let shared_empty = Cow::Borrowed("");
67
- let shared_default = flexible_default.map(|f| CowValue(f));
68
+ let shared_default = flexible_default.map(CowValue);
68
69
 
69
70
  for field in record.iter() {
70
71
  let value = if Some(field) == null_string {
@@ -1,13 +1,16 @@
1
- use magnus::{IntoValue, Ruby, Value};
1
+ use itertools::Itertools;
2
+ use magnus::{value::ReprValue, IntoValue, Ruby, Value};
2
3
  use std::{borrow::Cow, collections::HashMap, hash::BuildHasher};
3
4
 
5
+ use super::StringCacheKey;
6
+
4
7
  #[derive(Debug)]
5
8
  pub enum CsvRecord<'a, S: BuildHasher + Default> {
6
9
  Vec(Vec<Option<CowValue<'a>>>),
7
- Map(HashMap<&'static str, Option<CowValue<'a>>, S>),
10
+ Map(HashMap<StringCacheKey, Option<CowValue<'a>>, S>),
8
11
  }
9
12
 
10
- impl<'a, S: BuildHasher + Default> IntoValue for CsvRecord<'a, S> {
13
+ impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
11
14
  #[inline]
12
15
  fn into_value_with(self, handle: &Ruby) -> Value {
13
16
  match self {
@@ -19,9 +22,23 @@ impl<'a, S: BuildHasher + Default> IntoValue for CsvRecord<'a, S> {
19
22
  CsvRecord::Map(map) => {
20
23
  // Pre-allocate the hash with the known size
21
24
  let hash = handle.hash_new_capa(map.len());
22
- map.into_iter()
23
- .try_for_each(|(k, v)| hash.aset(k, v))
24
- .unwrap();
25
+
26
+ let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
27
+ let mut i = 0;
28
+
29
+ for chunk in &map.into_iter().chunks(128) {
30
+ for (k, v) in chunk {
31
+ values[i] = handle.into_value(k);
32
+ values[i + 1] = handle.into_value(v);
33
+ i += 2;
34
+ }
35
+ hash.bulk_insert(&values[..i]).unwrap();
36
+
37
+ // Zero out used values
38
+ values[..i].fill(handle.qnil().as_value());
39
+ i = 0;
40
+ }
41
+
25
42
  hash.into_value_with(handle)
26
43
  }
27
44
  }
@@ -31,7 +48,7 @@ impl<'a, S: BuildHasher + Default> IntoValue for CsvRecord<'a, S> {
31
48
  #[derive(Debug, Clone)]
32
49
  pub struct CowValue<'a>(pub Cow<'a, str>);
33
50
 
34
- impl<'a> IntoValue for CowValue<'a> {
51
+ impl IntoValue for CowValue<'_> {
35
52
  fn into_value_with(self, handle: &Ruby) -> Value {
36
53
  self.0.into_value_with(handle)
37
54
  }
@@ -1,6 +1,8 @@
1
- use super::header_cache::StringCache;
1
+ use super::header_cache::StringCacheKey;
2
2
  use super::parser::RecordParser;
3
+ use super::{header_cache::StringCache, ruby_reader::SeekableRead};
3
4
  use magnus::{Error, Ruby};
5
+ use std::io::BufReader;
4
6
  use std::{borrow::Cow, io::Read, thread};
5
7
 
6
8
  pub(crate) const READ_BUFFER_SIZE: usize = 16384;
@@ -9,16 +11,17 @@ pub struct RecordReader<'a, T: RecordParser<'a>> {
9
11
  inner: ReaderImpl<'a, T>,
10
12
  }
11
13
 
14
+ #[allow(clippy::large_enum_variant)]
12
15
  enum ReaderImpl<'a, T: RecordParser<'a>> {
13
16
  SingleThreaded {
14
- reader: csv::Reader<Box<dyn Read + 'a>>,
15
- headers: Vec<&'static str>,
17
+ reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
18
+ headers: Vec<StringCacheKey>,
16
19
  null_string: Option<String>,
17
20
  flexible_default: Option<Cow<'a, str>>,
18
21
  string_record: csv::StringRecord,
19
22
  },
20
23
  MultiThreaded {
21
- headers: Vec<&'static str>,
24
+ headers: Vec<StringCacheKey>,
22
25
  receiver: kanal::Receiver<T::Output>,
23
26
  handle: Option<thread::JoinHandle<()>>,
24
27
  },
@@ -48,8 +51,8 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
48
51
  }
49
52
 
50
53
  pub(crate) fn new_single_threaded(
51
- reader: csv::Reader<Box<dyn Read + 'a>>,
52
- headers: Vec<&'static str>,
54
+ reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
55
+ headers: Vec<StringCacheKey>,
53
56
  null_string: Option<String>,
54
57
  flexible_default: Option<&'a str>,
55
58
  ) -> Self {
@@ -59,7 +62,7 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
59
62
  reader,
60
63
  headers,
61
64
  null_string,
62
- flexible_default: flexible_default.map(|s| Cow::Borrowed(s)),
65
+ flexible_default: flexible_default.map(Cow::Borrowed),
63
66
  string_record: csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers_len),
64
67
  },
65
68
  }
@@ -69,11 +72,10 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
69
72
  impl<T: RecordParser<'static> + Send> RecordReader<'static, T> {
70
73
  pub(crate) fn new_multi_threaded(
71
74
  mut reader: csv::Reader<Box<dyn Read + Send + 'static>>,
72
- headers: Vec<&'static str>,
75
+ headers: Vec<StringCacheKey>,
73
76
  buffer_size: usize,
74
77
  null_string: Option<String>,
75
78
  flexible_default: Option<&'static str>,
76
- should_forget: bool,
77
79
  ) -> Self {
78
80
  let (sender, receiver) = kanal::bounded(buffer_size);
79
81
  let headers_for_thread = headers.clone();
@@ -86,16 +88,12 @@ impl<T: RecordParser<'static> + Send> RecordReader<'static, T> {
86
88
  &headers_for_thread,
87
89
  &record,
88
90
  null_string.as_deref(),
89
- flexible_default.map(|s| Cow::Borrowed(s)),
91
+ flexible_default.map(Cow::Borrowed),
90
92
  );
91
93
  if sender.send(row).is_err() {
92
94
  break;
93
95
  }
94
96
  }
95
- if should_forget {
96
- let file_to_forget = reader.into_inner();
97
- std::mem::forget(file_to_forget);
98
- }
99
97
  });
100
98
 
101
99
  Self {
@@ -134,7 +132,7 @@ impl<'a, T: RecordParser<'a>> Iterator for RecordReader<'a, T> {
134
132
  } => match reader.read_record(string_record) {
135
133
  Ok(true) => Some(T::parse(
136
134
  headers,
137
- &string_record,
135
+ string_record,
138
136
  null_string.as_deref(),
139
137
  flexible_default.clone(),
140
138
  )),
@@ -165,10 +163,10 @@ impl<'a, T: RecordParser<'a>> Drop for RecordReader<'a, T> {
165
163
  if let Some(handle) = handle.take() {
166
164
  let _ = handle.join();
167
165
  }
168
- let _ = StringCache::clear(headers);
166
+ let _ = StringCache::clear(&headers);
169
167
  }
170
168
  ReaderImpl::SingleThreaded { headers, .. } => {
171
- let _ = StringCache::clear(headers);
169
+ let _ = StringCache::clear(&headers);
172
170
  }
173
171
  }
174
172
  }
@@ -0,0 +1,30 @@
1
+ use std::{fs::File, io, mem::ManuallyDrop};
2
+
3
+ pub struct ForgottenFileHandle(pub ManuallyDrop<File>);
4
+
5
+ impl std::io::Read for ForgottenFileHandle {
6
+ fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
7
+ self.0.read(buf)
8
+ }
9
+
10
+ fn read_vectored(&mut self, bufs: &mut [std::io::IoSliceMut<'_>]) -> io::Result<usize> {
11
+ self.0.read_vectored(bufs)
12
+ }
13
+
14
+ // fn read_buf(&mut self, cursor: BorrowedCursor<'_>) -> io::Result<()> {
15
+ // self.0.read_buf(cursor)
16
+ // }
17
+
18
+ // #[inline]
19
+ // fn is_read_vectored(&self) -> bool {
20
+ // self.0.is_read_vectored()
21
+ // }
22
+
23
+ fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
24
+ self.0.read_to_end(buf)
25
+ }
26
+
27
+ fn read_to_string(&mut self, buf: &mut String) -> io::Result<usize> {
28
+ self.0.read_to_string(buf)
29
+ }
30
+ }
@@ -1,43 +1,89 @@
1
- use super::READ_BUFFER_SIZE;
2
1
  use magnus::{
3
2
  value::{Opaque, ReprValue},
4
3
  RClass, RString, Ruby, Value,
5
4
  };
6
- use std::io::{self, Read};
5
+ use std::io::{self, Read, Seek, SeekFrom, Write};
7
6
  use std::sync::OnceLock;
8
7
 
9
8
  static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
10
9
 
11
10
  /// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
12
11
  /// and provide a standard Read implementation for them.
13
- pub struct RubyReader<'a, T> {
14
- #[allow(unused)]
15
- ruby: &'a Ruby,
12
+ pub struct RubyReader<T> {
16
13
  inner: T,
17
- buffer: Option<Vec<u8>>,
18
14
  offset: usize,
19
- // Number of bytes that have been read into the buffer
20
- // Used as an upper bound for offset
21
- buffered_bytes: usize,
22
15
  }
23
16
 
24
- pub fn build_ruby_reader<'a>(
25
- ruby: &'a Ruby,
17
+ pub trait SeekableRead: std::io::Read + Seek {}
18
+ impl SeekableRead for RubyReader<Value> {}
19
+ impl SeekableRead for RubyReader<RString> {}
20
+
21
+ pub fn build_ruby_reader(
22
+ ruby: &Ruby,
26
23
  input: Value,
27
- ) -> Result<Box<dyn Read + 'a>, magnus::Error> {
24
+ ) -> Result<Box<dyn SeekableRead>, magnus::Error> {
28
25
  if RubyReader::is_string_io(ruby, &input) {
29
26
  RubyReader::from_string_io(ruby, input)
30
27
  } else if RubyReader::is_io_like(&input) {
31
- RubyReader::from_io(ruby, input)
28
+ RubyReader::from_io(input)
32
29
  } else {
33
- RubyReader::from_string_like(ruby, input)
30
+ RubyReader::from_string_like(input)
31
+ }
32
+ }
33
+
34
+ impl Seek for RubyReader<Value> {
35
+ fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
36
+ let (whence, offset) = match pos {
37
+ SeekFrom::Start(i) => (0, i as i64),
38
+ SeekFrom::Current(i) => (1, i),
39
+ SeekFrom::End(i) => (2, i),
40
+ };
41
+
42
+ let new_position = self
43
+ .inner
44
+ .funcall("seek", (offset, whence))
45
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
46
+
47
+ Ok(new_position)
48
+ }
49
+ }
50
+
51
+ impl Write for RubyReader<Value> {
52
+ fn write(&mut self, buf: &[u8]) -> Result<usize, io::Error> {
53
+ let ruby_bytes = RString::from_slice(buf);
54
+
55
+ let bytes_written = self
56
+ .inner
57
+ .funcall::<_, _, usize>("write", (ruby_bytes,))
58
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
59
+
60
+ Ok(bytes_written)
61
+ }
62
+
63
+ fn flush(&mut self) -> Result<(), io::Error> {
64
+ self.inner
65
+ .funcall::<_, _, Value>("flush", ())
66
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
67
+
68
+ Ok(())
69
+ }
70
+ }
71
+
72
+ impl Seek for RubyReader<RString> {
73
+ fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
74
+ match pos {
75
+ io::SeekFrom::Start(offset) => self.offset = offset as usize,
76
+ io::SeekFrom::Current(offset) => self.offset = (self.offset as i64 + offset) as usize,
77
+ io::SeekFrom::End(offset) => self.offset = self.inner.len() - offset as usize,
78
+ }
79
+ Ok(self.offset as u64)
34
80
  }
35
81
  }
36
82
 
37
- impl<'a> RubyReader<'a, Value> {
38
- fn from_io(ruby: &'a Ruby, input: Value) -> Result<Box<dyn Read + 'a>, magnus::Error> {
83
+ impl RubyReader<Value> {
84
+ fn from_io(input: Value) -> Result<Box<dyn SeekableRead>, magnus::Error> {
39
85
  if Self::is_io_like(&input) {
40
- Ok(Box::new(Self::from_io_like(ruby, input)))
86
+ Ok(Box::new(Self::from_io_like(input)))
41
87
  } else {
42
88
  Err(magnus::Error::new(
43
89
  magnus::exception::type_error(),
@@ -50,70 +96,19 @@ impl<'a> RubyReader<'a, Value> {
50
96
  input.respond_to("read", false).unwrap_or(false)
51
97
  }
52
98
 
53
- fn from_io_like(ruby: &'a Ruby, input: Value) -> Self {
99
+ fn from_io_like(input: Value) -> Self {
54
100
  Self {
55
- ruby,
56
101
  inner: input,
57
- buffer: Some(vec![0; READ_BUFFER_SIZE]),
58
102
  offset: 0,
59
- buffered_bytes: 0,
60
103
  }
61
104
  }
62
-
63
- fn read_from_buffer(&mut self, to_buf: &mut [u8]) -> Option<io::Result<usize>> {
64
- if let Some(from_buf) = &self.buffer {
65
- // If the offset is within the buffered bytes, copy the remaining bytes to the output buffer
66
- if self.offset < self.buffered_bytes {
67
- let remaining = self.buffered_bytes - self.offset;
68
- let copy_size = remaining.min(to_buf.len());
69
- to_buf[..copy_size]
70
- .copy_from_slice(&from_buf[self.offset..self.offset + copy_size]);
71
- self.offset += copy_size;
72
- Some(Ok(copy_size))
73
- } else {
74
- None
75
- }
76
- } else {
77
- None
78
- }
79
- }
80
-
81
- fn read_from_ruby(&mut self, buf: &mut [u8]) -> io::Result<usize> {
82
- let buffer = self.buffer.as_mut().unwrap();
83
- let result = self
84
- .inner
85
- .funcall::<_, _, RString>("read", (buffer.capacity(),))
86
- .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
87
-
88
- if result.is_nil() {
89
- return Ok(0); // EOF
90
- }
91
-
92
- let bytes = unsafe { result.as_slice() };
93
-
94
- // Update internal buffer
95
- let bytes_len = bytes.len();
96
- if bytes_len == 0 {
97
- return Ok(0);
98
- }
99
-
100
- // Only copy what we actually read
101
- buffer[..bytes_len].copy_from_slice(bytes);
102
- self.buffered_bytes = bytes_len;
103
-
104
- // Copy to output buffer
105
- let copy_size = bytes_len.min(buf.len());
106
- buf[..copy_size].copy_from_slice(&buffer[..copy_size]);
107
- self.offset = copy_size;
108
- Ok(copy_size)
109
- }
110
105
  }
111
106
 
112
- impl<'a> RubyReader<'a, RString> {
107
+ impl RubyReader<RString> {
113
108
  pub fn from_string_io(
114
- ruby: &'a Ruby,
109
+ ruby: &Ruby,
115
110
  input: Value,
116
- ) -> Result<Box<dyn Read + 'a>, magnus::Error> {
111
+ ) -> Result<Box<dyn SeekableRead>, magnus::Error> {
117
112
  if !Self::is_string_io(ruby, &input) {
118
113
  return Err(magnus::Error::new(
119
114
  magnus::exception::type_error(),
@@ -123,11 +118,8 @@ impl<'a> RubyReader<'a, RString> {
123
118
 
124
119
  let string_content = input.funcall::<_, _, RString>("string", ()).unwrap();
125
120
  Ok(Box::new(Self {
126
- ruby,
127
121
  inner: string_content,
128
- buffer: None,
129
122
  offset: 0,
130
- buffered_bytes: 0,
131
123
  }))
132
124
  }
133
125
 
@@ -139,33 +131,32 @@ impl<'a> RubyReader<'a, RString> {
139
131
  input.is_kind_of(ruby.get_inner(*string_io_class))
140
132
  }
141
133
 
142
- fn from_string_like(ruby: &'a Ruby, input: Value) -> Result<Box<dyn Read + 'a>, magnus::Error> {
134
+ fn from_string_like(input: Value) -> Result<Box<dyn SeekableRead>, magnus::Error> {
143
135
  // Try calling `to_str`, and if that fails, try `to_s`
144
136
  let string_content = input
145
137
  .funcall::<_, _, RString>("to_str", ())
146
138
  .or_else(|_| input.funcall::<_, _, RString>("to_s", ()))?;
147
139
  Ok(Box::new(Self {
148
- ruby,
149
140
  inner: string_content,
150
- buffer: None,
151
141
  offset: 0,
152
- buffered_bytes: 0,
153
142
  }))
154
143
  }
155
144
  }
156
145
 
157
- impl<'a> Read for RubyReader<'a, Value> {
158
- fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
159
- if let Some(result) = self.read_from_buffer(buf) {
160
- result
161
- } else {
162
- // If the buffer is empty, read from Ruby
163
- self.read_from_ruby(buf)
164
- }
146
+ impl Read for RubyReader<Value> {
147
+ fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
148
+ let bytes = self
149
+ .inner
150
+ .funcall::<_, _, RString>("read", (buf.len(),))
151
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
152
+
153
+ buf.write_all(unsafe { bytes.as_slice() })?;
154
+
155
+ Ok(bytes.len())
165
156
  }
166
157
  }
167
158
 
168
- impl<'a> Read for RubyReader<'a, RString> {
159
+ impl Read for RubyReader<RString> {
169
160
  fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
170
161
  let string_buffer = unsafe { self.inner.as_slice() };
171
162
  if self.offset >= string_buffer.len() {
data/ext/osv/src/lib.rs CHANGED
@@ -1,3 +1,4 @@
1
+ mod allocator;
1
2
  mod csv;
2
3
  mod reader;
3
4
  mod utils;
@@ -1,19 +1,19 @@
1
- use crate::csv::{CowValue, CsvRecord, RecordReaderBuilder};
1
+ use crate::csv::{CowValue, CsvRecord, RecordReaderBuilder, StringCacheKey};
2
2
  use crate::utils::*;
3
+ use ahash::RandomState;
3
4
  use csv::Trim;
4
5
  use magnus::value::ReprValue;
5
6
  use magnus::{block::Yield, Error, KwArgs, RHash, Ruby, Symbol, Value};
6
7
  use std::collections::HashMap;
7
- use xxhash_rust::xxh3::Xxh3Builder;
8
8
 
9
- pub fn parse_csv<'a>(
9
+ pub fn parse_csv(
10
10
  rb_self: Value,
11
11
  args: &[Value],
12
- ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<'static, Xxh3Builder>>>>, Error> {
12
+ ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<'static, RandomState>>>>, Error> {
13
13
  let original = unsafe { Ruby::get_unchecked() };
14
14
  let ruby: &'static Ruby = Box::leak(Box::new(original));
15
15
 
16
- let CsvArgs {
16
+ let ReadCsvArgs {
17
17
  to_read,
18
18
  has_headers,
19
19
  delimiter,
@@ -24,7 +24,7 @@ pub fn parse_csv<'a>(
24
24
  flexible,
25
25
  flexible_default,
26
26
  trim,
27
- } = parse_csv_args(&ruby, args)?;
27
+ } = parse_read_csv_args(ruby, args)?;
28
28
 
29
29
  let flexible_default: &'static Option<String> = Box::leak(Box::new(flexible_default));
30
30
  let leaked_flexible_default: &'static Option<&str> =
@@ -51,11 +51,11 @@ pub fn parse_csv<'a>(
51
51
  });
52
52
  }
53
53
 
54
- let iter: Box<dyn Iterator<Item = CsvRecord<Xxh3Builder>>> = match result_type.as_str() {
54
+ let iter: Box<dyn Iterator<Item = CsvRecord<RandomState>>> = match result_type.as_str() {
55
55
  "hash" => {
56
56
  let builder = RecordReaderBuilder::<
57
- HashMap<&'static str, Option<CowValue<'static>>, Xxh3Builder>,
58
- >::new(&ruby, to_read)
57
+ HashMap<StringCacheKey, Option<CowValue<'static>>, RandomState>,
58
+ >::new(ruby, to_read)
59
59
  .has_headers(has_headers)
60
60
  .flexible(flexible)
61
61
  .flexible_default(flexible_default.as_deref())
@@ -68,7 +68,7 @@ pub fn parse_csv<'a>(
68
68
  Box::new(builder.build_threaded()?.map(CsvRecord::Map))
69
69
  }
70
70
  "array" => Box::new(
71
- RecordReaderBuilder::<Vec<Option<CowValue<'static>>>>::new(&ruby, to_read)
71
+ RecordReaderBuilder::<Vec<Option<CowValue<'static>>>>::new(ruby, to_read)
72
72
  .has_headers(has_headers)
73
73
  .flexible(flexible)
74
74
  .flexible_default(flexible_default.as_deref())
@@ -107,7 +107,7 @@ struct EnumeratorArgs {
107
107
 
108
108
  fn create_enumerator(
109
109
  args: EnumeratorArgs,
110
- ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<'static, Xxh3Builder>>>>, Error> {
110
+ ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<'static, RandomState>>>>, Error> {
111
111
  let kwargs = RHash::new();
112
112
  kwargs.aset(Symbol::new("has_headers"), args.has_headers)?;
113
113
  kwargs.aset(
data/ext/osv/src/utils.rs CHANGED
@@ -13,12 +13,12 @@ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, E
13
13
  RString::from_value(value)
14
14
  .ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid string value"))?
15
15
  .to_string()
16
- .map(|s| Some(s))
16
+ .map(Some)
17
17
  } else if value.is_kind_of(ruby.class_symbol()) {
18
18
  Symbol::from_value(value)
19
19
  .ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid symbol value"))?
20
20
  .funcall("to_s", ())
21
- .map(|s| Some(s))
21
+ .map(Some)
22
22
  } else {
23
23
  Err(Error::new(
24
24
  magnus::exception::type_error(),
@@ -28,7 +28,7 @@ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, E
28
28
  }
29
29
 
30
30
  #[derive(Debug)]
31
- pub struct CsvArgs {
31
+ pub struct ReadCsvArgs {
32
32
  pub to_read: Value,
33
33
  pub has_headers: bool,
34
34
  pub delimiter: u8,
@@ -42,7 +42,7 @@ pub struct CsvArgs {
42
42
  }
43
43
 
44
44
  /// Parse common arguments for CSV parsing
45
- pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
45
+ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, Error> {
46
46
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
47
47
  let (to_read,) = parsed_args.required;
48
48
 
@@ -166,7 +166,7 @@ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
166
166
  None => csv::Trim::None,
167
167
  };
168
168
 
169
- Ok(CsvArgs {
169
+ Ok(ReadCsvArgs {
170
170
  to_read,
171
171
  has_headers,
172
172
  delimiter,
data/lib/osv/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module OSV
2
- VERSION = "0.3.14"
2
+ VERSION = "0.3.16"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: osv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.14
4
+ version: 0.3.16
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-02 00:00:00.000000000 Z
11
+ date: 2025-01-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -59,12 +59,14 @@ files:
59
59
  - Rakefile
60
60
  - ext/osv/Cargo.toml
61
61
  - ext/osv/extconf.rb
62
+ - ext/osv/src/allocator.rs
62
63
  - ext/osv/src/csv/builder.rs
63
64
  - ext/osv/src/csv/header_cache.rs
64
65
  - ext/osv/src/csv/mod.rs
65
66
  - ext/osv/src/csv/parser.rs
66
67
  - ext/osv/src/csv/record.rs
67
68
  - ext/osv/src/csv/record_reader.rs
69
+ - ext/osv/src/csv/ruby_integration.rs
68
70
  - ext/osv/src/csv/ruby_reader.rs
69
71
  - ext/osv/src/lib.rs
70
72
  - ext/osv/src/reader.rs