osv 0.3.14 → 0.3.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +122 -6
- data/README.md +27 -27
- data/ext/osv/Cargo.toml +8 -1
- data/ext/osv/src/allocator.rs +13 -0
- data/ext/osv/src/csv/builder.rs +14 -9
- data/ext/osv/src/csv/header_cache.rs +79 -26
- data/ext/osv/src/csv/mod.rs +3 -1
- data/ext/osv/src/csv/parser.rs +9 -8
- data/ext/osv/src/csv/record.rs +24 -7
- data/ext/osv/src/csv/record_reader.rs +15 -17
- data/ext/osv/src/csv/ruby_integration.rs +30 -0
- data/ext/osv/src/csv/ruby_reader.rs +79 -88
- data/ext/osv/src/lib.rs +1 -0
- data/ext/osv/src/reader.rs +11 -11
- data/ext/osv/src/utils.rs +5 -5
- data/lib/osv/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 91401989a8532162a9731fed3cb07661c0676105f77465da23f9a267773e7651
|
4
|
+
data.tar.gz: aeba48f1338a4160044e8c7264f80eb065d950567288bded39acf5d9bc593d7b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8d2ea3f724a6f7af317bb1ae865513c15f2ef0e475b070e7f9ae2e1b4155b2d82090387beb0c6a2e5cb8664b1f6dd0cf61e6ad9545957bc3ada1a3e87758b1ee
|
7
|
+
data.tar.gz: 0eaa86241092c14f4c2973d74e65877b7f3f87487a2681b9a094054f98db759772bcf012ec2f4fa073bd16f2b02927212b13afec484f84daf764d3b3e0811b6b
|
data/Cargo.lock
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# This file is automatically @generated by Cargo.
|
2
2
|
# It is not intended for manual editing.
|
3
|
-
version =
|
3
|
+
version = 4
|
4
4
|
|
5
5
|
[[package]]
|
6
6
|
name = "adler2"
|
@@ -8,6 +8,19 @@ version = "2.0.0"
|
|
8
8
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
9
9
|
checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
|
10
10
|
|
11
|
+
[[package]]
|
12
|
+
name = "ahash"
|
13
|
+
version = "0.8.11"
|
14
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
15
|
+
checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
|
16
|
+
dependencies = [
|
17
|
+
"cfg-if",
|
18
|
+
"getrandom",
|
19
|
+
"once_cell",
|
20
|
+
"version_check",
|
21
|
+
"zerocopy",
|
22
|
+
]
|
23
|
+
|
11
24
|
[[package]]
|
12
25
|
name = "aho-corasick"
|
13
26
|
version = "1.1.3"
|
@@ -32,7 +45,7 @@ dependencies = [
|
|
32
45
|
"bitflags",
|
33
46
|
"cexpr",
|
34
47
|
"clang-sys",
|
35
|
-
"itertools",
|
48
|
+
"itertools 0.12.1",
|
36
49
|
"lazy_static",
|
37
50
|
"lazycell",
|
38
51
|
"proc-macro2",
|
@@ -49,6 +62,15 @@ version = "2.6.0"
|
|
49
62
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
50
63
|
checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
|
51
64
|
|
65
|
+
[[package]]
|
66
|
+
name = "cc"
|
67
|
+
version = "1.2.7"
|
68
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
69
|
+
checksum = "a012a0df96dd6d06ba9a1b29d6402d1a5d77c6befd2566afdc26e10603dc93d7"
|
70
|
+
dependencies = [
|
71
|
+
"shlex",
|
72
|
+
]
|
73
|
+
|
52
74
|
[[package]]
|
53
75
|
name = "cexpr"
|
54
76
|
version = "0.6.0"
|
@@ -127,6 +149,17 @@ version = "0.3.31"
|
|
127
149
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
128
150
|
checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
|
129
151
|
|
152
|
+
[[package]]
|
153
|
+
name = "getrandom"
|
154
|
+
version = "0.2.15"
|
155
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
156
|
+
checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
|
157
|
+
dependencies = [
|
158
|
+
"cfg-if",
|
159
|
+
"libc",
|
160
|
+
"wasi",
|
161
|
+
]
|
162
|
+
|
130
163
|
[[package]]
|
131
164
|
name = "glob"
|
132
165
|
version = "0.3.1"
|
@@ -142,12 +175,41 @@ dependencies = [
|
|
142
175
|
"either",
|
143
176
|
]
|
144
177
|
|
178
|
+
[[package]]
|
179
|
+
name = "itertools"
|
180
|
+
version = "0.14.0"
|
181
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
182
|
+
checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
|
183
|
+
dependencies = [
|
184
|
+
"either",
|
185
|
+
]
|
186
|
+
|
145
187
|
[[package]]
|
146
188
|
name = "itoa"
|
147
189
|
version = "1.0.14"
|
148
190
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
149
191
|
checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
|
150
192
|
|
193
|
+
[[package]]
|
194
|
+
name = "jemalloc-sys"
|
195
|
+
version = "0.5.4+5.3.0-patched"
|
196
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
197
|
+
checksum = "ac6c1946e1cea1788cbfde01c993b52a10e2da07f4bac608228d1bed20bfebf2"
|
198
|
+
dependencies = [
|
199
|
+
"cc",
|
200
|
+
"libc",
|
201
|
+
]
|
202
|
+
|
203
|
+
[[package]]
|
204
|
+
name = "jemallocator"
|
205
|
+
version = "0.5.4"
|
206
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
207
|
+
checksum = "a0de374a9f8e63150e6f5e8a60cc14c668226d7a347d8aee1a45766e3c4dd3bc"
|
208
|
+
dependencies = [
|
209
|
+
"jemalloc-sys",
|
210
|
+
"libc",
|
211
|
+
]
|
212
|
+
|
151
213
|
[[package]]
|
152
214
|
name = "kanal"
|
153
215
|
version = "0.1.0-pre8"
|
@@ -186,6 +248,16 @@ dependencies = [
|
|
186
248
|
"windows-targets",
|
187
249
|
]
|
188
250
|
|
251
|
+
[[package]]
|
252
|
+
name = "libmimalloc-sys"
|
253
|
+
version = "0.1.39"
|
254
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
255
|
+
checksum = "23aa6811d3bd4deb8a84dde645f943476d13b248d818edcf8ce0b2f37f036b44"
|
256
|
+
dependencies = [
|
257
|
+
"cc",
|
258
|
+
"libc",
|
259
|
+
]
|
260
|
+
|
189
261
|
[[package]]
|
190
262
|
name = "lock_api"
|
191
263
|
version = "0.4.12"
|
@@ -237,6 +309,15 @@ version = "2.7.4"
|
|
237
309
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
238
310
|
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
239
311
|
|
312
|
+
[[package]]
|
313
|
+
name = "mimalloc"
|
314
|
+
version = "0.1.43"
|
315
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
316
|
+
checksum = "68914350ae34959d83f732418d51e2427a794055d0b9529f48259ac07af65633"
|
317
|
+
dependencies = [
|
318
|
+
"libmimalloc-sys",
|
319
|
+
]
|
320
|
+
|
240
321
|
[[package]]
|
241
322
|
name = "minimal-lexical"
|
242
323
|
version = "0.2.1"
|
@@ -262,19 +343,28 @@ dependencies = [
|
|
262
343
|
"minimal-lexical",
|
263
344
|
]
|
264
345
|
|
346
|
+
[[package]]
|
347
|
+
name = "once_cell"
|
348
|
+
version = "1.20.2"
|
349
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
350
|
+
checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
|
351
|
+
|
265
352
|
[[package]]
|
266
353
|
name = "osv"
|
267
354
|
version = "0.1.0"
|
268
355
|
dependencies = [
|
356
|
+
"ahash",
|
269
357
|
"csv",
|
270
358
|
"flate2",
|
359
|
+
"itertools 0.14.0",
|
360
|
+
"jemallocator",
|
271
361
|
"kanal",
|
272
362
|
"magnus 0.7.1",
|
363
|
+
"mimalloc",
|
273
364
|
"rb-sys",
|
274
365
|
"serde",
|
275
366
|
"serde_magnus",
|
276
367
|
"thiserror",
|
277
|
-
"xxhash-rust",
|
278
368
|
]
|
279
369
|
|
280
370
|
[[package]]
|
@@ -464,6 +554,18 @@ version = "1.0.14"
|
|
464
554
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
465
555
|
checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
|
466
556
|
|
557
|
+
[[package]]
|
558
|
+
name = "version_check"
|
559
|
+
version = "0.9.5"
|
560
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
561
|
+
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
562
|
+
|
563
|
+
[[package]]
|
564
|
+
name = "wasi"
|
565
|
+
version = "0.11.0+wasi-snapshot-preview1"
|
566
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
567
|
+
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
568
|
+
|
467
569
|
[[package]]
|
468
570
|
name = "windows-targets"
|
469
571
|
version = "0.52.6"
|
@@ -529,7 +631,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
529
631
|
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
530
632
|
|
531
633
|
[[package]]
|
532
|
-
name = "
|
533
|
-
version = "0.
|
634
|
+
name = "zerocopy"
|
635
|
+
version = "0.7.35"
|
534
636
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
535
|
-
checksum = "
|
637
|
+
checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
|
638
|
+
dependencies = [
|
639
|
+
"zerocopy-derive",
|
640
|
+
]
|
641
|
+
|
642
|
+
[[package]]
|
643
|
+
name = "zerocopy-derive"
|
644
|
+
version = "0.7.35"
|
645
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
646
|
+
checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
|
647
|
+
dependencies = [
|
648
|
+
"proc-macro2",
|
649
|
+
"quote",
|
650
|
+
"syn",
|
651
|
+
]
|
data/README.md
CHANGED
@@ -118,7 +118,7 @@ This library is faster than the standard Ruby CSV library. It's also faster than
|
|
118
118
|
|
119
119
|
Here's some unscientific benchmarks. You can find the code in the [benchmark/comparison_benchmark.rb](benchmark/comparison_benchmark.rb) file.
|
120
120
|
|
121
|
-
### 1,000,000
|
121
|
+
### 1,000,000 records
|
122
122
|
|
123
123
|
```
|
124
124
|
🏃 Running benchmarks...
|
@@ -142,34 +142,34 @@ OSV - Gzipped Direct 1.000 i/100ms
|
|
142
142
|
FastCSV - Gzipped 1.000 i/100ms
|
143
143
|
CSV - Gzipped 1.000 i/100ms
|
144
144
|
Calculating -------------------------------------
|
145
|
-
CSV - StringIO 0.
|
146
|
-
FastCSV - StringIO 0.
|
147
|
-
OSV - StringIO 0.
|
148
|
-
CSV - Hash output 0.
|
149
|
-
OSV - Hash output 0.
|
150
|
-
CSV - Array output 0.
|
151
|
-
OSV - Array output 0.
|
145
|
+
CSV - StringIO 0.080 (± 0.0%) i/s (12.43 s/i) - 3.000 in 37.301114s
|
146
|
+
FastCSV - StringIO 0.368 (± 0.0%) i/s (2.72 s/i) - 12.000 in 32.619020s
|
147
|
+
OSV - StringIO 0.699 (± 0.0%) i/s (1.43 s/i) - 21.000 in 30.091225s
|
148
|
+
CSV - Hash output 0.059 (± 0.0%) i/s (16.95 s/i) - 2.000 in 33.908533s
|
149
|
+
OSV - Hash output 0.329 (± 0.0%) i/s (3.04 s/i) - 10.000 in 30.551275s
|
150
|
+
CSV - Array output 0.066 (± 0.0%) i/s (15.18 s/i) - 2.000 in 30.357327s
|
151
|
+
OSV - Array output 0.632 (± 0.0%) i/s (1.58 s/i) - 19.000 in 30.150113s
|
152
152
|
FastCSV - Array output
|
153
|
-
0.
|
153
|
+
0.350 (± 0.0%) i/s (2.86 s/i) - 11.000 in 31.477268s
|
154
154
|
OSV - Direct Open Array output
|
155
|
-
0.
|
156
|
-
OSV - Gzipped 0.
|
157
|
-
OSV - Gzipped Direct 0.
|
158
|
-
FastCSV - Gzipped 0.323 (± 0.0%) i/s (3.
|
159
|
-
CSV - Gzipped 0.
|
155
|
+
0.641 (± 0.0%) i/s (1.56 s/i) - 20.000 in 31.275201s
|
156
|
+
OSV - Gzipped 0.530 (± 0.0%) i/s (1.89 s/i) - 16.000 in 30.183753s
|
157
|
+
OSV - Gzipped Direct 0.727 (± 0.0%) i/s (1.37 s/i) - 22.000 in 30.283991s
|
158
|
+
FastCSV - Gzipped 0.323 (± 0.0%) i/s (3.09 s/i) - 10.000 in 30.949600s
|
159
|
+
CSV - Gzipped 0.056 (± 0.0%) i/s (17.72 s/i) - 2.000 in 35.440473s
|
160
160
|
|
161
161
|
Comparison:
|
162
|
-
OSV - Gzipped Direct: 0.
|
163
|
-
OSV -
|
164
|
-
|
165
|
-
|
166
|
-
OSV - Gzipped: 0.
|
167
|
-
FastCSV - StringIO: 0.4 i/s - 1.
|
168
|
-
FastCSV - Array output: 0.
|
169
|
-
|
170
|
-
|
171
|
-
CSV - StringIO: 0.1 i/s -
|
172
|
-
CSV - Array output: 0.1 i/s -
|
173
|
-
CSV - Hash output: 0.1 i/s -
|
174
|
-
CSV - Gzipped: 0.1 i/s -
|
162
|
+
OSV - Gzipped Direct: 0.7 i/s
|
163
|
+
OSV - StringIO: 0.7 i/s - 1.04x slower
|
164
|
+
OSV - Direct Open Array output: 0.6 i/s - 1.14x slower
|
165
|
+
OSV - Array output: 0.6 i/s - 1.15x slower
|
166
|
+
OSV - Gzipped: 0.5 i/s - 1.37x slower
|
167
|
+
FastCSV - StringIO: 0.4 i/s - 1.98x slower
|
168
|
+
FastCSV - Array output: 0.3 i/s - 2.08x slower
|
169
|
+
OSV - Hash output: 0.3 i/s - 2.21x slower
|
170
|
+
FastCSV - Gzipped: 0.3 i/s - 2.25x slower
|
171
|
+
CSV - StringIO: 0.1 i/s - 9.04x slower
|
172
|
+
CSV - Array output: 0.1 i/s - 11.04x slower
|
173
|
+
CSV - Hash output: 0.1 i/s - 12.33x slower
|
174
|
+
CSV - Gzipped: 0.1 i/s - 12.89x slower
|
175
175
|
```
|
data/ext/osv/Cargo.toml
CHANGED
@@ -7,6 +7,7 @@ edition = "2021"
|
|
7
7
|
crate-type = ["cdylib"]
|
8
8
|
|
9
9
|
[dependencies]
|
10
|
+
ahash = "0.8"
|
10
11
|
csv = "^1.3"
|
11
12
|
flate2 = "1.0.35"
|
12
13
|
kanal = "0.1.0-pre8"
|
@@ -15,4 +16,10 @@ rb-sys = "^0.9"
|
|
15
16
|
serde = { version = "1.0", features = ["derive"] }
|
16
17
|
serde_magnus = "0.8.1"
|
17
18
|
thiserror = "2.0"
|
18
|
-
|
19
|
+
itertools = "^0.14"
|
20
|
+
|
21
|
+
[target.'cfg(target_os = "linux")'.dependencies]
|
22
|
+
jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
|
23
|
+
|
24
|
+
[target.'cfg(not(any(target_os = "linux", target_os = "windows")))'.dependencies]
|
25
|
+
mimalloc = { version = "0.1", default-features = false }
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#[cfg(target_os = "linux")]
|
2
|
+
use jemallocator::Jemalloc;
|
3
|
+
|
4
|
+
#[cfg(not(any(target_os = "linux", target_os = "windows")))]
|
5
|
+
use mimalloc::MiMalloc;
|
6
|
+
|
7
|
+
#[global_allocator]
|
8
|
+
#[cfg(target_os = "linux")]
|
9
|
+
static ALLOC: Jemalloc = Jemalloc;
|
10
|
+
|
11
|
+
#[global_allocator]
|
12
|
+
#[cfg(not(any(target_os = "linux", target_os = "windows")))]
|
13
|
+
static ALLOC: MiMalloc = MiMalloc;
|
data/ext/osv/src/csv/builder.rs
CHANGED
@@ -2,7 +2,8 @@ use super::{
|
|
2
2
|
header_cache::{CacheError, StringCache},
|
3
3
|
parser::RecordParser,
|
4
4
|
record_reader::{RecordReader, READ_BUFFER_SIZE},
|
5
|
-
ruby_reader::build_ruby_reader,
|
5
|
+
ruby_reader::{build_ruby_reader, SeekableRead},
|
6
|
+
ForgottenFileHandle,
|
6
7
|
};
|
7
8
|
use flate2::read::GzDecoder;
|
8
9
|
use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, Ruby, Value};
|
@@ -10,6 +11,7 @@ use std::{
|
|
10
11
|
fs::File,
|
11
12
|
io::{self, BufReader, Read},
|
12
13
|
marker::PhantomData,
|
14
|
+
mem::ManuallyDrop,
|
13
15
|
os::fd::FromRawFd,
|
14
16
|
};
|
15
17
|
|
@@ -64,7 +66,6 @@ impl<T: RecordParser<'static> + Send + 'static> RecordReaderBuilder<'static, T>
|
|
64
66
|
fn build_multi_threaded(
|
65
67
|
self,
|
66
68
|
readable: Box<dyn Read + Send + 'static>,
|
67
|
-
should_forget: bool,
|
68
69
|
) -> Result<RecordReader<'static, T>, ReaderError> {
|
69
70
|
let flexible = self.flexible || self.flexible_default.is_some();
|
70
71
|
let mut reader = csv::ReaderBuilder::new()
|
@@ -84,21 +85,20 @@ impl<T: RecordParser<'static> + Send + 'static> RecordReaderBuilder<'static, T>
|
|
84
85
|
self.buffer,
|
85
86
|
self.null_string,
|
86
87
|
self.flexible_default,
|
87
|
-
should_forget,
|
88
88
|
))
|
89
89
|
}
|
90
90
|
|
91
91
|
pub fn build_threaded(self) -> Result<RecordReader<'static, T>, ReaderError> {
|
92
92
|
if self.to_read.is_kind_of(self.ruby.class_io()) {
|
93
93
|
let readable = self.handle_file_descriptor()?;
|
94
|
-
self.build_multi_threaded(readable
|
94
|
+
self.build_multi_threaded(readable)
|
95
95
|
} else if self.to_read.is_kind_of(self.ruby.class_string()) {
|
96
96
|
let readable = self.handle_file_path()?;
|
97
|
-
self.build_multi_threaded(readable
|
97
|
+
self.build_multi_threaded(readable)
|
98
98
|
} else {
|
99
99
|
let readable = build_ruby_reader(self.ruby, self.to_read)?;
|
100
|
-
|
101
|
-
self.build_single_threaded(
|
100
|
+
let buffered_reader = BufReader::with_capacity(READ_BUFFER_SIZE, readable);
|
101
|
+
self.build_single_threaded(buffered_reader)
|
102
102
|
}
|
103
103
|
}
|
104
104
|
}
|
@@ -172,7 +172,11 @@ impl<'a, T: RecordParser<'a> + Send> RecordReaderBuilder<'a, T> {
|
|
172
172
|
}
|
173
173
|
|
174
174
|
let file = unsafe { File::from_raw_fd(fd) };
|
175
|
-
|
175
|
+
let forgotten = ForgottenFileHandle(ManuallyDrop::new(file));
|
176
|
+
Ok(Box::new(BufReader::with_capacity(
|
177
|
+
READ_BUFFER_SIZE,
|
178
|
+
forgotten,
|
179
|
+
)))
|
176
180
|
}
|
177
181
|
|
178
182
|
fn handle_file_path(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
|
@@ -191,9 +195,10 @@ impl<'a, T: RecordParser<'a> + Send> RecordReaderBuilder<'a, T> {
|
|
191
195
|
|
192
196
|
fn build_single_threaded(
|
193
197
|
self,
|
194
|
-
readable: Box<dyn
|
198
|
+
readable: BufReader<Box<dyn SeekableRead>>,
|
195
199
|
) -> Result<RecordReader<'a, T>, ReaderError> {
|
196
200
|
let flexible = self.flexible || self.flexible_default.is_some();
|
201
|
+
|
197
202
|
let mut reader = csv::ReaderBuilder::new()
|
198
203
|
.has_headers(self.has_headers)
|
199
204
|
.delimiter(self.delimiter)
|
@@ -1,3 +1,4 @@
|
|
1
|
+
use magnus::{r_string::FString, value::Opaque, IntoValue, RString, Ruby, Value};
|
1
2
|
/// This module exists to avoid cloning header keys in returned HashMaps.
|
2
3
|
/// Since the underlying RString creation already involves cloning,
|
3
4
|
/// this caching layer aims to reduce redundant allocations.
|
@@ -6,7 +7,7 @@
|
|
6
7
|
/// so this optimization could be removed if any issues arise.
|
7
8
|
use std::{
|
8
9
|
collections::HashMap,
|
9
|
-
sync::{atomic::AtomicU32, LazyLock, Mutex},
|
10
|
+
sync::{atomic::AtomicU32, atomic::Ordering, LazyLock, Mutex},
|
10
11
|
};
|
11
12
|
use thiserror::Error;
|
12
13
|
|
@@ -16,64 +17,116 @@ pub enum CacheError {
|
|
16
17
|
LockError(String),
|
17
18
|
}
|
18
19
|
|
19
|
-
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, AtomicU32>>> =
|
20
|
+
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey, AtomicU32)>>> =
|
20
21
|
LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
|
21
22
|
|
22
23
|
pub struct StringCache;
|
23
24
|
|
25
|
+
#[derive(Copy, Clone)]
|
26
|
+
pub struct StringCacheKey(Opaque<FString>, &'static str);
|
27
|
+
|
28
|
+
impl StringCacheKey {
|
29
|
+
pub fn new(string: &str) -> Self {
|
30
|
+
let rstr = RString::new(string);
|
31
|
+
let fstr = rstr.to_interned_str();
|
32
|
+
Self(Opaque::from(fstr), fstr.as_str().unwrap())
|
33
|
+
}
|
34
|
+
}
|
35
|
+
|
36
|
+
impl AsRef<str> for StringCacheKey {
|
37
|
+
fn as_ref(&self) -> &'static str {
|
38
|
+
self.1
|
39
|
+
}
|
40
|
+
}
|
41
|
+
|
42
|
+
impl IntoValue for StringCacheKey {
|
43
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
44
|
+
handle.into_value(self.0)
|
45
|
+
}
|
46
|
+
}
|
47
|
+
|
48
|
+
impl std::fmt::Debug for StringCacheKey {
|
49
|
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
50
|
+
self.1.fmt(f)
|
51
|
+
}
|
52
|
+
}
|
53
|
+
|
54
|
+
impl PartialEq for StringCacheKey {
|
55
|
+
fn eq(&self, other: &Self) -> bool {
|
56
|
+
self.1 == other.1
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
impl std::cmp::Eq for StringCacheKey {}
|
61
|
+
|
62
|
+
impl std::hash::Hash for StringCacheKey {
|
63
|
+
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
64
|
+
self.1.hash(state);
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
24
68
|
impl StringCache {
|
25
69
|
#[allow(dead_code)]
|
26
|
-
pub fn intern(string: String) -> Result
|
70
|
+
pub fn intern(string: String) -> Result<StringCacheKey, CacheError> {
|
27
71
|
let mut cache = STRING_CACHE
|
28
72
|
.lock()
|
29
73
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
30
74
|
|
31
|
-
if let Some((
|
32
|
-
|
33
|
-
Ok(
|
75
|
+
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
|
76
|
+
counter.fetch_add(1, Ordering::Relaxed);
|
77
|
+
Ok(*interned_string)
|
34
78
|
} else {
|
79
|
+
let interned = StringCacheKey::new(string.as_str());
|
35
80
|
let leaked = Box::leak(string.into_boxed_str());
|
36
|
-
cache.insert(leaked, AtomicU32::new(1));
|
37
|
-
Ok(
|
81
|
+
cache.insert(leaked, (interned, AtomicU32::new(1)));
|
82
|
+
Ok(interned)
|
38
83
|
}
|
39
84
|
}
|
40
85
|
|
41
|
-
pub fn intern_many(strings: &[String]) -> Result<Vec
|
86
|
+
pub fn intern_many(strings: &[String]) -> Result<Vec<StringCacheKey>, CacheError> {
|
42
87
|
let mut cache = STRING_CACHE
|
43
88
|
.lock()
|
44
89
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
45
90
|
|
46
|
-
let mut result = Vec::with_capacity(strings.len());
|
91
|
+
let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
|
47
92
|
for string in strings {
|
48
|
-
if let Some((
|
49
|
-
|
50
|
-
result.push(
|
93
|
+
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
|
94
|
+
counter.fetch_add(1, Ordering::Relaxed);
|
95
|
+
result.push(*interned_string);
|
51
96
|
} else {
|
97
|
+
let interned = StringCacheKey::new(&string);
|
52
98
|
let leaked = Box::leak(string.clone().into_boxed_str());
|
53
|
-
cache.insert(leaked, AtomicU32::new(1));
|
54
|
-
result.push(
|
99
|
+
cache.insert(leaked, (interned, AtomicU32::new(1)));
|
100
|
+
result.push(interned);
|
55
101
|
}
|
56
102
|
}
|
57
103
|
Ok(result)
|
58
104
|
}
|
59
105
|
|
60
|
-
pub fn clear(headers: &[
|
106
|
+
pub fn clear(headers: &[StringCacheKey]) -> Result<(), CacheError> {
|
61
107
|
let mut cache = STRING_CACHE
|
62
108
|
.lock()
|
63
109
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
64
110
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
let
|
69
|
-
if
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
111
|
+
let to_remove: Vec<_> = headers
|
112
|
+
.iter()
|
113
|
+
.filter_map(|header| {
|
114
|
+
let key = header.as_ref();
|
115
|
+
if let Some((_, (_, counter))) = cache.get_key_value(key) {
|
116
|
+
let prev_count = counter.fetch_sub(1, Ordering::Relaxed);
|
117
|
+
if prev_count == 1 {
|
118
|
+
Some(key)
|
119
|
+
} else {
|
120
|
+
None
|
74
121
|
}
|
122
|
+
} else {
|
123
|
+
None
|
75
124
|
}
|
76
|
-
}
|
125
|
+
})
|
126
|
+
.collect();
|
127
|
+
|
128
|
+
for key in to_remove {
|
129
|
+
cache.remove(key);
|
77
130
|
}
|
78
131
|
|
79
132
|
Ok(())
|
data/ext/osv/src/csv/mod.rs
CHANGED
@@ -3,10 +3,12 @@ mod header_cache;
|
|
3
3
|
mod parser;
|
4
4
|
mod record;
|
5
5
|
mod record_reader;
|
6
|
+
mod ruby_integration;
|
6
7
|
mod ruby_reader;
|
7
8
|
|
8
9
|
pub use builder::RecordReaderBuilder;
|
9
10
|
pub(crate) use builder::BUFFER_CHANNEL_SIZE;
|
11
|
+
pub use header_cache::StringCacheKey;
|
10
12
|
pub use record::CowValue;
|
11
13
|
pub use record::CsvRecord;
|
12
|
-
pub
|
14
|
+
pub use ruby_integration::*;
|
data/ext/osv/src/csv/parser.rs
CHANGED
@@ -2,13 +2,14 @@ use std::borrow::Cow;
|
|
2
2
|
use std::collections::HashMap;
|
3
3
|
use std::hash::BuildHasher;
|
4
4
|
|
5
|
+
use super::header_cache::StringCacheKey;
|
5
6
|
use super::CowValue;
|
6
7
|
|
7
8
|
pub trait RecordParser<'a> {
|
8
9
|
type Output: 'a;
|
9
10
|
|
10
11
|
fn parse(
|
11
|
-
headers: &[
|
12
|
+
headers: &[StringCacheKey],
|
12
13
|
record: &csv::StringRecord,
|
13
14
|
null_string: Option<&str>,
|
14
15
|
flexible_default: Option<Cow<'a, str>>,
|
@@ -16,13 +17,13 @@ pub trait RecordParser<'a> {
|
|
16
17
|
}
|
17
18
|
|
18
19
|
impl<'a, S: BuildHasher + Default + 'a> RecordParser<'a>
|
19
|
-
for HashMap
|
20
|
+
for HashMap<StringCacheKey, Option<CowValue<'a>>, S>
|
20
21
|
{
|
21
22
|
type Output = Self;
|
22
23
|
|
23
24
|
#[inline]
|
24
25
|
fn parse(
|
25
|
-
headers: &[
|
26
|
+
headers: &[StringCacheKey],
|
26
27
|
record: &csv::StringRecord,
|
27
28
|
null_string: Option<&str>,
|
28
29
|
flexible_default: Option<Cow<'a, str>>,
|
@@ -30,8 +31,8 @@ impl<'a, S: BuildHasher + Default + 'a> RecordParser<'a>
|
|
30
31
|
let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
|
31
32
|
|
32
33
|
let shared_empty = Cow::Borrowed("");
|
33
|
-
let shared_default = flexible_default.map(
|
34
|
-
headers.iter().enumerate().for_each(|(i,
|
34
|
+
let shared_default = flexible_default.map(CowValue);
|
35
|
+
headers.iter().enumerate().for_each(|(i, ref header)| {
|
35
36
|
let value = record.get(i).map_or_else(
|
36
37
|
|| shared_default.clone(),
|
37
38
|
|field| {
|
@@ -44,7 +45,7 @@ impl<'a, S: BuildHasher + Default + 'a> RecordParser<'a>
|
|
44
45
|
}
|
45
46
|
},
|
46
47
|
);
|
47
|
-
map.insert(header, value);
|
48
|
+
map.insert((*header).clone(), value);
|
48
49
|
});
|
49
50
|
map
|
50
51
|
}
|
@@ -55,7 +56,7 @@ impl<'a> RecordParser<'a> for Vec<Option<CowValue<'a>>> {
|
|
55
56
|
|
56
57
|
#[inline]
|
57
58
|
fn parse(
|
58
|
-
headers: &[
|
59
|
+
headers: &[StringCacheKey],
|
59
60
|
record: &csv::StringRecord,
|
60
61
|
null_string: Option<&str>,
|
61
62
|
flexible_default: Option<Cow<'a, str>>,
|
@@ -64,7 +65,7 @@ impl<'a> RecordParser<'a> for Vec<Option<CowValue<'a>>> {
|
|
64
65
|
let mut vec = Vec::with_capacity(target_len);
|
65
66
|
|
66
67
|
let shared_empty = Cow::Borrowed("");
|
67
|
-
let shared_default = flexible_default.map(
|
68
|
+
let shared_default = flexible_default.map(CowValue);
|
68
69
|
|
69
70
|
for field in record.iter() {
|
70
71
|
let value = if Some(field) == null_string {
|
data/ext/osv/src/csv/record.rs
CHANGED
@@ -1,13 +1,16 @@
|
|
1
|
-
use
|
1
|
+
use itertools::Itertools;
|
2
|
+
use magnus::{value::ReprValue, IntoValue, Ruby, Value};
|
2
3
|
use std::{borrow::Cow, collections::HashMap, hash::BuildHasher};
|
3
4
|
|
5
|
+
use super::StringCacheKey;
|
6
|
+
|
4
7
|
#[derive(Debug)]
|
5
8
|
pub enum CsvRecord<'a, S: BuildHasher + Default> {
|
6
9
|
Vec(Vec<Option<CowValue<'a>>>),
|
7
|
-
Map(HashMap
|
10
|
+
Map(HashMap<StringCacheKey, Option<CowValue<'a>>, S>),
|
8
11
|
}
|
9
12
|
|
10
|
-
impl<
|
13
|
+
impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
|
11
14
|
#[inline]
|
12
15
|
fn into_value_with(self, handle: &Ruby) -> Value {
|
13
16
|
match self {
|
@@ -19,9 +22,23 @@ impl<'a, S: BuildHasher + Default> IntoValue for CsvRecord<'a, S> {
|
|
19
22
|
CsvRecord::Map(map) => {
|
20
23
|
// Pre-allocate the hash with the known size
|
21
24
|
let hash = handle.hash_new_capa(map.len());
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
+
|
26
|
+
let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
|
27
|
+
let mut i = 0;
|
28
|
+
|
29
|
+
for chunk in &map.into_iter().chunks(128) {
|
30
|
+
for (k, v) in chunk {
|
31
|
+
values[i] = handle.into_value(k);
|
32
|
+
values[i + 1] = handle.into_value(v);
|
33
|
+
i += 2;
|
34
|
+
}
|
35
|
+
hash.bulk_insert(&values[..i]).unwrap();
|
36
|
+
|
37
|
+
// Zero out used values
|
38
|
+
values[..i].fill(handle.qnil().as_value());
|
39
|
+
i = 0;
|
40
|
+
}
|
41
|
+
|
25
42
|
hash.into_value_with(handle)
|
26
43
|
}
|
27
44
|
}
|
@@ -31,7 +48,7 @@ impl<'a, S: BuildHasher + Default> IntoValue for CsvRecord<'a, S> {
|
|
31
48
|
#[derive(Debug, Clone)]
|
32
49
|
pub struct CowValue<'a>(pub Cow<'a, str>);
|
33
50
|
|
34
|
-
impl
|
51
|
+
impl IntoValue for CowValue<'_> {
|
35
52
|
fn into_value_with(self, handle: &Ruby) -> Value {
|
36
53
|
self.0.into_value_with(handle)
|
37
54
|
}
|
@@ -1,6 +1,8 @@
|
|
1
|
-
use super::header_cache::
|
1
|
+
use super::header_cache::StringCacheKey;
|
2
2
|
use super::parser::RecordParser;
|
3
|
+
use super::{header_cache::StringCache, ruby_reader::SeekableRead};
|
3
4
|
use magnus::{Error, Ruby};
|
5
|
+
use std::io::BufReader;
|
4
6
|
use std::{borrow::Cow, io::Read, thread};
|
5
7
|
|
6
8
|
pub(crate) const READ_BUFFER_SIZE: usize = 16384;
|
@@ -9,16 +11,17 @@ pub struct RecordReader<'a, T: RecordParser<'a>> {
|
|
9
11
|
inner: ReaderImpl<'a, T>,
|
10
12
|
}
|
11
13
|
|
14
|
+
#[allow(clippy::large_enum_variant)]
|
12
15
|
enum ReaderImpl<'a, T: RecordParser<'a>> {
|
13
16
|
SingleThreaded {
|
14
|
-
reader: csv::Reader<Box<dyn
|
15
|
-
headers: Vec
|
17
|
+
reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
|
18
|
+
headers: Vec<StringCacheKey>,
|
16
19
|
null_string: Option<String>,
|
17
20
|
flexible_default: Option<Cow<'a, str>>,
|
18
21
|
string_record: csv::StringRecord,
|
19
22
|
},
|
20
23
|
MultiThreaded {
|
21
|
-
headers: Vec
|
24
|
+
headers: Vec<StringCacheKey>,
|
22
25
|
receiver: kanal::Receiver<T::Output>,
|
23
26
|
handle: Option<thread::JoinHandle<()>>,
|
24
27
|
},
|
@@ -48,8 +51,8 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
|
|
48
51
|
}
|
49
52
|
|
50
53
|
pub(crate) fn new_single_threaded(
|
51
|
-
reader: csv::Reader<Box<dyn
|
52
|
-
headers: Vec
|
54
|
+
reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
|
55
|
+
headers: Vec<StringCacheKey>,
|
53
56
|
null_string: Option<String>,
|
54
57
|
flexible_default: Option<&'a str>,
|
55
58
|
) -> Self {
|
@@ -59,7 +62,7 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
|
|
59
62
|
reader,
|
60
63
|
headers,
|
61
64
|
null_string,
|
62
|
-
flexible_default: flexible_default.map(
|
65
|
+
flexible_default: flexible_default.map(Cow::Borrowed),
|
63
66
|
string_record: csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers_len),
|
64
67
|
},
|
65
68
|
}
|
@@ -69,11 +72,10 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
|
|
69
72
|
impl<T: RecordParser<'static> + Send> RecordReader<'static, T> {
|
70
73
|
pub(crate) fn new_multi_threaded(
|
71
74
|
mut reader: csv::Reader<Box<dyn Read + Send + 'static>>,
|
72
|
-
headers: Vec
|
75
|
+
headers: Vec<StringCacheKey>,
|
73
76
|
buffer_size: usize,
|
74
77
|
null_string: Option<String>,
|
75
78
|
flexible_default: Option<&'static str>,
|
76
|
-
should_forget: bool,
|
77
79
|
) -> Self {
|
78
80
|
let (sender, receiver) = kanal::bounded(buffer_size);
|
79
81
|
let headers_for_thread = headers.clone();
|
@@ -86,16 +88,12 @@ impl<T: RecordParser<'static> + Send> RecordReader<'static, T> {
|
|
86
88
|
&headers_for_thread,
|
87
89
|
&record,
|
88
90
|
null_string.as_deref(),
|
89
|
-
flexible_default.map(
|
91
|
+
flexible_default.map(Cow::Borrowed),
|
90
92
|
);
|
91
93
|
if sender.send(row).is_err() {
|
92
94
|
break;
|
93
95
|
}
|
94
96
|
}
|
95
|
-
if should_forget {
|
96
|
-
let file_to_forget = reader.into_inner();
|
97
|
-
std::mem::forget(file_to_forget);
|
98
|
-
}
|
99
97
|
});
|
100
98
|
|
101
99
|
Self {
|
@@ -134,7 +132,7 @@ impl<'a, T: RecordParser<'a>> Iterator for RecordReader<'a, T> {
|
|
134
132
|
} => match reader.read_record(string_record) {
|
135
133
|
Ok(true) => Some(T::parse(
|
136
134
|
headers,
|
137
|
-
|
135
|
+
string_record,
|
138
136
|
null_string.as_deref(),
|
139
137
|
flexible_default.clone(),
|
140
138
|
)),
|
@@ -165,10 +163,10 @@ impl<'a, T: RecordParser<'a>> Drop for RecordReader<'a, T> {
|
|
165
163
|
if let Some(handle) = handle.take() {
|
166
164
|
let _ = handle.join();
|
167
165
|
}
|
168
|
-
let _ = StringCache::clear(headers);
|
166
|
+
let _ = StringCache::clear(&headers);
|
169
167
|
}
|
170
168
|
ReaderImpl::SingleThreaded { headers, .. } => {
|
171
|
-
let _ = StringCache::clear(headers);
|
169
|
+
let _ = StringCache::clear(&headers);
|
172
170
|
}
|
173
171
|
}
|
174
172
|
}
|
@@ -0,0 +1,30 @@
|
|
1
|
+
use std::{fs::File, io, mem::ManuallyDrop};
|
2
|
+
|
3
|
+
pub struct ForgottenFileHandle(pub ManuallyDrop<File>);
|
4
|
+
|
5
|
+
impl std::io::Read for ForgottenFileHandle {
|
6
|
+
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
7
|
+
self.0.read(buf)
|
8
|
+
}
|
9
|
+
|
10
|
+
fn read_vectored(&mut self, bufs: &mut [std::io::IoSliceMut<'_>]) -> io::Result<usize> {
|
11
|
+
self.0.read_vectored(bufs)
|
12
|
+
}
|
13
|
+
|
14
|
+
// fn read_buf(&mut self, cursor: BorrowedCursor<'_>) -> io::Result<()> {
|
15
|
+
// self.0.read_buf(cursor)
|
16
|
+
// }
|
17
|
+
|
18
|
+
// #[inline]
|
19
|
+
// fn is_read_vectored(&self) -> bool {
|
20
|
+
// self.0.is_read_vectored()
|
21
|
+
// }
|
22
|
+
|
23
|
+
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
|
24
|
+
self.0.read_to_end(buf)
|
25
|
+
}
|
26
|
+
|
27
|
+
fn read_to_string(&mut self, buf: &mut String) -> io::Result<usize> {
|
28
|
+
self.0.read_to_string(buf)
|
29
|
+
}
|
30
|
+
}
|
@@ -1,43 +1,89 @@
|
|
1
|
-
use super::READ_BUFFER_SIZE;
|
2
1
|
use magnus::{
|
3
2
|
value::{Opaque, ReprValue},
|
4
3
|
RClass, RString, Ruby, Value,
|
5
4
|
};
|
6
|
-
use std::io::{self, Read};
|
5
|
+
use std::io::{self, Read, Seek, SeekFrom, Write};
|
7
6
|
use std::sync::OnceLock;
|
8
7
|
|
9
8
|
static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
|
10
9
|
|
11
10
|
/// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
|
12
11
|
/// and provide a standard Read implementation for them.
|
13
|
-
pub struct RubyReader<
|
14
|
-
#[allow(unused)]
|
15
|
-
ruby: &'a Ruby,
|
12
|
+
pub struct RubyReader<T> {
|
16
13
|
inner: T,
|
17
|
-
buffer: Option<Vec<u8>>,
|
18
14
|
offset: usize,
|
19
|
-
// Number of bytes that have been read into the buffer
|
20
|
-
// Used as an upper bound for offset
|
21
|
-
buffered_bytes: usize,
|
22
15
|
}
|
23
16
|
|
24
|
-
pub
|
25
|
-
|
17
|
+
pub trait SeekableRead: std::io::Read + Seek {}
|
18
|
+
impl SeekableRead for RubyReader<Value> {}
|
19
|
+
impl SeekableRead for RubyReader<RString> {}
|
20
|
+
|
21
|
+
pub fn build_ruby_reader(
|
22
|
+
ruby: &Ruby,
|
26
23
|
input: Value,
|
27
|
-
) -> Result<Box<dyn
|
24
|
+
) -> Result<Box<dyn SeekableRead>, magnus::Error> {
|
28
25
|
if RubyReader::is_string_io(ruby, &input) {
|
29
26
|
RubyReader::from_string_io(ruby, input)
|
30
27
|
} else if RubyReader::is_io_like(&input) {
|
31
|
-
RubyReader::from_io(
|
28
|
+
RubyReader::from_io(input)
|
32
29
|
} else {
|
33
|
-
RubyReader::from_string_like(
|
30
|
+
RubyReader::from_string_like(input)
|
31
|
+
}
|
32
|
+
}
|
33
|
+
|
34
|
+
impl Seek for RubyReader<Value> {
|
35
|
+
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
36
|
+
let (whence, offset) = match pos {
|
37
|
+
SeekFrom::Start(i) => (0, i as i64),
|
38
|
+
SeekFrom::Current(i) => (1, i),
|
39
|
+
SeekFrom::End(i) => (2, i),
|
40
|
+
};
|
41
|
+
|
42
|
+
let new_position = self
|
43
|
+
.inner
|
44
|
+
.funcall("seek", (offset, whence))
|
45
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
46
|
+
|
47
|
+
Ok(new_position)
|
48
|
+
}
|
49
|
+
}
|
50
|
+
|
51
|
+
impl Write for RubyReader<Value> {
|
52
|
+
fn write(&mut self, buf: &[u8]) -> Result<usize, io::Error> {
|
53
|
+
let ruby_bytes = RString::from_slice(buf);
|
54
|
+
|
55
|
+
let bytes_written = self
|
56
|
+
.inner
|
57
|
+
.funcall::<_, _, usize>("write", (ruby_bytes,))
|
58
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
59
|
+
|
60
|
+
Ok(bytes_written)
|
61
|
+
}
|
62
|
+
|
63
|
+
fn flush(&mut self) -> Result<(), io::Error> {
|
64
|
+
self.inner
|
65
|
+
.funcall::<_, _, Value>("flush", ())
|
66
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
67
|
+
|
68
|
+
Ok(())
|
69
|
+
}
|
70
|
+
}
|
71
|
+
|
72
|
+
impl Seek for RubyReader<RString> {
|
73
|
+
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
74
|
+
match pos {
|
75
|
+
io::SeekFrom::Start(offset) => self.offset = offset as usize,
|
76
|
+
io::SeekFrom::Current(offset) => self.offset = (self.offset as i64 + offset) as usize,
|
77
|
+
io::SeekFrom::End(offset) => self.offset = self.inner.len() - offset as usize,
|
78
|
+
}
|
79
|
+
Ok(self.offset as u64)
|
34
80
|
}
|
35
81
|
}
|
36
82
|
|
37
|
-
impl
|
38
|
-
fn from_io(
|
83
|
+
impl RubyReader<Value> {
|
84
|
+
fn from_io(input: Value) -> Result<Box<dyn SeekableRead>, magnus::Error> {
|
39
85
|
if Self::is_io_like(&input) {
|
40
|
-
Ok(Box::new(Self::from_io_like(
|
86
|
+
Ok(Box::new(Self::from_io_like(input)))
|
41
87
|
} else {
|
42
88
|
Err(magnus::Error::new(
|
43
89
|
magnus::exception::type_error(),
|
@@ -50,70 +96,19 @@ impl<'a> RubyReader<'a, Value> {
|
|
50
96
|
input.respond_to("read", false).unwrap_or(false)
|
51
97
|
}
|
52
98
|
|
53
|
-
fn from_io_like(
|
99
|
+
fn from_io_like(input: Value) -> Self {
|
54
100
|
Self {
|
55
|
-
ruby,
|
56
101
|
inner: input,
|
57
|
-
buffer: Some(vec![0; READ_BUFFER_SIZE]),
|
58
102
|
offset: 0,
|
59
|
-
buffered_bytes: 0,
|
60
103
|
}
|
61
104
|
}
|
62
|
-
|
63
|
-
fn read_from_buffer(&mut self, to_buf: &mut [u8]) -> Option<io::Result<usize>> {
|
64
|
-
if let Some(from_buf) = &self.buffer {
|
65
|
-
// If the offset is within the buffered bytes, copy the remaining bytes to the output buffer
|
66
|
-
if self.offset < self.buffered_bytes {
|
67
|
-
let remaining = self.buffered_bytes - self.offset;
|
68
|
-
let copy_size = remaining.min(to_buf.len());
|
69
|
-
to_buf[..copy_size]
|
70
|
-
.copy_from_slice(&from_buf[self.offset..self.offset + copy_size]);
|
71
|
-
self.offset += copy_size;
|
72
|
-
Some(Ok(copy_size))
|
73
|
-
} else {
|
74
|
-
None
|
75
|
-
}
|
76
|
-
} else {
|
77
|
-
None
|
78
|
-
}
|
79
|
-
}
|
80
|
-
|
81
|
-
fn read_from_ruby(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
82
|
-
let buffer = self.buffer.as_mut().unwrap();
|
83
|
-
let result = self
|
84
|
-
.inner
|
85
|
-
.funcall::<_, _, RString>("read", (buffer.capacity(),))
|
86
|
-
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
87
|
-
|
88
|
-
if result.is_nil() {
|
89
|
-
return Ok(0); // EOF
|
90
|
-
}
|
91
|
-
|
92
|
-
let bytes = unsafe { result.as_slice() };
|
93
|
-
|
94
|
-
// Update internal buffer
|
95
|
-
let bytes_len = bytes.len();
|
96
|
-
if bytes_len == 0 {
|
97
|
-
return Ok(0);
|
98
|
-
}
|
99
|
-
|
100
|
-
// Only copy what we actually read
|
101
|
-
buffer[..bytes_len].copy_from_slice(bytes);
|
102
|
-
self.buffered_bytes = bytes_len;
|
103
|
-
|
104
|
-
// Copy to output buffer
|
105
|
-
let copy_size = bytes_len.min(buf.len());
|
106
|
-
buf[..copy_size].copy_from_slice(&buffer[..copy_size]);
|
107
|
-
self.offset = copy_size;
|
108
|
-
Ok(copy_size)
|
109
|
-
}
|
110
105
|
}
|
111
106
|
|
112
|
-
impl
|
107
|
+
impl RubyReader<RString> {
|
113
108
|
pub fn from_string_io(
|
114
|
-
ruby: &
|
109
|
+
ruby: &Ruby,
|
115
110
|
input: Value,
|
116
|
-
) -> Result<Box<dyn
|
111
|
+
) -> Result<Box<dyn SeekableRead>, magnus::Error> {
|
117
112
|
if !Self::is_string_io(ruby, &input) {
|
118
113
|
return Err(magnus::Error::new(
|
119
114
|
magnus::exception::type_error(),
|
@@ -123,11 +118,8 @@ impl<'a> RubyReader<'a, RString> {
|
|
123
118
|
|
124
119
|
let string_content = input.funcall::<_, _, RString>("string", ()).unwrap();
|
125
120
|
Ok(Box::new(Self {
|
126
|
-
ruby,
|
127
121
|
inner: string_content,
|
128
|
-
buffer: None,
|
129
122
|
offset: 0,
|
130
|
-
buffered_bytes: 0,
|
131
123
|
}))
|
132
124
|
}
|
133
125
|
|
@@ -139,33 +131,32 @@ impl<'a> RubyReader<'a, RString> {
|
|
139
131
|
input.is_kind_of(ruby.get_inner(*string_io_class))
|
140
132
|
}
|
141
133
|
|
142
|
-
fn from_string_like(
|
134
|
+
fn from_string_like(input: Value) -> Result<Box<dyn SeekableRead>, magnus::Error> {
|
143
135
|
// Try calling `to_str`, and if that fails, try `to_s`
|
144
136
|
let string_content = input
|
145
137
|
.funcall::<_, _, RString>("to_str", ())
|
146
138
|
.or_else(|_| input.funcall::<_, _, RString>("to_s", ()))?;
|
147
139
|
Ok(Box::new(Self {
|
148
|
-
ruby,
|
149
140
|
inner: string_content,
|
150
|
-
buffer: None,
|
151
141
|
offset: 0,
|
152
|
-
buffered_bytes: 0,
|
153
142
|
}))
|
154
143
|
}
|
155
144
|
}
|
156
145
|
|
157
|
-
impl
|
158
|
-
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
}
|
146
|
+
impl Read for RubyReader<Value> {
|
147
|
+
fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
|
148
|
+
let bytes = self
|
149
|
+
.inner
|
150
|
+
.funcall::<_, _, RString>("read", (buf.len(),))
|
151
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
152
|
+
|
153
|
+
buf.write_all(unsafe { bytes.as_slice() })?;
|
154
|
+
|
155
|
+
Ok(bytes.len())
|
165
156
|
}
|
166
157
|
}
|
167
158
|
|
168
|
-
impl
|
159
|
+
impl Read for RubyReader<RString> {
|
169
160
|
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
170
161
|
let string_buffer = unsafe { self.inner.as_slice() };
|
171
162
|
if self.offset >= string_buffer.len() {
|
data/ext/osv/src/lib.rs
CHANGED
data/ext/osv/src/reader.rs
CHANGED
@@ -1,19 +1,19 @@
|
|
1
|
-
use crate::csv::{CowValue, CsvRecord, RecordReaderBuilder};
|
1
|
+
use crate::csv::{CowValue, CsvRecord, RecordReaderBuilder, StringCacheKey};
|
2
2
|
use crate::utils::*;
|
3
|
+
use ahash::RandomState;
|
3
4
|
use csv::Trim;
|
4
5
|
use magnus::value::ReprValue;
|
5
6
|
use magnus::{block::Yield, Error, KwArgs, RHash, Ruby, Symbol, Value};
|
6
7
|
use std::collections::HashMap;
|
7
|
-
use xxhash_rust::xxh3::Xxh3Builder;
|
8
8
|
|
9
|
-
pub fn parse_csv
|
9
|
+
pub fn parse_csv(
|
10
10
|
rb_self: Value,
|
11
11
|
args: &[Value],
|
12
|
-
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<'static,
|
12
|
+
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<'static, RandomState>>>>, Error> {
|
13
13
|
let original = unsafe { Ruby::get_unchecked() };
|
14
14
|
let ruby: &'static Ruby = Box::leak(Box::new(original));
|
15
15
|
|
16
|
-
let
|
16
|
+
let ReadCsvArgs {
|
17
17
|
to_read,
|
18
18
|
has_headers,
|
19
19
|
delimiter,
|
@@ -24,7 +24,7 @@ pub fn parse_csv<'a>(
|
|
24
24
|
flexible,
|
25
25
|
flexible_default,
|
26
26
|
trim,
|
27
|
-
} =
|
27
|
+
} = parse_read_csv_args(ruby, args)?;
|
28
28
|
|
29
29
|
let flexible_default: &'static Option<String> = Box::leak(Box::new(flexible_default));
|
30
30
|
let leaked_flexible_default: &'static Option<&str> =
|
@@ -51,11 +51,11 @@ pub fn parse_csv<'a>(
|
|
51
51
|
});
|
52
52
|
}
|
53
53
|
|
54
|
-
let iter: Box<dyn Iterator<Item = CsvRecord<
|
54
|
+
let iter: Box<dyn Iterator<Item = CsvRecord<RandomState>>> = match result_type.as_str() {
|
55
55
|
"hash" => {
|
56
56
|
let builder = RecordReaderBuilder::<
|
57
|
-
HashMap
|
58
|
-
>::new(
|
57
|
+
HashMap<StringCacheKey, Option<CowValue<'static>>, RandomState>,
|
58
|
+
>::new(ruby, to_read)
|
59
59
|
.has_headers(has_headers)
|
60
60
|
.flexible(flexible)
|
61
61
|
.flexible_default(flexible_default.as_deref())
|
@@ -68,7 +68,7 @@ pub fn parse_csv<'a>(
|
|
68
68
|
Box::new(builder.build_threaded()?.map(CsvRecord::Map))
|
69
69
|
}
|
70
70
|
"array" => Box::new(
|
71
|
-
RecordReaderBuilder::<Vec<Option<CowValue<'static>>>>::new(
|
71
|
+
RecordReaderBuilder::<Vec<Option<CowValue<'static>>>>::new(ruby, to_read)
|
72
72
|
.has_headers(has_headers)
|
73
73
|
.flexible(flexible)
|
74
74
|
.flexible_default(flexible_default.as_deref())
|
@@ -107,7 +107,7 @@ struct EnumeratorArgs {
|
|
107
107
|
|
108
108
|
fn create_enumerator(
|
109
109
|
args: EnumeratorArgs,
|
110
|
-
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<'static,
|
110
|
+
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<'static, RandomState>>>>, Error> {
|
111
111
|
let kwargs = RHash::new();
|
112
112
|
kwargs.aset(Symbol::new("has_headers"), args.has_headers)?;
|
113
113
|
kwargs.aset(
|
data/ext/osv/src/utils.rs
CHANGED
@@ -13,12 +13,12 @@ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, E
|
|
13
13
|
RString::from_value(value)
|
14
14
|
.ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid string value"))?
|
15
15
|
.to_string()
|
16
|
-
.map(
|
16
|
+
.map(Some)
|
17
17
|
} else if value.is_kind_of(ruby.class_symbol()) {
|
18
18
|
Symbol::from_value(value)
|
19
19
|
.ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid symbol value"))?
|
20
20
|
.funcall("to_s", ())
|
21
|
-
.map(
|
21
|
+
.map(Some)
|
22
22
|
} else {
|
23
23
|
Err(Error::new(
|
24
24
|
magnus::exception::type_error(),
|
@@ -28,7 +28,7 @@ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, E
|
|
28
28
|
}
|
29
29
|
|
30
30
|
#[derive(Debug)]
|
31
|
-
pub struct
|
31
|
+
pub struct ReadCsvArgs {
|
32
32
|
pub to_read: Value,
|
33
33
|
pub has_headers: bool,
|
34
34
|
pub delimiter: u8,
|
@@ -42,7 +42,7 @@ pub struct CsvArgs {
|
|
42
42
|
}
|
43
43
|
|
44
44
|
/// Parse common arguments for CSV parsing
|
45
|
-
pub fn
|
45
|
+
pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, Error> {
|
46
46
|
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
47
47
|
let (to_read,) = parsed_args.required;
|
48
48
|
|
@@ -166,7 +166,7 @@ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
|
|
166
166
|
None => csv::Trim::None,
|
167
167
|
};
|
168
168
|
|
169
|
-
Ok(
|
169
|
+
Ok(ReadCsvArgs {
|
170
170
|
to_read,
|
171
171
|
has_headers,
|
172
172
|
delimiter,
|
data/lib/osv/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: osv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.16
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-01-
|
11
|
+
date: 2025-01-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -59,12 +59,14 @@ files:
|
|
59
59
|
- Rakefile
|
60
60
|
- ext/osv/Cargo.toml
|
61
61
|
- ext/osv/extconf.rb
|
62
|
+
- ext/osv/src/allocator.rs
|
62
63
|
- ext/osv/src/csv/builder.rs
|
63
64
|
- ext/osv/src/csv/header_cache.rs
|
64
65
|
- ext/osv/src/csv/mod.rs
|
65
66
|
- ext/osv/src/csv/parser.rs
|
66
67
|
- ext/osv/src/csv/record.rs
|
67
68
|
- ext/osv/src/csv/record_reader.rs
|
69
|
+
- ext/osv/src/csv/ruby_integration.rs
|
68
70
|
- ext/osv/src/csv/ruby_reader.rs
|
69
71
|
- ext/osv/src/lib.rs
|
70
72
|
- ext/osv/src/reader.rs
|