osv 0.3.1 → 0.3.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Cargo.lock +39 -4
- data/ext/osv/Cargo.toml +4 -3
- data/ext/osv/src/csv/builder.rs +18 -2
- data/ext/osv/src/csv/header_cache.rs +71 -0
- data/ext/osv/src/csv/mod.rs +1 -1
- data/ext/osv/src/csv/parser.rs +8 -8
- data/ext/osv/src/csv/reader.rs +27 -3
- data/ext/osv/src/csv/record.rs +9 -3
- data/ext/osv/src/reader.rs +6 -6
- data/lib/osv/version.rb +1 -1
- metadata +2 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 33c644fac6e61f8bf3b9f11e646d6706017ece2a386df03a568b5ed06fd91e2a
|
4
|
+
data.tar.gz: 0d977fb3a7eaf867663feb76161eb2d3fbe42e758523ef8cafbff51a9acfef0d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b6fe382c005837fbfc705bd02b1859fdfa9fa9f955c15f7b13ffacb97f1d5dc0714288c23a7e2431a21aada6aacb1952b33ae8d14acd019c96ed719f5580c02d
|
7
|
+
data.tar.gz: 4e7c3d783f23af709505a4c182d1ee6744c9f28226aecdf68c532ada9ffa653858bc730bfd879cfa94d8e8f7a1280c49380bb5b6f74ed16b7ac776318aa53d1b
|
data/Cargo.lock
CHANGED
@@ -2,6 +2,12 @@
|
|
2
2
|
# It is not intended for manual editing.
|
3
3
|
version = 3
|
4
4
|
|
5
|
+
[[package]]
|
6
|
+
name = "adler2"
|
7
|
+
version = "2.0.0"
|
8
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
9
|
+
checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
|
10
|
+
|
5
11
|
[[package]]
|
6
12
|
name = "aho-corasick"
|
7
13
|
version = "1.1.3"
|
@@ -69,6 +75,15 @@ dependencies = [
|
|
69
75
|
"libloading",
|
70
76
|
]
|
71
77
|
|
78
|
+
[[package]]
|
79
|
+
name = "crc32fast"
|
80
|
+
version = "1.4.2"
|
81
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
82
|
+
checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
|
83
|
+
dependencies = [
|
84
|
+
"cfg-if",
|
85
|
+
]
|
86
|
+
|
72
87
|
[[package]]
|
73
88
|
name = "csv"
|
74
89
|
version = "1.3.1"
|
@@ -96,6 +111,16 @@ version = "1.13.0"
|
|
96
111
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
97
112
|
checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
|
98
113
|
|
114
|
+
[[package]]
|
115
|
+
name = "flate2"
|
116
|
+
version = "1.0.35"
|
117
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
118
|
+
checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c"
|
119
|
+
dependencies = [
|
120
|
+
"crc32fast",
|
121
|
+
"miniz_oxide",
|
122
|
+
]
|
123
|
+
|
99
124
|
[[package]]
|
100
125
|
name = "futures-core"
|
101
126
|
version = "0.3.31"
|
@@ -218,6 +243,15 @@ version = "0.2.1"
|
|
218
243
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
219
244
|
checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
|
220
245
|
|
246
|
+
[[package]]
|
247
|
+
name = "miniz_oxide"
|
248
|
+
version = "0.8.2"
|
249
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
250
|
+
checksum = "4ffbe83022cedc1d264172192511ae958937694cd57ce297164951b8b3568394"
|
251
|
+
dependencies = [
|
252
|
+
"adler2",
|
253
|
+
]
|
254
|
+
|
221
255
|
[[package]]
|
222
256
|
name = "nom"
|
223
257
|
version = "7.1.3"
|
@@ -233,6 +267,7 @@ name = "osv"
|
|
233
267
|
version = "0.1.0"
|
234
268
|
dependencies = [
|
235
269
|
"csv",
|
270
|
+
"flate2",
|
236
271
|
"kanal",
|
237
272
|
"magnus 0.7.1",
|
238
273
|
"rb-sys",
|
@@ -260,18 +295,18 @@ dependencies = [
|
|
260
295
|
|
261
296
|
[[package]]
|
262
297
|
name = "rb-sys"
|
263
|
-
version = "0.9.
|
298
|
+
version = "0.9.104"
|
264
299
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
265
|
-
checksum = "
|
300
|
+
checksum = "e2e26425f064a90404ed5e33fee2137b02a9c6d1c83e19394f4d8a476b9d76a2"
|
266
301
|
dependencies = [
|
267
302
|
"rb-sys-build",
|
268
303
|
]
|
269
304
|
|
270
305
|
[[package]]
|
271
306
|
name = "rb-sys-build"
|
272
|
-
version = "0.9.
|
307
|
+
version = "0.9.104"
|
273
308
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
274
|
-
checksum = "
|
309
|
+
checksum = "c9802c9003c5648ee0a067e9aa8960d402d5f764f682f93c1ed49eec72f6d7fc"
|
275
310
|
dependencies = [
|
276
311
|
"bindgen",
|
277
312
|
"lazy_static",
|
data/ext/osv/Cargo.toml
CHANGED
@@ -7,9 +7,10 @@ edition = "2021"
|
|
7
7
|
crate-type = ["cdylib"]
|
8
8
|
|
9
9
|
[dependencies]
|
10
|
-
csv = "1.3
|
10
|
+
csv = "^1.3"
|
11
|
+
flate2 = "1.0.35"
|
12
|
+
kanal = "0.1.0-pre8"
|
11
13
|
magnus = { version = "0.7", features = ["rb-sys"] }
|
12
|
-
rb-sys = "0.9"
|
14
|
+
rb-sys = "^0.9"
|
13
15
|
serde = { version = "1.0", features = ["derive"] }
|
14
16
|
serde_magnus = "0.8.1"
|
15
|
-
kanal = "0.1.0-pre8"
|
data/ext/osv/src/csv/builder.rs
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
use super::{
|
2
|
+
header_cache::StringCache,
|
2
3
|
parser::RecordParser,
|
3
4
|
reader::{ReadImpl, RecordReader},
|
4
5
|
};
|
6
|
+
use flate2::read::GzDecoder;
|
5
7
|
use magnus::{rb_sys::AsRawValue, value::ReprValue, Error, RString, Ruby, Value};
|
6
8
|
use std::{fs::File, io::Read, marker::PhantomData, os::fd::FromRawFd, thread};
|
7
9
|
|
@@ -74,7 +76,12 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
74
76
|
format!("Failed to open file: {e}"),
|
75
77
|
)
|
76
78
|
})?;
|
77
|
-
|
79
|
+
if path.ends_with(".gz") {
|
80
|
+
let file = GzDecoder::new(file);
|
81
|
+
Ok(Box::new(file))
|
82
|
+
} else {
|
83
|
+
Ok(Box::new(file))
|
84
|
+
}
|
78
85
|
}
|
79
86
|
}
|
80
87
|
|
@@ -89,11 +96,19 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
89
96
|
let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
|
90
97
|
let null_string = self.null_string;
|
91
98
|
|
99
|
+
let static_headers = StringCache::intern_many(&headers).map_err(|e| {
|
100
|
+
Error::new(
|
101
|
+
self.ruby.exception_runtime_error(),
|
102
|
+
format!("Failed to intern headers: {e}"),
|
103
|
+
)
|
104
|
+
})?;
|
105
|
+
let headers_for_cleanup = static_headers.clone();
|
106
|
+
|
92
107
|
let (sender, receiver) = kanal::bounded(self.buffer);
|
93
108
|
let handle = thread::spawn(move || {
|
94
109
|
let mut record = csv::StringRecord::new();
|
95
110
|
while let Ok(true) = reader.read_record(&mut record) {
|
96
|
-
let row = T::parse(&
|
111
|
+
let row = T::parse(&static_headers, &record, &null_string);
|
97
112
|
if sender.send(row).is_err() {
|
98
113
|
break;
|
99
114
|
}
|
@@ -104,6 +119,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
104
119
|
|
105
120
|
Ok(RecordReader {
|
106
121
|
reader: ReadImpl::MultiThreaded {
|
122
|
+
headers: headers_for_cleanup,
|
107
123
|
receiver,
|
108
124
|
handle: Some(handle),
|
109
125
|
},
|
@@ -0,0 +1,71 @@
|
|
1
|
+
/// This module exists to avoid cloning header keys in returned HashMaps.
|
2
|
+
/// Since the underlying RString creation already involves cloning,
|
3
|
+
/// this caching layer aims to reduce redundant allocations.
|
4
|
+
///
|
5
|
+
/// Note: Performance testing on macOS showed minimal speed improvements,
|
6
|
+
/// so this optimization could be removed if any issues arise.
|
7
|
+
|
8
|
+
|
9
|
+
use std::{
|
10
|
+
collections::HashMap,
|
11
|
+
sync::{atomic::AtomicU32, LazyLock, Mutex},
|
12
|
+
};
|
13
|
+
|
14
|
+
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, AtomicU32>>> =
|
15
|
+
LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
|
16
|
+
|
17
|
+
pub struct StringCache {}
|
18
|
+
|
19
|
+
impl StringCache {
|
20
|
+
#[allow(dead_code)]
|
21
|
+
pub fn intern(string: String) -> Result<&'static str, String> {
|
22
|
+
let mut cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
|
23
|
+
|
24
|
+
if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
|
25
|
+
count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
26
|
+
Ok(existing)
|
27
|
+
} else {
|
28
|
+
let leaked = Box::leak(string.into_boxed_str());
|
29
|
+
cache.insert(leaked, AtomicU32::new(1));
|
30
|
+
Ok(leaked)
|
31
|
+
}
|
32
|
+
}
|
33
|
+
|
34
|
+
pub fn intern_many(strings: &[String]) -> Result<Vec<&'static str>, String> {
|
35
|
+
let mut cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
|
36
|
+
let mut result = Vec::with_capacity(strings.len());
|
37
|
+
|
38
|
+
for string in strings {
|
39
|
+
let static_str: &'static str =
|
40
|
+
if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
|
41
|
+
count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
42
|
+
existing
|
43
|
+
} else {
|
44
|
+
let leaked = Box::leak(string.clone().into_boxed_str());
|
45
|
+
cache.insert(leaked, AtomicU32::new(1));
|
46
|
+
leaked
|
47
|
+
};
|
48
|
+
result.push(static_str);
|
49
|
+
}
|
50
|
+
|
51
|
+
Ok(result)
|
52
|
+
}
|
53
|
+
|
54
|
+
pub fn clear(headers: &[&'static str]) -> Result<(), String> {
|
55
|
+
let cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
|
56
|
+
|
57
|
+
for header in headers {
|
58
|
+
if let Some(count) = cache.get(header) {
|
59
|
+
let remaining = count.fetch_sub(1, std::sync::atomic::Ordering::Relaxed);
|
60
|
+
if remaining == 0 {
|
61
|
+
let ptr = *header as *const str as *mut str;
|
62
|
+
unsafe {
|
63
|
+
let _ = Box::from_raw(ptr);
|
64
|
+
}
|
65
|
+
}
|
66
|
+
}
|
67
|
+
}
|
68
|
+
|
69
|
+
Ok(())
|
70
|
+
}
|
71
|
+
}
|
data/ext/osv/src/csv/mod.rs
CHANGED
data/ext/osv/src/csv/parser.rs
CHANGED
@@ -2,24 +2,24 @@ use std::collections::HashMap;
|
|
2
2
|
|
3
3
|
pub trait RecordParser {
|
4
4
|
type Output;
|
5
|
-
fn parse
|
6
|
-
headers: &'
|
5
|
+
fn parse(
|
6
|
+
headers: &[&'static str],
|
7
7
|
record: &csv::StringRecord,
|
8
8
|
null_string: &str,
|
9
9
|
) -> Self::Output;
|
10
10
|
}
|
11
11
|
|
12
|
-
impl RecordParser for HashMap
|
12
|
+
impl RecordParser for HashMap<&'static str, Option<String>> {
|
13
13
|
type Output = Self;
|
14
|
-
fn parse
|
15
|
-
headers: &'
|
14
|
+
fn parse(
|
15
|
+
headers: &[&'static str],
|
16
16
|
record: &csv::StringRecord,
|
17
17
|
null_string: &str,
|
18
18
|
) -> Self::Output {
|
19
19
|
let mut map = HashMap::with_capacity(headers.len());
|
20
20
|
for (header, field) in headers.iter().zip(record.iter()) {
|
21
21
|
map.insert(
|
22
|
-
header
|
22
|
+
*header,
|
23
23
|
if field == null_string {
|
24
24
|
None
|
25
25
|
} else {
|
@@ -33,8 +33,8 @@ impl RecordParser for HashMap<String, Option<String>> {
|
|
33
33
|
|
34
34
|
impl RecordParser for Vec<Option<String>> {
|
35
35
|
type Output = Self;
|
36
|
-
fn parse
|
37
|
-
_headers: &'
|
36
|
+
fn parse(
|
37
|
+
_headers: &[&'static str],
|
38
38
|
record: &csv::StringRecord,
|
39
39
|
null_string: &str,
|
40
40
|
) -> Self::Output {
|
data/ext/osv/src/csv/reader.rs
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
use super::parser::RecordParser;
|
1
|
+
use super::{header_cache::StringCache, parser::RecordParser};
|
2
2
|
use magnus::{Error, Ruby};
|
3
3
|
use std::{io::Read, thread};
|
4
4
|
|
@@ -6,14 +6,36 @@ pub struct RecordReader<T: RecordParser> {
|
|
6
6
|
pub(crate) reader: ReadImpl<T>,
|
7
7
|
}
|
8
8
|
|
9
|
+
impl<T: RecordParser> Drop for RecordReader<T> {
|
10
|
+
fn drop(&mut self) {
|
11
|
+
match &mut self.reader {
|
12
|
+
ReadImpl::MultiThreaded {
|
13
|
+
receiver,
|
14
|
+
handle,
|
15
|
+
headers,
|
16
|
+
} => {
|
17
|
+
receiver.close();
|
18
|
+
if let Some(handle) = handle.take() {
|
19
|
+
let _ = handle.join();
|
20
|
+
}
|
21
|
+
StringCache::clear(headers).unwrap();
|
22
|
+
}
|
23
|
+
ReadImpl::SingleThreaded { headers, .. } => {
|
24
|
+
StringCache::clear(headers).unwrap();
|
25
|
+
}
|
26
|
+
}
|
27
|
+
}
|
28
|
+
}
|
29
|
+
|
9
30
|
#[allow(dead_code)]
|
10
31
|
pub enum ReadImpl<T: RecordParser> {
|
11
32
|
SingleThreaded {
|
12
33
|
reader: csv::Reader<Box<dyn Read + Send + 'static>>,
|
13
|
-
headers: Vec
|
34
|
+
headers: Vec<&'static str>,
|
14
35
|
null_string: String,
|
15
36
|
},
|
16
37
|
MultiThreaded {
|
38
|
+
headers: Vec<&'static str>,
|
17
39
|
receiver: kanal::Receiver<T::Output>,
|
18
40
|
handle: Option<thread::JoinHandle<()>>,
|
19
41
|
},
|
@@ -48,7 +70,9 @@ impl<T: RecordParser> Iterator for RecordReader<T> {
|
|
48
70
|
|
49
71
|
fn next(&mut self) -> Option<Self::Item> {
|
50
72
|
match &mut self.reader {
|
51
|
-
ReadImpl::MultiThreaded {
|
73
|
+
ReadImpl::MultiThreaded {
|
74
|
+
receiver, handle, ..
|
75
|
+
} => match receiver.recv() {
|
52
76
|
Ok(record) => Some(record),
|
53
77
|
Err(_) => {
|
54
78
|
if let Some(handle) = handle.take() {
|
data/ext/osv/src/csv/record.rs
CHANGED
@@ -1,17 +1,23 @@
|
|
1
|
-
use magnus::{IntoValue, Ruby, Value};
|
1
|
+
use magnus::{IntoValue, RHash, Ruby, Value};
|
2
2
|
use std::collections::HashMap;
|
3
3
|
|
4
4
|
#[derive(Debug)]
|
5
5
|
pub enum CsvRecord {
|
6
6
|
Vec(Vec<Option<String>>),
|
7
|
-
Map(HashMap
|
7
|
+
Map(HashMap<&'static str, Option<String>>),
|
8
8
|
}
|
9
9
|
|
10
10
|
impl IntoValue for CsvRecord {
|
11
11
|
fn into_value_with(self, handle: &Ruby) -> Value {
|
12
12
|
match self {
|
13
13
|
CsvRecord::Vec(vec) => vec.into_value_with(handle),
|
14
|
-
CsvRecord::Map(map) =>
|
14
|
+
CsvRecord::Map(map) => {
|
15
|
+
let hash = RHash::new();
|
16
|
+
for (k, v) in map {
|
17
|
+
hash.aset(k, v).unwrap();
|
18
|
+
}
|
19
|
+
hash.into_value_with(handle)
|
20
|
+
}
|
15
21
|
}
|
16
22
|
}
|
17
23
|
}
|
data/ext/osv/src/reader.rs
CHANGED
@@ -1,15 +1,15 @@
|
|
1
|
-
use std::collections::HashMap;
|
2
|
-
|
3
1
|
use crate::csv::{CsvRecord, RecordReaderBuilder};
|
4
2
|
use crate::utils::*;
|
5
3
|
use magnus::value::ReprValue;
|
6
4
|
use magnus::{block::Yield, Error, KwArgs, RHash, Ruby, Symbol, Value};
|
5
|
+
use std::collections::HashMap;
|
7
6
|
|
8
7
|
pub fn parse_csv(
|
9
|
-
ruby: &Ruby,
|
10
8
|
rb_self: Value,
|
11
9
|
args: &[Value],
|
12
10
|
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord>>>, Error> {
|
11
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
12
|
+
|
13
13
|
let CsvArgs {
|
14
14
|
to_read,
|
15
15
|
has_headers,
|
@@ -18,7 +18,7 @@ pub fn parse_csv(
|
|
18
18
|
null_string,
|
19
19
|
buffer_size,
|
20
20
|
result_type,
|
21
|
-
} = parse_csv_args(ruby, args)?;
|
21
|
+
} = parse_csv_args(&ruby, args)?;
|
22
22
|
|
23
23
|
if !ruby.block_given() {
|
24
24
|
return create_enumerator(EnumeratorArgs {
|
@@ -35,7 +35,7 @@ pub fn parse_csv(
|
|
35
35
|
|
36
36
|
let iter: Box<dyn Iterator<Item = CsvRecord>> = match result_type.as_str() {
|
37
37
|
"hash" => Box::new(
|
38
|
-
RecordReaderBuilder::<HashMap
|
38
|
+
RecordReaderBuilder::<HashMap<&'static str, Option<String>>>::new(&ruby, to_read)
|
39
39
|
.has_headers(has_headers)
|
40
40
|
.delimiter(delimiter)
|
41
41
|
.quote_char(quote_char)
|
@@ -45,7 +45,7 @@ pub fn parse_csv(
|
|
45
45
|
.map(CsvRecord::Map),
|
46
46
|
),
|
47
47
|
"array" => Box::new(
|
48
|
-
RecordReaderBuilder::<Vec<Option<String>>>::new(ruby, to_read)
|
48
|
+
RecordReaderBuilder::<Vec<Option<String>>>::new(&ruby, to_read)
|
49
49
|
.has_headers(has_headers)
|
50
50
|
.delimiter(delimiter)
|
51
51
|
.quote_char(quote_char)
|
data/lib/osv/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: osv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
@@ -56,6 +56,7 @@ files:
|
|
56
56
|
- ext/osv/Cargo.toml
|
57
57
|
- ext/osv/extconf.rb
|
58
58
|
- ext/osv/src/csv/builder.rs
|
59
|
+
- ext/osv/src/csv/header_cache.rs
|
59
60
|
- ext/osv/src/csv/mod.rs
|
60
61
|
- ext/osv/src/csv/parser.rs
|
61
62
|
- ext/osv/src/csv/reader.rs
|