osv 0.3.0 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Cargo.lock +11 -4
- data/ext/osv/Cargo.toml +3 -2
- data/ext/osv/src/csv/builder.rs +11 -3
- data/ext/osv/src/csv/header_cache.rs +63 -0
- data/ext/osv/src/csv/mod.rs +1 -1
- data/ext/osv/src/csv/parser.rs +34 -26
- data/ext/osv/src/csv/reader.rs +27 -3
- data/ext/osv/src/csv/record.rs +9 -3
- data/ext/osv/src/reader.rs +6 -6
- data/lib/osv/version.rb +1 -1
- metadata +2 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 687f5a3426558ea4d4d342f153dae1f11da93807daddcc6944888d92cda7502b
|
4
|
+
data.tar.gz: e9f766b94e4c7f806e81eec859a0146b6a4758c5a69c759b084f64e01fd7a888
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bb14e59bca79613064a07cbcfa6c2531f08c68eb131e512b9822c7295bcacfd051de8dcd49100d12c8377d90a438d528dc5b20bcf6a1aa7c4e44f8e2aaae7111
|
7
|
+
data.tar.gz: a3a83f9653ef5bc41c908de14c96b756c719fd2750c8739b4a2165ff91ed82d34fdfde184c853185a8ff516e324d01816c65764f05adf58db6b2159ae66b0c74
|
data/Cargo.lock
CHANGED
@@ -43,6 +43,12 @@ version = "2.6.0"
|
|
43
43
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
44
44
|
checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
|
45
45
|
|
46
|
+
[[package]]
|
47
|
+
name = "bumpalo"
|
48
|
+
version = "3.16.0"
|
49
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
50
|
+
checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
|
51
|
+
|
46
52
|
[[package]]
|
47
53
|
name = "cexpr"
|
48
54
|
version = "0.6.0"
|
@@ -232,6 +238,7 @@ dependencies = [
|
|
232
238
|
name = "osv"
|
233
239
|
version = "0.1.0"
|
234
240
|
dependencies = [
|
241
|
+
"bumpalo",
|
235
242
|
"csv",
|
236
243
|
"kanal",
|
237
244
|
"magnus 0.7.1",
|
@@ -260,18 +267,18 @@ dependencies = [
|
|
260
267
|
|
261
268
|
[[package]]
|
262
269
|
name = "rb-sys"
|
263
|
-
version = "0.9.
|
270
|
+
version = "0.9.104"
|
264
271
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
265
|
-
checksum = "
|
272
|
+
checksum = "e2e26425f064a90404ed5e33fee2137b02a9c6d1c83e19394f4d8a476b9d76a2"
|
266
273
|
dependencies = [
|
267
274
|
"rb-sys-build",
|
268
275
|
]
|
269
276
|
|
270
277
|
[[package]]
|
271
278
|
name = "rb-sys-build"
|
272
|
-
version = "0.9.
|
279
|
+
version = "0.9.104"
|
273
280
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
274
|
-
checksum = "
|
281
|
+
checksum = "c9802c9003c5648ee0a067e9aa8960d402d5f764f682f93c1ed49eec72f6d7fc"
|
275
282
|
dependencies = [
|
276
283
|
"bindgen",
|
277
284
|
"lazy_static",
|
data/ext/osv/Cargo.toml
CHANGED
@@ -7,9 +7,10 @@ edition = "2021"
|
|
7
7
|
crate-type = ["cdylib"]
|
8
8
|
|
9
9
|
[dependencies]
|
10
|
-
csv = "1.3
|
10
|
+
csv = "^1.3"
|
11
11
|
magnus = { version = "0.7", features = ["rb-sys"] }
|
12
|
-
rb-sys = "0.9"
|
12
|
+
rb-sys = "^0.9"
|
13
13
|
serde = { version = "1.0", features = ["derive"] }
|
14
14
|
serde_magnus = "0.8.1"
|
15
15
|
kanal = "0.1.0-pre8"
|
16
|
+
bumpalo = { version = "3.16.0", features = ["collections"] }
|
data/ext/osv/src/csv/builder.rs
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
use super::{
|
2
|
+
header_cache::StringCache,
|
2
3
|
parser::RecordParser,
|
3
4
|
reader::{ReadImpl, RecordReader},
|
4
5
|
};
|
@@ -80,7 +81,6 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
80
81
|
|
81
82
|
pub fn build(self) -> Result<RecordReader<T>, Error> {
|
82
83
|
let readable = self.get_reader()?;
|
83
|
-
|
84
84
|
let mut reader = csv::ReaderBuilder::new()
|
85
85
|
.has_headers(self.has_headers)
|
86
86
|
.delimiter(self.delimiter)
|
@@ -88,14 +88,21 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
88
88
|
.from_reader(readable);
|
89
89
|
|
90
90
|
let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
|
91
|
-
let headers_clone = headers.clone();
|
92
91
|
let null_string = self.null_string;
|
93
92
|
|
93
|
+
let static_headers = StringCache::intern_many(&headers).map_err(|e| {
|
94
|
+
Error::new(
|
95
|
+
self.ruby.exception_runtime_error(),
|
96
|
+
format!("Failed to intern headers: {e}"),
|
97
|
+
)
|
98
|
+
})?;
|
99
|
+
let headers_for_cleanup = static_headers.clone();
|
100
|
+
|
94
101
|
let (sender, receiver) = kanal::bounded(self.buffer);
|
95
102
|
let handle = thread::spawn(move || {
|
96
103
|
let mut record = csv::StringRecord::new();
|
97
104
|
while let Ok(true) = reader.read_record(&mut record) {
|
98
|
-
let row = T::parse(&
|
105
|
+
let row = T::parse(&static_headers, &record, &null_string);
|
99
106
|
if sender.send(row).is_err() {
|
100
107
|
break;
|
101
108
|
}
|
@@ -106,6 +113,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
106
113
|
|
107
114
|
Ok(RecordReader {
|
108
115
|
reader: ReadImpl::MultiThreaded {
|
116
|
+
headers: headers_for_cleanup,
|
109
117
|
receiver,
|
110
118
|
handle: Some(handle),
|
111
119
|
},
|
@@ -0,0 +1,63 @@
|
|
1
|
+
use std::{
|
2
|
+
collections::HashMap,
|
3
|
+
sync::{atomic::AtomicU32, LazyLock, Mutex},
|
4
|
+
};
|
5
|
+
|
6
|
+
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, AtomicU32>>> =
|
7
|
+
LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
|
8
|
+
|
9
|
+
pub struct StringCache {}
|
10
|
+
|
11
|
+
impl StringCache {
|
12
|
+
#[allow(dead_code)]
|
13
|
+
pub fn intern(string: String) -> Result<&'static str, String> {
|
14
|
+
let mut cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
|
15
|
+
|
16
|
+
if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
|
17
|
+
count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
18
|
+
Ok(existing)
|
19
|
+
} else {
|
20
|
+
let leaked = Box::leak(string.into_boxed_str());
|
21
|
+
cache.insert(leaked, AtomicU32::new(1));
|
22
|
+
Ok(leaked)
|
23
|
+
}
|
24
|
+
}
|
25
|
+
|
26
|
+
pub fn intern_many(strings: &[String]) -> Result<Vec<&'static str>, String> {
|
27
|
+
let mut cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
|
28
|
+
let mut result = Vec::with_capacity(strings.len());
|
29
|
+
|
30
|
+
for string in strings {
|
31
|
+
let static_str: &'static str =
|
32
|
+
if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
|
33
|
+
count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
34
|
+
existing
|
35
|
+
} else {
|
36
|
+
let leaked = Box::leak(string.clone().into_boxed_str());
|
37
|
+
cache.insert(leaked, AtomicU32::new(1));
|
38
|
+
leaked
|
39
|
+
};
|
40
|
+
result.push(static_str);
|
41
|
+
}
|
42
|
+
|
43
|
+
Ok(result)
|
44
|
+
}
|
45
|
+
|
46
|
+
pub fn clear(headers: &[&'static str]) -> Result<(), String> {
|
47
|
+
let cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
|
48
|
+
|
49
|
+
for header in headers {
|
50
|
+
if let Some(count) = cache.get(header) {
|
51
|
+
let remaining = count.fetch_sub(1, std::sync::atomic::Ordering::Relaxed);
|
52
|
+
if remaining == 0 {
|
53
|
+
let ptr = *header as *const str as *mut str;
|
54
|
+
unsafe {
|
55
|
+
let _ = Box::from_raw(ptr);
|
56
|
+
}
|
57
|
+
}
|
58
|
+
}
|
59
|
+
}
|
60
|
+
|
61
|
+
Ok(())
|
62
|
+
}
|
63
|
+
}
|
data/ext/osv/src/csv/mod.rs
CHANGED
data/ext/osv/src/csv/parser.rs
CHANGED
@@ -2,42 +2,50 @@ use std::collections::HashMap;
|
|
2
2
|
|
3
3
|
pub trait RecordParser {
|
4
4
|
type Output;
|
5
|
-
|
6
|
-
|
5
|
+
fn parse(
|
6
|
+
headers: &[&'static str],
|
7
|
+
record: &csv::StringRecord,
|
8
|
+
null_string: &str,
|
9
|
+
) -> Self::Output;
|
7
10
|
}
|
8
11
|
|
9
|
-
impl RecordParser for HashMap
|
12
|
+
impl RecordParser for HashMap<&'static str, Option<String>> {
|
10
13
|
type Output = Self;
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
14
|
+
fn parse(
|
15
|
+
headers: &[&'static str],
|
16
|
+
record: &csv::StringRecord,
|
17
|
+
null_string: &str,
|
18
|
+
) -> Self::Output {
|
19
|
+
let mut map = HashMap::with_capacity(headers.len());
|
20
|
+
for (header, field) in headers.iter().zip(record.iter()) {
|
21
|
+
map.insert(
|
22
|
+
*header,
|
23
|
+
if field == null_string {
|
18
24
|
None
|
19
25
|
} else {
|
20
26
|
Some(field.to_string())
|
21
|
-
}
|
22
|
-
|
23
|
-
|
24
|
-
|
27
|
+
},
|
28
|
+
);
|
29
|
+
}
|
30
|
+
map
|
25
31
|
}
|
26
32
|
}
|
27
33
|
|
28
34
|
impl RecordParser for Vec<Option<String>> {
|
29
35
|
type Output = Self;
|
30
|
-
|
31
|
-
|
32
|
-
record
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
36
|
+
fn parse(
|
37
|
+
_headers: &[&'static str],
|
38
|
+
record: &csv::StringRecord,
|
39
|
+
null_string: &str,
|
40
|
+
) -> Self::Output {
|
41
|
+
let mut vec = Vec::with_capacity(record.len());
|
42
|
+
for field in record.iter() {
|
43
|
+
vec.push(if field == null_string {
|
44
|
+
None
|
45
|
+
} else {
|
46
|
+
Some(field.to_string())
|
47
|
+
});
|
48
|
+
}
|
49
|
+
vec
|
42
50
|
}
|
43
51
|
}
|
data/ext/osv/src/csv/reader.rs
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
use super::parser::RecordParser;
|
1
|
+
use super::{header_cache::StringCache, parser::RecordParser};
|
2
2
|
use magnus::{Error, Ruby};
|
3
3
|
use std::{io::Read, thread};
|
4
4
|
|
@@ -6,14 +6,36 @@ pub struct RecordReader<T: RecordParser> {
|
|
6
6
|
pub(crate) reader: ReadImpl<T>,
|
7
7
|
}
|
8
8
|
|
9
|
+
impl<T: RecordParser> Drop for RecordReader<T> {
|
10
|
+
fn drop(&mut self) {
|
11
|
+
match &mut self.reader {
|
12
|
+
ReadImpl::MultiThreaded {
|
13
|
+
receiver,
|
14
|
+
handle,
|
15
|
+
headers,
|
16
|
+
} => {
|
17
|
+
receiver.close();
|
18
|
+
if let Some(handle) = handle.take() {
|
19
|
+
let _ = handle.join();
|
20
|
+
}
|
21
|
+
StringCache::clear(headers).unwrap();
|
22
|
+
}
|
23
|
+
ReadImpl::SingleThreaded { headers, .. } => {
|
24
|
+
StringCache::clear(headers).unwrap();
|
25
|
+
}
|
26
|
+
}
|
27
|
+
}
|
28
|
+
}
|
29
|
+
|
9
30
|
#[allow(dead_code)]
|
10
31
|
pub enum ReadImpl<T: RecordParser> {
|
11
32
|
SingleThreaded {
|
12
33
|
reader: csv::Reader<Box<dyn Read + Send + 'static>>,
|
13
|
-
headers: Vec
|
34
|
+
headers: Vec<&'static str>,
|
14
35
|
null_string: String,
|
15
36
|
},
|
16
37
|
MultiThreaded {
|
38
|
+
headers: Vec<&'static str>,
|
17
39
|
receiver: kanal::Receiver<T::Output>,
|
18
40
|
handle: Option<thread::JoinHandle<()>>,
|
19
41
|
},
|
@@ -48,7 +70,9 @@ impl<T: RecordParser> Iterator for RecordReader<T> {
|
|
48
70
|
|
49
71
|
fn next(&mut self) -> Option<Self::Item> {
|
50
72
|
match &mut self.reader {
|
51
|
-
ReadImpl::MultiThreaded {
|
73
|
+
ReadImpl::MultiThreaded {
|
74
|
+
receiver, handle, ..
|
75
|
+
} => match receiver.recv() {
|
52
76
|
Ok(record) => Some(record),
|
53
77
|
Err(_) => {
|
54
78
|
if let Some(handle) = handle.take() {
|
data/ext/osv/src/csv/record.rs
CHANGED
@@ -1,17 +1,23 @@
|
|
1
|
-
use magnus::{IntoValue, Ruby, Value};
|
1
|
+
use magnus::{IntoValue, RHash, Ruby, Value};
|
2
2
|
use std::collections::HashMap;
|
3
3
|
|
4
4
|
#[derive(Debug)]
|
5
5
|
pub enum CsvRecord {
|
6
6
|
Vec(Vec<Option<String>>),
|
7
|
-
Map(HashMap
|
7
|
+
Map(HashMap<&'static str, Option<String>>),
|
8
8
|
}
|
9
9
|
|
10
10
|
impl IntoValue for CsvRecord {
|
11
11
|
fn into_value_with(self, handle: &Ruby) -> Value {
|
12
12
|
match self {
|
13
13
|
CsvRecord::Vec(vec) => vec.into_value_with(handle),
|
14
|
-
CsvRecord::Map(map) =>
|
14
|
+
CsvRecord::Map(map) => {
|
15
|
+
let hash = RHash::new();
|
16
|
+
for (k, v) in map {
|
17
|
+
hash.aset(k, v).unwrap();
|
18
|
+
}
|
19
|
+
hash.into_value_with(handle)
|
20
|
+
}
|
15
21
|
}
|
16
22
|
}
|
17
23
|
}
|
data/ext/osv/src/reader.rs
CHANGED
@@ -1,15 +1,15 @@
|
|
1
|
-
use std::collections::HashMap;
|
2
|
-
|
3
1
|
use crate::csv::{CsvRecord, RecordReaderBuilder};
|
4
2
|
use crate::utils::*;
|
5
3
|
use magnus::value::ReprValue;
|
6
4
|
use magnus::{block::Yield, Error, KwArgs, RHash, Ruby, Symbol, Value};
|
5
|
+
use std::collections::HashMap;
|
7
6
|
|
8
7
|
pub fn parse_csv(
|
9
|
-
ruby: &Ruby,
|
10
8
|
rb_self: Value,
|
11
9
|
args: &[Value],
|
12
10
|
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord>>>, Error> {
|
11
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
12
|
+
|
13
13
|
let CsvArgs {
|
14
14
|
to_read,
|
15
15
|
has_headers,
|
@@ -18,7 +18,7 @@ pub fn parse_csv(
|
|
18
18
|
null_string,
|
19
19
|
buffer_size,
|
20
20
|
result_type,
|
21
|
-
} = parse_csv_args(ruby, args)?;
|
21
|
+
} = parse_csv_args(&ruby, args)?;
|
22
22
|
|
23
23
|
if !ruby.block_given() {
|
24
24
|
return create_enumerator(EnumeratorArgs {
|
@@ -35,7 +35,7 @@ pub fn parse_csv(
|
|
35
35
|
|
36
36
|
let iter: Box<dyn Iterator<Item = CsvRecord>> = match result_type.as_str() {
|
37
37
|
"hash" => Box::new(
|
38
|
-
RecordReaderBuilder::<HashMap
|
38
|
+
RecordReaderBuilder::<HashMap<&'static str, Option<String>>>::new(&ruby, to_read)
|
39
39
|
.has_headers(has_headers)
|
40
40
|
.delimiter(delimiter)
|
41
41
|
.quote_char(quote_char)
|
@@ -45,7 +45,7 @@ pub fn parse_csv(
|
|
45
45
|
.map(CsvRecord::Map),
|
46
46
|
),
|
47
47
|
"array" => Box::new(
|
48
|
-
RecordReaderBuilder::<Vec<Option<String>>>::new(ruby, to_read)
|
48
|
+
RecordReaderBuilder::<Vec<Option<String>>>::new(&ruby, to_read)
|
49
49
|
.has_headers(has_headers)
|
50
50
|
.delimiter(delimiter)
|
51
51
|
.quote_char(quote_char)
|
data/lib/osv/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: osv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
@@ -56,6 +56,7 @@ files:
|
|
56
56
|
- ext/osv/Cargo.toml
|
57
57
|
- ext/osv/extconf.rb
|
58
58
|
- ext/osv/src/csv/builder.rs
|
59
|
+
- ext/osv/src/csv/header_cache.rs
|
59
60
|
- ext/osv/src/csv/mod.rs
|
60
61
|
- ext/osv/src/csv/parser.rs
|
61
62
|
- ext/osv/src/csv/reader.rs
|