osv 0.4.1 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/osv/src/csv/builder.rs +2 -1
- data/ext/osv/src/csv/header_cache.rs +18 -83
- data/ext/osv/src/csv/parser.rs +46 -44
- data/ext/osv/src/csv/record.rs +2 -2
- data/ext/osv/src/csv/record_reader.rs +3 -12
- data/ext/osv/src/reader.rs +3 -4
- data/lib/osv/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cae389fff24c3109f17a1c450022771da964e1b7dced4ed2f34f93753c213dc8
|
4
|
+
data.tar.gz: 1e3736fb0b84003f62a5038a3d7e71e7d6e31581f943452c2acb08b04a21ff64
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ddae565f1b208de7fc18fa1cdaff7b3d1ed02ec22bf85d0477a9b152d9238049893a5ef40bad95ac4b2d8f8cb0cd59f14fd4e365aa69f3ef109fa6f6701d2499
|
7
|
+
data.tar.gz: aa81197d39f7e3dcc5732bfb7d71545cd9303d888ee7da1a34b0244ade287a8b25db16f7428d4f09689dfe1d874d3663bebb6d546e6b363f7496137339b15150
|
data/ext/osv/src/csv/builder.rs
CHANGED
@@ -169,7 +169,8 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
|
|
169
169
|
return Err(ReaderError::InvalidFileDescriptor(fd));
|
170
170
|
}
|
171
171
|
|
172
|
-
let file = unsafe { File::from_raw_fd(fd) }
|
172
|
+
let file = std::panic::catch_unwind(|| unsafe { File::from_raw_fd(fd) })
|
173
|
+
.map_err(|e| ReaderError::FileDescriptor(format!("{:?}", e)))?;
|
173
174
|
let forgotten = ForgottenFileHandle(ManuallyDrop::new(file));
|
174
175
|
Ok(Box::new(forgotten))
|
175
176
|
}
|
@@ -8,11 +8,11 @@ use std::{
|
|
8
8
|
collections::HashMap,
|
9
9
|
sync::{
|
10
10
|
atomic::{AtomicU32, Ordering},
|
11
|
-
|
11
|
+
LazyLock, Mutex,
|
12
12
|
},
|
13
13
|
};
|
14
14
|
|
15
|
-
use magnus::{
|
15
|
+
use magnus::{IntoValue, RString, Ruby, Value};
|
16
16
|
|
17
17
|
use thiserror::Error;
|
18
18
|
|
@@ -22,24 +22,25 @@ pub enum CacheError {
|
|
22
22
|
LockError(String),
|
23
23
|
}
|
24
24
|
|
25
|
-
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (
|
25
|
+
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey, AtomicU32)>>> =
|
26
26
|
LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
|
27
27
|
|
28
28
|
pub struct StringCache;
|
29
29
|
|
30
|
-
|
30
|
+
#[derive(Copy, Clone)]
|
31
|
+
pub struct StringCacheKey(&'static str);
|
31
32
|
|
32
33
|
impl StringCacheKey {
|
33
34
|
pub fn new(string: &str) -> Self {
|
34
35
|
let rstr = RString::new(string);
|
35
36
|
let fstr = rstr.to_interned_str();
|
36
|
-
Self(
|
37
|
+
Self(fstr.as_str().unwrap())
|
37
38
|
}
|
38
39
|
}
|
39
40
|
|
40
41
|
impl AsRef<str> for StringCacheKey {
|
41
42
|
fn as_ref(&self) -> &'static str {
|
42
|
-
self.
|
43
|
+
self.0
|
43
44
|
}
|
44
45
|
}
|
45
46
|
|
@@ -57,13 +58,13 @@ impl IntoValue for &StringCacheKey {
|
|
57
58
|
|
58
59
|
impl std::fmt::Debug for StringCacheKey {
|
59
60
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
60
|
-
self.
|
61
|
+
self.0.fmt(f)
|
61
62
|
}
|
62
63
|
}
|
63
64
|
|
64
65
|
impl PartialEq for StringCacheKey {
|
65
66
|
fn eq(&self, other: &Self) -> bool {
|
66
|
-
self.
|
67
|
+
self.0 == other.0
|
67
68
|
}
|
68
69
|
}
|
69
70
|
|
@@ -71,95 +72,29 @@ impl std::cmp::Eq for StringCacheKey {}
|
|
71
72
|
|
72
73
|
impl std::hash::Hash for StringCacheKey {
|
73
74
|
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
74
|
-
self.
|
75
|
+
self.0.hash(state);
|
75
76
|
}
|
76
77
|
}
|
77
78
|
|
78
79
|
impl StringCache {
|
79
|
-
|
80
|
-
|
80
|
+
pub fn intern_many<AsStr: AsRef<str>>(
|
81
|
+
strings: &[AsStr],
|
82
|
+
) -> Result<Vec<StringCacheKey>, CacheError> {
|
81
83
|
let mut cache = STRING_CACHE
|
82
84
|
.lock()
|
83
85
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
84
86
|
|
85
|
-
|
86
|
-
counter.fetch_add(1, Ordering::Relaxed);
|
87
|
-
Ok(interned_string.clone())
|
88
|
-
} else {
|
89
|
-
let interned = Arc::new(StringCacheKey::new(string.as_str()));
|
90
|
-
let leaked = Box::leak(string.into_boxed_str());
|
91
|
-
cache.insert(leaked, (interned.clone(), AtomicU32::new(1)));
|
92
|
-
Ok(interned)
|
93
|
-
}
|
94
|
-
}
|
95
|
-
|
96
|
-
pub fn intern_many(strings: &[String]) -> Result<Vec<Arc<StringCacheKey>>, CacheError> {
|
97
|
-
let mut cache = STRING_CACHE
|
98
|
-
.lock()
|
99
|
-
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
100
|
-
|
101
|
-
let mut result: Vec<Arc<StringCacheKey>> = Vec::with_capacity(strings.len());
|
87
|
+
let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
|
102
88
|
for string in strings {
|
103
|
-
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.
|
89
|
+
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_ref()) {
|
104
90
|
counter.fetch_add(1, Ordering::Relaxed);
|
105
|
-
result.push(interned_string
|
91
|
+
result.push(*interned_string);
|
106
92
|
} else {
|
107
|
-
let interned =
|
108
|
-
|
109
|
-
cache.insert(leaked, (interned.clone(), AtomicU32::new(1)));
|
93
|
+
let interned = StringCacheKey::new(string.as_ref());
|
94
|
+
cache.insert(interned.0, (interned, AtomicU32::new(1)));
|
110
95
|
result.push(interned);
|
111
96
|
}
|
112
97
|
}
|
113
98
|
Ok(result)
|
114
99
|
}
|
115
|
-
|
116
|
-
pub fn clear(headers: &[Arc<StringCacheKey>]) -> Result<(), CacheError> {
|
117
|
-
let mut cache = STRING_CACHE
|
118
|
-
.lock()
|
119
|
-
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
120
|
-
|
121
|
-
let to_remove: Vec<_> = headers
|
122
|
-
.iter()
|
123
|
-
.filter_map(|header| {
|
124
|
-
let key = header.as_ref().as_ref();
|
125
|
-
if let Some((_, (_, counter))) = cache.get_key_value(key) {
|
126
|
-
let prev_count = counter.fetch_sub(1, Ordering::Relaxed);
|
127
|
-
if prev_count == 1 {
|
128
|
-
Some(key)
|
129
|
-
} else {
|
130
|
-
None
|
131
|
-
}
|
132
|
-
} else {
|
133
|
-
None
|
134
|
-
}
|
135
|
-
})
|
136
|
-
.collect();
|
137
|
-
|
138
|
-
for key in to_remove {
|
139
|
-
cache.remove(key);
|
140
|
-
}
|
141
|
-
|
142
|
-
Ok(())
|
143
|
-
}
|
144
|
-
}
|
145
|
-
|
146
|
-
pub struct HeaderCacheCleanupIter<I> {
|
147
|
-
pub inner: I,
|
148
|
-
pub headers: OnceLock<Vec<Arc<StringCacheKey>>>,
|
149
|
-
}
|
150
|
-
|
151
|
-
impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
|
152
|
-
type Item = I::Item;
|
153
|
-
|
154
|
-
fn next(&mut self) -> Option<Self::Item> {
|
155
|
-
self.inner.next()
|
156
|
-
}
|
157
|
-
}
|
158
|
-
|
159
|
-
impl<I> Drop for HeaderCacheCleanupIter<I> {
|
160
|
-
fn drop(&mut self) {
|
161
|
-
if let Some(headers) = self.headers.get() {
|
162
|
-
StringCache::clear(headers).unwrap();
|
163
|
-
}
|
164
|
-
}
|
165
100
|
}
|
data/ext/osv/src/csv/parser.rs
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
use std::borrow::Cow;
|
2
2
|
use std::collections::HashMap;
|
3
3
|
use std::hash::BuildHasher;
|
4
|
-
use std::sync::Arc;
|
5
4
|
|
6
5
|
use super::header_cache::StringCacheKey;
|
7
6
|
use super::CowStr;
|
@@ -15,7 +14,7 @@ pub trait RecordParser<'a> {
|
|
15
14
|
type Output;
|
16
15
|
|
17
16
|
fn parse(
|
18
|
-
headers: &[
|
17
|
+
headers: &[StringCacheKey],
|
19
18
|
record: &CsvRecordType,
|
20
19
|
null_string: Option<Cow<'a, str>>,
|
21
20
|
ignore_null_bytes: bool,
|
@@ -23,46 +22,38 @@ pub trait RecordParser<'a> {
|
|
23
22
|
}
|
24
23
|
|
25
24
|
impl<'a, S: BuildHasher + Default> RecordParser<'a>
|
26
|
-
for HashMap<
|
25
|
+
for HashMap<StringCacheKey, Option<CowStr<'a>>, S>
|
27
26
|
{
|
28
27
|
type Output = Self;
|
29
28
|
|
30
29
|
#[inline]
|
31
30
|
fn parse(
|
32
|
-
headers: &[
|
31
|
+
headers: &[StringCacheKey],
|
33
32
|
record: &CsvRecordType,
|
34
33
|
null_string: Option<Cow<'a, str>>,
|
35
34
|
ignore_null_bytes: bool,
|
36
35
|
) -> Self::Output {
|
37
36
|
let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
|
38
|
-
|
39
37
|
let shared_empty = Cow::Borrowed("");
|
40
38
|
|
41
39
|
headers.iter().enumerate().for_each(|(i, header)| {
|
42
40
|
let value = match record {
|
43
41
|
CsvRecordType::String(s) => s.get(i).and_then(|field| {
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
} else {
|
51
|
-
Some(CowStr(Cow::Owned(field.to_string())))
|
52
|
-
}
|
42
|
+
convert_field_to_cow_str(
|
43
|
+
field,
|
44
|
+
null_string.as_deref(),
|
45
|
+
ignore_null_bytes,
|
46
|
+
&shared_empty,
|
47
|
+
)
|
53
48
|
}),
|
54
|
-
|
55
49
|
CsvRecordType::Byte(b) => b.get(i).and_then(|field| {
|
56
50
|
let field = String::from_utf8_lossy(field);
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
} else {
|
64
|
-
Some(CowStr(Cow::Owned(field.to_string())))
|
65
|
-
}
|
51
|
+
convert_field_to_cow_str(
|
52
|
+
&field,
|
53
|
+
null_string.as_deref(),
|
54
|
+
ignore_null_bytes,
|
55
|
+
&shared_empty,
|
56
|
+
)
|
66
57
|
}),
|
67
58
|
};
|
68
59
|
|
@@ -77,43 +68,36 @@ impl<'a> RecordParser<'a> for Vec<Option<CowStr<'a>>> {
|
|
77
68
|
|
78
69
|
#[inline]
|
79
70
|
fn parse(
|
80
|
-
headers: &[
|
71
|
+
headers: &[StringCacheKey],
|
81
72
|
record: &CsvRecordType,
|
82
73
|
null_string: Option<Cow<'a, str>>,
|
83
74
|
ignore_null_bytes: bool,
|
84
75
|
) -> Self::Output {
|
85
76
|
let target_len = headers.len();
|
86
77
|
let mut vec = Vec::with_capacity(target_len);
|
87
|
-
|
88
78
|
let shared_empty = Cow::Borrowed("");
|
89
79
|
|
90
80
|
match record {
|
91
81
|
CsvRecordType::String(record) => {
|
92
82
|
for field in record.iter() {
|
93
|
-
let value =
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
} else {
|
100
|
-
Some(CowStr(Cow::Owned(field.to_string())))
|
101
|
-
};
|
83
|
+
let value = convert_field_to_cow_str(
|
84
|
+
field,
|
85
|
+
null_string.as_deref(),
|
86
|
+
ignore_null_bytes,
|
87
|
+
&shared_empty,
|
88
|
+
);
|
102
89
|
vec.push(value);
|
103
90
|
}
|
104
91
|
}
|
105
92
|
CsvRecordType::Byte(record) => {
|
106
93
|
for field in record.iter() {
|
107
94
|
let field = String::from_utf8_lossy(field);
|
108
|
-
let value =
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
} else {
|
115
|
-
Some(CowStr(Cow::Owned(field.to_string())))
|
116
|
-
};
|
95
|
+
let value = convert_field_to_cow_str(
|
96
|
+
&field,
|
97
|
+
null_string.as_deref(),
|
98
|
+
ignore_null_bytes,
|
99
|
+
&shared_empty,
|
100
|
+
);
|
117
101
|
vec.push(value);
|
118
102
|
}
|
119
103
|
}
|
@@ -122,3 +106,21 @@ impl<'a> RecordParser<'a> for Vec<Option<CowStr<'a>>> {
|
|
122
106
|
vec
|
123
107
|
}
|
124
108
|
}
|
109
|
+
|
110
|
+
#[inline]
|
111
|
+
fn convert_field_to_cow_str<'a>(
|
112
|
+
field: &str,
|
113
|
+
null_string: Option<&str>,
|
114
|
+
ignore_null_bytes: bool,
|
115
|
+
shared_empty: &Cow<'a, str>,
|
116
|
+
) -> Option<CowStr<'a>> {
|
117
|
+
if Some(field) == null_string {
|
118
|
+
None
|
119
|
+
} else if field.is_empty() {
|
120
|
+
Some(CowStr(shared_empty.clone()))
|
121
|
+
} else if ignore_null_bytes {
|
122
|
+
Some(CowStr(Cow::Owned(field.replace("\0", ""))))
|
123
|
+
} else {
|
124
|
+
Some(CowStr(Cow::Owned(field.to_string())))
|
125
|
+
}
|
126
|
+
}
|
data/ext/osv/src/csv/record.rs
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
use itertools::Itertools;
|
2
2
|
use magnus::{value::ReprValue, IntoValue, Ruby, Value};
|
3
|
-
use std::{borrow::Cow, collections::HashMap, hash::BuildHasher
|
3
|
+
use std::{borrow::Cow, collections::HashMap, hash::BuildHasher};
|
4
4
|
|
5
5
|
use super::StringCacheKey;
|
6
6
|
|
7
7
|
#[derive(Debug)]
|
8
8
|
pub enum CsvRecord<'a, S: BuildHasher + Default> {
|
9
9
|
Vec(Vec<Option<CowStr<'a>>>),
|
10
|
-
Map(HashMap<
|
10
|
+
Map(HashMap<StringCacheKey, Option<CowStr<'a>>, S>),
|
11
11
|
}
|
12
12
|
|
13
13
|
impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
|
@@ -1,11 +1,10 @@
|
|
1
1
|
use super::builder::ReaderError;
|
2
2
|
use super::header_cache::StringCacheKey;
|
3
3
|
use super::parser::{CsvRecordType, RecordParser};
|
4
|
-
use super::
|
4
|
+
use super::ruby_reader::SeekableRead;
|
5
5
|
use magnus::{Error, Ruby};
|
6
6
|
use std::borrow::Cow;
|
7
7
|
use std::io::{BufReader, Read};
|
8
|
-
use std::sync::Arc;
|
9
8
|
|
10
9
|
/// Size of the internal buffer used for reading CSV records
|
11
10
|
pub(crate) const READ_BUFFER_SIZE: usize = 16384;
|
@@ -15,7 +14,7 @@ pub(crate) const READ_BUFFER_SIZE: usize = 16384;
|
|
15
14
|
/// This struct implements Iterator to provide a streaming interface for CSV records.
|
16
15
|
pub struct RecordReader<'a, T: RecordParser<'a>> {
|
17
16
|
reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
|
18
|
-
headers: Vec<
|
17
|
+
headers: Vec<StringCacheKey>,
|
19
18
|
null_string: Option<Cow<'a, str>>,
|
20
19
|
string_record: CsvRecordType,
|
21
20
|
parser: std::marker::PhantomData<T>,
|
@@ -75,7 +74,7 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
|
|
75
74
|
/// Creates a new RecordReader instance.
|
76
75
|
pub(crate) fn new(
|
77
76
|
reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
|
78
|
-
headers: Vec<
|
77
|
+
headers: Vec<StringCacheKey>,
|
79
78
|
null_string: Option<Cow<'a, str>>,
|
80
79
|
ignore_null_bytes: bool,
|
81
80
|
lossy: bool,
|
@@ -137,11 +136,3 @@ impl<'a, T: RecordParser<'a>> Iterator for RecordReader<'a, T> {
|
|
137
136
|
(0, None) // Cannot determine size without reading entire file
|
138
137
|
}
|
139
138
|
}
|
140
|
-
|
141
|
-
impl<'a, T: RecordParser<'a>> Drop for RecordReader<'a, T> {
|
142
|
-
#[inline]
|
143
|
-
fn drop(&mut self) {
|
144
|
-
// Intentionally ignore errors during cleanup as there's no meaningful way to handle them
|
145
|
-
let _ = StringCache::clear(&self.headers);
|
146
|
-
}
|
147
|
-
}
|
data/ext/osv/src/reader.rs
CHANGED
@@ -5,7 +5,6 @@ use csv::Trim;
|
|
5
5
|
use magnus::value::ReprValue;
|
6
6
|
use magnus::{Error, IntoValue, KwArgs, RHash, Ruby, Symbol, Value};
|
7
7
|
use std::collections::HashMap;
|
8
|
-
use std::sync::Arc;
|
9
8
|
|
10
9
|
/// Valid result types for CSV parsing
|
11
10
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
@@ -94,7 +93,7 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result<Value, Error> {
|
|
94
93
|
match result_type {
|
95
94
|
ResultType::Hash => {
|
96
95
|
let builder = RecordReaderBuilder::<
|
97
|
-
HashMap<
|
96
|
+
HashMap<StringCacheKey, Option<CowStr<'_>>, RandomState>,
|
98
97
|
>::new(ruby, to_read)
|
99
98
|
.has_headers(has_headers)
|
100
99
|
.flexible(flexible)
|
@@ -109,11 +108,11 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result<Value, Error> {
|
|
109
108
|
let ruby = unsafe { Ruby::get_unchecked() };
|
110
109
|
for result in builder {
|
111
110
|
let record = result?;
|
112
|
-
let _: Value = ruby.yield_value(CsvRecord::Map(record))?;
|
111
|
+
let _: Value = ruby.yield_value(CsvRecord::<ahash::RandomState>::Map(record))?;
|
113
112
|
}
|
114
113
|
}
|
115
114
|
ResultType::Array => {
|
116
|
-
let builder = RecordReaderBuilder::<Vec<Option<CowStr<'
|
115
|
+
let builder = RecordReaderBuilder::<Vec<Option<CowStr<'_>>>>::new(ruby, to_read)
|
117
116
|
.has_headers(has_headers)
|
118
117
|
.flexible(flexible)
|
119
118
|
.trim(trim)
|
data/lib/osv/version.rb
CHANGED