osv 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/osv/src/csv/builder.rs +2 -1
- data/ext/osv/src/csv/header_cache.rs +20 -15
- data/ext/osv/src/csv/parser.rs +6 -5
- data/ext/osv/src/csv/record.rs +3 -3
- data/ext/osv/src/csv/record_reader.rs +34 -13
- data/ext/osv/src/reader.rs +2 -1
- data/lib/osv/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 137ae556685639f7d13234e3061d9b310757ce02f75a713753d175f1bc71b628
|
4
|
+
data.tar.gz: 5892494ad08d783955d2b932150d65433a4d3593376fadbaf54e54780e7a350f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6efbc2ee65a8e79379722ae977ee7dbec6131b78968d080f9feb86a3310368c387da54dd8c073e9b4008cb80d906293ea9115982d00d5ff637cf5ab51179b53c
|
7
|
+
data.tar.gz: 7b4ab3199f90654cd831dfbb52a9d22b70237e7120bd5308a1b7698268fa981abefd7ee47d53424d0c7bff46956256db8f1e139d17e381fd5570a16ca183e376
|
data/ext/osv/src/csv/builder.rs
CHANGED
@@ -213,7 +213,8 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
|
|
213
213
|
.from_reader(reader);
|
214
214
|
|
215
215
|
let mut headers =
|
216
|
-
RecordReader::<T>::get_headers(&self.ruby, &mut reader, self.has_headers)?;
|
216
|
+
RecordReader::<T>::get_headers(&self.ruby, &mut reader, self.has_headers, self.lossy)?;
|
217
|
+
|
217
218
|
if self.ignore_null_bytes {
|
218
219
|
headers = headers.iter().map(|h| h.replace("\0", "")).collect();
|
219
220
|
}
|
@@ -8,7 +8,7 @@ use std::{
|
|
8
8
|
collections::HashMap,
|
9
9
|
sync::{
|
10
10
|
atomic::{AtomicU32, Ordering},
|
11
|
-
LazyLock, Mutex, OnceLock,
|
11
|
+
Arc, LazyLock, Mutex, OnceLock,
|
12
12
|
},
|
13
13
|
};
|
14
14
|
|
@@ -22,12 +22,11 @@ pub enum CacheError {
|
|
22
22
|
LockError(String),
|
23
23
|
}
|
24
24
|
|
25
|
-
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey
|
25
|
+
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (Arc<StringCacheKey>, AtomicU32)>>> =
|
26
26
|
LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
|
27
27
|
|
28
28
|
pub struct StringCache;
|
29
29
|
|
30
|
-
#[derive(Copy, Clone)]
|
31
30
|
pub struct StringCacheKey(Opaque<FString>, &'static str);
|
32
31
|
|
33
32
|
impl StringCacheKey {
|
@@ -50,6 +49,12 @@ impl IntoValue for StringCacheKey {
|
|
50
49
|
}
|
51
50
|
}
|
52
51
|
|
52
|
+
impl IntoValue for &StringCacheKey {
|
53
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
54
|
+
handle.into_value(self.0)
|
55
|
+
}
|
56
|
+
}
|
57
|
+
|
53
58
|
impl std::fmt::Debug for StringCacheKey {
|
54
59
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
55
60
|
self.1.fmt(f)
|
@@ -72,43 +77,43 @@ impl std::hash::Hash for StringCacheKey {
|
|
72
77
|
|
73
78
|
impl StringCache {
|
74
79
|
#[allow(dead_code)]
|
75
|
-
pub fn intern(string: String) -> Result<StringCacheKey
|
80
|
+
pub fn intern(string: String) -> Result<Arc<StringCacheKey>, CacheError> {
|
76
81
|
let mut cache = STRING_CACHE
|
77
82
|
.lock()
|
78
83
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
79
84
|
|
80
85
|
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
|
81
86
|
counter.fetch_add(1, Ordering::Relaxed);
|
82
|
-
Ok(
|
87
|
+
Ok(interned_string.clone())
|
83
88
|
} else {
|
84
|
-
let interned = StringCacheKey::new(string.as_str());
|
89
|
+
let interned = Arc::new(StringCacheKey::new(string.as_str()));
|
85
90
|
let leaked = Box::leak(string.into_boxed_str());
|
86
|
-
cache.insert(leaked, (interned, AtomicU32::new(1)));
|
91
|
+
cache.insert(leaked, (interned.clone(), AtomicU32::new(1)));
|
87
92
|
Ok(interned)
|
88
93
|
}
|
89
94
|
}
|
90
95
|
|
91
|
-
pub fn intern_many(strings: &[String]) -> Result<Vec<StringCacheKey
|
96
|
+
pub fn intern_many(strings: &[String]) -> Result<Vec<Arc<StringCacheKey>>, CacheError> {
|
92
97
|
let mut cache = STRING_CACHE
|
93
98
|
.lock()
|
94
99
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
95
100
|
|
96
|
-
let mut result: Vec<StringCacheKey
|
101
|
+
let mut result: Vec<Arc<StringCacheKey>> = Vec::with_capacity(strings.len());
|
97
102
|
for string in strings {
|
98
103
|
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
|
99
104
|
counter.fetch_add(1, Ordering::Relaxed);
|
100
|
-
result.push(
|
105
|
+
result.push(interned_string.clone());
|
101
106
|
} else {
|
102
|
-
let interned = StringCacheKey::new(string);
|
107
|
+
let interned = Arc::new(StringCacheKey::new(string));
|
103
108
|
let leaked = Box::leak(string.clone().into_boxed_str());
|
104
|
-
cache.insert(leaked, (interned, AtomicU32::new(1)));
|
109
|
+
cache.insert(leaked, (interned.clone(), AtomicU32::new(1)));
|
105
110
|
result.push(interned);
|
106
111
|
}
|
107
112
|
}
|
108
113
|
Ok(result)
|
109
114
|
}
|
110
115
|
|
111
|
-
pub fn clear(headers: &[StringCacheKey]) -> Result<(), CacheError> {
|
116
|
+
pub fn clear(headers: &[Arc<StringCacheKey>]) -> Result<(), CacheError> {
|
112
117
|
let mut cache = STRING_CACHE
|
113
118
|
.lock()
|
114
119
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
@@ -116,7 +121,7 @@ impl StringCache {
|
|
116
121
|
let to_remove: Vec<_> = headers
|
117
122
|
.iter()
|
118
123
|
.filter_map(|header| {
|
119
|
-
let key = header.as_ref();
|
124
|
+
let key = header.as_ref().as_ref();
|
120
125
|
if let Some((_, (_, counter))) = cache.get_key_value(key) {
|
121
126
|
let prev_count = counter.fetch_sub(1, Ordering::Relaxed);
|
122
127
|
if prev_count == 1 {
|
@@ -140,7 +145,7 @@ impl StringCache {
|
|
140
145
|
|
141
146
|
pub struct HeaderCacheCleanupIter<I> {
|
142
147
|
pub inner: I,
|
143
|
-
pub headers: OnceLock<Vec<StringCacheKey
|
148
|
+
pub headers: OnceLock<Vec<Arc<StringCacheKey>>>,
|
144
149
|
}
|
145
150
|
|
146
151
|
impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
|
data/ext/osv/src/csv/parser.rs
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
use std::borrow::Cow;
|
2
2
|
use std::collections::HashMap;
|
3
3
|
use std::hash::BuildHasher;
|
4
|
+
use std::sync::Arc;
|
4
5
|
|
5
6
|
use super::header_cache::StringCacheKey;
|
6
7
|
use super::CowStr;
|
@@ -14,7 +15,7 @@ pub trait RecordParser<'a> {
|
|
14
15
|
type Output;
|
15
16
|
|
16
17
|
fn parse(
|
17
|
-
headers: &[StringCacheKey],
|
18
|
+
headers: &[Arc<StringCacheKey>],
|
18
19
|
record: &CsvRecordType,
|
19
20
|
null_string: Option<Cow<'a, str>>,
|
20
21
|
ignore_null_bytes: bool,
|
@@ -22,13 +23,13 @@ pub trait RecordParser<'a> {
|
|
22
23
|
}
|
23
24
|
|
24
25
|
impl<'a, S: BuildHasher + Default> RecordParser<'a>
|
25
|
-
for HashMap<StringCacheKey
|
26
|
+
for HashMap<Arc<StringCacheKey>, Option<CowStr<'a>>, S>
|
26
27
|
{
|
27
28
|
type Output = Self;
|
28
29
|
|
29
30
|
#[inline]
|
30
31
|
fn parse(
|
31
|
-
headers: &[StringCacheKey],
|
32
|
+
headers: &[Arc<StringCacheKey>],
|
32
33
|
record: &CsvRecordType,
|
33
34
|
null_string: Option<Cow<'a, str>>,
|
34
35
|
ignore_null_bytes: bool,
|
@@ -65,7 +66,7 @@ impl<'a, S: BuildHasher + Default> RecordParser<'a>
|
|
65
66
|
}),
|
66
67
|
};
|
67
68
|
|
68
|
-
map.insert(
|
69
|
+
map.insert(header.clone(), value);
|
69
70
|
});
|
70
71
|
map
|
71
72
|
}
|
@@ -76,7 +77,7 @@ impl<'a> RecordParser<'a> for Vec<Option<CowStr<'a>>> {
|
|
76
77
|
|
77
78
|
#[inline]
|
78
79
|
fn parse(
|
79
|
-
headers: &[StringCacheKey],
|
80
|
+
headers: &[Arc<StringCacheKey>],
|
80
81
|
record: &CsvRecordType,
|
81
82
|
null_string: Option<Cow<'a, str>>,
|
82
83
|
ignore_null_bytes: bool,
|
data/ext/osv/src/csv/record.rs
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
use itertools::Itertools;
|
2
2
|
use magnus::{value::ReprValue, IntoValue, Ruby, Value};
|
3
|
-
use std::{borrow::Cow, collections::HashMap, hash::BuildHasher};
|
3
|
+
use std::{borrow::Cow, collections::HashMap, hash::BuildHasher, sync::Arc};
|
4
4
|
|
5
5
|
use super::StringCacheKey;
|
6
6
|
|
7
7
|
#[derive(Debug)]
|
8
8
|
pub enum CsvRecord<'a, S: BuildHasher + Default> {
|
9
9
|
Vec(Vec<Option<CowStr<'a>>>),
|
10
|
-
Map(HashMap<StringCacheKey
|
10
|
+
Map(HashMap<Arc<StringCacheKey>, Option<CowStr<'a>>, S>),
|
11
11
|
}
|
12
12
|
|
13
13
|
impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
|
@@ -28,7 +28,7 @@ impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
|
|
28
28
|
|
29
29
|
for chunk in &map.into_iter().chunks(128) {
|
30
30
|
for (k, v) in chunk {
|
31
|
-
values[i] = handle.into_value(k);
|
31
|
+
values[i] = handle.into_value(k.as_ref());
|
32
32
|
values[i + 1] = handle.into_value(v);
|
33
33
|
i += 2;
|
34
34
|
}
|
@@ -5,6 +5,7 @@ use super::{header_cache::StringCache, ruby_reader::SeekableRead};
|
|
5
5
|
use magnus::{Error, Ruby};
|
6
6
|
use std::borrow::Cow;
|
7
7
|
use std::io::{BufReader, Read};
|
8
|
+
use std::sync::Arc;
|
8
9
|
|
9
10
|
/// Size of the internal buffer used for reading CSV records
|
10
11
|
pub(crate) const READ_BUFFER_SIZE: usize = 16384;
|
@@ -14,7 +15,7 @@ pub(crate) const READ_BUFFER_SIZE: usize = 16384;
|
|
14
15
|
/// This struct implements Iterator to provide a streaming interface for CSV records.
|
15
16
|
pub struct RecordReader<'a, T: RecordParser<'a>> {
|
16
17
|
reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
|
17
|
-
headers: Vec<StringCacheKey
|
18
|
+
headers: Vec<Arc<StringCacheKey>>,
|
18
19
|
null_string: Option<Cow<'a, str>>,
|
19
20
|
string_record: CsvRecordType,
|
20
21
|
parser: std::marker::PhantomData<T>,
|
@@ -36,25 +37,45 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
|
|
36
37
|
ruby: &Ruby,
|
37
38
|
reader: &mut csv::Reader<impl Read>,
|
38
39
|
has_headers: bool,
|
40
|
+
lossy: bool,
|
39
41
|
) -> Result<Vec<String>, Error> {
|
40
|
-
let
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
42
|
+
let headers = if lossy {
|
43
|
+
let first_row = reader.byte_headers().map_err(|e| {
|
44
|
+
Error::new(
|
45
|
+
ruby.exception_runtime_error(),
|
46
|
+
format!("Failed to read headers: {e}"),
|
47
|
+
)
|
48
|
+
})?;
|
49
|
+
if has_headers {
|
50
|
+
first_row
|
51
|
+
.iter()
|
52
|
+
.map(String::from_utf8_lossy)
|
53
|
+
.map(|x| x.to_string())
|
54
|
+
.collect()
|
55
|
+
} else {
|
56
|
+
(0..first_row.len()).map(|i| format!("c{i}")).collect()
|
57
|
+
}
|
49
58
|
} else {
|
50
|
-
|
51
|
-
|
59
|
+
let first_row = reader.headers().map_err(|e| {
|
60
|
+
Error::new(
|
61
|
+
ruby.exception_runtime_error(),
|
62
|
+
format!("Failed to read headers: {e}"),
|
63
|
+
)
|
64
|
+
})?;
|
65
|
+
if has_headers {
|
66
|
+
first_row.iter().map(String::from).collect()
|
67
|
+
} else {
|
68
|
+
(0..first_row.len()).map(|i| format!("c{i}")).collect()
|
69
|
+
}
|
70
|
+
};
|
71
|
+
|
72
|
+
Ok(headers)
|
52
73
|
}
|
53
74
|
|
54
75
|
/// Creates a new RecordReader instance.
|
55
76
|
pub(crate) fn new(
|
56
77
|
reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
|
57
|
-
headers: Vec<StringCacheKey
|
78
|
+
headers: Vec<Arc<StringCacheKey>>,
|
58
79
|
null_string: Option<Cow<'a, str>>,
|
59
80
|
ignore_null_bytes: bool,
|
60
81
|
lossy: bool,
|
data/ext/osv/src/reader.rs
CHANGED
@@ -5,6 +5,7 @@ use csv::Trim;
|
|
5
5
|
use magnus::value::ReprValue;
|
6
6
|
use magnus::{Error, IntoValue, KwArgs, RHash, Ruby, Symbol, Value};
|
7
7
|
use std::collections::HashMap;
|
8
|
+
use std::sync::Arc;
|
8
9
|
|
9
10
|
/// Valid result types for CSV parsing
|
10
11
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
@@ -93,7 +94,7 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result<Value, Error> {
|
|
93
94
|
match result_type {
|
94
95
|
ResultType::Hash => {
|
95
96
|
let builder = RecordReaderBuilder::<
|
96
|
-
HashMap<StringCacheKey
|
97
|
+
HashMap<Arc<StringCacheKey>, Option<CowStr<'static>>, RandomState>,
|
97
98
|
>::new(ruby, to_read)
|
98
99
|
.has_headers(has_headers)
|
99
100
|
.flexible(flexible)
|
data/lib/osv/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: osv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-01-
|
11
|
+
date: 2025-01-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|