osv 0.3.15 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +11 -1
- data/README.md +27 -27
- data/ext/osv/Cargo.toml +1 -0
- data/ext/osv/src/csv/builder.rs +92 -85
- data/ext/osv/src/csv/header_cache.rs +105 -26
- data/ext/osv/src/csv/mod.rs +2 -2
- data/ext/osv/src/csv/parser.rs +22 -85
- data/ext/osv/src/csv/record.rs +25 -8
- data/ext/osv/src/csv/record_reader.rs +53 -118
- data/ext/osv/src/csv/ruby_integration.rs +10 -21
- data/ext/osv/src/csv/ruby_reader.rs +9 -4
- data/ext/osv/src/reader.rs +64 -46
- data/ext/osv/src/utils.rs +4 -12
- data/lib/osv/version.rb +1 -1
- metadata +2 -2
data/ext/osv/src/csv/parser.rs
CHANGED
@@ -2,77 +2,78 @@ use std::borrow::Cow;
|
|
2
2
|
use std::collections::HashMap;
|
3
3
|
use std::hash::BuildHasher;
|
4
4
|
|
5
|
-
use super::
|
5
|
+
use super::header_cache::StringCacheKey;
|
6
|
+
use super::CowStr;
|
6
7
|
|
7
8
|
pub trait RecordParser<'a> {
|
8
|
-
type Output
|
9
|
+
type Output;
|
9
10
|
|
10
11
|
fn parse(
|
11
|
-
headers: &[
|
12
|
+
headers: &[StringCacheKey],
|
12
13
|
record: &csv::StringRecord,
|
13
|
-
null_string: Option
|
14
|
+
null_string: Option<Cow<'a, str>>,
|
14
15
|
flexible_default: Option<Cow<'a, str>>,
|
15
16
|
) -> Self::Output;
|
16
17
|
}
|
17
18
|
|
18
|
-
impl<'a, S: BuildHasher + Default
|
19
|
-
for HashMap
|
19
|
+
impl<'a, S: BuildHasher + Default> RecordParser<'a>
|
20
|
+
for HashMap<StringCacheKey, Option<CowStr<'a>>, S>
|
20
21
|
{
|
21
22
|
type Output = Self;
|
22
23
|
|
23
24
|
#[inline]
|
24
25
|
fn parse(
|
25
|
-
headers: &[
|
26
|
+
headers: &[StringCacheKey],
|
26
27
|
record: &csv::StringRecord,
|
27
|
-
null_string: Option
|
28
|
+
null_string: Option<Cow<'a, str>>,
|
28
29
|
flexible_default: Option<Cow<'a, str>>,
|
29
30
|
) -> Self::Output {
|
30
31
|
let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
|
31
32
|
|
32
33
|
let shared_empty = Cow::Borrowed("");
|
33
|
-
let shared_default = flexible_default.map(
|
34
|
-
headers.iter().enumerate().for_each(|(i,
|
34
|
+
let shared_default = flexible_default.map(CowStr);
|
35
|
+
headers.iter().enumerate().for_each(|(i, ref header)| {
|
35
36
|
let value = record.get(i).map_or_else(
|
36
37
|
|| shared_default.clone(),
|
37
38
|
|field| {
|
38
|
-
if null_string == Some(field) {
|
39
|
+
if null_string.as_deref() == Some(field) {
|
39
40
|
None
|
40
41
|
} else if field.is_empty() {
|
41
|
-
Some(
|
42
|
+
Some(CowStr(shared_empty.clone()))
|
42
43
|
} else {
|
43
|
-
Some(
|
44
|
+
Some(CowStr(Cow::Owned(field.to_string())))
|
44
45
|
}
|
45
46
|
},
|
46
47
|
);
|
47
|
-
map.insert(header, value);
|
48
|
+
map.insert((*header).clone(), value);
|
48
49
|
});
|
49
50
|
map
|
50
51
|
}
|
51
52
|
}
|
52
53
|
|
53
|
-
impl<'a> RecordParser<'a> for Vec<Option<
|
54
|
+
impl<'a> RecordParser<'a> for Vec<Option<CowStr<'a>>> {
|
54
55
|
type Output = Self;
|
55
56
|
|
56
57
|
#[inline]
|
57
58
|
fn parse(
|
58
|
-
headers: &[
|
59
|
+
headers: &[StringCacheKey],
|
59
60
|
record: &csv::StringRecord,
|
60
|
-
null_string: Option
|
61
|
+
null_string: Option<Cow<'a, str>>,
|
61
62
|
flexible_default: Option<Cow<'a, str>>,
|
62
63
|
) -> Self::Output {
|
63
64
|
let target_len = headers.len();
|
64
65
|
let mut vec = Vec::with_capacity(target_len);
|
65
66
|
|
66
67
|
let shared_empty = Cow::Borrowed("");
|
67
|
-
let shared_default = flexible_default.map(
|
68
|
+
let shared_default = flexible_default.map(CowStr);
|
68
69
|
|
69
70
|
for field in record.iter() {
|
70
|
-
let value = if Some(field) == null_string {
|
71
|
+
let value = if Some(field) == null_string.as_deref() {
|
71
72
|
None
|
72
73
|
} else if field.is_empty() {
|
73
|
-
Some(
|
74
|
+
Some(CowStr(shared_empty.clone()))
|
74
75
|
} else {
|
75
|
-
Some(
|
76
|
+
Some(CowStr(Cow::Owned(field.to_string())))
|
76
77
|
};
|
77
78
|
vec.push(value);
|
78
79
|
}
|
@@ -85,67 +86,3 @@ impl<'a> RecordParser<'a> for Vec<Option<CowValue<'a>>> {
|
|
85
86
|
vec
|
86
87
|
}
|
87
88
|
}
|
88
|
-
|
89
|
-
// impl<'a, S: BuildHasher + Default + 'a> RecordParser<'a>
|
90
|
-
// for HashMap<&'static str, Option<String>, S>
|
91
|
-
// {
|
92
|
-
// type Output = Self;
|
93
|
-
|
94
|
-
// #[inline]
|
95
|
-
// fn parse(
|
96
|
-
// headers: &[&'static str],
|
97
|
-
// record: &csv::StringRecord,
|
98
|
-
// null_string: Option<&str>,
|
99
|
-
// flexible_default: Option<Cow<'a, str>>,
|
100
|
-
// ) -> Self::Output {
|
101
|
-
// let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
|
102
|
-
// headers.iter().enumerate().for_each(|(i, &header)| {
|
103
|
-
// let value = record.get(i).map_or_else(
|
104
|
-
// || flexible_default.clone(),
|
105
|
-
// |field| {
|
106
|
-
// if null_string == Some(field) {
|
107
|
-
// None
|
108
|
-
// } else if field.is_empty() {
|
109
|
-
// Some(String::new())
|
110
|
-
// } else {
|
111
|
-
// Some(field.into())
|
112
|
-
// }
|
113
|
-
// },
|
114
|
-
// );
|
115
|
-
// map.insert(header, value);
|
116
|
-
// });
|
117
|
-
// map
|
118
|
-
// }
|
119
|
-
// }
|
120
|
-
|
121
|
-
// impl<'a> RecordParser<'a> for Vec<Option<String>> {
|
122
|
-
// type Output = Self;
|
123
|
-
|
124
|
-
// #[inline]
|
125
|
-
// fn parse(
|
126
|
-
// headers: &[&'static str],
|
127
|
-
// record: &csv::StringRecord,
|
128
|
-
// null_string: Option<&str>,
|
129
|
-
// flexible_default: Option<Cow<'a, str>>,
|
130
|
-
// ) -> Self::Output {
|
131
|
-
// let target_len = headers.len();
|
132
|
-
// let mut vec = Vec::with_capacity(target_len);
|
133
|
-
// for field in record.iter() {
|
134
|
-
// let value = if Some(field) == null_string {
|
135
|
-
// None
|
136
|
-
// } else if field.is_empty() {
|
137
|
-
// Some(String::new())
|
138
|
-
// } else {
|
139
|
-
// Some(field.into())
|
140
|
-
// };
|
141
|
-
// vec.push(value);
|
142
|
-
// }
|
143
|
-
|
144
|
-
// if vec.len() < target_len {
|
145
|
-
// if let Some(default) = flexible_default {
|
146
|
-
// vec.resize_with(target_len, || Some(default.to_string()));
|
147
|
-
// }
|
148
|
-
// }
|
149
|
-
// vec
|
150
|
-
// }
|
151
|
-
// }
|
data/ext/osv/src/csv/record.rs
CHANGED
@@ -1,10 +1,13 @@
|
|
1
|
-
use
|
1
|
+
use itertools::Itertools;
|
2
|
+
use magnus::{value::ReprValue, IntoValue, Ruby, Value};
|
2
3
|
use std::{borrow::Cow, collections::HashMap, hash::BuildHasher};
|
3
4
|
|
5
|
+
use super::StringCacheKey;
|
6
|
+
|
4
7
|
#[derive(Debug)]
|
5
8
|
pub enum CsvRecord<'a, S: BuildHasher + Default> {
|
6
|
-
Vec(Vec<Option<
|
7
|
-
Map(HashMap
|
9
|
+
Vec(Vec<Option<CowStr<'a>>>),
|
10
|
+
Map(HashMap<StringCacheKey, Option<CowStr<'a>>, S>),
|
8
11
|
}
|
9
12
|
|
10
13
|
impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
|
@@ -19,9 +22,23 @@ impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
|
|
19
22
|
CsvRecord::Map(map) => {
|
20
23
|
// Pre-allocate the hash with the known size
|
21
24
|
let hash = handle.hash_new_capa(map.len());
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
+
|
26
|
+
let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
|
27
|
+
let mut i = 0;
|
28
|
+
|
29
|
+
for chunk in &map.into_iter().chunks(128) {
|
30
|
+
for (k, v) in chunk {
|
31
|
+
values[i] = handle.into_value(k);
|
32
|
+
values[i + 1] = handle.into_value(v);
|
33
|
+
i += 2;
|
34
|
+
}
|
35
|
+
hash.bulk_insert(&values[..i]).unwrap();
|
36
|
+
|
37
|
+
// Zero out used values
|
38
|
+
values[..i].fill(handle.qnil().as_value());
|
39
|
+
i = 0;
|
40
|
+
}
|
41
|
+
|
25
42
|
hash.into_value_with(handle)
|
26
43
|
}
|
27
44
|
}
|
@@ -29,9 +46,9 @@ impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
|
|
29
46
|
}
|
30
47
|
|
31
48
|
#[derive(Debug, Clone)]
|
32
|
-
pub struct
|
49
|
+
pub struct CowStr<'a>(pub Cow<'a, str>);
|
33
50
|
|
34
|
-
impl IntoValue for
|
51
|
+
impl IntoValue for CowStr<'_> {
|
35
52
|
fn into_value_with(self, handle: &Ruby) -> Value {
|
36
53
|
self.0.into_value_with(handle)
|
37
54
|
}
|
@@ -1,32 +1,35 @@
|
|
1
|
+
use super::header_cache::StringCacheKey;
|
1
2
|
use super::parser::RecordParser;
|
2
3
|
use super::{header_cache::StringCache, ruby_reader::SeekableRead};
|
3
4
|
use magnus::{Error, Ruby};
|
4
|
-
use std::
|
5
|
-
use std::{
|
5
|
+
use std::borrow::Cow;
|
6
|
+
use std::io::{BufReader, Read};
|
6
7
|
|
8
|
+
/// Size of the internal buffer used for reading CSV records
|
7
9
|
pub(crate) const READ_BUFFER_SIZE: usize = 16384;
|
8
10
|
|
11
|
+
/// A reader that processes CSV records using a specified parser.
|
12
|
+
///
|
13
|
+
/// This struct implements Iterator to provide a streaming interface for CSV records.
|
9
14
|
pub struct RecordReader<'a, T: RecordParser<'a>> {
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
|
17
|
-
headers: Vec<&'static str>,
|
18
|
-
null_string: Option<String>,
|
19
|
-
flexible_default: Option<Cow<'a, str>>,
|
20
|
-
string_record: csv::StringRecord,
|
21
|
-
},
|
22
|
-
MultiThreaded {
|
23
|
-
headers: Vec<&'static str>,
|
24
|
-
receiver: kanal::Receiver<T::Output>,
|
25
|
-
handle: Option<thread::JoinHandle<()>>,
|
26
|
-
},
|
15
|
+
reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
|
16
|
+
headers: Vec<StringCacheKey>,
|
17
|
+
null_string: Option<Cow<'a, str>>,
|
18
|
+
flexible_default: Option<Cow<'a, str>>,
|
19
|
+
string_record: csv::StringRecord,
|
20
|
+
parser: std::marker::PhantomData<T>,
|
27
21
|
}
|
28
22
|
|
29
23
|
impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
|
24
|
+
/// Reads and processes headers from a CSV reader.
|
25
|
+
///
|
26
|
+
/// # Arguments
|
27
|
+
/// * `ruby` - Ruby VM context for error handling
|
28
|
+
/// * `reader` - CSV reader instance
|
29
|
+
/// * `has_headers` - Whether the CSV file contains headers
|
30
|
+
///
|
31
|
+
/// # Returns
|
32
|
+
/// A vector of header strings or generated column names if `has_headers` is false
|
30
33
|
#[inline]
|
31
34
|
pub(crate) fn get_headers(
|
32
35
|
ruby: &Ruby,
|
@@ -40,67 +43,41 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
|
|
40
43
|
)
|
41
44
|
})?;
|
42
45
|
|
43
|
-
|
44
|
-
|
45
|
-
headers.extend(first_row.iter().map(String::from));
|
46
|
+
Ok(if has_headers {
|
47
|
+
first_row.iter().map(String::from).collect()
|
46
48
|
} else {
|
47
|
-
|
48
|
-
}
|
49
|
-
Ok(headers)
|
49
|
+
(0..first_row.len()).map(|i| format!("c{i}")).collect()
|
50
|
+
})
|
50
51
|
}
|
51
52
|
|
52
|
-
|
53
|
+
/// Creates a new RecordReader instance.
|
54
|
+
pub(crate) fn new(
|
53
55
|
reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
|
54
|
-
headers: Vec
|
55
|
-
null_string: Option<
|
56
|
-
flexible_default: Option
|
56
|
+
headers: Vec<StringCacheKey>,
|
57
|
+
null_string: Option<Cow<'a, str>>,
|
58
|
+
flexible_default: Option<Cow<'a, str>>,
|
57
59
|
) -> Self {
|
58
60
|
let headers_len = headers.len();
|
59
61
|
Self {
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
},
|
62
|
+
reader,
|
63
|
+
headers,
|
64
|
+
null_string,
|
65
|
+
flexible_default,
|
66
|
+
string_record: csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers_len),
|
67
|
+
parser: std::marker::PhantomData,
|
67
68
|
}
|
68
69
|
}
|
69
|
-
}
|
70
70
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
let handle = thread::spawn(move || {
|
83
|
-
let mut record =
|
84
|
-
csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers_for_thread.len());
|
85
|
-
while let Ok(true) = reader.read_record(&mut record) {
|
86
|
-
let row = T::parse(
|
87
|
-
&headers_for_thread,
|
88
|
-
&record,
|
89
|
-
null_string.as_deref(),
|
90
|
-
flexible_default.map(Cow::Borrowed),
|
91
|
-
);
|
92
|
-
if sender.send(row).is_err() {
|
93
|
-
break;
|
94
|
-
}
|
95
|
-
}
|
96
|
-
});
|
97
|
-
|
98
|
-
Self {
|
99
|
-
inner: ReaderImpl::MultiThreaded {
|
100
|
-
headers,
|
101
|
-
receiver,
|
102
|
-
handle: Some(handle),
|
103
|
-
},
|
71
|
+
/// Attempts to read the next record, returning any errors encountered.
|
72
|
+
fn try_next(&mut self) -> csv::Result<Option<T::Output>> {
|
73
|
+
match self.reader.read_record(&mut self.string_record)? {
|
74
|
+
true => Ok(Some(T::parse(
|
75
|
+
&self.headers,
|
76
|
+
&self.string_record,
|
77
|
+
self.null_string.clone(),
|
78
|
+
self.flexible_default.clone(),
|
79
|
+
))),
|
80
|
+
false => Ok(None),
|
104
81
|
}
|
105
82
|
}
|
106
83
|
}
|
@@ -110,63 +87,21 @@ impl<'a, T: RecordParser<'a>> Iterator for RecordReader<'a, T> {
|
|
110
87
|
|
111
88
|
#[inline]
|
112
89
|
fn next(&mut self) -> Option<Self::Item> {
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
} => match receiver.recv() {
|
117
|
-
Ok(record) => Some(record),
|
118
|
-
Err(_) => {
|
119
|
-
if let Some(handle) = handle.take() {
|
120
|
-
let _ = handle.join();
|
121
|
-
}
|
122
|
-
None
|
123
|
-
}
|
124
|
-
},
|
125
|
-
ReaderImpl::SingleThreaded {
|
126
|
-
reader,
|
127
|
-
headers,
|
128
|
-
null_string,
|
129
|
-
flexible_default,
|
130
|
-
ref mut string_record,
|
131
|
-
} => match reader.read_record(string_record) {
|
132
|
-
Ok(true) => Some(T::parse(
|
133
|
-
headers,
|
134
|
-
string_record,
|
135
|
-
null_string.as_deref(),
|
136
|
-
flexible_default.clone(),
|
137
|
-
)),
|
138
|
-
Ok(false) => None,
|
139
|
-
Err(_e) => None,
|
140
|
-
},
|
141
|
-
}
|
90
|
+
// Note: We intentionally swallow errors here to maintain Iterator contract.
|
91
|
+
// Errors can be handled by using try_next() directly if needed.
|
92
|
+
self.try_next().ok().flatten()
|
142
93
|
}
|
143
94
|
|
144
95
|
#[inline]
|
145
96
|
fn size_hint(&self) -> (usize, Option<usize>) {
|
146
|
-
|
147
|
-
(0, None)
|
97
|
+
(0, None) // Cannot determine size without reading entire file
|
148
98
|
}
|
149
99
|
}
|
150
100
|
|
151
101
|
impl<'a, T: RecordParser<'a>> Drop for RecordReader<'a, T> {
|
152
102
|
#[inline]
|
153
103
|
fn drop(&mut self) {
|
154
|
-
|
155
|
-
|
156
|
-
receiver,
|
157
|
-
handle,
|
158
|
-
headers,
|
159
|
-
..
|
160
|
-
} => {
|
161
|
-
receiver.close();
|
162
|
-
if let Some(handle) = handle.take() {
|
163
|
-
let _ = handle.join();
|
164
|
-
}
|
165
|
-
let _ = StringCache::clear(headers);
|
166
|
-
}
|
167
|
-
ReaderImpl::SingleThreaded { headers, .. } => {
|
168
|
-
let _ = StringCache::clear(headers);
|
169
|
-
}
|
170
|
-
}
|
104
|
+
// Intentionally ignore errors during cleanup as there's no meaningful way to handle them
|
105
|
+
let _ = StringCache::clear(&self.headers);
|
171
106
|
}
|
172
107
|
}
|
@@ -1,30 +1,19 @@
|
|
1
|
-
use std::{
|
1
|
+
use std::{
|
2
|
+
fs::File,
|
3
|
+
io::{self, Read, Seek, SeekFrom},
|
4
|
+
mem::ManuallyDrop,
|
5
|
+
};
|
2
6
|
|
3
7
|
pub struct ForgottenFileHandle(pub ManuallyDrop<File>);
|
4
8
|
|
5
|
-
impl
|
9
|
+
impl Read for ForgottenFileHandle {
|
6
10
|
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
7
11
|
self.0.read(buf)
|
8
12
|
}
|
13
|
+
}
|
9
14
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
// fn read_buf(&mut self, cursor: BorrowedCursor<'_>) -> io::Result<()> {
|
15
|
-
// self.0.read_buf(cursor)
|
16
|
-
// }
|
17
|
-
|
18
|
-
// #[inline]
|
19
|
-
// fn is_read_vectored(&self) -> bool {
|
20
|
-
// self.0.is_read_vectored()
|
21
|
-
// }
|
22
|
-
|
23
|
-
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
|
24
|
-
self.0.read_to_end(buf)
|
25
|
-
}
|
26
|
-
|
27
|
-
fn read_to_string(&mut self, buf: &mut String) -> io::Result<usize> {
|
28
|
-
self.0.read_to_string(buf)
|
15
|
+
impl Seek for ForgottenFileHandle {
|
16
|
+
fn seek(&mut self, pos: SeekFrom) -> io::Result<u64> {
|
17
|
+
self.0.seek(pos)
|
29
18
|
}
|
30
19
|
}
|
@@ -2,9 +2,12 @@ use magnus::{
|
|
2
2
|
value::{Opaque, ReprValue},
|
3
3
|
RClass, RString, Ruby, Value,
|
4
4
|
};
|
5
|
-
use std::
|
5
|
+
use std::fs::File;
|
6
|
+
use std::io::{self, BufReader, Read, Seek, SeekFrom, Write};
|
6
7
|
use std::sync::OnceLock;
|
7
8
|
|
9
|
+
use super::ForgottenFileHandle;
|
10
|
+
|
8
11
|
static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
|
9
12
|
|
10
13
|
/// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
|
@@ -17,6 +20,10 @@ pub struct RubyReader<T> {
|
|
17
20
|
pub trait SeekableRead: std::io::Read + Seek {}
|
18
21
|
impl SeekableRead for RubyReader<Value> {}
|
19
22
|
impl SeekableRead for RubyReader<RString> {}
|
23
|
+
impl SeekableRead for File {}
|
24
|
+
impl<T: Read + Seek> SeekableRead for BufReader<T> {}
|
25
|
+
impl SeekableRead for std::io::Cursor<Vec<u8>> {}
|
26
|
+
impl SeekableRead for ForgottenFileHandle {}
|
20
27
|
|
21
28
|
pub fn build_ruby_reader(
|
22
29
|
ruby: &Ruby,
|
@@ -74,9 +81,7 @@ impl Seek for RubyReader<RString> {
|
|
74
81
|
match pos {
|
75
82
|
io::SeekFrom::Start(offset) => self.offset = offset as usize,
|
76
83
|
io::SeekFrom::Current(offset) => self.offset = (self.offset as i64 + offset) as usize,
|
77
|
-
io::SeekFrom::End(offset) =>
|
78
|
-
self.offset = self.inner.len() - offset as usize
|
79
|
-
}
|
84
|
+
io::SeekFrom::End(offset) => self.offset = self.inner.len() - offset as usize,
|
80
85
|
}
|
81
86
|
Ok(self.offset as u64)
|
82
87
|
}
|