osv 0.3.22 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -3
- data/ext/osv/src/csv/builder.rs +12 -24
- data/ext/osv/src/csv/header_cache.rs +20 -15
- data/ext/osv/src/csv/parser.rs +64 -35
- data/ext/osv/src/csv/record.rs +3 -3
- data/ext/osv/src/csv/record_reader.rs +53 -21
- data/ext/osv/src/reader.rs +8 -8
- data/ext/osv/src/utils.rs +8 -8
- data/lib/osv/version.rb +1 -1
- data/lib/osv.rbi +2 -5
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 137ae556685639f7d13234e3061d9b310757ce02f75a713753d175f1bc71b628
|
4
|
+
data.tar.gz: 5892494ad08d783955d2b932150d65433a4d3593376fadbaf54e54780e7a350f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6efbc2ee65a8e79379722ae977ee7dbec6131b78968d080f9feb86a3310368c387da54dd8c073e9b4008cb80d906293ea9115982d00d5ff637cf5ab51179b53c
|
7
|
+
data.tar.gz: 7b4ab3199f90654cd831dfbb52a9d22b70237e7120bd5308a1b7698268fa981abefd7ee47d53424d0c7bff46956256db8f1e139d17e381fd5570a16ca183e376
|
data/README.md
CHANGED
@@ -84,11 +84,10 @@ OSV.for_each("data.csv",
|
|
84
84
|
|
85
85
|
# Parsing behavior
|
86
86
|
flexible: false, # Allow varying number of fields (default: false)
|
87
|
-
flexible_default: nil, # Default value for missing fields. If unset, we ignore missing fields.
|
88
|
-
# Implicitly enables flexible mode if set.
|
89
87
|
trim: :all, # Whether to trim whitespace. Options are :all, :headers, or :fields (default: nil)
|
90
88
|
buffer_size: 1024, # Number of rows to buffer in memory (default: 1024)
|
91
89
|
ignore_null_bytes: false, # Boolean specifying if null bytes should be ignored (default: false)
|
90
|
+
lossy: false, # Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character (default: false)
|
92
91
|
)
|
93
92
|
```
|
94
93
|
|
@@ -103,9 +102,9 @@ OSV.for_each("data.csv",
|
|
103
102
|
- `buffer_size`: Integer specifying the number of rows to buffer in memory (default: 1024)
|
104
103
|
- `result_type`: String specifying the output format ("hash" or "array" or :hash or :array)
|
105
104
|
- `flexible`: Boolean specifying if the parser should be flexible (default: false)
|
106
|
-
- `flexible_default`: String specifying the default value for missing fields. Implicitly enables flexible mode if set. (default: `nil`)
|
107
105
|
- `trim`: String specifying the trim mode ("all" or "headers" or "fields" or :all or :headers or :fields)
|
108
106
|
- `ignore_null_bytes`: Boolean specifying if null bytes should be ignored (default: false)
|
107
|
+
- `lossy`: Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character (default: false)
|
109
108
|
|
110
109
|
When `has_headers` is false, hash keys will be generated as `"c0"`, `"c1"`, etc.
|
111
110
|
|
data/ext/osv/src/csv/builder.rs
CHANGED
@@ -79,9 +79,9 @@ pub struct RecordReaderBuilder<'a, T: RecordParser<'a>> {
|
|
79
79
|
quote_char: u8,
|
80
80
|
null_string: Option<String>,
|
81
81
|
flexible: bool,
|
82
|
-
flexible_default: Option<String>,
|
83
82
|
trim: csv::Trim,
|
84
83
|
ignore_null_bytes: bool,
|
84
|
+
lossy: bool,
|
85
85
|
_phantom: PhantomData<T>,
|
86
86
|
_phantom_a: PhantomData<&'a ()>,
|
87
87
|
}
|
@@ -97,9 +97,9 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
|
|
97
97
|
quote_char: b'"',
|
98
98
|
null_string: None,
|
99
99
|
flexible: false,
|
100
|
-
flexible_default: None,
|
101
100
|
trim: csv::Trim::None,
|
102
101
|
ignore_null_bytes: false,
|
102
|
+
lossy: false,
|
103
103
|
_phantom: PhantomData,
|
104
104
|
_phantom_a: PhantomData,
|
105
105
|
}
|
@@ -140,13 +140,6 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
|
|
140
140
|
self
|
141
141
|
}
|
142
142
|
|
143
|
-
/// Sets the default value for missing fields when in flexible mode.
|
144
|
-
#[must_use]
|
145
|
-
pub fn flexible_default(mut self, flexible_default: Option<String>) -> Self {
|
146
|
-
self.flexible_default = flexible_default;
|
147
|
-
self
|
148
|
-
}
|
149
|
-
|
150
143
|
/// Sets the trimming mode for fields.
|
151
144
|
#[must_use]
|
152
145
|
pub fn trim(mut self, trim: csv::Trim) -> Self {
|
@@ -160,6 +153,12 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
|
|
160
153
|
self
|
161
154
|
}
|
162
155
|
|
156
|
+
#[must_use]
|
157
|
+
pub fn lossy(mut self, lossy: bool) -> Self {
|
158
|
+
self.lossy = lossy;
|
159
|
+
self
|
160
|
+
}
|
161
|
+
|
163
162
|
/// Handles reading from a file descriptor.
|
164
163
|
fn handle_file_descriptor(&self) -> Result<Box<dyn SeekableRead>, ReaderError> {
|
165
164
|
let raw_value = self.to_read.as_raw();
|
@@ -202,7 +201,7 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
|
|
202
201
|
build_ruby_reader(&self.ruby, self.to_read)?
|
203
202
|
};
|
204
203
|
|
205
|
-
let flexible = self.flexible
|
204
|
+
let flexible = self.flexible;
|
206
205
|
let reader = BufReader::with_capacity(READ_BUFFER_SIZE, readable);
|
207
206
|
|
208
207
|
let mut reader = csv::ReaderBuilder::new()
|
@@ -214,24 +213,13 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
|
|
214
213
|
.from_reader(reader);
|
215
214
|
|
216
215
|
let mut headers =
|
217
|
-
RecordReader::<T>::get_headers(&self.ruby, &mut reader, self.has_headers)?;
|
216
|
+
RecordReader::<T>::get_headers(&self.ruby, &mut reader, self.has_headers, self.lossy)?;
|
217
|
+
|
218
218
|
if self.ignore_null_bytes {
|
219
219
|
headers = headers.iter().map(|h| h.replace("\0", "")).collect();
|
220
220
|
}
|
221
221
|
let static_headers = StringCache::intern_many(&headers)?;
|
222
222
|
|
223
|
-
// We intern both of these to get static string references we can reuse throughout the parser.
|
224
|
-
let flexible_default = self
|
225
|
-
.flexible_default
|
226
|
-
.map(|s| {
|
227
|
-
RString::new(&s)
|
228
|
-
.to_interned_str()
|
229
|
-
.as_str()
|
230
|
-
.map_err(|e| ReaderError::InvalidFlexibleDefault(format!("{:?}", e)))
|
231
|
-
})
|
232
|
-
.transpose()?
|
233
|
-
.map(Cow::Borrowed);
|
234
|
-
|
235
223
|
let null_string = self
|
236
224
|
.null_string
|
237
225
|
.map(|s| {
|
@@ -247,8 +235,8 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
|
|
247
235
|
reader,
|
248
236
|
static_headers,
|
249
237
|
null_string,
|
250
|
-
flexible_default,
|
251
238
|
self.ignore_null_bytes,
|
239
|
+
self.lossy,
|
252
240
|
))
|
253
241
|
}
|
254
242
|
}
|
@@ -8,7 +8,7 @@ use std::{
|
|
8
8
|
collections::HashMap,
|
9
9
|
sync::{
|
10
10
|
atomic::{AtomicU32, Ordering},
|
11
|
-
LazyLock, Mutex, OnceLock,
|
11
|
+
Arc, LazyLock, Mutex, OnceLock,
|
12
12
|
},
|
13
13
|
};
|
14
14
|
|
@@ -22,12 +22,11 @@ pub enum CacheError {
|
|
22
22
|
LockError(String),
|
23
23
|
}
|
24
24
|
|
25
|
-
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey
|
25
|
+
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (Arc<StringCacheKey>, AtomicU32)>>> =
|
26
26
|
LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
|
27
27
|
|
28
28
|
pub struct StringCache;
|
29
29
|
|
30
|
-
#[derive(Copy, Clone)]
|
31
30
|
pub struct StringCacheKey(Opaque<FString>, &'static str);
|
32
31
|
|
33
32
|
impl StringCacheKey {
|
@@ -50,6 +49,12 @@ impl IntoValue for StringCacheKey {
|
|
50
49
|
}
|
51
50
|
}
|
52
51
|
|
52
|
+
impl IntoValue for &StringCacheKey {
|
53
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
54
|
+
handle.into_value(self.0)
|
55
|
+
}
|
56
|
+
}
|
57
|
+
|
53
58
|
impl std::fmt::Debug for StringCacheKey {
|
54
59
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
55
60
|
self.1.fmt(f)
|
@@ -72,43 +77,43 @@ impl std::hash::Hash for StringCacheKey {
|
|
72
77
|
|
73
78
|
impl StringCache {
|
74
79
|
#[allow(dead_code)]
|
75
|
-
pub fn intern(string: String) -> Result<StringCacheKey
|
80
|
+
pub fn intern(string: String) -> Result<Arc<StringCacheKey>, CacheError> {
|
76
81
|
let mut cache = STRING_CACHE
|
77
82
|
.lock()
|
78
83
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
79
84
|
|
80
85
|
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
|
81
86
|
counter.fetch_add(1, Ordering::Relaxed);
|
82
|
-
Ok(
|
87
|
+
Ok(interned_string.clone())
|
83
88
|
} else {
|
84
|
-
let interned = StringCacheKey::new(string.as_str());
|
89
|
+
let interned = Arc::new(StringCacheKey::new(string.as_str()));
|
85
90
|
let leaked = Box::leak(string.into_boxed_str());
|
86
|
-
cache.insert(leaked, (interned, AtomicU32::new(1)));
|
91
|
+
cache.insert(leaked, (interned.clone(), AtomicU32::new(1)));
|
87
92
|
Ok(interned)
|
88
93
|
}
|
89
94
|
}
|
90
95
|
|
91
|
-
pub fn intern_many(strings: &[String]) -> Result<Vec<StringCacheKey
|
96
|
+
pub fn intern_many(strings: &[String]) -> Result<Vec<Arc<StringCacheKey>>, CacheError> {
|
92
97
|
let mut cache = STRING_CACHE
|
93
98
|
.lock()
|
94
99
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
95
100
|
|
96
|
-
let mut result: Vec<StringCacheKey
|
101
|
+
let mut result: Vec<Arc<StringCacheKey>> = Vec::with_capacity(strings.len());
|
97
102
|
for string in strings {
|
98
103
|
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
|
99
104
|
counter.fetch_add(1, Ordering::Relaxed);
|
100
|
-
result.push(
|
105
|
+
result.push(interned_string.clone());
|
101
106
|
} else {
|
102
|
-
let interned = StringCacheKey::new(string);
|
107
|
+
let interned = Arc::new(StringCacheKey::new(string));
|
103
108
|
let leaked = Box::leak(string.clone().into_boxed_str());
|
104
|
-
cache.insert(leaked, (interned, AtomicU32::new(1)));
|
109
|
+
cache.insert(leaked, (interned.clone(), AtomicU32::new(1)));
|
105
110
|
result.push(interned);
|
106
111
|
}
|
107
112
|
}
|
108
113
|
Ok(result)
|
109
114
|
}
|
110
115
|
|
111
|
-
pub fn clear(headers: &[StringCacheKey]) -> Result<(), CacheError> {
|
116
|
+
pub fn clear(headers: &[Arc<StringCacheKey>]) -> Result<(), CacheError> {
|
112
117
|
let mut cache = STRING_CACHE
|
113
118
|
.lock()
|
114
119
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
@@ -116,7 +121,7 @@ impl StringCache {
|
|
116
121
|
let to_remove: Vec<_> = headers
|
117
122
|
.iter()
|
118
123
|
.filter_map(|header| {
|
119
|
-
let key = header.as_ref();
|
124
|
+
let key = header.as_ref().as_ref();
|
120
125
|
if let Some((_, (_, counter))) = cache.get_key_value(key) {
|
121
126
|
let prev_count = counter.fetch_sub(1, Ordering::Relaxed);
|
122
127
|
if prev_count == 1 {
|
@@ -140,7 +145,7 @@ impl StringCache {
|
|
140
145
|
|
141
146
|
pub struct HeaderCacheCleanupIter<I> {
|
142
147
|
pub inner: I,
|
143
|
-
pub headers: OnceLock<Vec<StringCacheKey
|
148
|
+
pub headers: OnceLock<Vec<Arc<StringCacheKey>>>,
|
144
149
|
}
|
145
150
|
|
146
151
|
impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
|
data/ext/osv/src/csv/parser.rs
CHANGED
@@ -1,44 +1,47 @@
|
|
1
1
|
use std::borrow::Cow;
|
2
2
|
use std::collections::HashMap;
|
3
3
|
use std::hash::BuildHasher;
|
4
|
+
use std::sync::Arc;
|
4
5
|
|
5
6
|
use super::header_cache::StringCacheKey;
|
6
7
|
use super::CowStr;
|
7
8
|
|
9
|
+
pub enum CsvRecordType {
|
10
|
+
String(csv::StringRecord),
|
11
|
+
Byte(csv::ByteRecord),
|
12
|
+
}
|
13
|
+
|
8
14
|
pub trait RecordParser<'a> {
|
9
15
|
type Output;
|
10
16
|
|
11
17
|
fn parse(
|
12
|
-
headers: &[StringCacheKey],
|
13
|
-
record: &
|
18
|
+
headers: &[Arc<StringCacheKey>],
|
19
|
+
record: &CsvRecordType,
|
14
20
|
null_string: Option<Cow<'a, str>>,
|
15
|
-
flexible_default: Option<Cow<'a, str>>,
|
16
21
|
ignore_null_bytes: bool,
|
17
22
|
) -> Self::Output;
|
18
23
|
}
|
19
24
|
|
20
25
|
impl<'a, S: BuildHasher + Default> RecordParser<'a>
|
21
|
-
for HashMap<StringCacheKey
|
26
|
+
for HashMap<Arc<StringCacheKey>, Option<CowStr<'a>>, S>
|
22
27
|
{
|
23
28
|
type Output = Self;
|
24
29
|
|
25
30
|
#[inline]
|
26
31
|
fn parse(
|
27
|
-
headers: &[StringCacheKey],
|
28
|
-
record: &
|
32
|
+
headers: &[Arc<StringCacheKey>],
|
33
|
+
record: &CsvRecordType,
|
29
34
|
null_string: Option<Cow<'a, str>>,
|
30
|
-
flexible_default: Option<Cow<'a, str>>,
|
31
35
|
ignore_null_bytes: bool,
|
32
36
|
) -> Self::Output {
|
33
37
|
let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
|
34
38
|
|
35
39
|
let shared_empty = Cow::Borrowed("");
|
36
|
-
|
40
|
+
|
37
41
|
headers.iter().enumerate().for_each(|(i, header)| {
|
38
|
-
let value = record
|
39
|
-
|
40
|
-
|
41
|
-
if null_string.as_deref() == Some(field) {
|
42
|
+
let value = match record {
|
43
|
+
CsvRecordType::String(s) => s.get(i).and_then(|field| {
|
44
|
+
if null_string.as_deref() == Some(field.as_ref()) {
|
42
45
|
None
|
43
46
|
} else if field.is_empty() {
|
44
47
|
Some(CowStr(shared_empty.clone()))
|
@@ -47,9 +50,23 @@ impl<'a, S: BuildHasher + Default> RecordParser<'a>
|
|
47
50
|
} else {
|
48
51
|
Some(CowStr(Cow::Owned(field.to_string())))
|
49
52
|
}
|
50
|
-
},
|
51
|
-
|
52
|
-
|
53
|
+
}),
|
54
|
+
|
55
|
+
CsvRecordType::Byte(b) => b.get(i).and_then(|field| {
|
56
|
+
let field = String::from_utf8_lossy(field);
|
57
|
+
if null_string.as_deref() == Some(field.as_ref()) {
|
58
|
+
None
|
59
|
+
} else if field.is_empty() {
|
60
|
+
Some(CowStr(shared_empty.clone()))
|
61
|
+
} else if ignore_null_bytes {
|
62
|
+
Some(CowStr(Cow::Owned(field.replace("\0", ""))))
|
63
|
+
} else {
|
64
|
+
Some(CowStr(Cow::Owned(field.to_string())))
|
65
|
+
}
|
66
|
+
}),
|
67
|
+
};
|
68
|
+
|
69
|
+
map.insert(header.clone(), value);
|
53
70
|
});
|
54
71
|
map
|
55
72
|
}
|
@@ -60,36 +77,48 @@ impl<'a> RecordParser<'a> for Vec<Option<CowStr<'a>>> {
|
|
60
77
|
|
61
78
|
#[inline]
|
62
79
|
fn parse(
|
63
|
-
headers: &[StringCacheKey],
|
64
|
-
record: &
|
80
|
+
headers: &[Arc<StringCacheKey>],
|
81
|
+
record: &CsvRecordType,
|
65
82
|
null_string: Option<Cow<'a, str>>,
|
66
|
-
flexible_default: Option<Cow<'a, str>>,
|
67
83
|
ignore_null_bytes: bool,
|
68
84
|
) -> Self::Output {
|
69
85
|
let target_len = headers.len();
|
70
86
|
let mut vec = Vec::with_capacity(target_len);
|
71
87
|
|
72
88
|
let shared_empty = Cow::Borrowed("");
|
73
|
-
let shared_default = flexible_default.map(CowStr);
|
74
|
-
|
75
|
-
for field in record.iter() {
|
76
|
-
let value = if Some(field) == null_string.as_deref() {
|
77
|
-
None
|
78
|
-
} else if field.is_empty() {
|
79
|
-
Some(CowStr(shared_empty.clone()))
|
80
|
-
} else if ignore_null_bytes {
|
81
|
-
Some(CowStr(Cow::Owned(field.replace("\0", ""))))
|
82
|
-
} else {
|
83
|
-
Some(CowStr(Cow::Owned(field.to_string())))
|
84
|
-
};
|
85
|
-
vec.push(value);
|
86
|
-
}
|
87
89
|
|
88
|
-
|
89
|
-
|
90
|
-
|
90
|
+
match record {
|
91
|
+
CsvRecordType::String(record) => {
|
92
|
+
for field in record.iter() {
|
93
|
+
let value = if Some(field.as_ref()) == null_string.as_deref() {
|
94
|
+
None
|
95
|
+
} else if field.is_empty() {
|
96
|
+
Some(CowStr(shared_empty.clone()))
|
97
|
+
} else if ignore_null_bytes {
|
98
|
+
Some(CowStr(Cow::Owned(field.replace("\0", ""))))
|
99
|
+
} else {
|
100
|
+
Some(CowStr(Cow::Owned(field.to_string())))
|
101
|
+
};
|
102
|
+
vec.push(value);
|
103
|
+
}
|
104
|
+
}
|
105
|
+
CsvRecordType::Byte(record) => {
|
106
|
+
for field in record.iter() {
|
107
|
+
let field = String::from_utf8_lossy(field);
|
108
|
+
let value = if Some(field.as_ref()) == null_string.as_deref() {
|
109
|
+
None
|
110
|
+
} else if field.is_empty() {
|
111
|
+
Some(CowStr(shared_empty.clone()))
|
112
|
+
} else if ignore_null_bytes {
|
113
|
+
Some(CowStr(Cow::Owned(field.replace("\0", ""))))
|
114
|
+
} else {
|
115
|
+
Some(CowStr(Cow::Owned(field.to_string())))
|
116
|
+
};
|
117
|
+
vec.push(value);
|
118
|
+
}
|
91
119
|
}
|
92
120
|
}
|
121
|
+
|
93
122
|
vec
|
94
123
|
}
|
95
124
|
}
|
data/ext/osv/src/csv/record.rs
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
use itertools::Itertools;
|
2
2
|
use magnus::{value::ReprValue, IntoValue, Ruby, Value};
|
3
|
-
use std::{borrow::Cow, collections::HashMap, hash::BuildHasher};
|
3
|
+
use std::{borrow::Cow, collections::HashMap, hash::BuildHasher, sync::Arc};
|
4
4
|
|
5
5
|
use super::StringCacheKey;
|
6
6
|
|
7
7
|
#[derive(Debug)]
|
8
8
|
pub enum CsvRecord<'a, S: BuildHasher + Default> {
|
9
9
|
Vec(Vec<Option<CowStr<'a>>>),
|
10
|
-
Map(HashMap<StringCacheKey
|
10
|
+
Map(HashMap<Arc<StringCacheKey>, Option<CowStr<'a>>, S>),
|
11
11
|
}
|
12
12
|
|
13
13
|
impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
|
@@ -28,7 +28,7 @@ impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
|
|
28
28
|
|
29
29
|
for chunk in &map.into_iter().chunks(128) {
|
30
30
|
for (k, v) in chunk {
|
31
|
-
values[i] = handle.into_value(k);
|
31
|
+
values[i] = handle.into_value(k.as_ref());
|
32
32
|
values[i + 1] = handle.into_value(v);
|
33
33
|
i += 2;
|
34
34
|
}
|
@@ -1,10 +1,11 @@
|
|
1
1
|
use super::builder::ReaderError;
|
2
2
|
use super::header_cache::StringCacheKey;
|
3
|
-
use super::parser::RecordParser;
|
3
|
+
use super::parser::{CsvRecordType, RecordParser};
|
4
4
|
use super::{header_cache::StringCache, ruby_reader::SeekableRead};
|
5
5
|
use magnus::{Error, Ruby};
|
6
6
|
use std::borrow::Cow;
|
7
7
|
use std::io::{BufReader, Read};
|
8
|
+
use std::sync::Arc;
|
8
9
|
|
9
10
|
/// Size of the internal buffer used for reading CSV records
|
10
11
|
pub(crate) const READ_BUFFER_SIZE: usize = 16384;
|
@@ -14,10 +15,9 @@ pub(crate) const READ_BUFFER_SIZE: usize = 16384;
|
|
14
15
|
/// This struct implements Iterator to provide a streaming interface for CSV records.
|
15
16
|
pub struct RecordReader<'a, T: RecordParser<'a>> {
|
16
17
|
reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
|
17
|
-
headers: Vec<StringCacheKey
|
18
|
+
headers: Vec<Arc<StringCacheKey>>,
|
18
19
|
null_string: Option<Cow<'a, str>>,
|
19
|
-
|
20
|
-
string_record: csv::StringRecord,
|
20
|
+
string_record: CsvRecordType,
|
21
21
|
parser: std::marker::PhantomData<T>,
|
22
22
|
ignore_null_bytes: bool,
|
23
23
|
}
|
@@ -37,36 +37,65 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
|
|
37
37
|
ruby: &Ruby,
|
38
38
|
reader: &mut csv::Reader<impl Read>,
|
39
39
|
has_headers: bool,
|
40
|
+
lossy: bool,
|
40
41
|
) -> Result<Vec<String>, Error> {
|
41
|
-
let
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
42
|
+
let headers = if lossy {
|
43
|
+
let first_row = reader.byte_headers().map_err(|e| {
|
44
|
+
Error::new(
|
45
|
+
ruby.exception_runtime_error(),
|
46
|
+
format!("Failed to read headers: {e}"),
|
47
|
+
)
|
48
|
+
})?;
|
49
|
+
if has_headers {
|
50
|
+
first_row
|
51
|
+
.iter()
|
52
|
+
.map(String::from_utf8_lossy)
|
53
|
+
.map(|x| x.to_string())
|
54
|
+
.collect()
|
55
|
+
} else {
|
56
|
+
(0..first_row.len()).map(|i| format!("c{i}")).collect()
|
57
|
+
}
|
50
58
|
} else {
|
51
|
-
|
52
|
-
|
59
|
+
let first_row = reader.headers().map_err(|e| {
|
60
|
+
Error::new(
|
61
|
+
ruby.exception_runtime_error(),
|
62
|
+
format!("Failed to read headers: {e}"),
|
63
|
+
)
|
64
|
+
})?;
|
65
|
+
if has_headers {
|
66
|
+
first_row.iter().map(String::from).collect()
|
67
|
+
} else {
|
68
|
+
(0..first_row.len()).map(|i| format!("c{i}")).collect()
|
69
|
+
}
|
70
|
+
};
|
71
|
+
|
72
|
+
Ok(headers)
|
53
73
|
}
|
54
74
|
|
55
75
|
/// Creates a new RecordReader instance.
|
56
76
|
pub(crate) fn new(
|
57
77
|
reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
|
58
|
-
headers: Vec<StringCacheKey
|
78
|
+
headers: Vec<Arc<StringCacheKey>>,
|
59
79
|
null_string: Option<Cow<'a, str>>,
|
60
|
-
flexible_default: Option<Cow<'a, str>>,
|
61
80
|
ignore_null_bytes: bool,
|
81
|
+
lossy: bool,
|
62
82
|
) -> Self {
|
63
83
|
let headers_len = headers.len();
|
64
84
|
Self {
|
65
85
|
reader,
|
66
86
|
headers,
|
67
87
|
null_string,
|
68
|
-
|
69
|
-
|
88
|
+
string_record: if lossy {
|
89
|
+
CsvRecordType::Byte(csv::ByteRecord::with_capacity(
|
90
|
+
READ_BUFFER_SIZE,
|
91
|
+
headers_len,
|
92
|
+
))
|
93
|
+
} else {
|
94
|
+
CsvRecordType::String(csv::StringRecord::with_capacity(
|
95
|
+
READ_BUFFER_SIZE,
|
96
|
+
headers_len,
|
97
|
+
))
|
98
|
+
},
|
70
99
|
parser: std::marker::PhantomData,
|
71
100
|
ignore_null_bytes,
|
72
101
|
}
|
@@ -74,12 +103,15 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
|
|
74
103
|
|
75
104
|
/// Attempts to read the next record, returning any errors encountered.
|
76
105
|
fn try_next(&mut self) -> Result<Option<T::Output>, ReaderError> {
|
77
|
-
|
106
|
+
let record = match self.string_record {
|
107
|
+
CsvRecordType::String(ref mut record) => self.reader.read_record(record),
|
108
|
+
CsvRecordType::Byte(ref mut record) => self.reader.read_byte_record(record),
|
109
|
+
}?;
|
110
|
+
if record {
|
78
111
|
Ok(Some(T::parse(
|
79
112
|
&self.headers,
|
80
113
|
&self.string_record,
|
81
114
|
self.null_string.clone(),
|
82
|
-
self.flexible_default.clone(),
|
83
115
|
self.ignore_null_bytes,
|
84
116
|
)))
|
85
117
|
} else {
|
data/ext/osv/src/reader.rs
CHANGED
@@ -5,6 +5,7 @@ use csv::Trim;
|
|
5
5
|
use magnus::value::ReprValue;
|
6
6
|
use magnus::{Error, IntoValue, KwArgs, RHash, Ruby, Symbol, Value};
|
7
7
|
use std::collections::HashMap;
|
8
|
+
use std::sync::Arc;
|
8
9
|
|
9
10
|
/// Valid result types for CSV parsing
|
10
11
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
@@ -34,9 +35,9 @@ struct EnumeratorArgs {
|
|
34
35
|
null_string: Option<String>,
|
35
36
|
result_type: String,
|
36
37
|
flexible: bool,
|
37
|
-
flexible_default: Option<String>,
|
38
38
|
trim: Option<String>,
|
39
39
|
ignore_null_bytes: bool,
|
40
|
+
lossy: bool,
|
40
41
|
}
|
41
42
|
|
42
43
|
/// Parses a CSV file with the given configuration.
|
@@ -56,9 +57,9 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result<Value, Error> {
|
|
56
57
|
null_string,
|
57
58
|
result_type,
|
58
59
|
flexible,
|
59
|
-
flexible_default,
|
60
60
|
trim,
|
61
61
|
ignore_null_bytes,
|
62
|
+
lossy,
|
62
63
|
} = parse_read_csv_args(&ruby, args)?;
|
63
64
|
|
64
65
|
if !ruby.block_given() {
|
@@ -71,7 +72,6 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result<Value, Error> {
|
|
71
72
|
null_string,
|
72
73
|
result_type,
|
73
74
|
flexible,
|
74
|
-
flexible_default,
|
75
75
|
trim: match trim {
|
76
76
|
Trim::All => Some("all".to_string()),
|
77
77
|
Trim::Headers => Some("headers".to_string()),
|
@@ -79,6 +79,7 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result<Value, Error> {
|
|
79
79
|
_ => None,
|
80
80
|
},
|
81
81
|
ignore_null_bytes,
|
82
|
+
lossy,
|
82
83
|
})
|
83
84
|
.map(|yield_enum| yield_enum.into_value_with(&ruby));
|
84
85
|
}
|
@@ -93,16 +94,16 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result<Value, Error> {
|
|
93
94
|
match result_type {
|
94
95
|
ResultType::Hash => {
|
95
96
|
let builder = RecordReaderBuilder::<
|
96
|
-
HashMap<StringCacheKey
|
97
|
+
HashMap<Arc<StringCacheKey>, Option<CowStr<'static>>, RandomState>,
|
97
98
|
>::new(ruby, to_read)
|
98
99
|
.has_headers(has_headers)
|
99
100
|
.flexible(flexible)
|
100
|
-
.flexible_default(flexible_default)
|
101
101
|
.trim(trim)
|
102
102
|
.delimiter(delimiter)
|
103
103
|
.quote_char(quote_char)
|
104
104
|
.null_string(null_string)
|
105
105
|
.ignore_null_bytes(ignore_null_bytes)
|
106
|
+
.lossy(lossy)
|
106
107
|
.build()?;
|
107
108
|
|
108
109
|
let ruby = unsafe { Ruby::get_unchecked() };
|
@@ -115,12 +116,12 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result<Value, Error> {
|
|
115
116
|
let builder = RecordReaderBuilder::<Vec<Option<CowStr<'static>>>>::new(ruby, to_read)
|
116
117
|
.has_headers(has_headers)
|
117
118
|
.flexible(flexible)
|
118
|
-
.flexible_default(flexible_default)
|
119
119
|
.trim(trim)
|
120
120
|
.delimiter(delimiter)
|
121
121
|
.quote_char(quote_char)
|
122
122
|
.null_string(null_string)
|
123
123
|
.ignore_null_bytes(ignore_null_bytes)
|
124
|
+
.lossy(lossy)
|
124
125
|
.build()?;
|
125
126
|
|
126
127
|
let ruby = unsafe { Ruby::get_unchecked() };
|
@@ -150,10 +151,9 @@ fn create_enumerator(args: EnumeratorArgs) -> Result<magnus::Enumerator, Error>
|
|
150
151
|
kwargs.aset(Symbol::new("nil_string"), args.null_string)?;
|
151
152
|
kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
|
152
153
|
kwargs.aset(Symbol::new("flexible"), args.flexible)?;
|
153
|
-
kwargs.aset(Symbol::new("flexible_default"), args.flexible_default)?;
|
154
154
|
kwargs.aset(Symbol::new("trim"), args.trim.map(Symbol::new))?;
|
155
155
|
kwargs.aset(Symbol::new("ignore_null_bytes"), args.ignore_null_bytes)?;
|
156
|
-
|
156
|
+
kwargs.aset(Symbol::new("lossy"), args.lossy)?;
|
157
157
|
Ok(args
|
158
158
|
.rb_self
|
159
159
|
.enumeratorize("for_each", (args.to_read, KwArgs(kwargs))))
|
data/ext/osv/src/utils.rs
CHANGED
@@ -34,9 +34,9 @@ pub struct ReadCsvArgs {
|
|
34
34
|
pub null_string: Option<String>,
|
35
35
|
pub result_type: String,
|
36
36
|
pub flexible: bool,
|
37
|
-
pub flexible_default: Option<String>,
|
38
37
|
pub trim: csv::Trim,
|
39
38
|
pub ignore_null_bytes: bool,
|
39
|
+
pub lossy: bool,
|
40
40
|
}
|
41
41
|
|
42
42
|
/// Parse common arguments for CSV parsing
|
@@ -54,9 +54,9 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
|
|
54
54
|
Option<Option<String>>,
|
55
55
|
Option<Option<Value>>,
|
56
56
|
Option<Option<bool>>,
|
57
|
-
Option<Option<Option<String>>>,
|
58
57
|
Option<Option<Value>>,
|
59
58
|
Option<Option<bool>>,
|
59
|
+
Option<Option<bool>>,
|
60
60
|
),
|
61
61
|
(),
|
62
62
|
>(
|
@@ -69,9 +69,9 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
|
|
69
69
|
"nil_string",
|
70
70
|
"result_type",
|
71
71
|
"flexible",
|
72
|
-
"flexible_default",
|
73
72
|
"trim",
|
74
73
|
"ignore_null_bytes",
|
74
|
+
"lossy",
|
75
75
|
],
|
76
76
|
)?;
|
77
77
|
|
@@ -134,11 +134,9 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
|
|
134
134
|
|
135
135
|
let flexible = kwargs.optional.5.flatten().unwrap_or_default();
|
136
136
|
|
137
|
-
let flexible_default = kwargs.optional.6.flatten().unwrap_or_default();
|
138
|
-
|
139
137
|
let trim = match kwargs
|
140
138
|
.optional
|
141
|
-
.
|
139
|
+
.6
|
142
140
|
.flatten()
|
143
141
|
.map(|value| parse_string_or_symbol(ruby, value))
|
144
142
|
{
|
@@ -166,7 +164,9 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
|
|
166
164
|
None => csv::Trim::None,
|
167
165
|
};
|
168
166
|
|
169
|
-
let ignore_null_bytes = kwargs.optional.
|
167
|
+
let ignore_null_bytes = kwargs.optional.7.flatten().unwrap_or_default();
|
168
|
+
|
169
|
+
let lossy = kwargs.optional.8.flatten().unwrap_or_default();
|
170
170
|
|
171
171
|
Ok(ReadCsvArgs {
|
172
172
|
to_read,
|
@@ -176,8 +176,8 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
|
|
176
176
|
null_string,
|
177
177
|
result_type,
|
178
178
|
flexible,
|
179
|
-
flexible_default,
|
180
179
|
trim,
|
181
180
|
ignore_null_bytes,
|
181
|
+
lossy,
|
182
182
|
})
|
183
183
|
}
|
data/lib/osv/version.rb
CHANGED
data/lib/osv.rbi
CHANGED
@@ -17,14 +17,12 @@ module OSV
|
|
17
17
|
# ("hash" or "array" or :hash or :array)
|
18
18
|
# - `flexible`: Boolean specifying if the parser should be flexible
|
19
19
|
# (default: false)
|
20
|
-
# - `flexible_default`: String specifying the default value for missing fields.
|
21
|
-
# Implicitly enables flexible mode if set.
|
22
|
-
# (default: `nil`)
|
23
20
|
# - `trim`: String specifying the trim mode
|
24
21
|
# ("all" or "headers" or "fields" or :all or :headers or :fields)
|
25
22
|
# (default: `nil`)
|
26
23
|
# - `ignore_null_bytes`: Boolean specifying if null bytes should be ignored
|
27
24
|
# (default: false)
|
25
|
+
# - `lossy`: Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character
|
28
26
|
sig do
|
29
27
|
params(
|
30
28
|
input: T.any(String, StringIO, IO),
|
@@ -35,7 +33,6 @@ module OSV
|
|
35
33
|
buffer_size: T.nilable(Integer),
|
36
34
|
result_type: T.nilable(T.any(String, Symbol)),
|
37
35
|
flexible: T.nilable(T::Boolean),
|
38
|
-
flexible_default: T.nilable(String),
|
39
36
|
ignore_null_bytes: T.nilable(T::Boolean),
|
40
37
|
trim: T.nilable(T.any(String, Symbol)),
|
41
38
|
blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.nilable(String)], T::Array[T.nilable(String)])).void)
|
@@ -50,9 +47,9 @@ module OSV
|
|
50
47
|
buffer_size: nil,
|
51
48
|
result_type: nil,
|
52
49
|
flexible: nil,
|
53
|
-
flexible_default: nil,
|
54
50
|
ignore_null_bytes: nil,
|
55
51
|
trim: nil,
|
52
|
+
lossy: nil,
|
56
53
|
&blk
|
57
54
|
)
|
58
55
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: osv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-01-
|
11
|
+
date: 2025-01-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|