osv 0.3.21 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -3
- data/ext/osv/src/csv/builder.rs +30 -28
- data/ext/osv/src/csv/parser.rs +58 -32
- data/ext/osv/src/csv/record_reader.rs +32 -17
- data/ext/osv/src/csv/ruby_reader.rs +19 -20
- data/ext/osv/src/reader.rs +27 -26
- data/ext/osv/src/utils.rs +8 -8
- data/lib/osv/version.rb +1 -1
- data/lib/osv.rbi +2 -5
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 51e4a387f1ed43bddc9f1f7a118637953d04239b5324ef131b9c860577ed4d41
|
4
|
+
data.tar.gz: e42928a09656216bbadcc2458953a8c5f28401ddf27095fc05038e0960471854
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4100c50a629ba5803db883532cfbe547eb3091e421b0876595d91791d8952a7b0169477c9c6f31063eafa5b91d0a9b1a9f0a5ae016d70cdd101e284beebfaf22
|
7
|
+
data.tar.gz: 90a822c644fcb37dc1892ede85a54395bc9e62a4b0b0a1af838182d390702d0ee4253151faafcedbf734b0a381fe2acf5c1ab23b842059fbdd4d51570fe33e58
|
data/README.md
CHANGED
@@ -84,11 +84,10 @@ OSV.for_each("data.csv",
|
|
84
84
|
|
85
85
|
# Parsing behavior
|
86
86
|
flexible: false, # Allow varying number of fields (default: false)
|
87
|
-
flexible_default: nil, # Default value for missing fields. If unset, we ignore missing fields.
|
88
|
-
# Implicitly enables flexible mode if set.
|
89
87
|
trim: :all, # Whether to trim whitespace. Options are :all, :headers, or :fields (default: nil)
|
90
88
|
buffer_size: 1024, # Number of rows to buffer in memory (default: 1024)
|
91
89
|
ignore_null_bytes: false, # Boolean specifying if null bytes should be ignored (default: false)
|
90
|
+
lossy: false, # Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character (default: false)
|
92
91
|
)
|
93
92
|
```
|
94
93
|
|
@@ -103,9 +102,9 @@ OSV.for_each("data.csv",
|
|
103
102
|
- `buffer_size`: Integer specifying the number of rows to buffer in memory (default: 1024)
|
104
103
|
- `result_type`: String specifying the output format ("hash" or "array" or :hash or :array)
|
105
104
|
- `flexible`: Boolean specifying if the parser should be flexible (default: false)
|
106
|
-
- `flexible_default`: String specifying the default value for missing fields. Implicitly enables flexible mode if set. (default: `nil`)
|
107
105
|
- `trim`: String specifying the trim mode ("all" or "headers" or "fields" or :all or :headers or :fields)
|
108
106
|
- `ignore_null_bytes`: Boolean specifying if null bytes should be ignored (default: false)
|
107
|
+
- `lossy`: Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character (default: false)
|
109
108
|
|
110
109
|
When `has_headers` is false, hash keys will be generated as `"c0"`, `"c1"`, etc.
|
111
110
|
|
data/ext/osv/src/csv/builder.rs
CHANGED
@@ -34,6 +34,10 @@ pub enum ReaderError {
|
|
34
34
|
InvalidFlexibleDefault(String),
|
35
35
|
#[error("Invalid null string value: {0}")]
|
36
36
|
InvalidNullString(String),
|
37
|
+
#[error("Failed to parse CSV record: {0}")]
|
38
|
+
CsvParse(#[from] csv::Error),
|
39
|
+
#[error("Invalid UTF-8: {0}")]
|
40
|
+
InvalidUtf8(String),
|
37
41
|
#[error("Ruby error: {0}")]
|
38
42
|
Ruby(String),
|
39
43
|
}
|
@@ -46,10 +50,20 @@ impl From<MagnusError> for ReaderError {
|
|
46
50
|
|
47
51
|
impl From<ReaderError> for MagnusError {
|
48
52
|
fn from(err: ReaderError) -> Self {
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
+
let ruby = Ruby::get().unwrap();
|
54
|
+
match err {
|
55
|
+
ReaderError::CsvParse(csv_err) => {
|
56
|
+
if csv_err.to_string().contains("invalid utf-8") {
|
57
|
+
MagnusError::new(ruby.exception_encoding_error(), csv_err.to_string())
|
58
|
+
} else {
|
59
|
+
MagnusError::new(ruby.exception_runtime_error(), csv_err.to_string())
|
60
|
+
}
|
61
|
+
}
|
62
|
+
ReaderError::InvalidUtf8(utf8_err) => {
|
63
|
+
MagnusError::new(ruby.exception_encoding_error(), utf8_err.to_string())
|
64
|
+
}
|
65
|
+
_ => MagnusError::new(ruby.exception_runtime_error(), err.to_string()),
|
66
|
+
}
|
53
67
|
}
|
54
68
|
}
|
55
69
|
|
@@ -65,9 +79,9 @@ pub struct RecordReaderBuilder<'a, T: RecordParser<'a>> {
|
|
65
79
|
quote_char: u8,
|
66
80
|
null_string: Option<String>,
|
67
81
|
flexible: bool,
|
68
|
-
flexible_default: Option<String>,
|
69
82
|
trim: csv::Trim,
|
70
83
|
ignore_null_bytes: bool,
|
84
|
+
lossy: bool,
|
71
85
|
_phantom: PhantomData<T>,
|
72
86
|
_phantom_a: PhantomData<&'a ()>,
|
73
87
|
}
|
@@ -83,9 +97,9 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
|
|
83
97
|
quote_char: b'"',
|
84
98
|
null_string: None,
|
85
99
|
flexible: false,
|
86
|
-
flexible_default: None,
|
87
100
|
trim: csv::Trim::None,
|
88
101
|
ignore_null_bytes: false,
|
102
|
+
lossy: false,
|
89
103
|
_phantom: PhantomData,
|
90
104
|
_phantom_a: PhantomData,
|
91
105
|
}
|
@@ -126,13 +140,6 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
|
|
126
140
|
self
|
127
141
|
}
|
128
142
|
|
129
|
-
/// Sets the default value for missing fields when in flexible mode.
|
130
|
-
#[must_use]
|
131
|
-
pub fn flexible_default(mut self, flexible_default: Option<String>) -> Self {
|
132
|
-
self.flexible_default = flexible_default;
|
133
|
-
self
|
134
|
-
}
|
135
|
-
|
136
143
|
/// Sets the trimming mode for fields.
|
137
144
|
#[must_use]
|
138
145
|
pub fn trim(mut self, trim: csv::Trim) -> Self {
|
@@ -146,6 +153,12 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
|
|
146
153
|
self
|
147
154
|
}
|
148
155
|
|
156
|
+
#[must_use]
|
157
|
+
pub fn lossy(mut self, lossy: bool) -> Self {
|
158
|
+
self.lossy = lossy;
|
159
|
+
self
|
160
|
+
}
|
161
|
+
|
149
162
|
/// Handles reading from a file descriptor.
|
150
163
|
fn handle_file_descriptor(&self) -> Result<Box<dyn SeekableRead>, ReaderError> {
|
151
164
|
let raw_value = self.to_read.as_raw();
|
@@ -188,7 +201,7 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
|
|
188
201
|
build_ruby_reader(&self.ruby, self.to_read)?
|
189
202
|
};
|
190
203
|
|
191
|
-
let flexible = self.flexible
|
204
|
+
let flexible = self.flexible;
|
192
205
|
let reader = BufReader::with_capacity(READ_BUFFER_SIZE, readable);
|
193
206
|
|
194
207
|
let mut reader = csv::ReaderBuilder::new()
|
@@ -199,24 +212,13 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
|
|
199
212
|
.trim(self.trim)
|
200
213
|
.from_reader(reader);
|
201
214
|
|
202
|
-
let mut headers =
|
215
|
+
let mut headers =
|
216
|
+
RecordReader::<T>::get_headers(&self.ruby, &mut reader, self.has_headers)?;
|
203
217
|
if self.ignore_null_bytes {
|
204
218
|
headers = headers.iter().map(|h| h.replace("\0", "")).collect();
|
205
219
|
}
|
206
220
|
let static_headers = StringCache::intern_many(&headers)?;
|
207
221
|
|
208
|
-
// We intern both of these to get static string references we can reuse throughout the parser.
|
209
|
-
let flexible_default = self
|
210
|
-
.flexible_default
|
211
|
-
.map(|s| {
|
212
|
-
RString::new(&s)
|
213
|
-
.to_interned_str()
|
214
|
-
.as_str()
|
215
|
-
.map_err(|e| ReaderError::InvalidFlexibleDefault(format!("{:?}", e)))
|
216
|
-
})
|
217
|
-
.transpose()?
|
218
|
-
.map(Cow::Borrowed);
|
219
|
-
|
220
222
|
let null_string = self
|
221
223
|
.null_string
|
222
224
|
.map(|s| {
|
@@ -232,8 +234,8 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
|
|
232
234
|
reader,
|
233
235
|
static_headers,
|
234
236
|
null_string,
|
235
|
-
flexible_default,
|
236
237
|
self.ignore_null_bytes,
|
238
|
+
self.lossy,
|
237
239
|
))
|
238
240
|
}
|
239
241
|
}
|
data/ext/osv/src/csv/parser.rs
CHANGED
@@ -5,14 +5,18 @@ use std::hash::BuildHasher;
|
|
5
5
|
use super::header_cache::StringCacheKey;
|
6
6
|
use super::CowStr;
|
7
7
|
|
8
|
+
pub enum CsvRecordType {
|
9
|
+
String(csv::StringRecord),
|
10
|
+
Byte(csv::ByteRecord),
|
11
|
+
}
|
12
|
+
|
8
13
|
pub trait RecordParser<'a> {
|
9
14
|
type Output;
|
10
15
|
|
11
16
|
fn parse(
|
12
17
|
headers: &[StringCacheKey],
|
13
|
-
record: &
|
18
|
+
record: &CsvRecordType,
|
14
19
|
null_string: Option<Cow<'a, str>>,
|
15
|
-
flexible_default: Option<Cow<'a, str>>,
|
16
20
|
ignore_null_bytes: bool,
|
17
21
|
) -> Self::Output;
|
18
22
|
}
|
@@ -25,31 +29,42 @@ impl<'a, S: BuildHasher + Default> RecordParser<'a>
|
|
25
29
|
#[inline]
|
26
30
|
fn parse(
|
27
31
|
headers: &[StringCacheKey],
|
28
|
-
record: &
|
32
|
+
record: &CsvRecordType,
|
29
33
|
null_string: Option<Cow<'a, str>>,
|
30
|
-
flexible_default: Option<Cow<'a, str>>,
|
31
34
|
ignore_null_bytes: bool,
|
32
35
|
) -> Self::Output {
|
33
36
|
let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
|
34
37
|
|
35
38
|
let shared_empty = Cow::Borrowed("");
|
36
|
-
|
39
|
+
|
37
40
|
headers.iter().enumerate().for_each(|(i, header)| {
|
38
|
-
let value = record
|
39
|
-
|
40
|
-
|
41
|
-
if null_string.as_deref() == Some(field) {
|
41
|
+
let value = match record {
|
42
|
+
CsvRecordType::String(s) => s.get(i).and_then(|field| {
|
43
|
+
if null_string.as_deref() == Some(field.as_ref()) {
|
42
44
|
None
|
43
45
|
} else if field.is_empty() {
|
44
46
|
Some(CowStr(shared_empty.clone()))
|
45
|
-
} else if ignore_null_bytes
|
47
|
+
} else if ignore_null_bytes {
|
46
48
|
Some(CowStr(Cow::Owned(field.replace("\0", ""))))
|
49
|
+
} else {
|
50
|
+
Some(CowStr(Cow::Owned(field.to_string())))
|
47
51
|
}
|
48
|
-
|
52
|
+
}),
|
53
|
+
|
54
|
+
CsvRecordType::Byte(b) => b.get(i).and_then(|field| {
|
55
|
+
let field = String::from_utf8_lossy(field);
|
56
|
+
if null_string.as_deref() == Some(field.as_ref()) {
|
57
|
+
None
|
58
|
+
} else if field.is_empty() {
|
59
|
+
Some(CowStr(shared_empty.clone()))
|
60
|
+
} else if ignore_null_bytes {
|
61
|
+
Some(CowStr(Cow::Owned(field.replace("\0", ""))))
|
62
|
+
} else {
|
49
63
|
Some(CowStr(Cow::Owned(field.to_string())))
|
50
64
|
}
|
51
|
-
},
|
52
|
-
|
65
|
+
}),
|
66
|
+
};
|
67
|
+
|
53
68
|
map.insert(*header, value);
|
54
69
|
});
|
55
70
|
map
|
@@ -62,36 +77,47 @@ impl<'a> RecordParser<'a> for Vec<Option<CowStr<'a>>> {
|
|
62
77
|
#[inline]
|
63
78
|
fn parse(
|
64
79
|
headers: &[StringCacheKey],
|
65
|
-
record: &
|
80
|
+
record: &CsvRecordType,
|
66
81
|
null_string: Option<Cow<'a, str>>,
|
67
|
-
flexible_default: Option<Cow<'a, str>>,
|
68
82
|
ignore_null_bytes: bool,
|
69
83
|
) -> Self::Output {
|
70
84
|
let target_len = headers.len();
|
71
85
|
let mut vec = Vec::with_capacity(target_len);
|
72
86
|
|
73
87
|
let shared_empty = Cow::Borrowed("");
|
74
|
-
let shared_default = flexible_default.map(CowStr);
|
75
88
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
89
|
+
match record {
|
90
|
+
CsvRecordType::String(record) => {
|
91
|
+
for field in record.iter() {
|
92
|
+
let value = if Some(field.as_ref()) == null_string.as_deref() {
|
93
|
+
None
|
94
|
+
} else if field.is_empty() {
|
95
|
+
Some(CowStr(shared_empty.clone()))
|
96
|
+
} else if ignore_null_bytes {
|
97
|
+
Some(CowStr(Cow::Owned(field.replace("\0", ""))))
|
98
|
+
} else {
|
99
|
+
Some(CowStr(Cow::Owned(field.to_string())))
|
100
|
+
};
|
101
|
+
vec.push(value);
|
102
|
+
}
|
83
103
|
}
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
104
|
+
CsvRecordType::Byte(record) => {
|
105
|
+
for field in record.iter() {
|
106
|
+
let field = String::from_utf8_lossy(field);
|
107
|
+
let value = if Some(field.as_ref()) == null_string.as_deref() {
|
108
|
+
None
|
109
|
+
} else if field.is_empty() {
|
110
|
+
Some(CowStr(shared_empty.clone()))
|
111
|
+
} else if ignore_null_bytes {
|
112
|
+
Some(CowStr(Cow::Owned(field.replace("\0", ""))))
|
113
|
+
} else {
|
114
|
+
Some(CowStr(Cow::Owned(field.to_string())))
|
115
|
+
};
|
116
|
+
vec.push(value);
|
117
|
+
}
|
93
118
|
}
|
94
119
|
}
|
120
|
+
|
95
121
|
vec
|
96
122
|
}
|
97
123
|
}
|
@@ -1,5 +1,6 @@
|
|
1
|
+
use super::builder::ReaderError;
|
1
2
|
use super::header_cache::StringCacheKey;
|
2
|
-
use super::parser::RecordParser;
|
3
|
+
use super::parser::{CsvRecordType, RecordParser};
|
3
4
|
use super::{header_cache::StringCache, ruby_reader::SeekableRead};
|
4
5
|
use magnus::{Error, Ruby};
|
5
6
|
use std::borrow::Cow;
|
@@ -15,8 +16,7 @@ pub struct RecordReader<'a, T: RecordParser<'a>> {
|
|
15
16
|
reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
|
16
17
|
headers: Vec<StringCacheKey>,
|
17
18
|
null_string: Option<Cow<'a, str>>,
|
18
|
-
|
19
|
-
string_record: csv::StringRecord,
|
19
|
+
string_record: CsvRecordType,
|
20
20
|
parser: std::marker::PhantomData<T>,
|
21
21
|
ignore_null_bytes: bool,
|
22
22
|
}
|
@@ -56,44 +56,59 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
|
|
56
56
|
reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
|
57
57
|
headers: Vec<StringCacheKey>,
|
58
58
|
null_string: Option<Cow<'a, str>>,
|
59
|
-
flexible_default: Option<Cow<'a, str>>,
|
60
59
|
ignore_null_bytes: bool,
|
60
|
+
lossy: bool,
|
61
61
|
) -> Self {
|
62
62
|
let headers_len = headers.len();
|
63
63
|
Self {
|
64
64
|
reader,
|
65
65
|
headers,
|
66
66
|
null_string,
|
67
|
-
|
68
|
-
|
67
|
+
string_record: if lossy {
|
68
|
+
CsvRecordType::Byte(csv::ByteRecord::with_capacity(
|
69
|
+
READ_BUFFER_SIZE,
|
70
|
+
headers_len,
|
71
|
+
))
|
72
|
+
} else {
|
73
|
+
CsvRecordType::String(csv::StringRecord::with_capacity(
|
74
|
+
READ_BUFFER_SIZE,
|
75
|
+
headers_len,
|
76
|
+
))
|
77
|
+
},
|
69
78
|
parser: std::marker::PhantomData,
|
70
79
|
ignore_null_bytes,
|
71
80
|
}
|
72
81
|
}
|
73
82
|
|
74
83
|
/// Attempts to read the next record, returning any errors encountered.
|
75
|
-
fn try_next(&mut self) ->
|
76
|
-
match self.
|
77
|
-
|
84
|
+
fn try_next(&mut self) -> Result<Option<T::Output>, ReaderError> {
|
85
|
+
let record = match self.string_record {
|
86
|
+
CsvRecordType::String(ref mut record) => self.reader.read_record(record),
|
87
|
+
CsvRecordType::Byte(ref mut record) => self.reader.read_byte_record(record),
|
88
|
+
}?;
|
89
|
+
if record {
|
90
|
+
Ok(Some(T::parse(
|
78
91
|
&self.headers,
|
79
92
|
&self.string_record,
|
80
93
|
self.null_string.clone(),
|
81
|
-
self.
|
82
|
-
|
83
|
-
|
84
|
-
|
94
|
+
self.ignore_null_bytes,
|
95
|
+
)))
|
96
|
+
} else {
|
97
|
+
Ok(None)
|
85
98
|
}
|
86
99
|
}
|
87
100
|
}
|
88
101
|
|
89
102
|
impl<'a, T: RecordParser<'a>> Iterator for RecordReader<'a, T> {
|
90
|
-
type Item = T::Output
|
103
|
+
type Item = Result<T::Output, ReaderError>;
|
91
104
|
|
92
105
|
#[inline]
|
93
106
|
fn next(&mut self) -> Option<Self::Item> {
|
94
|
-
|
95
|
-
|
96
|
-
|
107
|
+
match self.try_next() {
|
108
|
+
Ok(Some(record)) => Some(Ok(record)),
|
109
|
+
Ok(None) => None,
|
110
|
+
Err(e) => Some(Err(e)),
|
111
|
+
}
|
97
112
|
}
|
98
113
|
|
99
114
|
#[inline]
|
@@ -1,4 +1,5 @@
|
|
1
1
|
use magnus::{
|
2
|
+
error::Error as MagnusError,
|
2
3
|
value::{Opaque, ReprValue},
|
3
4
|
RClass, RString, Ruby, Value,
|
4
5
|
};
|
@@ -6,7 +7,7 @@ use std::fs::File;
|
|
6
7
|
use std::io::{self, BufReader, Read, Seek, SeekFrom, Write};
|
7
8
|
use std::sync::OnceLock;
|
8
9
|
|
9
|
-
use super::ForgottenFileHandle;
|
10
|
+
use super::{builder::ReaderError, ForgottenFileHandle};
|
10
11
|
|
11
12
|
static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
|
12
13
|
|
@@ -25,10 +26,7 @@ impl<T: Read + Seek> SeekableRead for BufReader<T> {}
|
|
25
26
|
impl SeekableRead for std::io::Cursor<Vec<u8>> {}
|
26
27
|
impl SeekableRead for ForgottenFileHandle {}
|
27
28
|
|
28
|
-
pub fn build_ruby_reader(
|
29
|
-
ruby: &Ruby,
|
30
|
-
input: Value,
|
31
|
-
) -> Result<Box<dyn SeekableRead>, magnus::Error> {
|
29
|
+
pub fn build_ruby_reader(ruby: &Ruby, input: Value) -> Result<Box<dyn SeekableRead>, ReaderError> {
|
32
30
|
if RubyReader::is_string_io(ruby, &input) {
|
33
31
|
RubyReader::from_string_io(ruby, input)
|
34
32
|
} else if RubyReader::is_io_like(&input) {
|
@@ -88,14 +86,14 @@ impl Seek for RubyReader<RString> {
|
|
88
86
|
}
|
89
87
|
|
90
88
|
impl RubyReader<Value> {
|
91
|
-
fn from_io(input: Value) -> Result<Box<dyn SeekableRead>,
|
89
|
+
fn from_io(input: Value) -> Result<Box<dyn SeekableRead>, ReaderError> {
|
92
90
|
if Self::is_io_like(&input) {
|
93
91
|
Ok(Box::new(Self::from_io_like(input)))
|
94
92
|
} else {
|
95
|
-
Err(
|
93
|
+
Err(MagnusError::new(
|
96
94
|
magnus::exception::type_error(),
|
97
95
|
"Input is not an IO-like object",
|
98
|
-
))
|
96
|
+
))?
|
99
97
|
}
|
100
98
|
}
|
101
99
|
|
@@ -112,15 +110,12 @@ impl RubyReader<Value> {
|
|
112
110
|
}
|
113
111
|
|
114
112
|
impl RubyReader<RString> {
|
115
|
-
pub fn from_string_io(
|
116
|
-
ruby: &Ruby,
|
117
|
-
input: Value,
|
118
|
-
) -> Result<Box<dyn SeekableRead>, magnus::Error> {
|
113
|
+
pub fn from_string_io(ruby: &Ruby, input: Value) -> Result<Box<dyn SeekableRead>, ReaderError> {
|
119
114
|
if !Self::is_string_io(ruby, &input) {
|
120
|
-
return Err(
|
115
|
+
return Err(MagnusError::new(
|
121
116
|
magnus::exception::type_error(),
|
122
117
|
"Input is not a StringIO",
|
123
|
-
))
|
118
|
+
))?;
|
124
119
|
}
|
125
120
|
|
126
121
|
let string_content = input.funcall::<_, _, RString>("string", ()).unwrap();
|
@@ -138,11 +133,11 @@ impl RubyReader<RString> {
|
|
138
133
|
input.is_kind_of(ruby.get_inner(*string_io_class))
|
139
134
|
}
|
140
135
|
|
141
|
-
fn from_string_like(input: Value) -> Result<Box<dyn SeekableRead>,
|
142
|
-
// Try calling `to_str`, and if that fails, try `to_s`
|
136
|
+
fn from_string_like(input: Value) -> Result<Box<dyn SeekableRead>, ReaderError> {
|
143
137
|
let string_content = input
|
144
138
|
.funcall::<_, _, RString>("to_str", ())
|
145
139
|
.or_else(|_| input.funcall::<_, _, RString>("to_s", ()))?;
|
140
|
+
|
146
141
|
Ok(Box::new(Self {
|
147
142
|
inner: string_content,
|
148
143
|
offset: 0,
|
@@ -154,12 +149,16 @@ impl Read for RubyReader<Value> {
|
|
154
149
|
fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
|
155
150
|
let bytes = self
|
156
151
|
.inner
|
157
|
-
.funcall::<_, _, RString
|
152
|
+
.funcall::<_, _, Option<RString>>("read", (buf.len(),))
|
158
153
|
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
159
154
|
|
160
|
-
|
161
|
-
|
162
|
-
|
155
|
+
match bytes {
|
156
|
+
Some(bytes) => {
|
157
|
+
buf.write_all(unsafe { bytes.as_slice() })?;
|
158
|
+
Ok(bytes.len())
|
159
|
+
}
|
160
|
+
None => Ok(0), // EOF
|
161
|
+
}
|
163
162
|
}
|
164
163
|
}
|
165
164
|
|
data/ext/osv/src/reader.rs
CHANGED
@@ -3,7 +3,7 @@ use crate::utils::*;
|
|
3
3
|
use ahash::RandomState;
|
4
4
|
use csv::Trim;
|
5
5
|
use magnus::value::ReprValue;
|
6
|
-
use magnus::{
|
6
|
+
use magnus::{Error, IntoValue, KwArgs, RHash, Ruby, Symbol, Value};
|
7
7
|
use std::collections::HashMap;
|
8
8
|
|
9
9
|
/// Valid result types for CSV parsing
|
@@ -34,9 +34,9 @@ struct EnumeratorArgs {
|
|
34
34
|
null_string: Option<String>,
|
35
35
|
result_type: String,
|
36
36
|
flexible: bool,
|
37
|
-
flexible_default: Option<String>,
|
38
37
|
trim: Option<String>,
|
39
38
|
ignore_null_bytes: bool,
|
39
|
+
lossy: bool,
|
40
40
|
}
|
41
41
|
|
42
42
|
/// Parses a CSV file with the given configuration.
|
@@ -44,10 +44,7 @@ struct EnumeratorArgs {
|
|
44
44
|
/// # Safety
|
45
45
|
/// This function uses unsafe code to get the Ruby runtime and leak memory for static references.
|
46
46
|
/// This is necessary for Ruby integration but should be used with caution.
|
47
|
-
pub fn parse_csv(
|
48
|
-
rb_self: Value,
|
49
|
-
args: &[Value],
|
50
|
-
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<'static, RandomState>>>>, Error> {
|
47
|
+
pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result<Value, Error> {
|
51
48
|
// SAFETY: We're in a Ruby callback, so Ruby runtime is guaranteed to be initialized
|
52
49
|
let ruby = unsafe { Ruby::get_unchecked() };
|
53
50
|
|
@@ -59,9 +56,9 @@ pub fn parse_csv(
|
|
59
56
|
null_string,
|
60
57
|
result_type,
|
61
58
|
flexible,
|
62
|
-
flexible_default,
|
63
59
|
trim,
|
64
60
|
ignore_null_bytes,
|
61
|
+
lossy,
|
65
62
|
} = parse_read_csv_args(&ruby, args)?;
|
66
63
|
|
67
64
|
if !ruby.block_given() {
|
@@ -74,7 +71,6 @@ pub fn parse_csv(
|
|
74
71
|
null_string,
|
75
72
|
result_type,
|
76
73
|
flexible,
|
77
|
-
flexible_default,
|
78
74
|
trim: match trim {
|
79
75
|
Trim::All => Some("all".to_string()),
|
80
76
|
Trim::Headers => Some("headers".to_string()),
|
@@ -82,7 +78,9 @@ pub fn parse_csv(
|
|
82
78
|
_ => None,
|
83
79
|
},
|
84
80
|
ignore_null_bytes,
|
85
|
-
|
81
|
+
lossy,
|
82
|
+
})
|
83
|
+
.map(|yield_enum| yield_enum.into_value_with(&ruby));
|
86
84
|
}
|
87
85
|
|
88
86
|
let result_type = ResultType::from_str(&result_type).ok_or_else(|| {
|
@@ -92,46 +90,53 @@ pub fn parse_csv(
|
|
92
90
|
)
|
93
91
|
})?;
|
94
92
|
|
95
|
-
|
93
|
+
match result_type {
|
96
94
|
ResultType::Hash => {
|
97
95
|
let builder = RecordReaderBuilder::<
|
98
96
|
HashMap<StringCacheKey, Option<CowStr<'static>>, RandomState>,
|
99
97
|
>::new(ruby, to_read)
|
100
98
|
.has_headers(has_headers)
|
101
99
|
.flexible(flexible)
|
102
|
-
.flexible_default(flexible_default)
|
103
100
|
.trim(trim)
|
104
101
|
.delimiter(delimiter)
|
105
102
|
.quote_char(quote_char)
|
106
103
|
.null_string(null_string)
|
107
104
|
.ignore_null_bytes(ignore_null_bytes)
|
105
|
+
.lossy(lossy)
|
108
106
|
.build()?;
|
109
107
|
|
110
|
-
|
108
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
109
|
+
for result in builder {
|
110
|
+
let record = result?;
|
111
|
+
let _: Value = ruby.yield_value(CsvRecord::Map(record))?;
|
112
|
+
}
|
111
113
|
}
|
112
114
|
ResultType::Array => {
|
113
115
|
let builder = RecordReaderBuilder::<Vec<Option<CowStr<'static>>>>::new(ruby, to_read)
|
114
116
|
.has_headers(has_headers)
|
115
117
|
.flexible(flexible)
|
116
|
-
.flexible_default(flexible_default)
|
117
118
|
.trim(trim)
|
118
119
|
.delimiter(delimiter)
|
119
120
|
.quote_char(quote_char)
|
120
121
|
.null_string(null_string)
|
121
122
|
.ignore_null_bytes(ignore_null_bytes)
|
123
|
+
.lossy(lossy)
|
122
124
|
.build()?;
|
123
125
|
|
124
|
-
|
126
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
127
|
+
for result in builder {
|
128
|
+
let record = result?;
|
129
|
+
let _: Value = ruby.yield_value(CsvRecord::<ahash::RandomState>::Vec(record))?;
|
130
|
+
}
|
125
131
|
}
|
126
|
-
}
|
132
|
+
}
|
127
133
|
|
128
|
-
|
134
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
135
|
+
Ok(ruby.qnil().into_value_with(&ruby))
|
129
136
|
}
|
130
137
|
|
131
138
|
/// Creates an enumerator for lazy CSV parsing
|
132
|
-
fn create_enumerator(
|
133
|
-
args: EnumeratorArgs,
|
134
|
-
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<'static, RandomState>>>>, Error> {
|
139
|
+
fn create_enumerator(args: EnumeratorArgs) -> Result<magnus::Enumerator, Error> {
|
135
140
|
let kwargs = RHash::new();
|
136
141
|
kwargs.aset(Symbol::new("has_headers"), args.has_headers)?;
|
137
142
|
kwargs.aset(
|
@@ -145,14 +150,10 @@ fn create_enumerator(
|
|
145
150
|
kwargs.aset(Symbol::new("nil_string"), args.null_string)?;
|
146
151
|
kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
|
147
152
|
kwargs.aset(Symbol::new("flexible"), args.flexible)?;
|
148
|
-
kwargs.aset(Symbol::new("flexible_default"), args.flexible_default)?;
|
149
153
|
kwargs.aset(Symbol::new("trim"), args.trim.map(Symbol::new))?;
|
150
|
-
|
151
154
|
kwargs.aset(Symbol::new("ignore_null_bytes"), args.ignore_null_bytes)?;
|
152
|
-
|
153
|
-
|
154
|
-
let enumerator = args
|
155
|
+
kwargs.aset(Symbol::new("lossy"), args.lossy)?;
|
156
|
+
Ok(args
|
155
157
|
.rb_self
|
156
|
-
.enumeratorize("for_each", (args.to_read, KwArgs(kwargs)))
|
157
|
-
Ok(Yield::Enumerator(enumerator))
|
158
|
+
.enumeratorize("for_each", (args.to_read, KwArgs(kwargs))))
|
158
159
|
}
|
data/ext/osv/src/utils.rs
CHANGED
@@ -34,9 +34,9 @@ pub struct ReadCsvArgs {
|
|
34
34
|
pub null_string: Option<String>,
|
35
35
|
pub result_type: String,
|
36
36
|
pub flexible: bool,
|
37
|
-
pub flexible_default: Option<String>,
|
38
37
|
pub trim: csv::Trim,
|
39
38
|
pub ignore_null_bytes: bool,
|
39
|
+
pub lossy: bool,
|
40
40
|
}
|
41
41
|
|
42
42
|
/// Parse common arguments for CSV parsing
|
@@ -54,9 +54,9 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
|
|
54
54
|
Option<Option<String>>,
|
55
55
|
Option<Option<Value>>,
|
56
56
|
Option<Option<bool>>,
|
57
|
-
Option<Option<Option<String>>>,
|
58
57
|
Option<Option<Value>>,
|
59
58
|
Option<Option<bool>>,
|
59
|
+
Option<Option<bool>>,
|
60
60
|
),
|
61
61
|
(),
|
62
62
|
>(
|
@@ -69,9 +69,9 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
|
|
69
69
|
"nil_string",
|
70
70
|
"result_type",
|
71
71
|
"flexible",
|
72
|
-
"flexible_default",
|
73
72
|
"trim",
|
74
73
|
"ignore_null_bytes",
|
74
|
+
"lossy",
|
75
75
|
],
|
76
76
|
)?;
|
77
77
|
|
@@ -134,11 +134,9 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
|
|
134
134
|
|
135
135
|
let flexible = kwargs.optional.5.flatten().unwrap_or_default();
|
136
136
|
|
137
|
-
let flexible_default = kwargs.optional.6.flatten().unwrap_or_default();
|
138
|
-
|
139
137
|
let trim = match kwargs
|
140
138
|
.optional
|
141
|
-
.
|
139
|
+
.6
|
142
140
|
.flatten()
|
143
141
|
.map(|value| parse_string_or_symbol(ruby, value))
|
144
142
|
{
|
@@ -166,7 +164,9 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
|
|
166
164
|
None => csv::Trim::None,
|
167
165
|
};
|
168
166
|
|
169
|
-
let ignore_null_bytes = kwargs.optional.
|
167
|
+
let ignore_null_bytes = kwargs.optional.7.flatten().unwrap_or_default();
|
168
|
+
|
169
|
+
let lossy = kwargs.optional.8.flatten().unwrap_or_default();
|
170
170
|
|
171
171
|
Ok(ReadCsvArgs {
|
172
172
|
to_read,
|
@@ -176,8 +176,8 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
|
|
176
176
|
null_string,
|
177
177
|
result_type,
|
178
178
|
flexible,
|
179
|
-
flexible_default,
|
180
179
|
trim,
|
181
180
|
ignore_null_bytes,
|
181
|
+
lossy,
|
182
182
|
})
|
183
183
|
}
|
data/lib/osv/version.rb
CHANGED
data/lib/osv.rbi
CHANGED
@@ -17,14 +17,12 @@ module OSV
|
|
17
17
|
# ("hash" or "array" or :hash or :array)
|
18
18
|
# - `flexible`: Boolean specifying if the parser should be flexible
|
19
19
|
# (default: false)
|
20
|
-
# - `flexible_default`: String specifying the default value for missing fields.
|
21
|
-
# Implicitly enables flexible mode if set.
|
22
|
-
# (default: `nil`)
|
23
20
|
# - `trim`: String specifying the trim mode
|
24
21
|
# ("all" or "headers" or "fields" or :all or :headers or :fields)
|
25
22
|
# (default: `nil`)
|
26
23
|
# - `ignore_null_bytes`: Boolean specifying if null bytes should be ignored
|
27
24
|
# (default: false)
|
25
|
+
# - `lossy`: Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character
|
28
26
|
sig do
|
29
27
|
params(
|
30
28
|
input: T.any(String, StringIO, IO),
|
@@ -35,7 +33,6 @@ module OSV
|
|
35
33
|
buffer_size: T.nilable(Integer),
|
36
34
|
result_type: T.nilable(T.any(String, Symbol)),
|
37
35
|
flexible: T.nilable(T::Boolean),
|
38
|
-
flexible_default: T.nilable(String),
|
39
36
|
ignore_null_bytes: T.nilable(T::Boolean),
|
40
37
|
trim: T.nilable(T.any(String, Symbol)),
|
41
38
|
blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.nilable(String)], T::Array[T.nilable(String)])).void)
|
@@ -50,9 +47,9 @@ module OSV
|
|
50
47
|
buffer_size: nil,
|
51
48
|
result_type: nil,
|
52
49
|
flexible: nil,
|
53
|
-
flexible_default: nil,
|
54
50
|
ignore_null_bytes: nil,
|
55
51
|
trim: nil,
|
52
|
+
lossy: nil,
|
56
53
|
&blk
|
57
54
|
)
|
58
55
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: osv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-01-
|
11
|
+
date: 2025-01-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|