osv 0.3.16 → 0.3.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +26 -26
- data/ext/osv/src/csv/builder.rs +92 -85
- data/ext/osv/src/csv/header_cache.rs +28 -2
- data/ext/osv/src/csv/mod.rs +1 -2
- data/ext/osv/src/csv/parser.rs +16 -80
- data/ext/osv/src/csv/record.rs +4 -4
- data/ext/osv/src/csv/record_reader.rs +51 -117
- data/ext/osv/src/csv/ruby_integration.rs +10 -21
- data/ext/osv/src/csv/ruby_reader.rs +8 -1
- data/ext/osv/src/reader.rs +64 -46
- data/ext/osv/src/utils.rs +15 -19
- data/lib/osv/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: da944a5af1cc88630fe0952e6e710d2acb8ac420ae8708a107064f5ecf444dec
|
4
|
+
data.tar.gz: bd6de3860ff2f47eb03b9019d307d647fa8c2e8f366543fbe95604f284871b62
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8a130687fb25aaae3734f2e69c2258ccf893c584cd0c2893b751282b393ee4d52b2317a338f1ef68a864222e4947614ffdca7e6b98d8d37dc934dfede61f7bc1
|
7
|
+
data.tar.gz: 332a5dc1c6ce6df721b22f9e66b54d48426da3a0148917f9ec13036edd63e1fb70a950a2971964d289e076536af47d090c89fd95961d8ca4b51f1f1b8a221a98
|
data/README.md
CHANGED
@@ -142,34 +142,34 @@ OSV - Gzipped Direct 1.000 i/100ms
|
|
142
142
|
FastCSV - Gzipped 1.000 i/100ms
|
143
143
|
CSV - Gzipped 1.000 i/100ms
|
144
144
|
Calculating -------------------------------------
|
145
|
-
CSV - StringIO 0.
|
146
|
-
FastCSV - StringIO 0.
|
147
|
-
OSV - StringIO 0.
|
148
|
-
CSV - Hash output 0.
|
149
|
-
OSV - Hash output 0.
|
150
|
-
CSV - Array output 0.
|
151
|
-
OSV - Array output 0.
|
145
|
+
CSV - StringIO 0.083 (± 0.0%) i/s (12.06 s/i) - 3.000 in 36.304469s
|
146
|
+
FastCSV - StringIO 0.335 (± 0.0%) i/s (2.98 s/i) - 10.000 in 31.019521s
|
147
|
+
OSV - StringIO 0.705 (± 0.0%) i/s (1.42 s/i) - 21.000 in 30.629511s
|
148
|
+
CSV - Hash output 0.060 (± 0.0%) i/s (16.74 s/i) - 2.000 in 33.475977s
|
149
|
+
OSV - Hash output 0.434 (± 0.0%) i/s (2.30 s/i) - 13.000 in 30.071679s
|
150
|
+
CSV - Array output 0.063 (± 0.0%) i/s (15.88 s/i) - 2.000 in 32.229906s
|
151
|
+
OSV - Array output 0.406 (± 0.0%) i/s (2.47 s/i) - 12.000 in 31.072600s
|
152
152
|
FastCSV - Array output
|
153
|
-
0.
|
153
|
+
0.321 (± 0.0%) i/s (3.11 s/i) - 10.000 in 31.458966s
|
154
154
|
OSV - Direct Open Array output
|
155
|
-
0.
|
156
|
-
OSV - Gzipped 0.
|
157
|
-
OSV - Gzipped Direct 0.
|
158
|
-
FastCSV - Gzipped 0.
|
159
|
-
CSV - Gzipped 0.
|
155
|
+
0.686 (± 0.0%) i/s (1.46 s/i) - 21.000 in 30.639715s
|
156
|
+
OSV - Gzipped 0.524 (± 0.0%) i/s (1.91 s/i) - 16.000 in 30.695259s
|
157
|
+
OSV - Gzipped Direct 0.519 (± 0.0%) i/s (1.93 s/i) - 16.000 in 30.830005s
|
158
|
+
FastCSV - Gzipped 0.313 (± 0.0%) i/s (3.20 s/i) - 10.000 in 32.031002s
|
159
|
+
CSV - Gzipped 0.057 (± 0.0%) i/s (17.55 s/i) - 2.000 in 35.107808s
|
160
160
|
|
161
161
|
Comparison:
|
162
|
-
OSV -
|
163
|
-
|
164
|
-
OSV -
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
FastCSV -
|
169
|
-
|
170
|
-
FastCSV - Gzipped:
|
171
|
-
CSV - StringIO:
|
172
|
-
CSV - Array output:
|
173
|
-
CSV - Hash output:
|
174
|
-
CSV - Gzipped:
|
162
|
+
OSV - StringIO : 0.7 i/s
|
163
|
+
OSV - Direct Open Array output: 0.7 i/s - 1.03x slower
|
164
|
+
OSV - Gzipped : 0.5 i/s - 1.34x slower
|
165
|
+
OSV - Gzipped Direct : 0.5 i/s - 1.36x slower
|
166
|
+
OSV - Hash output : 0.4 i/s - 1.62x slower
|
167
|
+
OSV - Array output : 0.4 i/s - 1.74x slower
|
168
|
+
FastCSV - StringIO : 0.3 i/s - 2.10x slower
|
169
|
+
FastCSV - Array output : 0.3 i/s - 2.20x slower
|
170
|
+
FastCSV - Gzipped : 0.3 i/s - 2.26x slower
|
171
|
+
CSV - StringIO : 0.1 i/s - 8.50x slower
|
172
|
+
CSV - Array output : 0.1 i/s - 11.20x slower
|
173
|
+
CSV - Hash output : 0.1 i/s - 11.80x slower
|
174
|
+
CSV - Gzipped : 0.1 i/s - 12.37x slower
|
175
175
|
```
|
data/ext/osv/src/csv/builder.rs
CHANGED
@@ -6,8 +6,10 @@ use super::{
|
|
6
6
|
ForgottenFileHandle,
|
7
7
|
};
|
8
8
|
use flate2::read::GzDecoder;
|
9
|
-
use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, Ruby, Value};
|
9
|
+
use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, RString, Ruby, Value};
|
10
10
|
use std::{
|
11
|
+
borrow::Cow,
|
12
|
+
fmt::Debug,
|
11
13
|
fs::File,
|
12
14
|
io::{self, BufReader, Read},
|
13
15
|
marker::PhantomData,
|
@@ -17,18 +19,21 @@ use std::{
|
|
17
19
|
|
18
20
|
use thiserror::Error;
|
19
21
|
|
20
|
-
|
21
|
-
|
22
|
+
/// Errors that can occur when building a RecordReader
|
22
23
|
#[derive(Error, Debug)]
|
23
24
|
pub enum ReaderError {
|
24
25
|
#[error("Failed to get file descriptor: {0}")]
|
25
26
|
FileDescriptor(String),
|
26
|
-
#[error("Invalid file descriptor")]
|
27
|
-
InvalidFileDescriptor,
|
27
|
+
#[error("Invalid file descriptor: {0}")]
|
28
|
+
InvalidFileDescriptor(i32),
|
28
29
|
#[error("Failed to open file: {0}")]
|
29
30
|
FileOpen(#[from] io::Error),
|
30
31
|
#[error("Failed to intern headers: {0}")]
|
31
32
|
HeaderIntern(#[from] CacheError),
|
33
|
+
#[error("Invalid flexible default value: {0}")]
|
34
|
+
InvalidFlexibleDefault(String),
|
35
|
+
#[error("Invalid null string value: {0}")]
|
36
|
+
InvalidNullString(String),
|
32
37
|
#[error("Ruby error: {0}")]
|
33
38
|
Ruby(String),
|
34
39
|
}
|
@@ -48,63 +53,27 @@ impl From<ReaderError> for MagnusError {
|
|
48
53
|
}
|
49
54
|
}
|
50
55
|
|
51
|
-
|
52
|
-
|
56
|
+
/// Builder for configuring and creating a RecordReader instance.
|
57
|
+
///
|
58
|
+
/// This struct provides a fluent interface for setting up CSV parsing options
|
59
|
+
/// and creating a RecordReader with the specified configuration.
|
60
|
+
pub struct RecordReaderBuilder<'a, T: RecordParser<'a>> {
|
61
|
+
ruby: Ruby,
|
53
62
|
to_read: Value,
|
54
63
|
has_headers: bool,
|
55
64
|
delimiter: u8,
|
56
65
|
quote_char: u8,
|
57
66
|
null_string: Option<String>,
|
58
|
-
buffer: usize,
|
59
67
|
flexible: bool,
|
60
|
-
flexible_default: Option
|
68
|
+
flexible_default: Option<String>,
|
61
69
|
trim: csv::Trim,
|
62
70
|
_phantom: PhantomData<T>,
|
71
|
+
_phantom_a: PhantomData<&'a ()>,
|
63
72
|
}
|
64
73
|
|
65
|
-
impl<T: RecordParser<'
|
66
|
-
|
67
|
-
|
68
|
-
readable: Box<dyn Read + Send + 'static>,
|
69
|
-
) -> Result<RecordReader<'static, T>, ReaderError> {
|
70
|
-
let flexible = self.flexible || self.flexible_default.is_some();
|
71
|
-
let mut reader = csv::ReaderBuilder::new()
|
72
|
-
.has_headers(self.has_headers)
|
73
|
-
.delimiter(self.delimiter)
|
74
|
-
.quote(self.quote_char)
|
75
|
-
.flexible(flexible)
|
76
|
-
.trim(self.trim)
|
77
|
-
.from_reader(readable);
|
78
|
-
|
79
|
-
let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
|
80
|
-
let static_headers = StringCache::intern_many(&headers)?;
|
81
|
-
|
82
|
-
Ok(RecordReader::new_multi_threaded(
|
83
|
-
reader,
|
84
|
-
static_headers,
|
85
|
-
self.buffer,
|
86
|
-
self.null_string,
|
87
|
-
self.flexible_default,
|
88
|
-
))
|
89
|
-
}
|
90
|
-
|
91
|
-
pub fn build_threaded(self) -> Result<RecordReader<'static, T>, ReaderError> {
|
92
|
-
if self.to_read.is_kind_of(self.ruby.class_io()) {
|
93
|
-
let readable = self.handle_file_descriptor()?;
|
94
|
-
self.build_multi_threaded(readable)
|
95
|
-
} else if self.to_read.is_kind_of(self.ruby.class_string()) {
|
96
|
-
let readable = self.handle_file_path()?;
|
97
|
-
self.build_multi_threaded(readable)
|
98
|
-
} else {
|
99
|
-
let readable = build_ruby_reader(self.ruby, self.to_read)?;
|
100
|
-
let buffered_reader = BufReader::with_capacity(READ_BUFFER_SIZE, readable);
|
101
|
-
self.build_single_threaded(buffered_reader)
|
102
|
-
}
|
103
|
-
}
|
104
|
-
}
|
105
|
-
|
106
|
-
impl<'a, T: RecordParser<'a> + Send> RecordReaderBuilder<'a, T> {
|
107
|
-
pub fn new(ruby: &'a Ruby, to_read: Value) -> Self {
|
74
|
+
impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
|
75
|
+
/// Creates a new builder instance with default settings.
|
76
|
+
pub fn new(ruby: Ruby, to_read: Value) -> Self {
|
108
77
|
Self {
|
109
78
|
ruby,
|
110
79
|
to_read,
|
@@ -112,92 +81,107 @@ impl<'a, T: RecordParser<'a> + Send> RecordReaderBuilder<'a, T> {
|
|
112
81
|
delimiter: b',',
|
113
82
|
quote_char: b'"',
|
114
83
|
null_string: None,
|
115
|
-
buffer: BUFFER_CHANNEL_SIZE,
|
116
84
|
flexible: false,
|
117
85
|
flexible_default: None,
|
118
86
|
trim: csv::Trim::None,
|
119
87
|
_phantom: PhantomData,
|
88
|
+
_phantom_a: PhantomData,
|
120
89
|
}
|
121
90
|
}
|
122
91
|
|
92
|
+
/// Sets whether the CSV file has headers.
|
93
|
+
#[must_use]
|
123
94
|
pub fn has_headers(mut self, has_headers: bool) -> Self {
|
124
95
|
self.has_headers = has_headers;
|
125
96
|
self
|
126
97
|
}
|
127
98
|
|
99
|
+
/// Sets the delimiter character for the CSV.
|
100
|
+
#[must_use]
|
128
101
|
pub fn delimiter(mut self, delimiter: u8) -> Self {
|
129
102
|
self.delimiter = delimiter;
|
130
103
|
self
|
131
104
|
}
|
132
105
|
|
106
|
+
/// Sets the quote character for the CSV.
|
107
|
+
#[must_use]
|
133
108
|
pub fn quote_char(mut self, quote_char: u8) -> Self {
|
134
109
|
self.quote_char = quote_char;
|
135
110
|
self
|
136
111
|
}
|
137
112
|
|
113
|
+
/// Sets the string that should be interpreted as null.
|
114
|
+
#[must_use]
|
138
115
|
pub fn null_string(mut self, null_string: Option<String>) -> Self {
|
139
116
|
self.null_string = null_string;
|
140
117
|
self
|
141
118
|
}
|
142
119
|
|
143
|
-
|
144
|
-
|
145
|
-
self
|
146
|
-
}
|
147
|
-
|
120
|
+
/// Sets whether the reader should be flexible with field counts.
|
121
|
+
#[must_use]
|
148
122
|
pub fn flexible(mut self, flexible: bool) -> Self {
|
149
123
|
self.flexible = flexible;
|
150
124
|
self
|
151
125
|
}
|
152
126
|
|
153
|
-
|
127
|
+
/// Sets the default value for missing fields when in flexible mode.
|
128
|
+
#[must_use]
|
129
|
+
pub fn flexible_default(mut self, flexible_default: Option<String>) -> Self {
|
154
130
|
self.flexible_default = flexible_default;
|
155
131
|
self
|
156
132
|
}
|
157
133
|
|
134
|
+
/// Sets the trimming mode for fields.
|
135
|
+
#[must_use]
|
158
136
|
pub fn trim(mut self, trim: csv::Trim) -> Self {
|
159
137
|
self.trim = trim;
|
160
138
|
self
|
161
139
|
}
|
162
140
|
|
163
|
-
|
141
|
+
/// Handles reading from a file descriptor.
|
142
|
+
fn handle_file_descriptor(&self) -> Result<Box<dyn SeekableRead>, ReaderError> {
|
164
143
|
let raw_value = self.to_read.as_raw();
|
165
144
|
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
166
|
-
.map_err(|
|
167
|
-
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
168
|
-
})?;
|
145
|
+
.map_err(|e| ReaderError::FileDescriptor(format!("{:?}", e)))?;
|
169
146
|
|
170
147
|
if fd < 0 {
|
171
|
-
return Err(ReaderError::InvalidFileDescriptor);
|
148
|
+
return Err(ReaderError::InvalidFileDescriptor(fd));
|
172
149
|
}
|
173
150
|
|
174
151
|
let file = unsafe { File::from_raw_fd(fd) };
|
175
152
|
let forgotten = ForgottenFileHandle(ManuallyDrop::new(file));
|
176
|
-
Ok(Box::new(
|
177
|
-
READ_BUFFER_SIZE,
|
178
|
-
forgotten,
|
179
|
-
)))
|
153
|
+
Ok(Box::new(forgotten))
|
180
154
|
}
|
181
155
|
|
182
|
-
|
156
|
+
/// Handles reading from a file path.
|
157
|
+
fn handle_file_path(&self) -> Result<Box<dyn SeekableRead>, ReaderError> {
|
183
158
|
let path = self.to_read.to_r_string()?.to_string()?;
|
184
159
|
let file = File::open(&path)?;
|
185
160
|
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
)
|
161
|
+
if path.ends_with(".gz") {
|
162
|
+
// For gzipped files, we need to decompress them into memory first
|
163
|
+
// since GzDecoder doesn't support seeking
|
164
|
+
let mut decoder = GzDecoder::new(BufReader::with_capacity(READ_BUFFER_SIZE, file));
|
165
|
+
let mut contents = Vec::new();
|
166
|
+
decoder.read_to_end(&mut contents)?;
|
167
|
+
Ok(Box::new(std::io::Cursor::new(contents)))
|
191
168
|
} else {
|
192
|
-
Box::new(
|
193
|
-
}
|
169
|
+
Ok(Box::new(file))
|
170
|
+
}
|
194
171
|
}
|
195
172
|
|
196
|
-
|
197
|
-
|
198
|
-
readable
|
199
|
-
|
173
|
+
/// Builds the RecordReader with the configured options.
|
174
|
+
pub fn build(self) -> Result<RecordReader<'a, T>, ReaderError> {
|
175
|
+
let readable = if self.to_read.is_kind_of(self.ruby.class_io()) {
|
176
|
+
self.handle_file_descriptor()?
|
177
|
+
} else if self.to_read.is_kind_of(self.ruby.class_string()) {
|
178
|
+
self.handle_file_path()?
|
179
|
+
} else {
|
180
|
+
build_ruby_reader(&self.ruby, self.to_read)?
|
181
|
+
};
|
182
|
+
|
200
183
|
let flexible = self.flexible || self.flexible_default.is_some();
|
184
|
+
let reader = BufReader::with_capacity(READ_BUFFER_SIZE, readable);
|
201
185
|
|
202
186
|
let mut reader = csv::ReaderBuilder::new()
|
203
187
|
.has_headers(self.has_headers)
|
@@ -205,16 +189,39 @@ impl<'a, T: RecordParser<'a> + Send> RecordReaderBuilder<'a, T> {
|
|
205
189
|
.quote(self.quote_char)
|
206
190
|
.flexible(flexible)
|
207
191
|
.trim(self.trim)
|
208
|
-
.from_reader(
|
192
|
+
.from_reader(reader);
|
209
193
|
|
210
|
-
let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
|
194
|
+
let headers = RecordReader::<T>::get_headers(&self.ruby, &mut reader, self.has_headers)?;
|
211
195
|
let static_headers = StringCache::intern_many(&headers)?;
|
212
196
|
|
213
|
-
|
197
|
+
// We intern both of these to get static string references we can reuse throughout the parser.
|
198
|
+
let flexible_default = self
|
199
|
+
.flexible_default
|
200
|
+
.map(|s| {
|
201
|
+
RString::new(&s)
|
202
|
+
.to_interned_str()
|
203
|
+
.as_str()
|
204
|
+
.map_err(|e| ReaderError::InvalidFlexibleDefault(format!("{:?}", e)))
|
205
|
+
})
|
206
|
+
.transpose()?
|
207
|
+
.map(|s| Cow::Borrowed(s));
|
208
|
+
|
209
|
+
let null_string = self
|
210
|
+
.null_string
|
211
|
+
.map(|s| {
|
212
|
+
RString::new(&s)
|
213
|
+
.to_interned_str()
|
214
|
+
.as_str()
|
215
|
+
.map_err(|e| ReaderError::InvalidNullString(format!("{:?}", e)))
|
216
|
+
})
|
217
|
+
.transpose()?
|
218
|
+
.map(|s| Cow::Borrowed(s));
|
219
|
+
|
220
|
+
Ok(RecordReader::new(
|
214
221
|
reader,
|
215
222
|
static_headers,
|
216
|
-
|
217
|
-
|
223
|
+
null_string,
|
224
|
+
flexible_default,
|
218
225
|
))
|
219
226
|
}
|
220
227
|
}
|
@@ -1,4 +1,3 @@
|
|
1
|
-
use magnus::{r_string::FString, value::Opaque, IntoValue, RString, Ruby, Value};
|
2
1
|
/// This module exists to avoid cloning header keys in returned HashMaps.
|
3
2
|
/// Since the underlying RString creation already involves cloning,
|
4
3
|
/// this caching layer aims to reduce redundant allocations.
|
@@ -7,8 +6,14 @@ use magnus::{r_string::FString, value::Opaque, IntoValue, RString, Ruby, Value};
|
|
7
6
|
/// so this optimization could be removed if any issues arise.
|
8
7
|
use std::{
|
9
8
|
collections::HashMap,
|
10
|
-
sync::{
|
9
|
+
sync::{
|
10
|
+
atomic::{AtomicU32, Ordering},
|
11
|
+
LazyLock, Mutex, OnceLock,
|
12
|
+
},
|
11
13
|
};
|
14
|
+
|
15
|
+
use magnus::{r_string::FString, value::Opaque, IntoValue, RString, Ruby, Value};
|
16
|
+
|
12
17
|
use thiserror::Error;
|
13
18
|
|
14
19
|
#[derive(Debug, Error)]
|
@@ -132,3 +137,24 @@ impl StringCache {
|
|
132
137
|
Ok(())
|
133
138
|
}
|
134
139
|
}
|
140
|
+
|
141
|
+
pub struct HeaderCacheCleanupIter<I> {
|
142
|
+
pub inner: I,
|
143
|
+
pub headers: OnceLock<Vec<StringCacheKey>>,
|
144
|
+
}
|
145
|
+
|
146
|
+
impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
|
147
|
+
type Item = I::Item;
|
148
|
+
|
149
|
+
fn next(&mut self) -> Option<Self::Item> {
|
150
|
+
self.inner.next()
|
151
|
+
}
|
152
|
+
}
|
153
|
+
|
154
|
+
impl<I> Drop for HeaderCacheCleanupIter<I> {
|
155
|
+
fn drop(&mut self) {
|
156
|
+
if let Some(headers) = self.headers.get() {
|
157
|
+
StringCache::clear(&headers).unwrap();
|
158
|
+
}
|
159
|
+
}
|
160
|
+
}
|
data/ext/osv/src/csv/mod.rs
CHANGED
@@ -7,8 +7,7 @@ mod ruby_integration;
|
|
7
7
|
mod ruby_reader;
|
8
8
|
|
9
9
|
pub use builder::RecordReaderBuilder;
|
10
|
-
pub(crate) use builder::BUFFER_CHANNEL_SIZE;
|
11
10
|
pub use header_cache::StringCacheKey;
|
12
|
-
pub use record::
|
11
|
+
pub use record::CowStr;
|
13
12
|
pub use record::CsvRecord;
|
14
13
|
pub use ruby_integration::*;
|
data/ext/osv/src/csv/parser.rs
CHANGED
@@ -3,21 +3,21 @@ use std::collections::HashMap;
|
|
3
3
|
use std::hash::BuildHasher;
|
4
4
|
|
5
5
|
use super::header_cache::StringCacheKey;
|
6
|
-
use super::
|
6
|
+
use super::CowStr;
|
7
7
|
|
8
8
|
pub trait RecordParser<'a> {
|
9
|
-
type Output
|
9
|
+
type Output;
|
10
10
|
|
11
11
|
fn parse(
|
12
12
|
headers: &[StringCacheKey],
|
13
13
|
record: &csv::StringRecord,
|
14
|
-
null_string: Option
|
14
|
+
null_string: Option<Cow<'a, str>>,
|
15
15
|
flexible_default: Option<Cow<'a, str>>,
|
16
16
|
) -> Self::Output;
|
17
17
|
}
|
18
18
|
|
19
|
-
impl<'a, S: BuildHasher + Default
|
20
|
-
for HashMap<StringCacheKey, Option<
|
19
|
+
impl<'a, S: BuildHasher + Default> RecordParser<'a>
|
20
|
+
for HashMap<StringCacheKey, Option<CowStr<'a>>, S>
|
21
21
|
{
|
22
22
|
type Output = Self;
|
23
23
|
|
@@ -25,23 +25,23 @@ impl<'a, S: BuildHasher + Default + 'a> RecordParser<'a>
|
|
25
25
|
fn parse(
|
26
26
|
headers: &[StringCacheKey],
|
27
27
|
record: &csv::StringRecord,
|
28
|
-
null_string: Option
|
28
|
+
null_string: Option<Cow<'a, str>>,
|
29
29
|
flexible_default: Option<Cow<'a, str>>,
|
30
30
|
) -> Self::Output {
|
31
31
|
let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
|
32
32
|
|
33
33
|
let shared_empty = Cow::Borrowed("");
|
34
|
-
let shared_default = flexible_default.map(
|
34
|
+
let shared_default = flexible_default.map(CowStr);
|
35
35
|
headers.iter().enumerate().for_each(|(i, ref header)| {
|
36
36
|
let value = record.get(i).map_or_else(
|
37
37
|
|| shared_default.clone(),
|
38
38
|
|field| {
|
39
|
-
if null_string == Some(field) {
|
39
|
+
if null_string.as_deref() == Some(field) {
|
40
40
|
None
|
41
41
|
} else if field.is_empty() {
|
42
|
-
Some(
|
42
|
+
Some(CowStr(shared_empty.clone()))
|
43
43
|
} else {
|
44
|
-
Some(
|
44
|
+
Some(CowStr(Cow::Owned(field.to_string())))
|
45
45
|
}
|
46
46
|
},
|
47
47
|
);
|
@@ -51,29 +51,29 @@ impl<'a, S: BuildHasher + Default + 'a> RecordParser<'a>
|
|
51
51
|
}
|
52
52
|
}
|
53
53
|
|
54
|
-
impl<'a> RecordParser<'a> for Vec<Option<
|
54
|
+
impl<'a> RecordParser<'a> for Vec<Option<CowStr<'a>>> {
|
55
55
|
type Output = Self;
|
56
56
|
|
57
57
|
#[inline]
|
58
58
|
fn parse(
|
59
59
|
headers: &[StringCacheKey],
|
60
60
|
record: &csv::StringRecord,
|
61
|
-
null_string: Option
|
61
|
+
null_string: Option<Cow<'a, str>>,
|
62
62
|
flexible_default: Option<Cow<'a, str>>,
|
63
63
|
) -> Self::Output {
|
64
64
|
let target_len = headers.len();
|
65
65
|
let mut vec = Vec::with_capacity(target_len);
|
66
66
|
|
67
67
|
let shared_empty = Cow::Borrowed("");
|
68
|
-
let shared_default = flexible_default.map(
|
68
|
+
let shared_default = flexible_default.map(CowStr);
|
69
69
|
|
70
70
|
for field in record.iter() {
|
71
|
-
let value = if Some(field) == null_string {
|
71
|
+
let value = if Some(field) == null_string.as_deref() {
|
72
72
|
None
|
73
73
|
} else if field.is_empty() {
|
74
|
-
Some(
|
74
|
+
Some(CowStr(shared_empty.clone()))
|
75
75
|
} else {
|
76
|
-
Some(
|
76
|
+
Some(CowStr(Cow::Owned(field.to_string())))
|
77
77
|
};
|
78
78
|
vec.push(value);
|
79
79
|
}
|
@@ -86,67 +86,3 @@ impl<'a> RecordParser<'a> for Vec<Option<CowValue<'a>>> {
|
|
86
86
|
vec
|
87
87
|
}
|
88
88
|
}
|
89
|
-
|
90
|
-
// impl<'a, S: BuildHasher + Default + 'a> RecordParser<'a>
|
91
|
-
// for HashMap<&'static str, Option<String>, S>
|
92
|
-
// {
|
93
|
-
// type Output = Self;
|
94
|
-
|
95
|
-
// #[inline]
|
96
|
-
// fn parse(
|
97
|
-
// headers: &[&'static str],
|
98
|
-
// record: &csv::StringRecord,
|
99
|
-
// null_string: Option<&str>,
|
100
|
-
// flexible_default: Option<Cow<'a, str>>,
|
101
|
-
// ) -> Self::Output {
|
102
|
-
// let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
|
103
|
-
// headers.iter().enumerate().for_each(|(i, &header)| {
|
104
|
-
// let value = record.get(i).map_or_else(
|
105
|
-
// || flexible_default.clone(),
|
106
|
-
// |field| {
|
107
|
-
// if null_string == Some(field) {
|
108
|
-
// None
|
109
|
-
// } else if field.is_empty() {
|
110
|
-
// Some(String::new())
|
111
|
-
// } else {
|
112
|
-
// Some(field.into())
|
113
|
-
// }
|
114
|
-
// },
|
115
|
-
// );
|
116
|
-
// map.insert(header, value);
|
117
|
-
// });
|
118
|
-
// map
|
119
|
-
// }
|
120
|
-
// }
|
121
|
-
|
122
|
-
// impl<'a> RecordParser<'a> for Vec<Option<String>> {
|
123
|
-
// type Output = Self;
|
124
|
-
|
125
|
-
// #[inline]
|
126
|
-
// fn parse(
|
127
|
-
// headers: &[&'static str],
|
128
|
-
// record: &csv::StringRecord,
|
129
|
-
// null_string: Option<&str>,
|
130
|
-
// flexible_default: Option<Cow<'a, str>>,
|
131
|
-
// ) -> Self::Output {
|
132
|
-
// let target_len = headers.len();
|
133
|
-
// let mut vec = Vec::with_capacity(target_len);
|
134
|
-
// for field in record.iter() {
|
135
|
-
// let value = if Some(field) == null_string {
|
136
|
-
// None
|
137
|
-
// } else if field.is_empty() {
|
138
|
-
// Some(String::new())
|
139
|
-
// } else {
|
140
|
-
// Some(field.into())
|
141
|
-
// };
|
142
|
-
// vec.push(value);
|
143
|
-
// }
|
144
|
-
|
145
|
-
// if vec.len() < target_len {
|
146
|
-
// if let Some(default) = flexible_default {
|
147
|
-
// vec.resize_with(target_len, || Some(default.to_string()));
|
148
|
-
// }
|
149
|
-
// }
|
150
|
-
// vec
|
151
|
-
// }
|
152
|
-
// }
|
data/ext/osv/src/csv/record.rs
CHANGED
@@ -6,8 +6,8 @@ use super::StringCacheKey;
|
|
6
6
|
|
7
7
|
#[derive(Debug)]
|
8
8
|
pub enum CsvRecord<'a, S: BuildHasher + Default> {
|
9
|
-
Vec(Vec<Option<
|
10
|
-
Map(HashMap<StringCacheKey, Option<
|
9
|
+
Vec(Vec<Option<CowStr<'a>>>),
|
10
|
+
Map(HashMap<StringCacheKey, Option<CowStr<'a>>, S>),
|
11
11
|
}
|
12
12
|
|
13
13
|
impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
|
@@ -46,9 +46,9 @@ impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
|
|
46
46
|
}
|
47
47
|
|
48
48
|
#[derive(Debug, Clone)]
|
49
|
-
pub struct
|
49
|
+
pub struct CowStr<'a>(pub Cow<'a, str>);
|
50
50
|
|
51
|
-
impl IntoValue for
|
51
|
+
impl IntoValue for CowStr<'_> {
|
52
52
|
fn into_value_with(self, handle: &Ruby) -> Value {
|
53
53
|
self.0.into_value_with(handle)
|
54
54
|
}
|
@@ -2,32 +2,34 @@ use super::header_cache::StringCacheKey;
|
|
2
2
|
use super::parser::RecordParser;
|
3
3
|
use super::{header_cache::StringCache, ruby_reader::SeekableRead};
|
4
4
|
use magnus::{Error, Ruby};
|
5
|
-
use std::
|
6
|
-
use std::{
|
5
|
+
use std::borrow::Cow;
|
6
|
+
use std::io::{BufReader, Read};
|
7
7
|
|
8
|
+
/// Size of the internal buffer used for reading CSV records
|
8
9
|
pub(crate) const READ_BUFFER_SIZE: usize = 16384;
|
9
10
|
|
11
|
+
/// A reader that processes CSV records using a specified parser.
|
12
|
+
///
|
13
|
+
/// This struct implements Iterator to provide a streaming interface for CSV records.
|
10
14
|
pub struct RecordReader<'a, T: RecordParser<'a>> {
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
|
18
|
-
headers: Vec<StringCacheKey>,
|
19
|
-
null_string: Option<String>,
|
20
|
-
flexible_default: Option<Cow<'a, str>>,
|
21
|
-
string_record: csv::StringRecord,
|
22
|
-
},
|
23
|
-
MultiThreaded {
|
24
|
-
headers: Vec<StringCacheKey>,
|
25
|
-
receiver: kanal::Receiver<T::Output>,
|
26
|
-
handle: Option<thread::JoinHandle<()>>,
|
27
|
-
},
|
15
|
+
reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
|
16
|
+
headers: Vec<StringCacheKey>,
|
17
|
+
null_string: Option<Cow<'a, str>>,
|
18
|
+
flexible_default: Option<Cow<'a, str>>,
|
19
|
+
string_record: csv::StringRecord,
|
20
|
+
parser: std::marker::PhantomData<T>,
|
28
21
|
}
|
29
22
|
|
30
23
|
impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
|
24
|
+
/// Reads and processes headers from a CSV reader.
|
25
|
+
///
|
26
|
+
/// # Arguments
|
27
|
+
/// * `ruby` - Ruby VM context for error handling
|
28
|
+
/// * `reader` - CSV reader instance
|
29
|
+
/// * `has_headers` - Whether the CSV file contains headers
|
30
|
+
///
|
31
|
+
/// # Returns
|
32
|
+
/// A vector of header strings or generated column names if `has_headers` is false
|
31
33
|
#[inline]
|
32
34
|
pub(crate) fn get_headers(
|
33
35
|
ruby: &Ruby,
|
@@ -41,67 +43,41 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
|
|
41
43
|
)
|
42
44
|
})?;
|
43
45
|
|
44
|
-
|
45
|
-
|
46
|
-
headers.extend(first_row.iter().map(String::from));
|
46
|
+
Ok(if has_headers {
|
47
|
+
first_row.iter().map(String::from).collect()
|
47
48
|
} else {
|
48
|
-
|
49
|
-
}
|
50
|
-
Ok(headers)
|
49
|
+
(0..first_row.len()).map(|i| format!("c{i}")).collect()
|
50
|
+
})
|
51
51
|
}
|
52
52
|
|
53
|
-
|
53
|
+
/// Creates a new RecordReader instance.
|
54
|
+
pub(crate) fn new(
|
54
55
|
reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
|
55
56
|
headers: Vec<StringCacheKey>,
|
56
|
-
null_string: Option<
|
57
|
-
flexible_default: Option
|
57
|
+
null_string: Option<Cow<'a, str>>,
|
58
|
+
flexible_default: Option<Cow<'a, str>>,
|
58
59
|
) -> Self {
|
59
60
|
let headers_len = headers.len();
|
60
61
|
Self {
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
},
|
62
|
+
reader,
|
63
|
+
headers,
|
64
|
+
null_string,
|
65
|
+
flexible_default,
|
66
|
+
string_record: csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers_len),
|
67
|
+
parser: std::marker::PhantomData,
|
68
68
|
}
|
69
69
|
}
|
70
|
-
}
|
71
|
-
|
72
|
-
impl<T: RecordParser<'static> + Send> RecordReader<'static, T> {
|
73
|
-
pub(crate) fn new_multi_threaded(
|
74
|
-
mut reader: csv::Reader<Box<dyn Read + Send + 'static>>,
|
75
|
-
headers: Vec<StringCacheKey>,
|
76
|
-
buffer_size: usize,
|
77
|
-
null_string: Option<String>,
|
78
|
-
flexible_default: Option<&'static str>,
|
79
|
-
) -> Self {
|
80
|
-
let (sender, receiver) = kanal::bounded(buffer_size);
|
81
|
-
let headers_for_thread = headers.clone();
|
82
70
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
if sender.send(row).is_err() {
|
94
|
-
break;
|
95
|
-
}
|
96
|
-
}
|
97
|
-
});
|
98
|
-
|
99
|
-
Self {
|
100
|
-
inner: ReaderImpl::MultiThreaded {
|
101
|
-
headers,
|
102
|
-
receiver,
|
103
|
-
handle: Some(handle),
|
104
|
-
},
|
71
|
+
/// Attempts to read the next record, returning any errors encountered.
|
72
|
+
fn try_next(&mut self) -> csv::Result<Option<T::Output>> {
|
73
|
+
match self.reader.read_record(&mut self.string_record)? {
|
74
|
+
true => Ok(Some(T::parse(
|
75
|
+
&self.headers,
|
76
|
+
&self.string_record,
|
77
|
+
self.null_string.clone(),
|
78
|
+
self.flexible_default.clone(),
|
79
|
+
))),
|
80
|
+
false => Ok(None),
|
105
81
|
}
|
106
82
|
}
|
107
83
|
}
|
@@ -111,63 +87,21 @@ impl<'a, T: RecordParser<'a>> Iterator for RecordReader<'a, T> {
|
|
111
87
|
|
112
88
|
#[inline]
|
113
89
|
fn next(&mut self) -> Option<Self::Item> {
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
} => match receiver.recv() {
|
118
|
-
Ok(record) => Some(record),
|
119
|
-
Err(_) => {
|
120
|
-
if let Some(handle) = handle.take() {
|
121
|
-
let _ = handle.join();
|
122
|
-
}
|
123
|
-
None
|
124
|
-
}
|
125
|
-
},
|
126
|
-
ReaderImpl::SingleThreaded {
|
127
|
-
reader,
|
128
|
-
headers,
|
129
|
-
null_string,
|
130
|
-
flexible_default,
|
131
|
-
ref mut string_record,
|
132
|
-
} => match reader.read_record(string_record) {
|
133
|
-
Ok(true) => Some(T::parse(
|
134
|
-
headers,
|
135
|
-
string_record,
|
136
|
-
null_string.as_deref(),
|
137
|
-
flexible_default.clone(),
|
138
|
-
)),
|
139
|
-
Ok(false) => None,
|
140
|
-
Err(_e) => None,
|
141
|
-
},
|
142
|
-
}
|
90
|
+
// Note: We intentionally swallow errors here to maintain Iterator contract.
|
91
|
+
// Errors can be handled by using try_next() directly if needed.
|
92
|
+
self.try_next().ok().flatten()
|
143
93
|
}
|
144
94
|
|
145
95
|
#[inline]
|
146
96
|
fn size_hint(&self) -> (usize, Option<usize>) {
|
147
|
-
|
148
|
-
(0, None)
|
97
|
+
(0, None) // Cannot determine size without reading entire file
|
149
98
|
}
|
150
99
|
}
|
151
100
|
|
152
101
|
impl<'a, T: RecordParser<'a>> Drop for RecordReader<'a, T> {
|
153
102
|
#[inline]
|
154
103
|
fn drop(&mut self) {
|
155
|
-
|
156
|
-
|
157
|
-
receiver,
|
158
|
-
handle,
|
159
|
-
headers,
|
160
|
-
..
|
161
|
-
} => {
|
162
|
-
receiver.close();
|
163
|
-
if let Some(handle) = handle.take() {
|
164
|
-
let _ = handle.join();
|
165
|
-
}
|
166
|
-
let _ = StringCache::clear(&headers);
|
167
|
-
}
|
168
|
-
ReaderImpl::SingleThreaded { headers, .. } => {
|
169
|
-
let _ = StringCache::clear(&headers);
|
170
|
-
}
|
171
|
-
}
|
104
|
+
// Intentionally ignore errors during cleanup as there's no meaningful way to handle them
|
105
|
+
let _ = StringCache::clear(&self.headers);
|
172
106
|
}
|
173
107
|
}
|
@@ -1,30 +1,19 @@
|
|
1
|
-
use std::{
|
1
|
+
use std::{
|
2
|
+
fs::File,
|
3
|
+
io::{self, Read, Seek, SeekFrom},
|
4
|
+
mem::ManuallyDrop,
|
5
|
+
};
|
2
6
|
|
3
7
|
pub struct ForgottenFileHandle(pub ManuallyDrop<File>);
|
4
8
|
|
5
|
-
impl
|
9
|
+
impl Read for ForgottenFileHandle {
|
6
10
|
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
7
11
|
self.0.read(buf)
|
8
12
|
}
|
13
|
+
}
|
9
14
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
// fn read_buf(&mut self, cursor: BorrowedCursor<'_>) -> io::Result<()> {
|
15
|
-
// self.0.read_buf(cursor)
|
16
|
-
// }
|
17
|
-
|
18
|
-
// #[inline]
|
19
|
-
// fn is_read_vectored(&self) -> bool {
|
20
|
-
// self.0.is_read_vectored()
|
21
|
-
// }
|
22
|
-
|
23
|
-
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
|
24
|
-
self.0.read_to_end(buf)
|
25
|
-
}
|
26
|
-
|
27
|
-
fn read_to_string(&mut self, buf: &mut String) -> io::Result<usize> {
|
28
|
-
self.0.read_to_string(buf)
|
15
|
+
impl Seek for ForgottenFileHandle {
|
16
|
+
fn seek(&mut self, pos: SeekFrom) -> io::Result<u64> {
|
17
|
+
self.0.seek(pos)
|
29
18
|
}
|
30
19
|
}
|
@@ -2,9 +2,12 @@ use magnus::{
|
|
2
2
|
value::{Opaque, ReprValue},
|
3
3
|
RClass, RString, Ruby, Value,
|
4
4
|
};
|
5
|
-
use std::
|
5
|
+
use std::fs::File;
|
6
|
+
use std::io::{self, BufReader, Read, Seek, SeekFrom, Write};
|
6
7
|
use std::sync::OnceLock;
|
7
8
|
|
9
|
+
use super::ForgottenFileHandle;
|
10
|
+
|
8
11
|
static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
|
9
12
|
|
10
13
|
/// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
|
@@ -17,6 +20,10 @@ pub struct RubyReader<T> {
|
|
17
20
|
pub trait SeekableRead: std::io::Read + Seek {}
|
18
21
|
impl SeekableRead for RubyReader<Value> {}
|
19
22
|
impl SeekableRead for RubyReader<RString> {}
|
23
|
+
impl SeekableRead for File {}
|
24
|
+
impl<T: Read + Seek> SeekableRead for BufReader<T> {}
|
25
|
+
impl SeekableRead for std::io::Cursor<Vec<u8>> {}
|
26
|
+
impl SeekableRead for ForgottenFileHandle {}
|
20
27
|
|
21
28
|
pub fn build_ruby_reader(
|
22
29
|
ruby: &Ruby,
|
data/ext/osv/src/reader.rs
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
use crate::csv::{
|
1
|
+
use crate::csv::{CowStr, CsvRecord, RecordReaderBuilder, StringCacheKey};
|
2
2
|
use crate::utils::*;
|
3
3
|
use ahash::RandomState;
|
4
4
|
use csv::Trim;
|
@@ -6,12 +6,49 @@ use magnus::value::ReprValue;
|
|
6
6
|
use magnus::{block::Yield, Error, KwArgs, RHash, Ruby, Symbol, Value};
|
7
7
|
use std::collections::HashMap;
|
8
8
|
|
9
|
+
/// Valid result types for CSV parsing
|
10
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
11
|
+
enum ResultType {
|
12
|
+
Hash,
|
13
|
+
Array,
|
14
|
+
}
|
15
|
+
|
16
|
+
impl ResultType {
|
17
|
+
fn from_str(s: &str) -> Option<Self> {
|
18
|
+
match s {
|
19
|
+
"hash" => Some(Self::Hash),
|
20
|
+
"array" => Some(Self::Array),
|
21
|
+
_ => None,
|
22
|
+
}
|
23
|
+
}
|
24
|
+
}
|
25
|
+
|
26
|
+
/// Arguments for creating an enumerator
|
27
|
+
#[derive(Debug)]
|
28
|
+
struct EnumeratorArgs {
|
29
|
+
rb_self: Value,
|
30
|
+
to_read: Value,
|
31
|
+
has_headers: bool,
|
32
|
+
delimiter: u8,
|
33
|
+
quote_char: u8,
|
34
|
+
null_string: Option<String>,
|
35
|
+
result_type: String,
|
36
|
+
flexible: bool,
|
37
|
+
flexible_default: Option<String>,
|
38
|
+
trim: Option<String>,
|
39
|
+
}
|
40
|
+
|
41
|
+
/// Parses a CSV file with the given configuration.
|
42
|
+
///
|
43
|
+
/// # Safety
|
44
|
+
/// This function uses unsafe code to get the Ruby runtime and leak memory for static references.
|
45
|
+
/// This is necessary for Ruby integration but should be used with caution.
|
9
46
|
pub fn parse_csv(
|
10
47
|
rb_self: Value,
|
11
48
|
args: &[Value],
|
12
49
|
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<'static, RandomState>>>>, Error> {
|
13
|
-
|
14
|
-
let ruby
|
50
|
+
// SAFETY: We're in a Ruby callback, so Ruby runtime is guaranteed to be initialized
|
51
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
15
52
|
|
16
53
|
let ReadCsvArgs {
|
17
54
|
to_read,
|
@@ -19,16 +56,11 @@ pub fn parse_csv(
|
|
19
56
|
delimiter,
|
20
57
|
quote_char,
|
21
58
|
null_string,
|
22
|
-
buffer_size,
|
23
59
|
result_type,
|
24
60
|
flexible,
|
25
61
|
flexible_default,
|
26
62
|
trim,
|
27
|
-
} = parse_read_csv_args(ruby, args)?;
|
28
|
-
|
29
|
-
let flexible_default: &'static Option<String> = Box::leak(Box::new(flexible_default));
|
30
|
-
let leaked_flexible_default: &'static Option<&str> =
|
31
|
-
Box::leak(Box::new(flexible_default.as_deref()));
|
63
|
+
} = parse_read_csv_args(&ruby, args)?;
|
32
64
|
|
33
65
|
if !ruby.block_given() {
|
34
66
|
return create_enumerator(EnumeratorArgs {
|
@@ -38,10 +70,9 @@ pub fn parse_csv(
|
|
38
70
|
delimiter,
|
39
71
|
quote_char,
|
40
72
|
null_string,
|
41
|
-
|
42
|
-
result_type,
|
73
|
+
result_type: result_type,
|
43
74
|
flexible,
|
44
|
-
flexible_default:
|
75
|
+
flexible_default: flexible_default,
|
45
76
|
trim: match trim {
|
46
77
|
Trim::All => Some("all".to_string()),
|
47
78
|
Trim::Headers => Some("headers".to_string()),
|
@@ -51,60 +82,47 @@ pub fn parse_csv(
|
|
51
82
|
});
|
52
83
|
}
|
53
84
|
|
54
|
-
let
|
55
|
-
|
85
|
+
let result_type = ResultType::from_str(&result_type).ok_or_else(|| {
|
86
|
+
Error::new(
|
87
|
+
ruby.exception_runtime_error(),
|
88
|
+
"Invalid result type, expected 'hash' or 'array'",
|
89
|
+
)
|
90
|
+
})?;
|
91
|
+
|
92
|
+
let iter: Box<dyn Iterator<Item = CsvRecord<RandomState>>> = match result_type {
|
93
|
+
ResultType::Hash => {
|
56
94
|
let builder = RecordReaderBuilder::<
|
57
|
-
HashMap<StringCacheKey, Option<
|
95
|
+
HashMap<StringCacheKey, Option<CowStr<'static>>, RandomState>,
|
58
96
|
>::new(ruby, to_read)
|
59
97
|
.has_headers(has_headers)
|
60
98
|
.flexible(flexible)
|
61
|
-
.flexible_default(flexible_default
|
99
|
+
.flexible_default(flexible_default)
|
62
100
|
.trim(trim)
|
63
101
|
.delimiter(delimiter)
|
64
102
|
.quote_char(quote_char)
|
65
|
-
.null_string(null_string)
|
66
|
-
.buffer(buffer_size);
|
103
|
+
.null_string(null_string);
|
67
104
|
|
68
|
-
Box::new(builder.
|
105
|
+
Box::new(builder.build()?.map(CsvRecord::Map))
|
69
106
|
}
|
70
|
-
|
71
|
-
RecordReaderBuilder::<Vec<Option<
|
107
|
+
ResultType::Array => {
|
108
|
+
let builder = RecordReaderBuilder::<Vec<Option<CowStr<'static>>>>::new(ruby, to_read)
|
72
109
|
.has_headers(has_headers)
|
73
110
|
.flexible(flexible)
|
74
|
-
.flexible_default(flexible_default
|
111
|
+
.flexible_default(flexible_default)
|
75
112
|
.trim(trim)
|
76
113
|
.delimiter(delimiter)
|
77
114
|
.quote_char(quote_char)
|
78
115
|
.null_string(null_string)
|
79
|
-
.
|
80
|
-
|
81
|
-
|
82
|
-
),
|
83
|
-
_ => {
|
84
|
-
return Err(Error::new(
|
85
|
-
ruby.exception_runtime_error(),
|
86
|
-
"Invalid result type",
|
87
|
-
))
|
116
|
+
.build()?;
|
117
|
+
|
118
|
+
Box::new(builder.map(CsvRecord::Vec))
|
88
119
|
}
|
89
120
|
};
|
90
121
|
|
91
122
|
Ok(Yield::Iter(iter))
|
92
123
|
}
|
93
124
|
|
94
|
-
|
95
|
-
rb_self: Value,
|
96
|
-
to_read: Value,
|
97
|
-
has_headers: bool,
|
98
|
-
delimiter: u8,
|
99
|
-
quote_char: u8,
|
100
|
-
null_string: Option<String>,
|
101
|
-
buffer_size: usize,
|
102
|
-
result_type: String,
|
103
|
-
flexible: bool,
|
104
|
-
flexible_default: Option<&'static str>,
|
105
|
-
trim: Option<String>,
|
106
|
-
}
|
107
|
-
|
125
|
+
/// Creates an enumerator for lazy CSV parsing
|
108
126
|
fn create_enumerator(
|
109
127
|
args: EnumeratorArgs,
|
110
128
|
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<'static, RandomState>>>>, Error> {
|
@@ -119,11 +137,11 @@ fn create_enumerator(
|
|
119
137
|
String::from_utf8(vec![args.quote_char]).unwrap(),
|
120
138
|
)?;
|
121
139
|
kwargs.aset(Symbol::new("nil_string"), args.null_string)?;
|
122
|
-
kwargs.aset(Symbol::new("buffer_size"), args.buffer_size)?;
|
123
140
|
kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
|
124
141
|
kwargs.aset(Symbol::new("flexible"), args.flexible)?;
|
125
142
|
kwargs.aset(Symbol::new("flexible_default"), args.flexible_default)?;
|
126
143
|
kwargs.aset(Symbol::new("trim"), args.trim.map(Symbol::new))?;
|
144
|
+
|
127
145
|
let enumerator = args
|
128
146
|
.rb_self
|
129
147
|
.enumeratorize("for_each", (args.to_read, KwArgs(kwargs)));
|
data/ext/osv/src/utils.rs
CHANGED
@@ -4,8 +4,6 @@ use magnus::{
|
|
4
4
|
Error, RString, Ruby, Symbol, Value,
|
5
5
|
};
|
6
6
|
|
7
|
-
use crate::csv::BUFFER_CHANNEL_SIZE;
|
8
|
-
|
9
7
|
fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, Error> {
|
10
8
|
if value.is_nil() {
|
11
9
|
Ok(None)
|
@@ -34,7 +32,6 @@ pub struct ReadCsvArgs {
|
|
34
32
|
pub delimiter: u8,
|
35
33
|
pub quote_char: u8,
|
36
34
|
pub null_string: Option<String>,
|
37
|
-
pub buffer_size: usize,
|
38
35
|
pub result_type: String,
|
39
36
|
pub flexible: bool,
|
40
37
|
pub flexible_default: Option<String>,
|
@@ -50,15 +47,14 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
|
|
50
47
|
_,
|
51
48
|
(),
|
52
49
|
(
|
53
|
-
Option<bool
|
54
|
-
Option<String
|
55
|
-
Option<String>,
|
50
|
+
Option<Option<bool>>,
|
51
|
+
Option<Option<String>>,
|
56
52
|
Option<Option<String>>,
|
57
|
-
Option<usize>,
|
58
|
-
Option<Value>,
|
59
|
-
Option<bool>,
|
60
53
|
Option<Option<String>>,
|
61
|
-
Option<Value
|
54
|
+
Option<Option<Value>>,
|
55
|
+
Option<Option<bool>>,
|
56
|
+
Option<Option<Option<String>>>,
|
57
|
+
Option<Option<Value>>,
|
62
58
|
),
|
63
59
|
(),
|
64
60
|
>(
|
@@ -69,7 +65,6 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
|
|
69
65
|
"col_sep",
|
70
66
|
"quote_char",
|
71
67
|
"nil_string",
|
72
|
-
"buffer_size",
|
73
68
|
"result_type",
|
74
69
|
"flexible",
|
75
70
|
"flexible_default",
|
@@ -77,11 +72,12 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
|
|
77
72
|
],
|
78
73
|
)?;
|
79
74
|
|
80
|
-
let has_headers = kwargs.optional.0.unwrap_or(true);
|
75
|
+
let has_headers = kwargs.optional.0.flatten().unwrap_or(true);
|
81
76
|
|
82
77
|
let delimiter = *kwargs
|
83
78
|
.optional
|
84
79
|
.1
|
80
|
+
.flatten()
|
85
81
|
.unwrap_or_else(|| ",".to_string())
|
86
82
|
.as_bytes()
|
87
83
|
.first()
|
@@ -95,6 +91,7 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
|
|
95
91
|
let quote_char = *kwargs
|
96
92
|
.optional
|
97
93
|
.2
|
94
|
+
.flatten()
|
98
95
|
.unwrap_or_else(|| "\"".to_string())
|
99
96
|
.as_bytes()
|
100
97
|
.first()
|
@@ -107,11 +104,10 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
|
|
107
104
|
|
108
105
|
let null_string = kwargs.optional.3.unwrap_or_default();
|
109
106
|
|
110
|
-
let buffer_size = kwargs.optional.4.unwrap_or(BUFFER_CHANNEL_SIZE);
|
111
|
-
|
112
107
|
let result_type = match kwargs
|
113
108
|
.optional
|
114
|
-
.
|
109
|
+
.4
|
110
|
+
.flatten()
|
115
111
|
.map(|value| parse_string_or_symbol(ruby, value))
|
116
112
|
{
|
117
113
|
Some(Ok(Some(parsed))) => match parsed.as_str() {
|
@@ -133,13 +129,14 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
|
|
133
129
|
None => String::from("hash"),
|
134
130
|
};
|
135
131
|
|
136
|
-
let flexible = kwargs.optional.
|
132
|
+
let flexible = kwargs.optional.5.flatten().unwrap_or_default();
|
137
133
|
|
138
|
-
let flexible_default = kwargs.optional.
|
134
|
+
let flexible_default = kwargs.optional.6.flatten().unwrap_or_default();
|
139
135
|
|
140
136
|
let trim = match kwargs
|
141
137
|
.optional
|
142
|
-
.
|
138
|
+
.7
|
139
|
+
.flatten()
|
143
140
|
.map(|value| parse_string_or_symbol(ruby, value))
|
144
141
|
{
|
145
142
|
Some(Ok(Some(parsed))) => match parsed.as_str() {
|
@@ -172,7 +169,6 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
|
|
172
169
|
delimiter,
|
173
170
|
quote_char,
|
174
171
|
null_string,
|
175
|
-
buffer_size,
|
176
172
|
result_type,
|
177
173
|
flexible,
|
178
174
|
flexible_default,
|
data/lib/osv/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: osv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.18
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-01-
|
11
|
+
date: 2025-01-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|