osv 0.3.8 → 0.3.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +7 -0
- data/README.md +62 -51
- data/Rakefile +7 -1
- data/ext/osv/Cargo.toml +1 -0
- data/ext/osv/src/csv/builder.rs +62 -19
- data/ext/osv/src/csv/mod.rs +2 -0
- data/ext/osv/src/csv/parser.rs +16 -15
- data/ext/osv/src/csv/read_impl.rs +3 -1
- data/ext/osv/src/csv/record.rs +4 -4
- data/ext/osv/src/reader.rs +28 -13
- data/ext/osv/src/utils.rs +75 -29
- data/lib/osv/version.rb +1 -1
- data/lib/osv.rbi +7 -2
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: aed16dbfb14e6caebceb388104731091a5354394cd174804982dbce8f4b95963
|
4
|
+
data.tar.gz: 232497c8ec55ab15559f126c4bc90c6bcd0dd4296efadcbf23d07d6a24c7969d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 36e1bab13ede785e2f41f0a56b37a2ff7448deff182b614c71e15ccd16799abe94f1f7db63034cd1c04195666c85a12fa0284dde75753b16e05e8767e9c87b18
|
7
|
+
data.tar.gz: '0990fee8e41dd9eb046d6b3733770adb4ef9825165bb0a012b32dac03ddd9d8f2107860d75ef05437aa8d820d42d16c8148642fd3faab326dcab8007731bcf61'
|
data/Cargo.lock
CHANGED
@@ -274,6 +274,7 @@ dependencies = [
|
|
274
274
|
"serde",
|
275
275
|
"serde_magnus",
|
276
276
|
"thiserror",
|
277
|
+
"xxhash-rust",
|
277
278
|
]
|
278
279
|
|
279
280
|
[[package]]
|
@@ -526,3 +527,9 @@ name = "windows_x86_64_msvc"
|
|
526
527
|
version = "0.52.6"
|
527
528
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
528
529
|
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
530
|
+
|
531
|
+
[[package]]
|
532
|
+
name = "xxhash-rust"
|
533
|
+
version = "0.8.14"
|
534
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
535
|
+
checksum = "d7d48f1b18be023c95e7b75f481cac649d74be7c507ff4a407c55cfb957f7934"
|
data/README.md
CHANGED
@@ -70,9 +70,10 @@ Both methods support the following options:
|
|
70
70
|
- by default, empty strings are interpreted as empty strings
|
71
71
|
- if you want to interpret empty strings as nil, set this to an empty string
|
72
72
|
- `buffer_size`: Integer specifying the read buffer size
|
73
|
-
- `result_type`: String specifying the output format ("hash" or "array")
|
73
|
+
- `result_type`: String specifying the output format ("hash" or "array" or :hash or :array)
|
74
74
|
- `flexible`: Boolean specifying if the parser should be flexible (default: false)
|
75
75
|
- `flexible_default`: String specifying the default value for missing fields. Implicitly enables flexible mode if set. (default: `nil`)
|
76
|
+
- `trim`: String specifying the trim mode ("all" or "headers" or "fields" or :all or :headers or :fields)
|
76
77
|
|
77
78
|
### Input Sources
|
78
79
|
|
@@ -112,45 +113,50 @@ Here's some unscientific benchmarks. You can find the code in the [benchmark/com
|
|
112
113
|
### 10,000 lines
|
113
114
|
|
114
115
|
```
|
115
|
-
Benchmarking with
|
116
|
+
Benchmarking with 100001 lines of data
|
116
117
|
|
117
|
-
ruby 3.3.
|
118
|
+
ruby 3.3.6 (2024-11-05 revision 75015d4c1f) [arm64-darwin24]
|
118
119
|
Warming up --------------------------------------
|
119
|
-
OSV - Hash output
|
120
|
+
OSV - Hash output 1.000 i/100ms
|
120
121
|
CSV - Hash output 1.000 i/100ms
|
121
|
-
OSV - Array output
|
122
|
-
|
122
|
+
OSV - Array output 1.000 i/100ms
|
123
|
+
OSV - Direct Open Array output
|
124
|
+
12.719M i/100ms
|
125
|
+
CSV - Array output 1.000 i/100ms
|
123
126
|
FastCSV - Array output
|
124
|
-
|
125
|
-
OSV - StringIO
|
127
|
+
1.000 i/100ms
|
128
|
+
OSV - StringIO 1.000 i/100ms
|
126
129
|
CSV - StringIO 1.000 i/100ms
|
127
|
-
FastCSV - StringIO
|
128
|
-
OSV - Gzipped
|
130
|
+
FastCSV - StringIO 1.000 i/100ms
|
131
|
+
OSV - Gzipped 1.000 i/100ms
|
129
132
|
CSV - Gzipped 1.000 i/100ms
|
130
133
|
Calculating -------------------------------------
|
131
|
-
OSV - Hash output
|
132
|
-
CSV - Hash output
|
133
|
-
OSV - Array output
|
134
|
-
|
134
|
+
OSV - Hash output 6.722 (±14.9%) i/s (148.77 ms/i) - 59.000 in 10.074753s
|
135
|
+
CSV - Hash output 1.223 (± 0.0%) i/s (817.62 ms/i) - 13.000 in 10.788284s
|
136
|
+
OSV - Array output 17.284 (±11.6%) i/s (57.86 ms/i) - 171.000 in 10.007321s
|
137
|
+
OSV - Direct Open Array output
|
138
|
+
213.629M (±13.5%) i/s (4.68 ns/i) - 1.921B in 10.005506s
|
139
|
+
CSV - Array output 2.193 (± 0.0%) i/s (455.93 ms/i) - 22.000 in 10.052607s
|
135
140
|
FastCSV - Array output
|
136
|
-
|
137
|
-
OSV - StringIO
|
138
|
-
CSV - StringIO
|
139
|
-
FastCSV - StringIO
|
140
|
-
OSV - Gzipped
|
141
|
-
CSV - Gzipped
|
141
|
+
7.993 (± 0.0%) i/s (125.11 ms/i) - 80.000 in 10.053729s
|
142
|
+
OSV - StringIO 6.626 (±15.1%) i/s (150.91 ms/i) - 66.000 in 10.103646s
|
143
|
+
CSV - StringIO 1.478 (± 0.0%) i/s (676.78 ms/i) - 15.000 in 10.158640s
|
144
|
+
FastCSV - StringIO 17.074 (± 5.9%) i/s (58.57 ms/i) - 171.000 in 10.059266s
|
145
|
+
OSV - Gzipped 5.639 (± 0.0%) i/s (177.32 ms/i) - 57.000 in 10.152487s
|
146
|
+
CSV - Gzipped 1.176 (± 0.0%) i/s (850.19 ms/i) - 12.000 in 10.233398s
|
142
147
|
|
143
148
|
Comparison:
|
144
|
-
|
145
|
-
OSV - Array output:
|
146
|
-
FastCSV -
|
147
|
-
|
148
|
-
OSV - Hash output:
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
CSV - Hash output:
|
149
|
+
OSV - Direct Open Array output: 213629268.6 i/s
|
150
|
+
OSV - Array output: 17.3 i/s - 12360250.79x slower
|
151
|
+
FastCSV - StringIO: 17.1 i/s - 12511956.50x slower
|
152
|
+
FastCSV - Array output: 8.0 i/s - 26727225.72x slower
|
153
|
+
OSV - Hash output: 6.7 i/s - 31780615.83x slower
|
154
|
+
OSV - StringIO: 6.6 i/s - 32239620.60x slower
|
155
|
+
OSV - Gzipped: 5.6 i/s - 37881517.48x slower
|
156
|
+
CSV - Array output: 2.2 i/s - 97400427.87x slower
|
157
|
+
CSV - StringIO: 1.5 i/s - 144580048.04x slower
|
158
|
+
CSV - Hash output: 1.2 i/s - 174666591.31x slower
|
159
|
+
CSV - Gzipped: 1.2 i/s - 181626018.23x slower
|
154
160
|
```
|
155
161
|
|
156
162
|
### 1,000,000 lines
|
@@ -158,11 +164,13 @@ FastCSV - Array output: 97.9 i/s - 2.05x slower
|
|
158
164
|
```
|
159
165
|
Benchmarking with 1000001 lines of data
|
160
166
|
|
161
|
-
ruby 3.3.
|
167
|
+
ruby 3.3.6 (2024-11-05 revision 75015d4c1f) [arm64-darwin24]
|
162
168
|
Warming up --------------------------------------
|
163
169
|
OSV - Hash output 1.000 i/100ms
|
164
170
|
CSV - Hash output 1.000 i/100ms
|
165
171
|
OSV - Array output 1.000 i/100ms
|
172
|
+
OSV - Direct Open Array output
|
173
|
+
1.000 i/100ms
|
166
174
|
CSV - Array output 1.000 i/100ms
|
167
175
|
FastCSV - Array output
|
168
176
|
1.000 i/100ms
|
@@ -172,27 +180,30 @@ FastCSV - Array output
|
|
172
180
|
OSV - Gzipped 1.000 i/100ms
|
173
181
|
CSV - Gzipped 1.000 i/100ms
|
174
182
|
Calculating -------------------------------------
|
175
|
-
OSV - Hash output 0.
|
176
|
-
CSV - Hash output 0.
|
177
|
-
OSV - Array output 1.
|
178
|
-
|
183
|
+
OSV - Hash output 0.492 (± 0.0%) i/s (2.03 s/i) - 5.000 in 10.463278s
|
184
|
+
CSV - Hash output 0.114 (± 0.0%) i/s (8.75 s/i) - 2.000 in 17.573877s
|
185
|
+
OSV - Array output 1.502 (± 0.0%) i/s (665.58 ms/i) - 14.000 in 10.217551s
|
186
|
+
OSV - Direct Open Array output
|
187
|
+
1.626 (± 0.0%) i/s (614.90 ms/i) - 16.000 in 10.190323s
|
188
|
+
CSV - Array output 0.183 (± 0.0%) i/s (5.46 s/i) - 2.000 in 10.951943s
|
179
189
|
FastCSV - Array output
|
180
|
-
0.
|
181
|
-
OSV - StringIO 0.
|
182
|
-
CSV - StringIO 0.
|
183
|
-
FastCSV - StringIO
|
184
|
-
OSV - Gzipped 0.437 (± 0.0%) i/s (2.29 s/i) -
|
185
|
-
CSV - Gzipped 0.
|
190
|
+
0.326 (± 0.0%) i/s (3.07 s/i) - 4.000 in 12.340605s
|
191
|
+
OSV - StringIO 0.567 (± 0.0%) i/s (1.76 s/i) - 6.000 in 10.698027s
|
192
|
+
CSV - StringIO 0.141 (± 0.0%) i/s (7.10 s/i) - 2.000 in 14.237144s
|
193
|
+
FastCSV - StringIO 0.923 (± 0.0%) i/s (1.08 s/i) - 10.000 in 11.567775s
|
194
|
+
OSV - Gzipped 0.437 (± 0.0%) i/s (2.29 s/i) - 5.000 in 11.452764s
|
195
|
+
CSV - Gzipped 0.104 (± 0.0%) i/s (9.64 s/i) - 2.000 in 19.373423s
|
186
196
|
|
187
197
|
Comparison:
|
188
|
-
|
189
|
-
|
190
|
-
FastCSV -
|
191
|
-
|
192
|
-
|
193
|
-
OSV - Gzipped: 0.4 i/s -
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
+
OSV - Direct Open Array output: 1.6 i/s
|
199
|
+
OSV - Array output: 1.5 i/s - 1.08x slower
|
200
|
+
FastCSV - StringIO: 0.9 i/s - 1.76x slower
|
201
|
+
OSV - StringIO: 0.6 i/s - 2.87x slower
|
202
|
+
OSV - Hash output: 0.5 i/s - 3.30x slower
|
203
|
+
OSV - Gzipped: 0.4 i/s - 3.72x slower
|
204
|
+
FastCSV - Array output: 0.3 i/s - 4.99x slower
|
205
|
+
CSV - Array output: 0.2 i/s - 8.88x slower
|
206
|
+
CSV - StringIO: 0.1 i/s - 11.55x slower
|
207
|
+
CSV - Hash output: 0.1 i/s - 14.24x slower
|
208
|
+
CSV - Gzipped: 0.1 i/s - 15.68x slower
|
198
209
|
```
|
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ Rake::ExtensionTask.new("osv") do |c|
|
|
11
11
|
end
|
12
12
|
|
13
13
|
task :dev do
|
14
|
-
ENV["RB_SYS_CARGO_PROFILE"] = "
|
14
|
+
ENV["RB_SYS_CARGO_PROFILE"] = "release"
|
15
15
|
end
|
16
16
|
|
17
17
|
Rake::TestTask.new do |t|
|
@@ -20,3 +20,9 @@ Rake::TestTask.new do |t|
|
|
20
20
|
t.libs << "lib"
|
21
21
|
t.libs << "test"
|
22
22
|
end
|
23
|
+
|
24
|
+
task :release do
|
25
|
+
sh "bundle exec rake test"
|
26
|
+
sh "gem build osv.gemspec"
|
27
|
+
sh "gem push osv-#{OSV::VERSION}.gem"
|
28
|
+
end
|
data/ext/osv/Cargo.toml
CHANGED
data/ext/osv/src/csv/builder.rs
CHANGED
@@ -3,18 +3,21 @@ use super::{
|
|
3
3
|
parser::RecordParser,
|
4
4
|
read_impl::ReadImpl,
|
5
5
|
reader::RecordReader,
|
6
|
+
READ_BUFFER_SIZE,
|
6
7
|
};
|
7
8
|
use flate2::read::GzDecoder;
|
8
9
|
use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, RString, Ruby, Value};
|
9
10
|
use std::{
|
10
11
|
fs::File,
|
11
|
-
io::{self, Read},
|
12
|
+
io::{self, BufReader, Read},
|
12
13
|
marker::PhantomData,
|
13
14
|
os::fd::FromRawFd,
|
14
15
|
thread,
|
15
16
|
};
|
16
17
|
use thiserror::Error;
|
17
18
|
|
19
|
+
pub(crate) static BUFFER_CHANNEL_SIZE: usize = 1024;
|
20
|
+
|
18
21
|
#[derive(Error, Debug)]
|
19
22
|
pub enum ReaderError {
|
20
23
|
#[error("Failed to get file descriptor: {0}")]
|
@@ -56,6 +59,7 @@ pub struct RecordReaderBuilder<'a, T: RecordParser + Send + 'static> {
|
|
56
59
|
buffer: usize,
|
57
60
|
flexible: bool,
|
58
61
|
flexible_default: Option<String>,
|
62
|
+
trim: csv::Trim,
|
59
63
|
_phantom: PhantomData<T>,
|
60
64
|
}
|
61
65
|
|
@@ -68,9 +72,10 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
68
72
|
delimiter: b',',
|
69
73
|
quote_char: b'"',
|
70
74
|
null_string: None,
|
71
|
-
buffer:
|
75
|
+
buffer: BUFFER_CHANNEL_SIZE,
|
72
76
|
flexible: false,
|
73
77
|
flexible_default: None,
|
78
|
+
trim: csv::Trim::None,
|
74
79
|
_phantom: PhantomData,
|
75
80
|
}
|
76
81
|
}
|
@@ -110,6 +115,11 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
110
115
|
self
|
111
116
|
}
|
112
117
|
|
118
|
+
pub fn trim(mut self, trim: csv::Trim) -> Self {
|
119
|
+
self.trim = trim;
|
120
|
+
self
|
121
|
+
}
|
122
|
+
|
113
123
|
fn handle_string_io(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
|
114
124
|
let string: RString = self.to_read.funcall("string", ())?;
|
115
125
|
let content = string.to_string()?;
|
@@ -128,7 +138,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
128
138
|
}
|
129
139
|
|
130
140
|
let file = unsafe { File::from_raw_fd(fd) };
|
131
|
-
Ok(Box::new(file))
|
141
|
+
Ok(Box::new(BufReader::with_capacity(READ_BUFFER_SIZE, file)))
|
132
142
|
}
|
133
143
|
|
134
144
|
fn handle_file_path(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
|
@@ -136,24 +146,27 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
136
146
|
let file = File::open(&path)?;
|
137
147
|
|
138
148
|
Ok(if path.ends_with(".gz") {
|
139
|
-
Box::new(GzDecoder::new(
|
149
|
+
Box::new(GzDecoder::new(BufReader::with_capacity(
|
150
|
+
READ_BUFFER_SIZE,
|
151
|
+
file,
|
152
|
+
)))
|
140
153
|
} else {
|
141
|
-
Box::new(file)
|
154
|
+
Box::new(BufReader::with_capacity(READ_BUFFER_SIZE, file))
|
142
155
|
})
|
143
156
|
}
|
144
157
|
|
145
|
-
fn get_reader(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
|
158
|
+
fn get_reader(&self) -> Result<(Box<dyn Read + Send + 'static>, bool), ReaderError> {
|
146
159
|
let string_io: magnus::RClass = self.ruby.eval("StringIO")?;
|
147
160
|
let gzip_reader_class: magnus::RClass = self.ruby.eval("Zlib::GzipReader")?;
|
148
161
|
|
149
162
|
if self.to_read.is_kind_of(string_io) {
|
150
|
-
self.handle_string_io()
|
163
|
+
self.handle_string_io().map(|r| (r, false))
|
151
164
|
} else if self.to_read.is_kind_of(gzip_reader_class) {
|
152
165
|
Err(ReaderError::UnsupportedGzipReader)
|
153
166
|
} else if self.to_read.is_kind_of(self.ruby.class_io()) {
|
154
|
-
self.handle_file_descriptor()
|
167
|
+
self.handle_file_descriptor().map(|r| (r, true))
|
155
168
|
} else {
|
156
|
-
self.handle_file_path()
|
169
|
+
self.handle_file_path().map(|r| (r, false))
|
157
170
|
}
|
158
171
|
}
|
159
172
|
|
@@ -175,7 +188,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
175
188
|
|
176
189
|
pub fn build(self) -> Result<RecordReader<T>, ReaderError> {
|
177
190
|
match self.get_reader() {
|
178
|
-
Ok(readable) => self.build_multi_threaded(readable),
|
191
|
+
Ok((readable, should_forget)) => self.build_multi_threaded(readable, should_forget),
|
179
192
|
Err(_) => {
|
180
193
|
let readable = self.get_single_threaded_reader()?;
|
181
194
|
self.build_single_threaded(readable)
|
@@ -186,6 +199,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
186
199
|
fn build_multi_threaded(
|
187
200
|
self,
|
188
201
|
readable: Box<dyn Read + Send + 'static>,
|
202
|
+
should_forget: bool,
|
189
203
|
) -> Result<RecordReader<T>, ReaderError> {
|
190
204
|
let flexible = self.flexible || self.flexible_default.is_some();
|
191
205
|
let mut reader = csv::ReaderBuilder::new()
|
@@ -193,6 +207,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
193
207
|
.delimiter(self.delimiter)
|
194
208
|
.quote(self.quote_char)
|
195
209
|
.flexible(flexible)
|
210
|
+
.trim(self.trim)
|
196
211
|
.from_reader(readable);
|
197
212
|
|
198
213
|
let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
|
@@ -204,7 +219,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
204
219
|
|
205
220
|
let flexible_default = self.flexible_default.clone();
|
206
221
|
let handle = thread::spawn(move || {
|
207
|
-
let mut record = csv::StringRecord::
|
222
|
+
let mut record = csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers.len());
|
208
223
|
while let Ok(true) = reader.read_record(&mut record) {
|
209
224
|
let row = T::parse(
|
210
225
|
&static_headers,
|
@@ -216,8 +231,10 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
216
231
|
break;
|
217
232
|
}
|
218
233
|
}
|
219
|
-
|
220
|
-
|
234
|
+
if should_forget {
|
235
|
+
let file_to_forget = reader.into_inner();
|
236
|
+
std::mem::forget(file_to_forget);
|
237
|
+
}
|
221
238
|
});
|
222
239
|
|
223
240
|
Ok(RecordReader {
|
@@ -239,6 +256,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
239
256
|
.delimiter(self.delimiter)
|
240
257
|
.quote(self.quote_char)
|
241
258
|
.flexible(flexible)
|
259
|
+
.trim(self.trim)
|
242
260
|
.from_reader(readable);
|
243
261
|
|
244
262
|
let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
|
@@ -257,30 +275,55 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
257
275
|
|
258
276
|
struct RubyReader {
|
259
277
|
inner: Value,
|
278
|
+
buffer: Option<Vec<u8>>,
|
279
|
+
offset: usize,
|
260
280
|
}
|
261
281
|
|
262
282
|
impl RubyReader {
|
263
283
|
fn new(inner: Value) -> Self {
|
264
|
-
Self {
|
284
|
+
Self {
|
285
|
+
inner,
|
286
|
+
buffer: None,
|
287
|
+
offset: 0,
|
288
|
+
}
|
265
289
|
}
|
266
290
|
}
|
267
291
|
|
292
|
+
// Read the entire inner into a vector and then read future reads from that vector with offset
|
268
293
|
impl Read for RubyReader {
|
269
294
|
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
270
|
-
|
295
|
+
// If we have an existing buffer, read from it
|
296
|
+
if let Some(buffer) = self.buffer.as_ref() {
|
297
|
+
let remaining = buffer.len() - self.offset;
|
298
|
+
let copy_size = remaining.min(buf.len());
|
299
|
+
buf[..copy_size].copy_from_slice(&buffer[self.offset..self.offset + copy_size]);
|
300
|
+
self.offset += copy_size;
|
301
|
+
return Ok(copy_size);
|
302
|
+
}
|
303
|
+
|
304
|
+
// No buffer yet - read the entire content from Ruby
|
305
|
+
let result = self.inner.funcall::<_, _, Value>("read", ());
|
271
306
|
match result {
|
272
307
|
Ok(data) => {
|
273
308
|
if data.is_nil() {
|
274
|
-
return Ok(0);
|
309
|
+
return Ok(0); // EOF
|
275
310
|
}
|
276
311
|
|
277
312
|
let string = RString::from_value(data).ok_or_else(|| {
|
278
313
|
io::Error::new(io::ErrorKind::Other, "Failed to convert to RString")
|
279
314
|
})?;
|
280
315
|
let bytes = unsafe { string.as_slice() };
|
281
|
-
|
282
|
-
|
283
|
-
|
316
|
+
|
317
|
+
// Store the entire content in the buffer
|
318
|
+
self.buffer = Some(bytes.to_vec());
|
319
|
+
self.offset = 0;
|
320
|
+
|
321
|
+
// Read initial chunk
|
322
|
+
let copy_size = bytes.len().min(buf.len());
|
323
|
+
buf[..copy_size].copy_from_slice(&bytes[..copy_size]);
|
324
|
+
self.offset = copy_size;
|
325
|
+
|
326
|
+
Ok(copy_size)
|
284
327
|
}
|
285
328
|
Err(e) => Err(io::Error::new(io::ErrorKind::Other, e.to_string())),
|
286
329
|
}
|
data/ext/osv/src/csv/mod.rs
CHANGED
data/ext/osv/src/csv/parser.rs
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
use std::collections::HashMap;
|
2
|
+
use std::hash::BuildHasher;
|
2
3
|
|
3
4
|
pub trait RecordParser {
|
4
5
|
type Output;
|
@@ -11,7 +12,7 @@ pub trait RecordParser {
|
|
11
12
|
) -> Self::Output;
|
12
13
|
}
|
13
14
|
|
14
|
-
impl RecordParser for HashMap<&'static str, Option<String
|
15
|
+
impl<S: BuildHasher + Default> RecordParser for HashMap<&'static str, Option<String>, S> {
|
15
16
|
type Output = Self;
|
16
17
|
|
17
18
|
#[inline]
|
@@ -21,21 +22,21 @@ impl RecordParser for HashMap<&'static str, Option<String>> {
|
|
21
22
|
null_string: Option<&str>,
|
22
23
|
flexible_default: Option<&str>,
|
23
24
|
) -> Self::Output {
|
24
|
-
let mut map = HashMap::
|
25
|
-
headers.iter().enumerate().for_each(|(i, header)| {
|
25
|
+
let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
|
26
|
+
headers.iter().enumerate().for_each(|(i, &header)| {
|
26
27
|
let value = record.get(i).map_or_else(
|
27
|
-
|| flexible_default.map(
|
28
|
+
|| flexible_default.map(ToString::to_string),
|
28
29
|
|field| {
|
29
30
|
if null_string == Some(field) {
|
30
31
|
None
|
31
32
|
} else if field.is_empty() {
|
32
33
|
Some(String::new())
|
33
34
|
} else {
|
34
|
-
Some(field.
|
35
|
+
Some(field.into())
|
35
36
|
}
|
36
37
|
},
|
37
38
|
);
|
38
|
-
map.insert(
|
39
|
+
map.insert(header, value);
|
39
40
|
});
|
40
41
|
map
|
41
42
|
}
|
@@ -53,20 +54,20 @@ impl RecordParser for Vec<Option<String>> {
|
|
53
54
|
) -> Self::Output {
|
54
55
|
let target_len = headers.len();
|
55
56
|
let mut vec = Vec::with_capacity(target_len);
|
56
|
-
|
57
|
-
|
57
|
+
for field in record.iter() {
|
58
|
+
let value = if Some(field) == null_string {
|
58
59
|
None
|
59
60
|
} else if field.is_empty() {
|
60
61
|
Some(String::new())
|
61
62
|
} else {
|
62
|
-
Some(field.
|
63
|
-
}
|
64
|
-
|
63
|
+
Some(field.into())
|
64
|
+
};
|
65
|
+
vec.push(value);
|
66
|
+
}
|
65
67
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
vec.push(Some(default.to_string()));
|
68
|
+
if vec.len() < target_len {
|
69
|
+
if let Some(default) = flexible_default {
|
70
|
+
vec.resize_with(target_len, || Some(default.to_string()));
|
70
71
|
}
|
71
72
|
}
|
72
73
|
vec
|
@@ -1,6 +1,8 @@
|
|
1
1
|
use super::{header_cache::StringCache, parser::RecordParser};
|
2
2
|
use std::{io::Read, thread};
|
3
3
|
|
4
|
+
pub(crate) const READ_BUFFER_SIZE: usize = 8192;
|
5
|
+
|
4
6
|
pub enum ReadImpl<T: RecordParser> {
|
5
7
|
SingleThreaded {
|
6
8
|
reader: csv::Reader<Box<dyn Read>>,
|
@@ -36,7 +38,7 @@ impl<T: RecordParser> ReadImpl<T> {
|
|
36
38
|
null_string,
|
37
39
|
flexible_default,
|
38
40
|
} => {
|
39
|
-
let mut record = csv::StringRecord::
|
41
|
+
let mut record = csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers.len());
|
40
42
|
match reader.read_record(&mut record) {
|
41
43
|
Ok(true) => Some(T::parse(
|
42
44
|
headers,
|
data/ext/osv/src/csv/record.rs
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
use magnus::{IntoValue, Ruby, Value};
|
2
|
-
use std::collections::HashMap;
|
2
|
+
use std::{collections::HashMap, hash::BuildHasher};
|
3
3
|
|
4
4
|
#[derive(Debug)]
|
5
|
-
pub enum CsvRecord {
|
5
|
+
pub enum CsvRecord<S: BuildHasher + Default> {
|
6
6
|
Vec(Vec<Option<String>>),
|
7
|
-
Map(HashMap<&'static str, Option<String
|
7
|
+
Map(HashMap<&'static str, Option<String>, S>),
|
8
8
|
}
|
9
9
|
|
10
|
-
impl IntoValue for CsvRecord {
|
10
|
+
impl<S: BuildHasher + Default> IntoValue for CsvRecord<S> {
|
11
11
|
#[inline]
|
12
12
|
fn into_value_with(self, handle: &Ruby) -> Value {
|
13
13
|
match self {
|
data/ext/osv/src/reader.rs
CHANGED
@@ -1,13 +1,15 @@
|
|
1
1
|
use crate::csv::{CsvRecord, RecordReaderBuilder};
|
2
2
|
use crate::utils::*;
|
3
|
+
use csv::Trim;
|
3
4
|
use magnus::value::ReprValue;
|
4
5
|
use magnus::{block::Yield, Error, KwArgs, RHash, Ruby, Symbol, Value};
|
5
6
|
use std::collections::HashMap;
|
7
|
+
use xxhash_rust::xxh3::Xxh3Builder;
|
6
8
|
|
7
9
|
pub fn parse_csv(
|
8
10
|
rb_self: Value,
|
9
11
|
args: &[Value],
|
10
|
-
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord
|
12
|
+
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<Xxh3Builder>>>>, Error> {
|
11
13
|
let ruby = unsafe { Ruby::get_unchecked() };
|
12
14
|
|
13
15
|
let CsvArgs {
|
@@ -20,6 +22,7 @@ pub fn parse_csv(
|
|
20
22
|
result_type,
|
21
23
|
flexible,
|
22
24
|
flexible_default,
|
25
|
+
trim,
|
23
26
|
} = parse_csv_args(&ruby, args)?;
|
24
27
|
|
25
28
|
if !ruby.block_given() {
|
@@ -34,27 +37,37 @@ pub fn parse_csv(
|
|
34
37
|
result_type,
|
35
38
|
flexible,
|
36
39
|
flexible_default,
|
40
|
+
trim: match trim {
|
41
|
+
Trim::All => Some("all".to_string()),
|
42
|
+
Trim::Headers => Some("headers".to_string()),
|
43
|
+
Trim::Fields => Some("fields".to_string()),
|
44
|
+
_ => None,
|
45
|
+
},
|
37
46
|
});
|
38
47
|
}
|
39
48
|
|
40
|
-
let iter: Box<dyn Iterator<Item = CsvRecord
|
49
|
+
let iter: Box<dyn Iterator<Item = CsvRecord<Xxh3Builder>>> = match result_type.as_str() {
|
41
50
|
"hash" => Box::new(
|
42
|
-
RecordReaderBuilder::<HashMap<&'static str, Option<String
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
51
|
+
RecordReaderBuilder::<HashMap<&'static str, Option<String>, Xxh3Builder>>::new(
|
52
|
+
&ruby, to_read,
|
53
|
+
)
|
54
|
+
.has_headers(has_headers)
|
55
|
+
.flexible(flexible)
|
56
|
+
.flexible_default(flexible_default)
|
57
|
+
.trim(trim)
|
58
|
+
.delimiter(delimiter)
|
59
|
+
.quote_char(quote_char)
|
60
|
+
.null_string(null_string)
|
61
|
+
.buffer(buffer_size)
|
62
|
+
.build()?
|
63
|
+
.map(CsvRecord::Map),
|
52
64
|
),
|
53
65
|
"array" => Box::new(
|
54
66
|
RecordReaderBuilder::<Vec<Option<String>>>::new(&ruby, to_read)
|
55
67
|
.has_headers(has_headers)
|
56
68
|
.flexible(flexible)
|
57
69
|
.flexible_default(flexible_default)
|
70
|
+
.trim(trim)
|
58
71
|
.delimiter(delimiter)
|
59
72
|
.quote_char(quote_char)
|
60
73
|
.null_string(null_string)
|
@@ -84,11 +97,12 @@ struct EnumeratorArgs {
|
|
84
97
|
result_type: String,
|
85
98
|
flexible: bool,
|
86
99
|
flexible_default: Option<String>,
|
100
|
+
trim: Option<String>,
|
87
101
|
}
|
88
102
|
|
89
103
|
fn create_enumerator(
|
90
104
|
args: EnumeratorArgs,
|
91
|
-
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord
|
105
|
+
) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<Xxh3Builder>>>>, Error> {
|
92
106
|
let kwargs = RHash::new();
|
93
107
|
kwargs.aset(Symbol::new("has_headers"), args.has_headers)?;
|
94
108
|
kwargs.aset(
|
@@ -104,6 +118,7 @@ fn create_enumerator(
|
|
104
118
|
kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
|
105
119
|
kwargs.aset(Symbol::new("flexible"), args.flexible)?;
|
106
120
|
kwargs.aset(Symbol::new("flexible_default"), args.flexible_default)?;
|
121
|
+
kwargs.aset(Symbol::new("trim"), args.trim.map(Symbol::new))?;
|
107
122
|
let enumerator = args
|
108
123
|
.rb_self
|
109
124
|
.enumeratorize("for_each", (args.to_read, KwArgs(kwargs)));
|
data/ext/osv/src/utils.rs
CHANGED
@@ -4,6 +4,29 @@ use magnus::{
|
|
4
4
|
Error, RString, Ruby, Symbol, Value,
|
5
5
|
};
|
6
6
|
|
7
|
+
use crate::csv::BUFFER_CHANNEL_SIZE;
|
8
|
+
|
9
|
+
fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, Error> {
|
10
|
+
if value.is_nil() {
|
11
|
+
Ok(None)
|
12
|
+
} else if value.is_kind_of(ruby.class_string()) {
|
13
|
+
RString::from_value(value)
|
14
|
+
.ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid string value"))?
|
15
|
+
.to_string()
|
16
|
+
.map(|s| Some(s))
|
17
|
+
} else if value.is_kind_of(ruby.class_symbol()) {
|
18
|
+
Symbol::from_value(value)
|
19
|
+
.ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid symbol value"))?
|
20
|
+
.funcall("to_s", ())
|
21
|
+
.map(|s| Some(s))
|
22
|
+
} else {
|
23
|
+
Err(Error::new(
|
24
|
+
magnus::exception::type_error(),
|
25
|
+
"Value must be a String or Symbol",
|
26
|
+
))
|
27
|
+
}
|
28
|
+
}
|
29
|
+
|
7
30
|
#[derive(Debug)]
|
8
31
|
pub struct CsvArgs {
|
9
32
|
pub to_read: Value,
|
@@ -15,6 +38,7 @@ pub struct CsvArgs {
|
|
15
38
|
pub result_type: String,
|
16
39
|
pub flexible: bool,
|
17
40
|
pub flexible_default: Option<String>,
|
41
|
+
pub trim: csv::Trim,
|
18
42
|
}
|
19
43
|
|
20
44
|
/// Parse common arguments for CSV parsing
|
@@ -34,6 +58,7 @@ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
|
|
34
58
|
Option<Value>,
|
35
59
|
Option<bool>,
|
36
60
|
Option<Option<String>>,
|
61
|
+
Option<Value>,
|
37
62
|
),
|
38
63
|
(),
|
39
64
|
>(
|
@@ -48,6 +73,7 @@ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
|
|
48
73
|
"result_type",
|
49
74
|
"flexible",
|
50
75
|
"flexible_default",
|
76
|
+
"trim",
|
51
77
|
],
|
52
78
|
)?;
|
53
79
|
|
@@ -81,38 +107,28 @@ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
|
|
81
107
|
|
82
108
|
let null_string = kwargs.optional.3.unwrap_or_default();
|
83
109
|
|
84
|
-
let buffer_size = kwargs.optional.4.unwrap_or(
|
110
|
+
let buffer_size = kwargs.optional.4.unwrap_or(BUFFER_CHANNEL_SIZE);
|
85
111
|
|
86
|
-
let result_type = match kwargs
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
} else if value.is_kind_of(ruby.class_symbol()) {
|
95
|
-
Symbol::from_value(value)
|
96
|
-
.ok_or_else(|| {
|
97
|
-
Error::new(magnus::exception::type_error(), "Invalid symbol value")
|
98
|
-
})?
|
99
|
-
.funcall("to_s", ())?
|
100
|
-
} else {
|
112
|
+
let result_type = match kwargs
|
113
|
+
.optional
|
114
|
+
.5
|
115
|
+
.map(|value| parse_string_or_symbol(ruby, value))
|
116
|
+
{
|
117
|
+
Some(Ok(Some(parsed))) => match parsed.as_str() {
|
118
|
+
"hash" | "array" => parsed,
|
119
|
+
_ => {
|
101
120
|
return Err(Error::new(
|
102
|
-
magnus::exception::
|
103
|
-
"result_type must be
|
104
|
-
))
|
105
|
-
};
|
106
|
-
|
107
|
-
match parsed.as_str() {
|
108
|
-
"hash" | "array" => parsed,
|
109
|
-
_ => {
|
110
|
-
return Err(Error::new(
|
111
|
-
magnus::exception::runtime_error(),
|
112
|
-
"result_type must be either 'hash' or 'array'",
|
113
|
-
))
|
114
|
-
}
|
121
|
+
magnus::exception::runtime_error(),
|
122
|
+
"result_type must be either 'hash' or 'array'",
|
123
|
+
))
|
115
124
|
}
|
125
|
+
},
|
126
|
+
Some(Ok(None)) => String::from("hash"),
|
127
|
+
Some(Err(_)) => {
|
128
|
+
return Err(Error::new(
|
129
|
+
magnus::exception::type_error(),
|
130
|
+
"result_type must be a String or Symbol",
|
131
|
+
))
|
116
132
|
}
|
117
133
|
None => String::from("hash"),
|
118
134
|
};
|
@@ -121,6 +137,35 @@ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
|
|
121
137
|
|
122
138
|
let flexible_default = kwargs.optional.7.unwrap_or_default();
|
123
139
|
|
140
|
+
let trim = match kwargs
|
141
|
+
.optional
|
142
|
+
.8
|
143
|
+
.map(|value| parse_string_or_symbol(ruby, value))
|
144
|
+
{
|
145
|
+
Some(Ok(Some(parsed))) => match parsed.as_str() {
|
146
|
+
"all" => csv::Trim::All,
|
147
|
+
"headers" => csv::Trim::Headers,
|
148
|
+
"fields" => csv::Trim::Fields,
|
149
|
+
invalid => {
|
150
|
+
return Err(Error::new(
|
151
|
+
magnus::exception::runtime_error(),
|
152
|
+
format!(
|
153
|
+
"trim must be either 'all', 'headers', or 'fields' but got '{}'",
|
154
|
+
invalid
|
155
|
+
),
|
156
|
+
))
|
157
|
+
}
|
158
|
+
},
|
159
|
+
Some(Ok(None)) => csv::Trim::None,
|
160
|
+
Some(Err(_)) => {
|
161
|
+
return Err(Error::new(
|
162
|
+
magnus::exception::type_error(),
|
163
|
+
"trim must be a String or Symbol",
|
164
|
+
))
|
165
|
+
}
|
166
|
+
None => csv::Trim::None,
|
167
|
+
};
|
168
|
+
|
124
169
|
Ok(CsvArgs {
|
125
170
|
to_read,
|
126
171
|
has_headers,
|
@@ -131,5 +176,6 @@ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
|
|
131
176
|
result_type,
|
132
177
|
flexible,
|
133
178
|
flexible_default,
|
179
|
+
trim,
|
134
180
|
})
|
135
181
|
}
|
data/lib/osv/version.rb
CHANGED
data/lib/osv.rbi
CHANGED
@@ -14,12 +14,15 @@ module OSV
|
|
14
14
|
# an empty string.
|
15
15
|
# - `buffer_size`: Integer specifying the read buffer size
|
16
16
|
# - `result_type`: String specifying the output format
|
17
|
-
# ("hash" or "array")
|
17
|
+
# ("hash" or "array" or :hash or :array)
|
18
18
|
# - `flexible`: Boolean specifying if the parser should be flexible
|
19
19
|
# (default: false)
|
20
20
|
# - `flexible_default`: String specifying the default value for missing fields.
|
21
21
|
# Implicitly enables flexible mode if set.
|
22
22
|
# (default: `nil`)
|
23
|
+
# - `trim`: String specifying the trim mode
|
24
|
+
# ("all" or "headers" or "fields" or :all or :headers or :fields)
|
25
|
+
# (default: `nil`)
|
23
26
|
sig do
|
24
27
|
params(
|
25
28
|
input: T.any(String, StringIO, IO),
|
@@ -28,9 +31,10 @@ module OSV
|
|
28
31
|
quote_char: T.nilable(String),
|
29
32
|
nil_string: T.nilable(String),
|
30
33
|
buffer_size: T.nilable(Integer),
|
31
|
-
result_type: T.nilable(String),
|
34
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
32
35
|
flexible: T.nilable(T::Boolean),
|
33
36
|
flexible_default: T.nilable(String),
|
37
|
+
trim: T.nilable(T.any(String, Symbol)),
|
34
38
|
blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.nilable(String)], T::Array[T.nilable(String)])).void)
|
35
39
|
).returns(T.any(Enumerator, T.untyped))
|
36
40
|
end
|
@@ -44,6 +48,7 @@ module OSV
|
|
44
48
|
result_type: nil,
|
45
49
|
flexible: nil,
|
46
50
|
flexible_default: nil,
|
51
|
+
trim: nil,
|
47
52
|
&blk
|
48
53
|
)
|
49
54
|
end
|