osv 0.3.3 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Cargo.lock +21 -0
- data/Gemfile +9 -2
- data/README.md +90 -0
- data/ext/osv/Cargo.toml +1 -0
- data/ext/osv/src/csv/builder.rs +168 -33
- data/ext/osv/src/csv/header_cache.rs +33 -23
- data/ext/osv/src/csv/mod.rs +1 -0
- data/ext/osv/src/csv/parser.rs +33 -15
- data/ext/osv/src/csv/read_impl.rs +65 -0
- data/ext/osv/src/csv/reader.rs +32 -72
- data/ext/osv/src/csv/record.rs +7 -5
- data/lib/osv/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3b8a537d24e23250ff18da51bc59f9b4329b02e2fec9b881d4ee203a766e5514
|
4
|
+
data.tar.gz: dbdb8b1accd5897df7079adb98cc2ae50939f8ffeebf80c5095a06412dc69699
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d32dc649748d62092414047bc5f7666cab3b1a4cbfb58a00edb2c0ec1a634d375e3c522ac716b57a6bb37292436433d867042b922383366a8da44a3500a6e2a4
|
7
|
+
data.tar.gz: deec4d1433d6da2b8242fae9c3ff2f17e5c56143f0c39d1f0ceaadd3351ef05641bb03f219a02c47b50a3e36c41048894963da41a0b1729cd153f579277cf448
|
data/Cargo.lock
CHANGED
@@ -273,6 +273,7 @@ dependencies = [
|
|
273
273
|
"rb-sys",
|
274
274
|
"serde",
|
275
275
|
"serde_magnus",
|
276
|
+
"thiserror",
|
276
277
|
]
|
277
278
|
|
278
279
|
[[package]]
|
@@ -436,6 +437,26 @@ version = "1.0.1"
|
|
436
437
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
437
438
|
checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
|
438
439
|
|
440
|
+
[[package]]
|
441
|
+
name = "thiserror"
|
442
|
+
version = "1.0.69"
|
443
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
444
|
+
checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
|
445
|
+
dependencies = [
|
446
|
+
"thiserror-impl",
|
447
|
+
]
|
448
|
+
|
449
|
+
[[package]]
|
450
|
+
name = "thiserror-impl"
|
451
|
+
version = "1.0.69"
|
452
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
453
|
+
checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
|
454
|
+
dependencies = [
|
455
|
+
"proc-macro2",
|
456
|
+
"quote",
|
457
|
+
"syn",
|
458
|
+
]
|
459
|
+
|
439
460
|
[[package]]
|
440
461
|
name = "unicode-ident"
|
441
462
|
version = "1.0.14"
|
data/Gemfile
CHANGED
@@ -2,6 +2,13 @@ source "https://rubygems.org"
|
|
2
2
|
|
3
3
|
gem "rb_sys", "~> 0.9.56"
|
4
4
|
gem "rake"
|
5
|
-
gem "
|
5
|
+
gem "csv"
|
6
6
|
|
7
|
-
|
7
|
+
# Use local version of osv
|
8
|
+
gemspec
|
9
|
+
|
10
|
+
group :development, :test do
|
11
|
+
gem "minitest", "~> 5.0"
|
12
|
+
gem "benchmark-ips", "~> 2.12"
|
13
|
+
gem "fastcsv", "~> 0.0.7"
|
14
|
+
end
|
data/README.md
CHANGED
@@ -111,3 +111,93 @@ OSV.for_each(data) { |row| puts row["name"] }
|
|
111
111
|
## Performance
|
112
112
|
|
113
113
|
This library is faster than the standard Ruby CSV library, and is comparable to the fastest CSV parser gems I've used.
|
114
|
+
|
115
|
+
Here's some unscientific benchmarks. You can find the code in the [benchmark/comparison_benchmark.rb](benchmark/comparison_benchmark.rb) file.
|
116
|
+
|
117
|
+
### 10,000 lines
|
118
|
+
|
119
|
+
```
|
120
|
+
Benchmarking with 10001 lines of data
|
121
|
+
|
122
|
+
ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin23]
|
123
|
+
Warming up --------------------------------------
|
124
|
+
OSV - Hash output 6.000 i/100ms
|
125
|
+
CSV - Hash output 1.000 i/100ms
|
126
|
+
OSV - Array output 18.000 i/100ms
|
127
|
+
CSV - Array output 2.000 i/100ms
|
128
|
+
FastCSV - Array output
|
129
|
+
9.000 i/100ms
|
130
|
+
OSV - StringIO 7.000 i/100ms
|
131
|
+
CSV - StringIO 1.000 i/100ms
|
132
|
+
FastCSV - StringIO 20.000 i/100ms
|
133
|
+
OSV - Gzipped 6.000 i/100ms
|
134
|
+
CSV - Gzipped 1.000 i/100ms
|
135
|
+
Calculating -------------------------------------
|
136
|
+
OSV - Hash output 73.360 (± 4.1%) i/s (13.63 ms/i) - 366.000 in 5.000390s
|
137
|
+
CSV - Hash output 11.937 (±25.1%) i/s (83.78 ms/i) - 52.000 in 5.036297s
|
138
|
+
OSV - Array output 189.738 (± 8.4%) i/s (5.27 ms/i) - 954.000 in 5.071018s
|
139
|
+
CSV - Array output 25.471 (±11.8%) i/s (39.26 ms/i) - 120.000 in 5.015289s
|
140
|
+
FastCSV - Array output
|
141
|
+
97.867 (± 2.0%) i/s (10.22 ms/i) - 495.000 in 5.060957s
|
142
|
+
OSV - StringIO 80.784 (± 6.2%) i/s (12.38 ms/i) - 406.000 in 5.046696s
|
143
|
+
CSV - StringIO 15.872 (± 0.0%) i/s (63.01 ms/i) - 80.000 in 5.043361s
|
144
|
+
FastCSV - StringIO 200.511 (± 2.0%) i/s (4.99 ms/i) - 1.020k in 5.088592s
|
145
|
+
OSV - Gzipped 55.220 (±12.7%) i/s (18.11 ms/i) - 258.000 in 5.030928s
|
146
|
+
CSV - Gzipped 12.591 (±15.9%) i/s (79.42 ms/i) - 59.000 in 5.039709s
|
147
|
+
|
148
|
+
Comparison:
|
149
|
+
FastCSV - StringIO: 200.5 i/s
|
150
|
+
OSV - Array output: 189.7 i/s - same-ish: difference falls within error
|
151
|
+
FastCSV - Array output: 97.9 i/s - 2.05x slower
|
152
|
+
OSV - StringIO: 80.8 i/s - 2.48x slower
|
153
|
+
OSV - Hash output: 73.4 i/s - 2.73x slower
|
154
|
+
OSV - Gzipped: 55.2 i/s - 3.63x slower
|
155
|
+
CSV - Array output: 25.5 i/s - 7.87x slower
|
156
|
+
CSV - StringIO: 15.9 i/s - 12.63x slower
|
157
|
+
CSV - Gzipped: 12.6 i/s - 15.92x slower
|
158
|
+
CSV - Hash output: 11.9 i/s - 16.80x slower
|
159
|
+
```
|
160
|
+
|
161
|
+
### 1,000,000 lines
|
162
|
+
|
163
|
+
```
|
164
|
+
Benchmarking with 1000001 lines of data
|
165
|
+
|
166
|
+
ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin23]
|
167
|
+
Warming up --------------------------------------
|
168
|
+
OSV - Hash output 1.000 i/100ms
|
169
|
+
CSV - Hash output 1.000 i/100ms
|
170
|
+
OSV - Array output 1.000 i/100ms
|
171
|
+
CSV - Array output 1.000 i/100ms
|
172
|
+
FastCSV - Array output
|
173
|
+
1.000 i/100ms
|
174
|
+
OSV - StringIO 1.000 i/100ms
|
175
|
+
CSV - StringIO 1.000 i/100ms
|
176
|
+
FastCSV - StringIO 1.000 i/100ms
|
177
|
+
OSV - Gzipped 1.000 i/100ms
|
178
|
+
CSV - Gzipped 1.000 i/100ms
|
179
|
+
Calculating -------------------------------------
|
180
|
+
OSV - Hash output 0.578 (± 0.0%) i/s (1.73 s/i) - 3.000 in 5.287845s
|
181
|
+
CSV - Hash output 0.117 (± 0.0%) i/s (8.57 s/i) - 1.000 in 8.571770s
|
182
|
+
OSV - Array output 1.142 (± 0.0%) i/s (875.97 ms/i) - 5.000 in 5.234694s
|
183
|
+
CSV - Array output 0.235 (± 0.0%) i/s (4.25 s/i) - 2.000 in 8.561144s
|
184
|
+
FastCSV - Array output
|
185
|
+
0.768 (± 0.0%) i/s (1.30 s/i) - 4.000 in 6.924574s
|
186
|
+
OSV - StringIO 0.522 (± 0.0%) i/s (1.91 s/i) - 3.000 in 5.803969s
|
187
|
+
CSV - StringIO 0.132 (± 0.0%) i/s (7.59 s/i) - 1.000 in 7.593243s
|
188
|
+
FastCSV - StringIO 1.039 (± 0.0%) i/s (962.53 ms/i) - 6.000 in 5.806644s
|
189
|
+
OSV - Gzipped 0.437 (± 0.0%) i/s (2.29 s/i) - 3.000 in 6.885125s
|
190
|
+
CSV - Gzipped 0.115 (± 0.0%) i/s (8.68 s/i) - 1.000 in 8.684069s
|
191
|
+
|
192
|
+
Comparison:
|
193
|
+
OSV - Array output: 1.1 i/s
|
194
|
+
FastCSV - StringIO: 1.0 i/s - 1.10x slower
|
195
|
+
FastCSV - Array output: 0.8 i/s - 1.49x slower
|
196
|
+
OSV - Hash output: 0.6 i/s - 1.98x slower
|
197
|
+
OSV - StringIO: 0.5 i/s - 2.19x slower
|
198
|
+
OSV - Gzipped: 0.4 i/s - 2.61x slower
|
199
|
+
CSV - Array output: 0.2 i/s - 4.86x slower
|
200
|
+
CSV - StringIO: 0.1 i/s - 8.67x slower
|
201
|
+
CSV - Hash output: 0.1 i/s - 9.79x slower
|
202
|
+
CSV - Gzipped: 0.1 i/s - 9.91x slower
|
203
|
+
```
|
data/ext/osv/Cargo.toml
CHANGED
data/ext/osv/src/csv/builder.rs
CHANGED
@@ -1,11 +1,50 @@
|
|
1
1
|
use super::{
|
2
|
-
header_cache::StringCache,
|
2
|
+
header_cache::{CacheError, StringCache},
|
3
3
|
parser::RecordParser,
|
4
|
-
|
4
|
+
read_impl::ReadImpl,
|
5
|
+
reader::RecordReader,
|
5
6
|
};
|
6
7
|
use flate2::read::GzDecoder;
|
7
|
-
use magnus::{rb_sys::AsRawValue, value::ReprValue, Error, RString, Ruby, Value};
|
8
|
-
use std::{
|
8
|
+
use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, RString, Ruby, Value};
|
9
|
+
use std::{
|
10
|
+
fs::File,
|
11
|
+
io::{self, Read},
|
12
|
+
marker::PhantomData,
|
13
|
+
os::fd::FromRawFd,
|
14
|
+
thread,
|
15
|
+
};
|
16
|
+
use thiserror::Error;
|
17
|
+
|
18
|
+
#[derive(Error, Debug)]
|
19
|
+
pub enum ReaderError {
|
20
|
+
#[error("Failed to get file descriptor: {0}")]
|
21
|
+
FileDescriptor(String),
|
22
|
+
#[error("Invalid file descriptor")]
|
23
|
+
InvalidFileDescriptor,
|
24
|
+
#[error("Failed to open file: {0}")]
|
25
|
+
FileOpen(#[from] io::Error),
|
26
|
+
#[error("Failed to intern headers: {0}")]
|
27
|
+
HeaderIntern(#[from] CacheError),
|
28
|
+
#[error("Unsupported GzipReader")]
|
29
|
+
UnsupportedGzipReader,
|
30
|
+
#[error("Ruby error: {0}")]
|
31
|
+
Ruby(String),
|
32
|
+
}
|
33
|
+
|
34
|
+
impl From<MagnusError> for ReaderError {
|
35
|
+
fn from(err: MagnusError) -> Self {
|
36
|
+
Self::Ruby(err.to_string())
|
37
|
+
}
|
38
|
+
}
|
39
|
+
|
40
|
+
impl From<ReaderError> for MagnusError {
|
41
|
+
fn from(err: ReaderError) -> Self {
|
42
|
+
MagnusError::new(
|
43
|
+
Ruby::get().unwrap().exception_runtime_error(),
|
44
|
+
err.to_string(),
|
45
|
+
)
|
46
|
+
}
|
47
|
+
}
|
9
48
|
|
10
49
|
pub struct RecordReaderBuilder<'a, T: RecordParser + Send + 'static> {
|
11
50
|
ruby: &'a Ruby,
|
@@ -57,36 +96,83 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
57
96
|
self
|
58
97
|
}
|
59
98
|
|
60
|
-
fn
|
99
|
+
fn handle_string_io(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
|
100
|
+
let string: RString = self.to_read.funcall("string", ())?;
|
101
|
+
let content = string.to_string()?;
|
102
|
+
Ok(Box::new(std::io::Cursor::new(content)))
|
103
|
+
}
|
104
|
+
|
105
|
+
fn handle_file_descriptor(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
|
106
|
+
let raw_value = self.to_read.as_raw();
|
107
|
+
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
108
|
+
.map_err(|_| {
|
109
|
+
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
110
|
+
})?;
|
111
|
+
|
112
|
+
if fd < 0 {
|
113
|
+
return Err(ReaderError::InvalidFileDescriptor);
|
114
|
+
}
|
115
|
+
|
116
|
+
let file = unsafe { File::from_raw_fd(fd) };
|
117
|
+
Ok(Box::new(file))
|
118
|
+
}
|
119
|
+
|
120
|
+
fn handle_file_path(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
|
121
|
+
let path = self.to_read.to_r_string()?.to_string()?;
|
122
|
+
let file = File::open(&path)?;
|
123
|
+
|
124
|
+
Ok(if path.ends_with(".gz") {
|
125
|
+
Box::new(GzDecoder::new(file))
|
126
|
+
} else {
|
127
|
+
Box::new(file)
|
128
|
+
})
|
129
|
+
}
|
130
|
+
|
131
|
+
fn get_reader(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
|
61
132
|
let string_io: magnus::RClass = self.ruby.eval("StringIO")?;
|
133
|
+
let gzip_reader_class: magnus::RClass = self.ruby.eval("Zlib::GzipReader")?;
|
62
134
|
|
63
135
|
if self.to_read.is_kind_of(string_io) {
|
64
|
-
|
65
|
-
|
66
|
-
|
136
|
+
self.handle_string_io()
|
137
|
+
} else if self.to_read.is_kind_of(gzip_reader_class) {
|
138
|
+
Err(ReaderError::UnsupportedGzipReader)
|
67
139
|
} else if self.to_read.is_kind_of(self.ruby.class_io()) {
|
68
|
-
|
69
|
-
let file = unsafe { File::from_raw_fd(fd) };
|
70
|
-
Ok(Box::new(file))
|
140
|
+
self.handle_file_descriptor()
|
71
141
|
} else {
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
142
|
+
self.handle_file_path()
|
143
|
+
}
|
144
|
+
}
|
145
|
+
|
146
|
+
fn get_single_threaded_reader(&self) -> Result<Box<dyn Read>, ReaderError> {
|
147
|
+
let string_io: magnus::RClass = self.ruby.eval("StringIO")?;
|
148
|
+
let gzip_reader_class: magnus::RClass = self.ruby.eval("Zlib::GzipReader")?;
|
149
|
+
|
150
|
+
if self.to_read.is_kind_of(string_io) {
|
151
|
+
self.handle_string_io().map(|r| -> Box<dyn Read> { r })
|
152
|
+
} else if self.to_read.is_kind_of(gzip_reader_class) {
|
153
|
+
Ok(Box::new(RubyReader::new(self.to_read)))
|
154
|
+
} else if self.to_read.is_kind_of(self.ruby.class_io()) {
|
155
|
+
self.handle_file_descriptor()
|
156
|
+
.map(|r| -> Box<dyn Read> { r })
|
157
|
+
} else {
|
158
|
+
self.handle_file_path().map(|r| -> Box<dyn Read> { r })
|
159
|
+
}
|
160
|
+
}
|
161
|
+
|
162
|
+
pub fn build(self) -> Result<RecordReader<T>, ReaderError> {
|
163
|
+
match self.get_reader() {
|
164
|
+
Ok(readable) => self.build_multi_threaded(readable),
|
165
|
+
Err(_) => {
|
166
|
+
let readable = self.get_single_threaded_reader()?;
|
167
|
+
self.build_single_threaded(readable)
|
84
168
|
}
|
85
169
|
}
|
86
170
|
}
|
87
171
|
|
88
|
-
|
89
|
-
|
172
|
+
fn build_multi_threaded(
|
173
|
+
self,
|
174
|
+
readable: Box<dyn Read + Send + 'static>,
|
175
|
+
) -> Result<RecordReader<T>, ReaderError> {
|
90
176
|
let mut reader = csv::ReaderBuilder::new()
|
91
177
|
.has_headers(self.has_headers)
|
92
178
|
.delimiter(self.delimiter)
|
@@ -94,17 +180,12 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
94
180
|
.from_reader(readable);
|
95
181
|
|
96
182
|
let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
|
97
|
-
let
|
98
|
-
|
99
|
-
let static_headers = StringCache::intern_many(&headers).map_err(|e| {
|
100
|
-
Error::new(
|
101
|
-
self.ruby.exception_runtime_error(),
|
102
|
-
format!("Failed to intern headers: {e}"),
|
103
|
-
)
|
104
|
-
})?;
|
183
|
+
let static_headers = StringCache::intern_many(&headers)?;
|
105
184
|
let headers_for_cleanup = static_headers.clone();
|
106
185
|
|
107
186
|
let (sender, receiver) = kanal::bounded(self.buffer);
|
187
|
+
let null_string = self.null_string.clone();
|
188
|
+
|
108
189
|
let handle = thread::spawn(move || {
|
109
190
|
let mut record = csv::StringRecord::new();
|
110
191
|
while let Ok(true) = reader.read_record(&mut record) {
|
@@ -125,4 +206,58 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
125
206
|
},
|
126
207
|
})
|
127
208
|
}
|
209
|
+
|
210
|
+
fn build_single_threaded(
|
211
|
+
self,
|
212
|
+
readable: Box<dyn Read>,
|
213
|
+
) -> Result<RecordReader<T>, ReaderError> {
|
214
|
+
let mut reader = csv::ReaderBuilder::new()
|
215
|
+
.has_headers(self.has_headers)
|
216
|
+
.delimiter(self.delimiter)
|
217
|
+
.quote(self.quote_char)
|
218
|
+
.from_reader(readable);
|
219
|
+
|
220
|
+
let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
|
221
|
+
let static_headers = StringCache::intern_many(&headers)?;
|
222
|
+
|
223
|
+
Ok(RecordReader {
|
224
|
+
reader: ReadImpl::SingleThreaded {
|
225
|
+
reader,
|
226
|
+
headers: static_headers,
|
227
|
+
null_string: self.null_string,
|
228
|
+
},
|
229
|
+
})
|
230
|
+
}
|
231
|
+
}
|
232
|
+
|
233
|
+
struct RubyReader {
|
234
|
+
inner: Value,
|
235
|
+
}
|
236
|
+
|
237
|
+
impl RubyReader {
|
238
|
+
fn new(inner: Value) -> Self {
|
239
|
+
Self { inner }
|
240
|
+
}
|
241
|
+
}
|
242
|
+
|
243
|
+
impl Read for RubyReader {
|
244
|
+
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
245
|
+
let result = self.inner.funcall::<_, _, Value>("read", (buf.len(),));
|
246
|
+
match result {
|
247
|
+
Ok(data) => {
|
248
|
+
if data.is_nil() {
|
249
|
+
return Ok(0);
|
250
|
+
}
|
251
|
+
|
252
|
+
let string = RString::from_value(data).ok_or_else(|| {
|
253
|
+
io::Error::new(io::ErrorKind::Other, "Failed to convert to RString")
|
254
|
+
})?;
|
255
|
+
let bytes = unsafe { string.as_slice() };
|
256
|
+
let len = bytes.len().min(buf.len());
|
257
|
+
buf[..len].copy_from_slice(&bytes[..len]);
|
258
|
+
Ok(len)
|
259
|
+
}
|
260
|
+
Err(e) => Err(io::Error::new(io::ErrorKind::Other, e.to_string())),
|
261
|
+
}
|
262
|
+
}
|
128
263
|
}
|
@@ -4,22 +4,29 @@
|
|
4
4
|
///
|
5
5
|
/// Note: Performance testing on macOS showed minimal speed improvements,
|
6
6
|
/// so this optimization could be removed if any issues arise.
|
7
|
-
|
8
|
-
|
9
7
|
use std::{
|
10
8
|
collections::HashMap,
|
11
9
|
sync::{atomic::AtomicU32, LazyLock, Mutex},
|
12
10
|
};
|
11
|
+
use thiserror::Error;
|
12
|
+
|
13
|
+
#[derive(Debug, Error)]
|
14
|
+
pub enum CacheError {
|
15
|
+
#[error("Failed to acquire lock: {0}")]
|
16
|
+
LockError(String),
|
17
|
+
}
|
13
18
|
|
14
19
|
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, AtomicU32>>> =
|
15
20
|
LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
|
16
21
|
|
17
|
-
pub struct StringCache
|
22
|
+
pub struct StringCache;
|
18
23
|
|
19
24
|
impl StringCache {
|
20
25
|
#[allow(dead_code)]
|
21
|
-
pub fn intern(string: String) -> Result<&'static str,
|
22
|
-
let mut cache = STRING_CACHE
|
26
|
+
pub fn intern(string: String) -> Result<&'static str, CacheError> {
|
27
|
+
let mut cache = STRING_CACHE
|
28
|
+
.lock()
|
29
|
+
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
23
30
|
|
24
31
|
if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
|
25
32
|
count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
@@ -31,33 +38,36 @@ impl StringCache {
|
|
31
38
|
}
|
32
39
|
}
|
33
40
|
|
34
|
-
pub fn intern_many(strings: &[String]) -> Result<Vec<&'static str>,
|
35
|
-
let mut cache = STRING_CACHE
|
36
|
-
|
41
|
+
pub fn intern_many(strings: &[String]) -> Result<Vec<&'static str>, CacheError> {
|
42
|
+
let mut cache = STRING_CACHE
|
43
|
+
.lock()
|
44
|
+
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
37
45
|
|
46
|
+
let mut result = Vec::with_capacity(strings.len());
|
38
47
|
for string in strings {
|
39
|
-
let
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
};
|
48
|
-
result.push(static_str);
|
48
|
+
if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
|
49
|
+
count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
50
|
+
result.push(existing);
|
51
|
+
} else {
|
52
|
+
let leaked = Box::leak(string.clone().into_boxed_str());
|
53
|
+
cache.insert(leaked, AtomicU32::new(1));
|
54
|
+
result.push(leaked);
|
55
|
+
}
|
49
56
|
}
|
50
|
-
|
51
57
|
Ok(result)
|
52
58
|
}
|
53
59
|
|
54
|
-
pub fn clear(headers: &[&'static str]) -> Result<(),
|
55
|
-
let cache = STRING_CACHE
|
60
|
+
pub fn clear(headers: &[&'static str]) -> Result<(), CacheError> {
|
61
|
+
let mut cache = STRING_CACHE
|
62
|
+
.lock()
|
63
|
+
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
56
64
|
|
57
65
|
for header in headers {
|
58
66
|
if let Some(count) = cache.get(header) {
|
59
|
-
|
60
|
-
|
67
|
+
// Returns the previous value of the counter
|
68
|
+
let was = count.fetch_sub(1, std::sync::atomic::Ordering::Relaxed);
|
69
|
+
if was == 1 {
|
70
|
+
cache.remove(header);
|
61
71
|
let ptr = *header as *const str as *mut str;
|
62
72
|
unsafe {
|
63
73
|
let _ = Box::from_raw(ptr);
|
data/ext/osv/src/csv/mod.rs
CHANGED
data/ext/osv/src/csv/parser.rs
CHANGED
@@ -2,6 +2,7 @@ use std::collections::HashMap;
|
|
2
2
|
|
3
3
|
pub trait RecordParser {
|
4
4
|
type Output;
|
5
|
+
|
5
6
|
fn parse(
|
6
7
|
headers: &[&'static str],
|
7
8
|
record: &csv::StringRecord,
|
@@ -11,41 +12,58 @@ pub trait RecordParser {
|
|
11
12
|
|
12
13
|
impl RecordParser for HashMap<&'static str, Option<String>> {
|
13
14
|
type Output = Self;
|
15
|
+
|
16
|
+
#[inline]
|
14
17
|
fn parse(
|
15
18
|
headers: &[&'static str],
|
16
19
|
record: &csv::StringRecord,
|
17
20
|
null_string: &str,
|
18
21
|
) -> Self::Output {
|
19
22
|
let mut map = HashMap::with_capacity(headers.len());
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
23
|
+
headers
|
24
|
+
.iter()
|
25
|
+
.zip(record.iter())
|
26
|
+
.for_each(|(header, field)| {
|
27
|
+
map.insert(
|
28
|
+
*header,
|
29
|
+
if field == null_string {
|
30
|
+
None
|
31
|
+
} else {
|
32
|
+
// Avoid allocating for empty strings
|
33
|
+
if field.is_empty() {
|
34
|
+
Some(String::new())
|
35
|
+
} else {
|
36
|
+
Some(field.to_string())
|
37
|
+
}
|
38
|
+
},
|
39
|
+
);
|
40
|
+
});
|
30
41
|
map
|
31
42
|
}
|
32
43
|
}
|
33
44
|
|
34
45
|
impl RecordParser for Vec<Option<String>> {
|
35
46
|
type Output = Self;
|
47
|
+
|
48
|
+
#[inline]
|
36
49
|
fn parse(
|
37
50
|
_headers: &[&'static str],
|
38
51
|
record: &csv::StringRecord,
|
39
52
|
null_string: &str,
|
40
53
|
) -> Self::Output {
|
41
54
|
let mut vec = Vec::with_capacity(record.len());
|
42
|
-
|
43
|
-
|
55
|
+
vec.extend(record.iter().map(|field| {
|
56
|
+
if field == null_string {
|
44
57
|
None
|
45
58
|
} else {
|
46
|
-
|
47
|
-
|
48
|
-
|
59
|
+
// Avoid allocating for empty strings
|
60
|
+
if field.is_empty() {
|
61
|
+
Some(String::new())
|
62
|
+
} else {
|
63
|
+
Some(field.to_string())
|
64
|
+
}
|
65
|
+
}
|
66
|
+
}));
|
49
67
|
vec
|
50
68
|
}
|
51
69
|
}
|
@@ -0,0 +1,65 @@
|
|
1
|
+
use super::{header_cache::StringCache, parser::RecordParser};
|
2
|
+
use std::{io::Read, thread};
|
3
|
+
|
4
|
+
pub enum ReadImpl<T: RecordParser> {
|
5
|
+
SingleThreaded {
|
6
|
+
reader: csv::Reader<Box<dyn Read>>,
|
7
|
+
headers: Vec<&'static str>,
|
8
|
+
null_string: String,
|
9
|
+
},
|
10
|
+
MultiThreaded {
|
11
|
+
headers: Vec<&'static str>,
|
12
|
+
receiver: kanal::Receiver<T::Output>,
|
13
|
+
handle: Option<thread::JoinHandle<()>>,
|
14
|
+
},
|
15
|
+
}
|
16
|
+
|
17
|
+
impl<T: RecordParser> ReadImpl<T> {
|
18
|
+
#[inline]
|
19
|
+
pub fn next(&mut self) -> Option<T::Output> {
|
20
|
+
match self {
|
21
|
+
Self::MultiThreaded {
|
22
|
+
receiver, handle, ..
|
23
|
+
} => match receiver.recv() {
|
24
|
+
Ok(record) => Some(record),
|
25
|
+
Err(_) => {
|
26
|
+
if let Some(handle) = handle.take() {
|
27
|
+
let _ = handle.join();
|
28
|
+
}
|
29
|
+
None
|
30
|
+
}
|
31
|
+
},
|
32
|
+
Self::SingleThreaded {
|
33
|
+
reader,
|
34
|
+
headers,
|
35
|
+
null_string,
|
36
|
+
} => {
|
37
|
+
let mut record = csv::StringRecord::new();
|
38
|
+
match reader.read_record(&mut record) {
|
39
|
+
Ok(true) => Some(T::parse(headers, &record, null_string)),
|
40
|
+
_ => None,
|
41
|
+
}
|
42
|
+
}
|
43
|
+
}
|
44
|
+
}
|
45
|
+
|
46
|
+
#[inline]
|
47
|
+
pub fn cleanup(&mut self) {
|
48
|
+
match self {
|
49
|
+
Self::MultiThreaded {
|
50
|
+
receiver,
|
51
|
+
handle,
|
52
|
+
headers,
|
53
|
+
} => {
|
54
|
+
receiver.close();
|
55
|
+
if let Some(handle) = handle.take() {
|
56
|
+
let _ = handle.join();
|
57
|
+
}
|
58
|
+
let _ = StringCache::clear(headers);
|
59
|
+
}
|
60
|
+
Self::SingleThreaded { headers, .. } => {
|
61
|
+
let _ = StringCache::clear(headers);
|
62
|
+
}
|
63
|
+
}
|
64
|
+
}
|
65
|
+
}
|
data/ext/osv/src/csv/reader.rs
CHANGED
@@ -1,66 +1,35 @@
|
|
1
|
-
use super::{
|
1
|
+
use super::{parser::RecordParser, read_impl::ReadImpl};
|
2
2
|
use magnus::{Error, Ruby};
|
3
|
-
use std::{io::Read
|
3
|
+
use std::{borrow::Cow, io::Read};
|
4
4
|
|
5
5
|
pub struct RecordReader<T: RecordParser> {
|
6
6
|
pub(crate) reader: ReadImpl<T>,
|
7
7
|
}
|
8
8
|
|
9
|
-
impl<T: RecordParser> Drop for RecordReader<T> {
|
10
|
-
fn drop(&mut self) {
|
11
|
-
match &mut self.reader {
|
12
|
-
ReadImpl::MultiThreaded {
|
13
|
-
receiver,
|
14
|
-
handle,
|
15
|
-
headers,
|
16
|
-
} => {
|
17
|
-
receiver.close();
|
18
|
-
if let Some(handle) = handle.take() {
|
19
|
-
let _ = handle.join();
|
20
|
-
}
|
21
|
-
StringCache::clear(headers).unwrap();
|
22
|
-
}
|
23
|
-
ReadImpl::SingleThreaded { headers, .. } => {
|
24
|
-
StringCache::clear(headers).unwrap();
|
25
|
-
}
|
26
|
-
}
|
27
|
-
}
|
28
|
-
}
|
29
|
-
|
30
|
-
#[allow(dead_code)]
|
31
|
-
pub enum ReadImpl<T: RecordParser> {
|
32
|
-
SingleThreaded {
|
33
|
-
reader: csv::Reader<Box<dyn Read + Send + 'static>>,
|
34
|
-
headers: Vec<&'static str>,
|
35
|
-
null_string: String,
|
36
|
-
},
|
37
|
-
MultiThreaded {
|
38
|
-
headers: Vec<&'static str>,
|
39
|
-
receiver: kanal::Receiver<T::Output>,
|
40
|
-
handle: Option<thread::JoinHandle<()>>,
|
41
|
-
},
|
42
|
-
}
|
43
|
-
|
44
9
|
impl<T: RecordParser> RecordReader<T> {
|
10
|
+
#[inline]
|
45
11
|
pub(crate) fn get_headers(
|
46
12
|
ruby: &Ruby,
|
47
13
|
reader: &mut csv::Reader<impl Read>,
|
48
14
|
has_headers: bool,
|
49
15
|
) -> Result<Vec<String>, Error> {
|
50
|
-
let first_row = reader
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
)
|
57
|
-
})?
|
58
|
-
.clone();
|
16
|
+
let first_row = reader.headers().map_err(|e| {
|
17
|
+
Error::new(
|
18
|
+
ruby.exception_runtime_error(),
|
19
|
+
Cow::Owned(format!("Failed to read headers: {e}")),
|
20
|
+
)
|
21
|
+
})?;
|
59
22
|
|
60
23
|
Ok(if has_headers {
|
61
|
-
|
24
|
+
// Pre-allocate the vector with exact capacity
|
25
|
+
let mut headers = Vec::with_capacity(first_row.len());
|
26
|
+
headers.extend(first_row.iter().map(String::from));
|
27
|
+
headers
|
62
28
|
} else {
|
63
|
-
|
29
|
+
// Pre-allocate the vector with exact capacity
|
30
|
+
let mut headers = Vec::with_capacity(first_row.len());
|
31
|
+
headers.extend((0..first_row.len()).map(|i| format!("c{i}")));
|
32
|
+
headers
|
64
33
|
})
|
65
34
|
}
|
66
35
|
}
|
@@ -68,30 +37,21 @@ impl<T: RecordParser> RecordReader<T> {
|
|
68
37
|
impl<T: RecordParser> Iterator for RecordReader<T> {
|
69
38
|
type Item = T::Output;
|
70
39
|
|
40
|
+
#[inline]
|
71
41
|
fn next(&mut self) -> Option<Self::Item> {
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
headers,
|
87
|
-
null_string,
|
88
|
-
} => {
|
89
|
-
let mut record = csv::StringRecord::new();
|
90
|
-
match reader.read_record(&mut record) {
|
91
|
-
Ok(true) => Some(T::parse(headers, &record, null_string)),
|
92
|
-
_ => None,
|
93
|
-
}
|
94
|
-
}
|
95
|
-
}
|
42
|
+
self.reader.next()
|
43
|
+
}
|
44
|
+
|
45
|
+
#[inline]
|
46
|
+
fn size_hint(&self) -> (usize, Option<usize>) {
|
47
|
+
// We can't know the exact size without reading the whole file
|
48
|
+
(0, None)
|
49
|
+
}
|
50
|
+
}
|
51
|
+
|
52
|
+
impl<T: RecordParser> Drop for RecordReader<T> {
|
53
|
+
#[inline]
|
54
|
+
fn drop(&mut self) {
|
55
|
+
self.reader.cleanup();
|
96
56
|
}
|
97
57
|
}
|
data/ext/osv/src/csv/record.rs
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
use magnus::{IntoValue,
|
1
|
+
use magnus::{IntoValue, Ruby, Value};
|
2
2
|
use std::collections::HashMap;
|
3
3
|
|
4
4
|
#[derive(Debug)]
|
@@ -8,14 +8,16 @@ pub enum CsvRecord {
|
|
8
8
|
}
|
9
9
|
|
10
10
|
impl IntoValue for CsvRecord {
|
11
|
+
#[inline]
|
11
12
|
fn into_value_with(self, handle: &Ruby) -> Value {
|
12
13
|
match self {
|
13
14
|
CsvRecord::Vec(vec) => vec.into_value_with(handle),
|
14
15
|
CsvRecord::Map(map) => {
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
16
|
+
// Pre-allocate the hash with the known size
|
17
|
+
let hash = handle.hash_new_capa(map.len());
|
18
|
+
map.into_iter()
|
19
|
+
.try_for_each(|(k, v)| hash.aset(k, v))
|
20
|
+
.unwrap();
|
19
21
|
hash.into_value_with(handle)
|
20
22
|
}
|
21
23
|
}
|
data/lib/osv/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: osv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-12-
|
11
|
+
date: 2024-12-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -59,6 +59,7 @@ files:
|
|
59
59
|
- ext/osv/src/csv/header_cache.rs
|
60
60
|
- ext/osv/src/csv/mod.rs
|
61
61
|
- ext/osv/src/csv/parser.rs
|
62
|
+
- ext/osv/src/csv/read_impl.rs
|
62
63
|
- ext/osv/src/csv/reader.rs
|
63
64
|
- ext/osv/src/csv/record.rs
|
64
65
|
- ext/osv/src/lib.rs
|