osv 0.3.13 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +1 -1
- data/README.md +39 -81
- data/Rakefile +6 -8
- data/ext/osv/src/csv/builder.rs +59 -175
- data/ext/osv/src/csv/mod.rs +4 -3
- data/ext/osv/src/csv/parser.rs +90 -14
- data/ext/osv/src/csv/record.rs +19 -6
- data/ext/osv/src/csv/record_reader.rs +175 -0
- data/ext/osv/src/csv/ruby_reader.rs +181 -0
- data/ext/osv/src/reader.rs +24 -19
- data/lib/osv/version.rb +1 -1
- metadata +11 -15
- data/ext/osv/src/csv/read_impl.rs +0 -75
- data/ext/osv/src/csv/reader.rs +0 -57
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 689f28c935746890aa680fd2f649076a36d6ce233d4cbf2717dc129174b593dc
|
4
|
+
data.tar.gz: 45ddaa6774a9a4e9391d000b30b6e92afb8560b81821d8fec363d54283bac6d9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 74c2052ea9cbc61ddef5d1c46abdd5e4cdf7c60c946c421e4b8da7c160ba3f3eb761842279cd9f066aa6a1aa2214d0ef9ba9ff11c46294e8e1d4ebbb95161d70
|
7
|
+
data.tar.gz: 5a795e5fa6d84b39082c2754dea655cd5b4f8a00558627fb64f661a14ec32daa8ea7b31a13724291e11531d03c8c5fa1bdb928c6d9422b87fcbb5b5aba7daad5
|
data/Gemfile
CHANGED
@@ -2,12 +2,12 @@ source "https://rubygems.org"
|
|
2
2
|
|
3
3
|
gem "rb_sys", "~> 0.9.56"
|
4
4
|
gem "rake"
|
5
|
-
gem "csv"
|
6
5
|
|
7
6
|
# Use local version of osv
|
8
7
|
gemspec
|
9
8
|
|
10
9
|
group :development, :test do
|
10
|
+
gem "csv"
|
11
11
|
gem "minitest", "~> 5.0"
|
12
12
|
gem "benchmark-ips", "~> 2.12"
|
13
13
|
gem "fastcsv", "~> 0.0.7"
|
data/README.md
CHANGED
@@ -114,104 +114,62 @@ When `has_headers` is false, hash keys will be generated as `"c0"`, `"c1"`, etc.
|
|
114
114
|
|
115
115
|
## Performance
|
116
116
|
|
117
|
-
This library is faster than the standard Ruby CSV library
|
117
|
+
This library is faster than the standard Ruby CSV library. It's also faster than any other CSV gem I've been able to find.
|
118
118
|
|
119
119
|
Here's some unscientific benchmarks. You can find the code in the [benchmark/comparison_benchmark.rb](benchmark/comparison_benchmark.rb) file.
|
120
120
|
|
121
|
-
###
|
121
|
+
### 1,000,000 lines
|
122
122
|
|
123
123
|
```
|
124
|
-
|
124
|
+
🏃 Running benchmarks...
|
125
|
+
Benchmarking with 3000001 lines of data
|
125
126
|
|
126
|
-
ruby 3.3.6 (2024-11-05 revision 75015d4c1f) [arm64-darwin24]
|
127
|
+
ruby 3.3.6 (2024-11-05 revision 75015d4c1f) +YJIT [arm64-darwin24]
|
127
128
|
Warming up --------------------------------------
|
128
|
-
OSV - Hash output 1.000 i/100ms
|
129
|
-
CSV - Hash output 1.000 i/100ms
|
130
|
-
OSV - Array output 1.000 i/100ms
|
131
|
-
OSV - Direct Open Array output
|
132
|
-
12.719M i/100ms
|
133
|
-
CSV - Array output 1.000 i/100ms
|
134
|
-
FastCSV - Array output
|
135
|
-
1.000 i/100ms
|
136
|
-
OSV - StringIO 1.000 i/100ms
|
137
129
|
CSV - StringIO 1.000 i/100ms
|
138
130
|
FastCSV - StringIO 1.000 i/100ms
|
139
|
-
|
140
|
-
CSV - Gzipped 1.000 i/100ms
|
141
|
-
Calculating -------------------------------------
|
142
|
-
OSV - Hash output 6.722 (±14.9%) i/s (148.77 ms/i) - 59.000 in 10.074753s
|
143
|
-
CSV - Hash output 1.223 (± 0.0%) i/s (817.62 ms/i) - 13.000 in 10.788284s
|
144
|
-
OSV - Array output 17.284 (±11.6%) i/s (57.86 ms/i) - 171.000 in 10.007321s
|
145
|
-
OSV - Direct Open Array output
|
146
|
-
213.629M (±13.5%) i/s (4.68 ns/i) - 1.921B in 10.005506s
|
147
|
-
CSV - Array output 2.193 (± 0.0%) i/s (455.93 ms/i) - 22.000 in 10.052607s
|
148
|
-
FastCSV - Array output
|
149
|
-
7.993 (± 0.0%) i/s (125.11 ms/i) - 80.000 in 10.053729s
|
150
|
-
OSV - StringIO 6.626 (±15.1%) i/s (150.91 ms/i) - 66.000 in 10.103646s
|
151
|
-
CSV - StringIO 1.478 (± 0.0%) i/s (676.78 ms/i) - 15.000 in 10.158640s
|
152
|
-
FastCSV - StringIO 17.074 (± 5.9%) i/s (58.57 ms/i) - 171.000 in 10.059266s
|
153
|
-
OSV - Gzipped 5.639 (± 0.0%) i/s (177.32 ms/i) - 57.000 in 10.152487s
|
154
|
-
CSV - Gzipped 1.176 (± 0.0%) i/s (850.19 ms/i) - 12.000 in 10.233398s
|
155
|
-
|
156
|
-
Comparison:
|
157
|
-
OSV - Direct Open Array output: 213629268.6 i/s
|
158
|
-
OSV - Array output: 17.3 i/s - 12360250.79x slower
|
159
|
-
FastCSV - StringIO: 17.1 i/s - 12511956.50x slower
|
160
|
-
FastCSV - Array output: 8.0 i/s - 26727225.72x slower
|
161
|
-
OSV - Hash output: 6.7 i/s - 31780615.83x slower
|
162
|
-
OSV - StringIO: 6.6 i/s - 32239620.60x slower
|
163
|
-
OSV - Gzipped: 5.6 i/s - 37881517.48x slower
|
164
|
-
CSV - Array output: 2.2 i/s - 97400427.87x slower
|
165
|
-
CSV - StringIO: 1.5 i/s - 144580048.04x slower
|
166
|
-
CSV - Hash output: 1.2 i/s - 174666591.31x slower
|
167
|
-
CSV - Gzipped: 1.2 i/s - 181626018.23x slower
|
168
|
-
```
|
169
|
-
|
170
|
-
### 1,000,000 lines
|
171
|
-
|
172
|
-
```
|
173
|
-
Benchmarking with 1000001 lines of data
|
174
|
-
|
175
|
-
ruby 3.3.6 (2024-11-05 revision 75015d4c1f) [arm64-darwin24]
|
176
|
-
Warming up --------------------------------------
|
177
|
-
OSV - Hash output 1.000 i/100ms
|
131
|
+
OSV - StringIO 1.000 i/100ms
|
178
132
|
CSV - Hash output 1.000 i/100ms
|
179
|
-
|
180
|
-
OSV - Direct Open Array output
|
181
|
-
1.000 i/100ms
|
133
|
+
OSV - Hash output 1.000 i/100ms
|
182
134
|
CSV - Array output 1.000 i/100ms
|
135
|
+
OSV - Array output 1.000 i/100ms
|
183
136
|
FastCSV - Array output
|
184
137
|
1.000 i/100ms
|
185
|
-
|
186
|
-
|
187
|
-
FastCSV - StringIO 1.000 i/100ms
|
138
|
+
OSV - Direct Open Array output
|
139
|
+
1.000 i/100ms
|
188
140
|
OSV - Gzipped 1.000 i/100ms
|
141
|
+
OSV - Gzipped Direct 1.000 i/100ms
|
142
|
+
FastCSV - Gzipped 1.000 i/100ms
|
189
143
|
CSV - Gzipped 1.000 i/100ms
|
190
144
|
Calculating -------------------------------------
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
CSV - Array output 0.
|
145
|
+
CSV - StringIO 0.083 (± 0.0%) i/s (12.01 s/i) - 3.000 in 36.028672s
|
146
|
+
FastCSV - StringIO 0.366 (± 0.0%) i/s (2.73 s/i) - 11.000 in 30.032350s
|
147
|
+
OSV - StringIO 0.522 (± 0.0%) i/s (1.92 s/i) - 16.000 in 30.655768s
|
148
|
+
CSV - Hash output 0.062 (± 0.0%) i/s (16.16 s/i) - 2.000 in 32.311990s
|
149
|
+
OSV - Hash output 0.273 (± 0.0%) i/s (3.66 s/i) - 9.000 in 32.924970s
|
150
|
+
CSV - Array output 0.069 (± 0.0%) i/s (14.50 s/i) - 3.000 in 43.488185s
|
151
|
+
OSV - Array output 0.601 (± 0.0%) i/s (1.66 s/i) - 19.000 in 31.636782s
|
197
152
|
FastCSV - Array output
|
198
|
-
0.
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
153
|
+
0.356 (± 0.0%) i/s (2.81 s/i) - 11.000 in 30.871931s
|
154
|
+
OSV - Direct Open Array output
|
155
|
+
0.604 (± 0.0%) i/s (1.66 s/i) - 19.000 in 31.469190s
|
156
|
+
OSV - Gzipped 0.424 (± 0.0%) i/s (2.36 s/i) - 13.000 in 30.642322s
|
157
|
+
OSV - Gzipped Direct 0.636 (± 0.0%) i/s (1.57 s/i) - 20.000 in 31.424083s
|
158
|
+
FastCSV - Gzipped 0.323 (± 0.0%) i/s (3.10 s/i) - 10.000 in 30.990648s
|
159
|
+
CSV - Gzipped 0.058 (± 0.0%) i/s (17.11 s/i) - 2.000 in 34.228691s
|
204
160
|
|
205
161
|
Comparison:
|
206
|
-
OSV - Direct
|
207
|
-
|
208
|
-
|
209
|
-
OSV - StringIO: 0.
|
210
|
-
|
211
|
-
|
212
|
-
FastCSV - Array output: 0.
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
162
|
+
OSV - Gzipped Direct: 0.6 i/s
|
163
|
+
OSV - Direct Open Array output: 0.6 i/s - 1.05x slower
|
164
|
+
OSV - Array output: 0.6 i/s - 1.06x slower
|
165
|
+
OSV - StringIO: 0.5 i/s - 1.22x slower
|
166
|
+
OSV - Gzipped: 0.4 i/s - 1.50x slower
|
167
|
+
FastCSV - StringIO: 0.4 i/s - 1.74x slower
|
168
|
+
FastCSV - Array output: 0.4 i/s - 1.79x slower
|
169
|
+
FastCSV - Gzipped: 0.3 i/s - 1.97x slower
|
170
|
+
OSV - Hash output: 0.3 i/s - 2.33x slower
|
171
|
+
CSV - StringIO: 0.1 i/s - 7.64x slower
|
172
|
+
CSV - Array output: 0.1 i/s - 9.23x slower
|
173
|
+
CSV - Hash output: 0.1 i/s - 10.28x slower
|
174
|
+
CSV - Gzipped: 0.1 i/s - 10.89x slower
|
217
175
|
```
|
data/Rakefile
CHANGED
@@ -1,21 +1,19 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require "rake/testtask"
|
4
|
-
require "
|
4
|
+
require "rb_sys/extensiontask"
|
5
5
|
|
6
6
|
task default: :test
|
7
7
|
|
8
|
-
|
9
|
-
c.lib_dir = "lib/osv"
|
10
|
-
c.ext_dir = "ext/osv"
|
11
|
-
end
|
8
|
+
GEMSPEC = Gem::Specification.load("osv.gemspec")
|
12
9
|
|
13
|
-
|
14
|
-
|
10
|
+
RbSys::ExtensionTask.new("osv", GEMSPEC) do |ext|
|
11
|
+
ext.lib_dir = "lib/osv"
|
12
|
+
ext.ext_dir = "ext/osv"
|
15
13
|
end
|
16
14
|
|
17
15
|
Rake::TestTask.new do |t|
|
18
|
-
t.deps << :
|
16
|
+
t.deps << :compile
|
19
17
|
t.test_files = FileList[File.expand_path("test/*_test.rb", __dir__)]
|
20
18
|
t.libs << "lib"
|
21
19
|
t.libs << "test"
|
data/ext/osv/src/csv/builder.rs
CHANGED
@@ -1,19 +1,18 @@
|
|
1
1
|
use super::{
|
2
2
|
header_cache::{CacheError, StringCache},
|
3
3
|
parser::RecordParser,
|
4
|
-
|
5
|
-
|
6
|
-
READ_BUFFER_SIZE,
|
4
|
+
record_reader::{RecordReader, READ_BUFFER_SIZE},
|
5
|
+
ruby_reader::build_ruby_reader,
|
7
6
|
};
|
8
7
|
use flate2::read::GzDecoder;
|
9
|
-
use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError,
|
8
|
+
use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, Ruby, Value};
|
10
9
|
use std::{
|
11
10
|
fs::File,
|
12
11
|
io::{self, BufReader, Read},
|
13
12
|
marker::PhantomData,
|
14
13
|
os::fd::FromRawFd,
|
15
|
-
thread,
|
16
14
|
};
|
15
|
+
|
17
16
|
use thiserror::Error;
|
18
17
|
|
19
18
|
pub(crate) static BUFFER_CHANNEL_SIZE: usize = 1024;
|
@@ -28,8 +27,6 @@ pub enum ReaderError {
|
|
28
27
|
FileOpen(#[from] io::Error),
|
29
28
|
#[error("Failed to intern headers: {0}")]
|
30
29
|
HeaderIntern(#[from] CacheError),
|
31
|
-
#[error("Unsupported GzipReader")]
|
32
|
-
UnsupportedGzipReader,
|
33
30
|
#[error("Ruby error: {0}")]
|
34
31
|
Ruby(String),
|
35
32
|
}
|
@@ -49,7 +46,7 @@ impl From<ReaderError> for MagnusError {
|
|
49
46
|
}
|
50
47
|
}
|
51
48
|
|
52
|
-
pub struct RecordReaderBuilder<'a, T: RecordParser + Send
|
49
|
+
pub struct RecordReaderBuilder<'a, T: RecordParser<'a> + Send> {
|
53
50
|
ruby: &'a Ruby,
|
54
51
|
to_read: Value,
|
55
52
|
has_headers: bool,
|
@@ -58,12 +55,55 @@ pub struct RecordReaderBuilder<'a, T: RecordParser + Send + 'static> {
|
|
58
55
|
null_string: Option<String>,
|
59
56
|
buffer: usize,
|
60
57
|
flexible: bool,
|
61
|
-
flexible_default: Option
|
58
|
+
flexible_default: Option<&'a str>,
|
62
59
|
trim: csv::Trim,
|
63
60
|
_phantom: PhantomData<T>,
|
64
61
|
}
|
65
62
|
|
66
|
-
impl<
|
63
|
+
impl<T: RecordParser<'static> + Send + 'static> RecordReaderBuilder<'static, T> {
|
64
|
+
fn build_multi_threaded(
|
65
|
+
self,
|
66
|
+
readable: Box<dyn Read + Send + 'static>,
|
67
|
+
should_forget: bool,
|
68
|
+
) -> Result<RecordReader<'static, T>, ReaderError> {
|
69
|
+
let flexible = self.flexible || self.flexible_default.is_some();
|
70
|
+
let mut reader = csv::ReaderBuilder::new()
|
71
|
+
.has_headers(self.has_headers)
|
72
|
+
.delimiter(self.delimiter)
|
73
|
+
.quote(self.quote_char)
|
74
|
+
.flexible(flexible)
|
75
|
+
.trim(self.trim)
|
76
|
+
.from_reader(readable);
|
77
|
+
|
78
|
+
let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
|
79
|
+
let static_headers = StringCache::intern_many(&headers)?;
|
80
|
+
|
81
|
+
Ok(RecordReader::new_multi_threaded(
|
82
|
+
reader,
|
83
|
+
static_headers,
|
84
|
+
self.buffer,
|
85
|
+
self.null_string,
|
86
|
+
self.flexible_default,
|
87
|
+
should_forget,
|
88
|
+
))
|
89
|
+
}
|
90
|
+
|
91
|
+
pub fn build_threaded(self) -> Result<RecordReader<'static, T>, ReaderError> {
|
92
|
+
if self.to_read.is_kind_of(self.ruby.class_io()) {
|
93
|
+
let readable = self.handle_file_descriptor()?;
|
94
|
+
self.build_multi_threaded(readable, true)
|
95
|
+
} else if self.to_read.is_kind_of(self.ruby.class_string()) {
|
96
|
+
let readable = self.handle_file_path()?;
|
97
|
+
self.build_multi_threaded(readable, false)
|
98
|
+
} else {
|
99
|
+
let readable = build_ruby_reader(self.ruby, self.to_read)?;
|
100
|
+
|
101
|
+
self.build_single_threaded(readable)
|
102
|
+
}
|
103
|
+
}
|
104
|
+
}
|
105
|
+
|
106
|
+
impl<'a, T: RecordParser<'a> + Send> RecordReaderBuilder<'a, T> {
|
67
107
|
pub fn new(ruby: &'a Ruby, to_read: Value) -> Self {
|
68
108
|
Self {
|
69
109
|
ruby,
|
@@ -110,7 +150,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
110
150
|
self
|
111
151
|
}
|
112
152
|
|
113
|
-
pub fn flexible_default(mut self, flexible_default: Option
|
153
|
+
pub fn flexible_default(mut self, flexible_default: Option<&'a str>) -> Self {
|
114
154
|
self.flexible_default = flexible_default;
|
115
155
|
self
|
116
156
|
}
|
@@ -120,12 +160,6 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
120
160
|
self
|
121
161
|
}
|
122
162
|
|
123
|
-
fn handle_string_io(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
|
124
|
-
let string: RString = self.to_read.funcall("string", ())?;
|
125
|
-
let content = string.to_string()?;
|
126
|
-
Ok(Box::new(std::io::Cursor::new(content)))
|
127
|
-
}
|
128
|
-
|
129
163
|
fn handle_file_descriptor(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
|
130
164
|
let raw_value = self.to_read.as_raw();
|
131
165
|
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
@@ -155,101 +189,10 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
155
189
|
})
|
156
190
|
}
|
157
191
|
|
158
|
-
fn get_reader(&self) -> Result<(Box<dyn Read + Send + 'static>, bool), ReaderError> {
|
159
|
-
let string_io: magnus::RClass = self.ruby.eval("StringIO")?;
|
160
|
-
let gzip_reader_class: magnus::RClass = self.ruby.eval("Zlib::GzipReader")?;
|
161
|
-
|
162
|
-
if self.to_read.is_kind_of(string_io) {
|
163
|
-
self.handle_string_io().map(|r| (r, false))
|
164
|
-
} else if self.to_read.is_kind_of(gzip_reader_class) {
|
165
|
-
Err(ReaderError::UnsupportedGzipReader)
|
166
|
-
} else if self.to_read.is_kind_of(self.ruby.class_io()) {
|
167
|
-
self.handle_file_descriptor().map(|r| (r, true))
|
168
|
-
} else {
|
169
|
-
self.handle_file_path().map(|r| (r, false))
|
170
|
-
}
|
171
|
-
}
|
172
|
-
|
173
|
-
fn get_single_threaded_reader(&self) -> Result<Box<dyn Read>, ReaderError> {
|
174
|
-
let string_io: magnus::RClass = self.ruby.eval("StringIO")?;
|
175
|
-
let gzip_reader_class: magnus::RClass = self.ruby.eval("Zlib::GzipReader")?;
|
176
|
-
|
177
|
-
if self.to_read.is_kind_of(string_io) {
|
178
|
-
self.handle_string_io().map(|r| -> Box<dyn Read> { r })
|
179
|
-
} else if self.to_read.is_kind_of(gzip_reader_class) {
|
180
|
-
Ok(Box::new(RubyReader::new(self.to_read)))
|
181
|
-
} else if self.to_read.is_kind_of(self.ruby.class_io()) {
|
182
|
-
self.handle_file_descriptor()
|
183
|
-
.map(|r| -> Box<dyn Read> { r })
|
184
|
-
} else {
|
185
|
-
self.handle_file_path().map(|r| -> Box<dyn Read> { r })
|
186
|
-
}
|
187
|
-
}
|
188
|
-
|
189
|
-
pub fn build(self) -> Result<RecordReader<T>, ReaderError> {
|
190
|
-
match self.get_reader() {
|
191
|
-
Ok((readable, should_forget)) => self.build_multi_threaded(readable, should_forget),
|
192
|
-
Err(_) => {
|
193
|
-
let readable = self.get_single_threaded_reader()?;
|
194
|
-
self.build_single_threaded(readable)
|
195
|
-
}
|
196
|
-
}
|
197
|
-
}
|
198
|
-
|
199
|
-
fn build_multi_threaded(
|
200
|
-
self,
|
201
|
-
readable: Box<dyn Read + Send + 'static>,
|
202
|
-
should_forget: bool,
|
203
|
-
) -> Result<RecordReader<T>, ReaderError> {
|
204
|
-
let flexible = self.flexible || self.flexible_default.is_some();
|
205
|
-
let mut reader = csv::ReaderBuilder::new()
|
206
|
-
.has_headers(self.has_headers)
|
207
|
-
.delimiter(self.delimiter)
|
208
|
-
.quote(self.quote_char)
|
209
|
-
.flexible(flexible)
|
210
|
-
.trim(self.trim)
|
211
|
-
.from_reader(readable);
|
212
|
-
|
213
|
-
let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
|
214
|
-
let static_headers = StringCache::intern_many(&headers)?;
|
215
|
-
let headers_for_cleanup = static_headers.clone();
|
216
|
-
|
217
|
-
let (sender, receiver) = kanal::bounded(self.buffer);
|
218
|
-
let null_string = self.null_string.clone();
|
219
|
-
|
220
|
-
let flexible_default = self.flexible_default.clone();
|
221
|
-
let handle = thread::spawn(move || {
|
222
|
-
let mut record = csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers.len());
|
223
|
-
while let Ok(true) = reader.read_record(&mut record) {
|
224
|
-
let row = T::parse(
|
225
|
-
&static_headers,
|
226
|
-
&record,
|
227
|
-
null_string.as_deref(),
|
228
|
-
flexible_default.as_deref(),
|
229
|
-
);
|
230
|
-
if sender.send(row).is_err() {
|
231
|
-
break;
|
232
|
-
}
|
233
|
-
}
|
234
|
-
if should_forget {
|
235
|
-
let file_to_forget = reader.into_inner();
|
236
|
-
std::mem::forget(file_to_forget);
|
237
|
-
}
|
238
|
-
});
|
239
|
-
|
240
|
-
Ok(RecordReader {
|
241
|
-
reader: ReadImpl::MultiThreaded {
|
242
|
-
headers: headers_for_cleanup,
|
243
|
-
receiver,
|
244
|
-
handle: Some(handle),
|
245
|
-
},
|
246
|
-
})
|
247
|
-
}
|
248
|
-
|
249
192
|
fn build_single_threaded(
|
250
193
|
self,
|
251
|
-
readable: Box<dyn Read>,
|
252
|
-
) -> Result<RecordReader<T>, ReaderError> {
|
194
|
+
readable: Box<dyn Read + 'a>,
|
195
|
+
) -> Result<RecordReader<'a, T>, ReaderError> {
|
253
196
|
let flexible = self.flexible || self.flexible_default.is_some();
|
254
197
|
let mut reader = csv::ReaderBuilder::new()
|
255
198
|
.has_headers(self.has_headers)
|
@@ -262,70 +205,11 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
|
|
262
205
|
let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
|
263
206
|
let static_headers = StringCache::intern_many(&headers)?;
|
264
207
|
|
265
|
-
Ok(RecordReader
|
266
|
-
reader
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
},
|
272
|
-
})
|
273
|
-
}
|
274
|
-
}
|
275
|
-
|
276
|
-
struct RubyReader {
|
277
|
-
inner: Value,
|
278
|
-
buffer: Option<Vec<u8>>,
|
279
|
-
offset: usize,
|
280
|
-
}
|
281
|
-
|
282
|
-
impl RubyReader {
|
283
|
-
fn new(inner: Value) -> Self {
|
284
|
-
Self {
|
285
|
-
inner,
|
286
|
-
buffer: None,
|
287
|
-
offset: 0,
|
288
|
-
}
|
289
|
-
}
|
290
|
-
}
|
291
|
-
|
292
|
-
// Read the entire inner into a vector and then read future reads from that vector with offset
|
293
|
-
impl Read for RubyReader {
|
294
|
-
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
295
|
-
// If we have an existing buffer, read from it
|
296
|
-
if let Some(buffer) = self.buffer.as_ref() {
|
297
|
-
let remaining = buffer.len() - self.offset;
|
298
|
-
let copy_size = remaining.min(buf.len());
|
299
|
-
buf[..copy_size].copy_from_slice(&buffer[self.offset..self.offset + copy_size]);
|
300
|
-
self.offset += copy_size;
|
301
|
-
return Ok(copy_size);
|
302
|
-
}
|
303
|
-
|
304
|
-
// No buffer yet - read the entire content from Ruby
|
305
|
-
let result = self.inner.funcall::<_, _, Value>("read", ());
|
306
|
-
match result {
|
307
|
-
Ok(data) => {
|
308
|
-
if data.is_nil() {
|
309
|
-
return Ok(0); // EOF
|
310
|
-
}
|
311
|
-
|
312
|
-
let string = RString::from_value(data).ok_or_else(|| {
|
313
|
-
io::Error::new(io::ErrorKind::Other, "Failed to convert to RString")
|
314
|
-
})?;
|
315
|
-
let bytes = unsafe { string.as_slice() };
|
316
|
-
|
317
|
-
// Store the entire content in the buffer
|
318
|
-
self.buffer = Some(bytes.to_vec());
|
319
|
-
self.offset = 0;
|
320
|
-
|
321
|
-
// Read initial chunk
|
322
|
-
let copy_size = bytes.len().min(buf.len());
|
323
|
-
buf[..copy_size].copy_from_slice(&bytes[..copy_size]);
|
324
|
-
self.offset = copy_size;
|
325
|
-
|
326
|
-
Ok(copy_size)
|
327
|
-
}
|
328
|
-
Err(e) => Err(io::Error::new(io::ErrorKind::Other, e.to_string())),
|
329
|
-
}
|
208
|
+
Ok(RecordReader::new_single_threaded(
|
209
|
+
reader,
|
210
|
+
static_headers,
|
211
|
+
self.null_string,
|
212
|
+
self.flexible_default,
|
213
|
+
))
|
330
214
|
}
|
331
215
|
}
|
data/ext/osv/src/csv/mod.rs
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
mod builder;
|
2
2
|
mod header_cache;
|
3
3
|
mod parser;
|
4
|
-
pub mod read_impl;
|
5
|
-
mod reader;
|
6
4
|
mod record;
|
5
|
+
mod record_reader;
|
6
|
+
mod ruby_reader;
|
7
7
|
|
8
8
|
pub use builder::RecordReaderBuilder;
|
9
9
|
pub(crate) use builder::BUFFER_CHANNEL_SIZE;
|
10
|
-
pub
|
10
|
+
pub use record::CowValue;
|
11
11
|
pub use record::CsvRecord;
|
12
|
+
pub(crate) use record_reader::READ_BUFFER_SIZE;
|