osv 0.3.3 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 33c644fac6e61f8bf3b9f11e646d6706017ece2a386df03a568b5ed06fd91e2a
4
- data.tar.gz: 0d977fb3a7eaf867663feb76161eb2d3fbe42e758523ef8cafbff51a9acfef0d
3
+ metadata.gz: 3b8a537d24e23250ff18da51bc59f9b4329b02e2fec9b881d4ee203a766e5514
4
+ data.tar.gz: dbdb8b1accd5897df7079adb98cc2ae50939f8ffeebf80c5095a06412dc69699
5
5
  SHA512:
6
- metadata.gz: b6fe382c005837fbfc705bd02b1859fdfa9fa9f955c15f7b13ffacb97f1d5dc0714288c23a7e2431a21aada6aacb1952b33ae8d14acd019c96ed719f5580c02d
7
- data.tar.gz: 4e7c3d783f23af709505a4c182d1ee6744c9f28226aecdf68c532ada9ffa653858bc730bfd879cfa94d8e8f7a1280c49380bb5b6f74ed16b7ac776318aa53d1b
6
+ metadata.gz: d32dc649748d62092414047bc5f7666cab3b1a4cbfb58a00edb2c0ec1a634d375e3c522ac716b57a6bb37292436433d867042b922383366a8da44a3500a6e2a4
7
+ data.tar.gz: deec4d1433d6da2b8242fae9c3ff2f17e5c56143f0c39d1f0ceaadd3351ef05641bb03f219a02c47b50a3e36c41048894963da41a0b1729cd153f579277cf448
data/Cargo.lock CHANGED
@@ -273,6 +273,7 @@ dependencies = [
273
273
  "rb-sys",
274
274
  "serde",
275
275
  "serde_magnus",
276
+ "thiserror",
276
277
  ]
277
278
 
278
279
  [[package]]
@@ -436,6 +437,26 @@ version = "1.0.1"
436
437
  source = "registry+https://github.com/rust-lang/crates.io-index"
437
438
  checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
438
439
 
440
+ [[package]]
441
+ name = "thiserror"
442
+ version = "1.0.69"
443
+ source = "registry+https://github.com/rust-lang/crates.io-index"
444
+ checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
445
+ dependencies = [
446
+ "thiserror-impl",
447
+ ]
448
+
449
+ [[package]]
450
+ name = "thiserror-impl"
451
+ version = "1.0.69"
452
+ source = "registry+https://github.com/rust-lang/crates.io-index"
453
+ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
454
+ dependencies = [
455
+ "proc-macro2",
456
+ "quote",
457
+ "syn",
458
+ ]
459
+
439
460
  [[package]]
440
461
  name = "unicode-ident"
441
462
  version = "1.0.14"
data/Gemfile CHANGED
@@ -2,6 +2,13 @@ source "https://rubygems.org"
2
2
 
3
3
  gem "rb_sys", "~> 0.9.56"
4
4
  gem "rake"
5
- gem "rake-compiler", "1.2.0"
5
+ gem "csv"
6
6
 
7
- gem "minitest", "~> 5.0", group: :test
7
+ # Use local version of osv
8
+ gemspec
9
+
10
+ group :development, :test do
11
+ gem "minitest", "~> 5.0"
12
+ gem "benchmark-ips", "~> 2.12"
13
+ gem "fastcsv", "~> 0.0.7"
14
+ end
data/README.md CHANGED
@@ -111,3 +111,93 @@ OSV.for_each(data) { |row| puts row["name"] }
111
111
  ## Performance
112
112
 
113
113
  This library is faster than the standard Ruby CSV library, and is comparable to the fastest CSV parser gems I've used.
114
+
115
+ Here's some unscientific benchmarks. You can find the code in the [benchmark/comparison_benchmark.rb](benchmark/comparison_benchmark.rb) file.
116
+
117
+ ### 10,000 lines
118
+
119
+ ```
120
+ Benchmarking with 10001 lines of data
121
+
122
+ ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin23]
123
+ Warming up --------------------------------------
124
+ OSV - Hash output 6.000 i/100ms
125
+ CSV - Hash output 1.000 i/100ms
126
+ OSV - Array output 18.000 i/100ms
127
+ CSV - Array output 2.000 i/100ms
128
+ FastCSV - Array output
129
+ 9.000 i/100ms
130
+ OSV - StringIO 7.000 i/100ms
131
+ CSV - StringIO 1.000 i/100ms
132
+ FastCSV - StringIO 20.000 i/100ms
133
+ OSV - Gzipped 6.000 i/100ms
134
+ CSV - Gzipped 1.000 i/100ms
135
+ Calculating -------------------------------------
136
+ OSV - Hash output 73.360 (± 4.1%) i/s (13.63 ms/i) - 366.000 in 5.000390s
137
+ CSV - Hash output 11.937 (±25.1%) i/s (83.78 ms/i) - 52.000 in 5.036297s
138
+ OSV - Array output 189.738 (± 8.4%) i/s (5.27 ms/i) - 954.000 in 5.071018s
139
+ CSV - Array output 25.471 (±11.8%) i/s (39.26 ms/i) - 120.000 in 5.015289s
140
+ FastCSV - Array output
141
+ 97.867 (± 2.0%) i/s (10.22 ms/i) - 495.000 in 5.060957s
142
+ OSV - StringIO 80.784 (± 6.2%) i/s (12.38 ms/i) - 406.000 in 5.046696s
143
+ CSV - StringIO 15.872 (± 0.0%) i/s (63.01 ms/i) - 80.000 in 5.043361s
144
+ FastCSV - StringIO 200.511 (± 2.0%) i/s (4.99 ms/i) - 1.020k in 5.088592s
145
+ OSV - Gzipped 55.220 (±12.7%) i/s (18.11 ms/i) - 258.000 in 5.030928s
146
+ CSV - Gzipped 12.591 (±15.9%) i/s (79.42 ms/i) - 59.000 in 5.039709s
147
+
148
+ Comparison:
149
+ FastCSV - StringIO: 200.5 i/s
150
+ OSV - Array output: 189.7 i/s - same-ish: difference falls within error
151
+ FastCSV - Array output: 97.9 i/s - 2.05x slower
152
+ OSV - StringIO: 80.8 i/s - 2.48x slower
153
+ OSV - Hash output: 73.4 i/s - 2.73x slower
154
+ OSV - Gzipped: 55.2 i/s - 3.63x slower
155
+ CSV - Array output: 25.5 i/s - 7.87x slower
156
+ CSV - StringIO: 15.9 i/s - 12.63x slower
157
+ CSV - Gzipped: 12.6 i/s - 15.92x slower
158
+ CSV - Hash output: 11.9 i/s - 16.80x slower
159
+ ```
160
+
161
+ ### 1,000,000 lines
162
+
163
+ ```
164
+ Benchmarking with 1000001 lines of data
165
+
166
+ ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin23]
167
+ Warming up --------------------------------------
168
+ OSV - Hash output 1.000 i/100ms
169
+ CSV - Hash output 1.000 i/100ms
170
+ OSV - Array output 1.000 i/100ms
171
+ CSV - Array output 1.000 i/100ms
172
+ FastCSV - Array output
173
+ 1.000 i/100ms
174
+ OSV - StringIO 1.000 i/100ms
175
+ CSV - StringIO 1.000 i/100ms
176
+ FastCSV - StringIO 1.000 i/100ms
177
+ OSV - Gzipped 1.000 i/100ms
178
+ CSV - Gzipped 1.000 i/100ms
179
+ Calculating -------------------------------------
180
+ OSV - Hash output 0.578 (± 0.0%) i/s (1.73 s/i) - 3.000 in 5.287845s
181
+ CSV - Hash output 0.117 (± 0.0%) i/s (8.57 s/i) - 1.000 in 8.571770s
182
+ OSV - Array output 1.142 (± 0.0%) i/s (875.97 ms/i) - 5.000 in 5.234694s
183
+ CSV - Array output 0.235 (± 0.0%) i/s (4.25 s/i) - 2.000 in 8.561144s
184
+ FastCSV - Array output
185
+ 0.768 (± 0.0%) i/s (1.30 s/i) - 4.000 in 6.924574s
186
+ OSV - StringIO 0.522 (± 0.0%) i/s (1.91 s/i) - 3.000 in 5.803969s
187
+ CSV - StringIO 0.132 (± 0.0%) i/s (7.59 s/i) - 1.000 in 7.593243s
188
+ FastCSV - StringIO 1.039 (± 0.0%) i/s (962.53 ms/i) - 6.000 in 5.806644s
189
+ OSV - Gzipped 0.437 (± 0.0%) i/s (2.29 s/i) - 3.000 in 6.885125s
190
+ CSV - Gzipped 0.115 (± 0.0%) i/s (8.68 s/i) - 1.000 in 8.684069s
191
+
192
+ Comparison:
193
+ OSV - Array output: 1.1 i/s
194
+ FastCSV - StringIO: 1.0 i/s - 1.10x slower
195
+ FastCSV - Array output: 0.8 i/s - 1.49x slower
196
+ OSV - Hash output: 0.6 i/s - 1.98x slower
197
+ OSV - StringIO: 0.5 i/s - 2.19x slower
198
+ OSV - Gzipped: 0.4 i/s - 2.61x slower
199
+ CSV - Array output: 0.2 i/s - 4.86x slower
200
+ CSV - StringIO: 0.1 i/s - 8.67x slower
201
+ CSV - Hash output: 0.1 i/s - 9.79x slower
202
+ CSV - Gzipped: 0.1 i/s - 9.91x slower
203
+ ```
data/ext/osv/Cargo.toml CHANGED
@@ -14,3 +14,4 @@ magnus = { version = "0.7", features = ["rb-sys"] }
14
14
  rb-sys = "^0.9"
15
15
  serde = { version = "1.0", features = ["derive"] }
16
16
  serde_magnus = "0.8.1"
17
+ thiserror = "1.0"
@@ -1,11 +1,50 @@
1
1
  use super::{
2
- header_cache::StringCache,
2
+ header_cache::{CacheError, StringCache},
3
3
  parser::RecordParser,
4
- reader::{ReadImpl, RecordReader},
4
+ read_impl::ReadImpl,
5
+ reader::RecordReader,
5
6
  };
6
7
  use flate2::read::GzDecoder;
7
- use magnus::{rb_sys::AsRawValue, value::ReprValue, Error, RString, Ruby, Value};
8
- use std::{fs::File, io::Read, marker::PhantomData, os::fd::FromRawFd, thread};
8
+ use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, RString, Ruby, Value};
9
+ use std::{
10
+ fs::File,
11
+ io::{self, Read},
12
+ marker::PhantomData,
13
+ os::fd::FromRawFd,
14
+ thread,
15
+ };
16
+ use thiserror::Error;
17
+
18
+ #[derive(Error, Debug)]
19
+ pub enum ReaderError {
20
+ #[error("Failed to get file descriptor: {0}")]
21
+ FileDescriptor(String),
22
+ #[error("Invalid file descriptor")]
23
+ InvalidFileDescriptor,
24
+ #[error("Failed to open file: {0}")]
25
+ FileOpen(#[from] io::Error),
26
+ #[error("Failed to intern headers: {0}")]
27
+ HeaderIntern(#[from] CacheError),
28
+ #[error("Unsupported GzipReader")]
29
+ UnsupportedGzipReader,
30
+ #[error("Ruby error: {0}")]
31
+ Ruby(String),
32
+ }
33
+
34
+ impl From<MagnusError> for ReaderError {
35
+ fn from(err: MagnusError) -> Self {
36
+ Self::Ruby(err.to_string())
37
+ }
38
+ }
39
+
40
+ impl From<ReaderError> for MagnusError {
41
+ fn from(err: ReaderError) -> Self {
42
+ MagnusError::new(
43
+ Ruby::get().unwrap().exception_runtime_error(),
44
+ err.to_string(),
45
+ )
46
+ }
47
+ }
9
48
 
10
49
  pub struct RecordReaderBuilder<'a, T: RecordParser + Send + 'static> {
11
50
  ruby: &'a Ruby,
@@ -57,36 +96,83 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
57
96
  self
58
97
  }
59
98
 
60
- fn get_reader(&self) -> Result<Box<dyn Read + Send + 'static>, Error> {
99
+ fn handle_string_io(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
100
+ let string: RString = self.to_read.funcall("string", ())?;
101
+ let content = string.to_string()?;
102
+ Ok(Box::new(std::io::Cursor::new(content)))
103
+ }
104
+
105
+ fn handle_file_descriptor(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
106
+ let raw_value = self.to_read.as_raw();
107
+ let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
108
+ .map_err(|_| {
109
+ ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
110
+ })?;
111
+
112
+ if fd < 0 {
113
+ return Err(ReaderError::InvalidFileDescriptor);
114
+ }
115
+
116
+ let file = unsafe { File::from_raw_fd(fd) };
117
+ Ok(Box::new(file))
118
+ }
119
+
120
+ fn handle_file_path(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
121
+ let path = self.to_read.to_r_string()?.to_string()?;
122
+ let file = File::open(&path)?;
123
+
124
+ Ok(if path.ends_with(".gz") {
125
+ Box::new(GzDecoder::new(file))
126
+ } else {
127
+ Box::new(file)
128
+ })
129
+ }
130
+
131
+ fn get_reader(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
61
132
  let string_io: magnus::RClass = self.ruby.eval("StringIO")?;
133
+ let gzip_reader_class: magnus::RClass = self.ruby.eval("Zlib::GzipReader")?;
62
134
 
63
135
  if self.to_read.is_kind_of(string_io) {
64
- let string: RString = self.to_read.funcall("string", ())?;
65
- let content = string.to_string()?;
66
- Ok(Box::new(std::io::Cursor::new(content)))
136
+ self.handle_string_io()
137
+ } else if self.to_read.is_kind_of(gzip_reader_class) {
138
+ Err(ReaderError::UnsupportedGzipReader)
67
139
  } else if self.to_read.is_kind_of(self.ruby.class_io()) {
68
- let fd = unsafe { rb_sys::rb_io_descriptor(self.to_read.as_raw()) };
69
- let file = unsafe { File::from_raw_fd(fd) };
70
- Ok(Box::new(file))
140
+ self.handle_file_descriptor()
71
141
  } else {
72
- let path = self.to_read.to_r_string()?.to_string()?;
73
- let file = std::fs::File::open(&path).map_err(|e| {
74
- Error::new(
75
- self.ruby.exception_runtime_error(),
76
- format!("Failed to open file: {e}"),
77
- )
78
- })?;
79
- if path.ends_with(".gz") {
80
- let file = GzDecoder::new(file);
81
- Ok(Box::new(file))
82
- } else {
83
- Ok(Box::new(file))
142
+ self.handle_file_path()
143
+ }
144
+ }
145
+
146
+ fn get_single_threaded_reader(&self) -> Result<Box<dyn Read>, ReaderError> {
147
+ let string_io: magnus::RClass = self.ruby.eval("StringIO")?;
148
+ let gzip_reader_class: magnus::RClass = self.ruby.eval("Zlib::GzipReader")?;
149
+
150
+ if self.to_read.is_kind_of(string_io) {
151
+ self.handle_string_io().map(|r| -> Box<dyn Read> { r })
152
+ } else if self.to_read.is_kind_of(gzip_reader_class) {
153
+ Ok(Box::new(RubyReader::new(self.to_read)))
154
+ } else if self.to_read.is_kind_of(self.ruby.class_io()) {
155
+ self.handle_file_descriptor()
156
+ .map(|r| -> Box<dyn Read> { r })
157
+ } else {
158
+ self.handle_file_path().map(|r| -> Box<dyn Read> { r })
159
+ }
160
+ }
161
+
162
+ pub fn build(self) -> Result<RecordReader<T>, ReaderError> {
163
+ match self.get_reader() {
164
+ Ok(readable) => self.build_multi_threaded(readable),
165
+ Err(_) => {
166
+ let readable = self.get_single_threaded_reader()?;
167
+ self.build_single_threaded(readable)
84
168
  }
85
169
  }
86
170
  }
87
171
 
88
- pub fn build(self) -> Result<RecordReader<T>, Error> {
89
- let readable = self.get_reader()?;
172
+ fn build_multi_threaded(
173
+ self,
174
+ readable: Box<dyn Read + Send + 'static>,
175
+ ) -> Result<RecordReader<T>, ReaderError> {
90
176
  let mut reader = csv::ReaderBuilder::new()
91
177
  .has_headers(self.has_headers)
92
178
  .delimiter(self.delimiter)
@@ -94,17 +180,12 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
94
180
  .from_reader(readable);
95
181
 
96
182
  let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
97
- let null_string = self.null_string;
98
-
99
- let static_headers = StringCache::intern_many(&headers).map_err(|e| {
100
- Error::new(
101
- self.ruby.exception_runtime_error(),
102
- format!("Failed to intern headers: {e}"),
103
- )
104
- })?;
183
+ let static_headers = StringCache::intern_many(&headers)?;
105
184
  let headers_for_cleanup = static_headers.clone();
106
185
 
107
186
  let (sender, receiver) = kanal::bounded(self.buffer);
187
+ let null_string = self.null_string.clone();
188
+
108
189
  let handle = thread::spawn(move || {
109
190
  let mut record = csv::StringRecord::new();
110
191
  while let Ok(true) = reader.read_record(&mut record) {
@@ -125,4 +206,58 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
125
206
  },
126
207
  })
127
208
  }
209
+
210
+ fn build_single_threaded(
211
+ self,
212
+ readable: Box<dyn Read>,
213
+ ) -> Result<RecordReader<T>, ReaderError> {
214
+ let mut reader = csv::ReaderBuilder::new()
215
+ .has_headers(self.has_headers)
216
+ .delimiter(self.delimiter)
217
+ .quote(self.quote_char)
218
+ .from_reader(readable);
219
+
220
+ let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
221
+ let static_headers = StringCache::intern_many(&headers)?;
222
+
223
+ Ok(RecordReader {
224
+ reader: ReadImpl::SingleThreaded {
225
+ reader,
226
+ headers: static_headers,
227
+ null_string: self.null_string,
228
+ },
229
+ })
230
+ }
231
+ }
232
+
233
+ struct RubyReader {
234
+ inner: Value,
235
+ }
236
+
237
+ impl RubyReader {
238
+ fn new(inner: Value) -> Self {
239
+ Self { inner }
240
+ }
241
+ }
242
+
243
+ impl Read for RubyReader {
244
+ fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
245
+ let result = self.inner.funcall::<_, _, Value>("read", (buf.len(),));
246
+ match result {
247
+ Ok(data) => {
248
+ if data.is_nil() {
249
+ return Ok(0);
250
+ }
251
+
252
+ let string = RString::from_value(data).ok_or_else(|| {
253
+ io::Error::new(io::ErrorKind::Other, "Failed to convert to RString")
254
+ })?;
255
+ let bytes = unsafe { string.as_slice() };
256
+ let len = bytes.len().min(buf.len());
257
+ buf[..len].copy_from_slice(&bytes[..len]);
258
+ Ok(len)
259
+ }
260
+ Err(e) => Err(io::Error::new(io::ErrorKind::Other, e.to_string())),
261
+ }
262
+ }
128
263
  }
@@ -4,22 +4,29 @@
4
4
  ///
5
5
  /// Note: Performance testing on macOS showed minimal speed improvements,
6
6
  /// so this optimization could be removed if any issues arise.
7
-
8
-
9
7
  use std::{
10
8
  collections::HashMap,
11
9
  sync::{atomic::AtomicU32, LazyLock, Mutex},
12
10
  };
11
+ use thiserror::Error;
12
+
13
+ #[derive(Debug, Error)]
14
+ pub enum CacheError {
15
+ #[error("Failed to acquire lock: {0}")]
16
+ LockError(String),
17
+ }
13
18
 
14
19
  static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, AtomicU32>>> =
15
20
  LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
16
21
 
17
- pub struct StringCache {}
22
+ pub struct StringCache;
18
23
 
19
24
  impl StringCache {
20
25
  #[allow(dead_code)]
21
- pub fn intern(string: String) -> Result<&'static str, String> {
22
- let mut cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
26
+ pub fn intern(string: String) -> Result<&'static str, CacheError> {
27
+ let mut cache = STRING_CACHE
28
+ .lock()
29
+ .map_err(|e| CacheError::LockError(e.to_string()))?;
23
30
 
24
31
  if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
25
32
  count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
@@ -31,33 +38,36 @@ impl StringCache {
31
38
  }
32
39
  }
33
40
 
34
- pub fn intern_many(strings: &[String]) -> Result<Vec<&'static str>, String> {
35
- let mut cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
36
- let mut result = Vec::with_capacity(strings.len());
41
+ pub fn intern_many(strings: &[String]) -> Result<Vec<&'static str>, CacheError> {
42
+ let mut cache = STRING_CACHE
43
+ .lock()
44
+ .map_err(|e| CacheError::LockError(e.to_string()))?;
37
45
 
46
+ let mut result = Vec::with_capacity(strings.len());
38
47
  for string in strings {
39
- let static_str: &'static str =
40
- if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
41
- count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
42
- existing
43
- } else {
44
- let leaked = Box::leak(string.clone().into_boxed_str());
45
- cache.insert(leaked, AtomicU32::new(1));
46
- leaked
47
- };
48
- result.push(static_str);
48
+ if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
49
+ count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
50
+ result.push(existing);
51
+ } else {
52
+ let leaked = Box::leak(string.clone().into_boxed_str());
53
+ cache.insert(leaked, AtomicU32::new(1));
54
+ result.push(leaked);
55
+ }
49
56
  }
50
-
51
57
  Ok(result)
52
58
  }
53
59
 
54
- pub fn clear(headers: &[&'static str]) -> Result<(), String> {
55
- let cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
60
+ pub fn clear(headers: &[&'static str]) -> Result<(), CacheError> {
61
+ let mut cache = STRING_CACHE
62
+ .lock()
63
+ .map_err(|e| CacheError::LockError(e.to_string()))?;
56
64
 
57
65
  for header in headers {
58
66
  if let Some(count) = cache.get(header) {
59
- let remaining = count.fetch_sub(1, std::sync::atomic::Ordering::Relaxed);
60
- if remaining == 0 {
67
+ // Returns the previous value of the counter
68
+ let was = count.fetch_sub(1, std::sync::atomic::Ordering::Relaxed);
69
+ if was == 1 {
70
+ cache.remove(header);
61
71
  let ptr = *header as *const str as *mut str;
62
72
  unsafe {
63
73
  let _ = Box::from_raw(ptr);
@@ -1,6 +1,7 @@
1
1
  mod builder;
2
2
  mod header_cache;
3
3
  mod parser;
4
+ pub mod read_impl;
4
5
  mod reader;
5
6
  mod record;
6
7
 
@@ -2,6 +2,7 @@ use std::collections::HashMap;
2
2
 
3
3
  pub trait RecordParser {
4
4
  type Output;
5
+
5
6
  fn parse(
6
7
  headers: &[&'static str],
7
8
  record: &csv::StringRecord,
@@ -11,41 +12,58 @@ pub trait RecordParser {
11
12
 
12
13
  impl RecordParser for HashMap<&'static str, Option<String>> {
13
14
  type Output = Self;
15
+
16
+ #[inline]
14
17
  fn parse(
15
18
  headers: &[&'static str],
16
19
  record: &csv::StringRecord,
17
20
  null_string: &str,
18
21
  ) -> Self::Output {
19
22
  let mut map = HashMap::with_capacity(headers.len());
20
- for (header, field) in headers.iter().zip(record.iter()) {
21
- map.insert(
22
- *header,
23
- if field == null_string {
24
- None
25
- } else {
26
- Some(field.to_string())
27
- },
28
- );
29
- }
23
+ headers
24
+ .iter()
25
+ .zip(record.iter())
26
+ .for_each(|(header, field)| {
27
+ map.insert(
28
+ *header,
29
+ if field == null_string {
30
+ None
31
+ } else {
32
+ // Avoid allocating for empty strings
33
+ if field.is_empty() {
34
+ Some(String::new())
35
+ } else {
36
+ Some(field.to_string())
37
+ }
38
+ },
39
+ );
40
+ });
30
41
  map
31
42
  }
32
43
  }
33
44
 
34
45
  impl RecordParser for Vec<Option<String>> {
35
46
  type Output = Self;
47
+
48
+ #[inline]
36
49
  fn parse(
37
50
  _headers: &[&'static str],
38
51
  record: &csv::StringRecord,
39
52
  null_string: &str,
40
53
  ) -> Self::Output {
41
54
  let mut vec = Vec::with_capacity(record.len());
42
- for field in record.iter() {
43
- vec.push(if field == null_string {
55
+ vec.extend(record.iter().map(|field| {
56
+ if field == null_string {
44
57
  None
45
58
  } else {
46
- Some(field.to_string())
47
- });
48
- }
59
+ // Avoid allocating for empty strings
60
+ if field.is_empty() {
61
+ Some(String::new())
62
+ } else {
63
+ Some(field.to_string())
64
+ }
65
+ }
66
+ }));
49
67
  vec
50
68
  }
51
69
  }
@@ -0,0 +1,65 @@
1
+ use super::{header_cache::StringCache, parser::RecordParser};
2
+ use std::{io::Read, thread};
3
+
4
+ pub enum ReadImpl<T: RecordParser> {
5
+ SingleThreaded {
6
+ reader: csv::Reader<Box<dyn Read>>,
7
+ headers: Vec<&'static str>,
8
+ null_string: String,
9
+ },
10
+ MultiThreaded {
11
+ headers: Vec<&'static str>,
12
+ receiver: kanal::Receiver<T::Output>,
13
+ handle: Option<thread::JoinHandle<()>>,
14
+ },
15
+ }
16
+
17
+ impl<T: RecordParser> ReadImpl<T> {
18
+ #[inline]
19
+ pub fn next(&mut self) -> Option<T::Output> {
20
+ match self {
21
+ Self::MultiThreaded {
22
+ receiver, handle, ..
23
+ } => match receiver.recv() {
24
+ Ok(record) => Some(record),
25
+ Err(_) => {
26
+ if let Some(handle) = handle.take() {
27
+ let _ = handle.join();
28
+ }
29
+ None
30
+ }
31
+ },
32
+ Self::SingleThreaded {
33
+ reader,
34
+ headers,
35
+ null_string,
36
+ } => {
37
+ let mut record = csv::StringRecord::new();
38
+ match reader.read_record(&mut record) {
39
+ Ok(true) => Some(T::parse(headers, &record, null_string)),
40
+ _ => None,
41
+ }
42
+ }
43
+ }
44
+ }
45
+
46
+ #[inline]
47
+ pub fn cleanup(&mut self) {
48
+ match self {
49
+ Self::MultiThreaded {
50
+ receiver,
51
+ handle,
52
+ headers,
53
+ } => {
54
+ receiver.close();
55
+ if let Some(handle) = handle.take() {
56
+ let _ = handle.join();
57
+ }
58
+ let _ = StringCache::clear(headers);
59
+ }
60
+ Self::SingleThreaded { headers, .. } => {
61
+ let _ = StringCache::clear(headers);
62
+ }
63
+ }
64
+ }
65
+ }
@@ -1,66 +1,35 @@
1
- use super::{header_cache::StringCache, parser::RecordParser};
1
+ use super::{parser::RecordParser, read_impl::ReadImpl};
2
2
  use magnus::{Error, Ruby};
3
- use std::{io::Read, thread};
3
+ use std::{borrow::Cow, io::Read};
4
4
 
5
5
  pub struct RecordReader<T: RecordParser> {
6
6
  pub(crate) reader: ReadImpl<T>,
7
7
  }
8
8
 
9
- impl<T: RecordParser> Drop for RecordReader<T> {
10
- fn drop(&mut self) {
11
- match &mut self.reader {
12
- ReadImpl::MultiThreaded {
13
- receiver,
14
- handle,
15
- headers,
16
- } => {
17
- receiver.close();
18
- if let Some(handle) = handle.take() {
19
- let _ = handle.join();
20
- }
21
- StringCache::clear(headers).unwrap();
22
- }
23
- ReadImpl::SingleThreaded { headers, .. } => {
24
- StringCache::clear(headers).unwrap();
25
- }
26
- }
27
- }
28
- }
29
-
30
- #[allow(dead_code)]
31
- pub enum ReadImpl<T: RecordParser> {
32
- SingleThreaded {
33
- reader: csv::Reader<Box<dyn Read + Send + 'static>>,
34
- headers: Vec<&'static str>,
35
- null_string: String,
36
- },
37
- MultiThreaded {
38
- headers: Vec<&'static str>,
39
- receiver: kanal::Receiver<T::Output>,
40
- handle: Option<thread::JoinHandle<()>>,
41
- },
42
- }
43
-
44
9
  impl<T: RecordParser> RecordReader<T> {
10
+ #[inline]
45
11
  pub(crate) fn get_headers(
46
12
  ruby: &Ruby,
47
13
  reader: &mut csv::Reader<impl Read>,
48
14
  has_headers: bool,
49
15
  ) -> Result<Vec<String>, Error> {
50
- let first_row = reader
51
- .headers()
52
- .map_err(|e| {
53
- Error::new(
54
- ruby.exception_runtime_error(),
55
- format!("Failed to read headers: {e}"),
56
- )
57
- })?
58
- .clone();
16
+ let first_row = reader.headers().map_err(|e| {
17
+ Error::new(
18
+ ruby.exception_runtime_error(),
19
+ Cow::Owned(format!("Failed to read headers: {e}")),
20
+ )
21
+ })?;
59
22
 
60
23
  Ok(if has_headers {
61
- first_row.iter().map(String::from).collect()
24
+ // Pre-allocate the vector with exact capacity
25
+ let mut headers = Vec::with_capacity(first_row.len());
26
+ headers.extend(first_row.iter().map(String::from));
27
+ headers
62
28
  } else {
63
- (0..first_row.len()).map(|i| format!("c{i}")).collect()
29
+ // Pre-allocate the vector with exact capacity
30
+ let mut headers = Vec::with_capacity(first_row.len());
31
+ headers.extend((0..first_row.len()).map(|i| format!("c{i}")));
32
+ headers
64
33
  })
65
34
  }
66
35
  }
@@ -68,30 +37,21 @@ impl<T: RecordParser> RecordReader<T> {
68
37
  impl<T: RecordParser> Iterator for RecordReader<T> {
69
38
  type Item = T::Output;
70
39
 
40
+ #[inline]
71
41
  fn next(&mut self) -> Option<Self::Item> {
72
- match &mut self.reader {
73
- ReadImpl::MultiThreaded {
74
- receiver, handle, ..
75
- } => match receiver.recv() {
76
- Ok(record) => Some(record),
77
- Err(_) => {
78
- if let Some(handle) = handle.take() {
79
- let _ = handle.join();
80
- }
81
- None
82
- }
83
- },
84
- ReadImpl::SingleThreaded {
85
- reader,
86
- headers,
87
- null_string,
88
- } => {
89
- let mut record = csv::StringRecord::new();
90
- match reader.read_record(&mut record) {
91
- Ok(true) => Some(T::parse(headers, &record, null_string)),
92
- _ => None,
93
- }
94
- }
95
- }
42
+ self.reader.next()
43
+ }
44
+
45
+ #[inline]
46
+ fn size_hint(&self) -> (usize, Option<usize>) {
47
+ // We can't know the exact size without reading the whole file
48
+ (0, None)
49
+ }
50
+ }
51
+
52
+ impl<T: RecordParser> Drop for RecordReader<T> {
53
+ #[inline]
54
+ fn drop(&mut self) {
55
+ self.reader.cleanup();
96
56
  }
97
57
  }
@@ -1,4 +1,4 @@
1
- use magnus::{IntoValue, RHash, Ruby, Value};
1
+ use magnus::{IntoValue, Ruby, Value};
2
2
  use std::collections::HashMap;
3
3
 
4
4
  #[derive(Debug)]
@@ -8,14 +8,16 @@ pub enum CsvRecord {
8
8
  }
9
9
 
10
10
  impl IntoValue for CsvRecord {
11
+ #[inline]
11
12
  fn into_value_with(self, handle: &Ruby) -> Value {
12
13
  match self {
13
14
  CsvRecord::Vec(vec) => vec.into_value_with(handle),
14
15
  CsvRecord::Map(map) => {
15
- let hash = RHash::new();
16
- for (k, v) in map {
17
- hash.aset(k, v).unwrap();
18
- }
16
+ // Pre-allocate the hash with the known size
17
+ let hash = handle.hash_new_capa(map.len());
18
+ map.into_iter()
19
+ .try_for_each(|(k, v)| hash.aset(k, v))
20
+ .unwrap();
19
21
  hash.into_value_with(handle)
20
22
  }
21
23
  }
data/lib/osv/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module OSV
2
- VERSION = "0.3.3"
2
+ VERSION = "0.3.4"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: osv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.3.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-12-23 00:00:00.000000000 Z
11
+ date: 2024-12-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -59,6 +59,7 @@ files:
59
59
  - ext/osv/src/csv/header_cache.rs
60
60
  - ext/osv/src/csv/mod.rs
61
61
  - ext/osv/src/csv/parser.rs
62
+ - ext/osv/src/csv/read_impl.rs
62
63
  - ext/osv/src/csv/reader.rs
63
64
  - ext/osv/src/csv/record.rs
64
65
  - ext/osv/src/lib.rs