osv 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 33c644fac6e61f8bf3b9f11e646d6706017ece2a386df03a568b5ed06fd91e2a
4
- data.tar.gz: 0d977fb3a7eaf867663feb76161eb2d3fbe42e758523ef8cafbff51a9acfef0d
3
+ metadata.gz: 3b8a537d24e23250ff18da51bc59f9b4329b02e2fec9b881d4ee203a766e5514
4
+ data.tar.gz: dbdb8b1accd5897df7079adb98cc2ae50939f8ffeebf80c5095a06412dc69699
5
5
  SHA512:
6
- metadata.gz: b6fe382c005837fbfc705bd02b1859fdfa9fa9f955c15f7b13ffacb97f1d5dc0714288c23a7e2431a21aada6aacb1952b33ae8d14acd019c96ed719f5580c02d
7
- data.tar.gz: 4e7c3d783f23af709505a4c182d1ee6744c9f28226aecdf68c532ada9ffa653858bc730bfd879cfa94d8e8f7a1280c49380bb5b6f74ed16b7ac776318aa53d1b
6
+ metadata.gz: d32dc649748d62092414047bc5f7666cab3b1a4cbfb58a00edb2c0ec1a634d375e3c522ac716b57a6bb37292436433d867042b922383366a8da44a3500a6e2a4
7
+ data.tar.gz: deec4d1433d6da2b8242fae9c3ff2f17e5c56143f0c39d1f0ceaadd3351ef05641bb03f219a02c47b50a3e36c41048894963da41a0b1729cd153f579277cf448
data/Cargo.lock CHANGED
@@ -273,6 +273,7 @@ dependencies = [
273
273
  "rb-sys",
274
274
  "serde",
275
275
  "serde_magnus",
276
+ "thiserror",
276
277
  ]
277
278
 
278
279
  [[package]]
@@ -436,6 +437,26 @@ version = "1.0.1"
436
437
  source = "registry+https://github.com/rust-lang/crates.io-index"
437
438
  checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
438
439
 
440
+ [[package]]
441
+ name = "thiserror"
442
+ version = "1.0.69"
443
+ source = "registry+https://github.com/rust-lang/crates.io-index"
444
+ checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
445
+ dependencies = [
446
+ "thiserror-impl",
447
+ ]
448
+
449
+ [[package]]
450
+ name = "thiserror-impl"
451
+ version = "1.0.69"
452
+ source = "registry+https://github.com/rust-lang/crates.io-index"
453
+ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
454
+ dependencies = [
455
+ "proc-macro2",
456
+ "quote",
457
+ "syn",
458
+ ]
459
+
439
460
  [[package]]
440
461
  name = "unicode-ident"
441
462
  version = "1.0.14"
data/Gemfile CHANGED
@@ -2,6 +2,13 @@ source "https://rubygems.org"
2
2
 
3
3
  gem "rb_sys", "~> 0.9.56"
4
4
  gem "rake"
5
- gem "rake-compiler", "1.2.0"
5
+ gem "csv"
6
6
 
7
- gem "minitest", "~> 5.0", group: :test
7
+ # Use local version of osv
8
+ gemspec
9
+
10
+ group :development, :test do
11
+ gem "minitest", "~> 5.0"
12
+ gem "benchmark-ips", "~> 2.12"
13
+ gem "fastcsv", "~> 0.0.7"
14
+ end
data/README.md CHANGED
@@ -111,3 +111,93 @@ OSV.for_each(data) { |row| puts row["name"] }
111
111
  ## Performance
112
112
 
113
113
  This library is faster than the standard Ruby CSV library, and is comparable to the fastest CSV parser gems I've used.
114
+
115
+ Here's some unscientific benchmarks. You can find the code in the [benchmark/comparison_benchmark.rb](benchmark/comparison_benchmark.rb) file.
116
+
117
+ ### 10,000 lines
118
+
119
+ ```
120
+ Benchmarking with 10001 lines of data
121
+
122
+ ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin23]
123
+ Warming up --------------------------------------
124
+ OSV - Hash output 6.000 i/100ms
125
+ CSV - Hash output 1.000 i/100ms
126
+ OSV - Array output 18.000 i/100ms
127
+ CSV - Array output 2.000 i/100ms
128
+ FastCSV - Array output
129
+ 9.000 i/100ms
130
+ OSV - StringIO 7.000 i/100ms
131
+ CSV - StringIO 1.000 i/100ms
132
+ FastCSV - StringIO 20.000 i/100ms
133
+ OSV - Gzipped 6.000 i/100ms
134
+ CSV - Gzipped 1.000 i/100ms
135
+ Calculating -------------------------------------
136
+ OSV - Hash output 73.360 (± 4.1%) i/s (13.63 ms/i) - 366.000 in 5.000390s
137
+ CSV - Hash output 11.937 (±25.1%) i/s (83.78 ms/i) - 52.000 in 5.036297s
138
+ OSV - Array output 189.738 (± 8.4%) i/s (5.27 ms/i) - 954.000 in 5.071018s
139
+ CSV - Array output 25.471 (±11.8%) i/s (39.26 ms/i) - 120.000 in 5.015289s
140
+ FastCSV - Array output
141
+ 97.867 (± 2.0%) i/s (10.22 ms/i) - 495.000 in 5.060957s
142
+ OSV - StringIO 80.784 (± 6.2%) i/s (12.38 ms/i) - 406.000 in 5.046696s
143
+ CSV - StringIO 15.872 (± 0.0%) i/s (63.01 ms/i) - 80.000 in 5.043361s
144
+ FastCSV - StringIO 200.511 (± 2.0%) i/s (4.99 ms/i) - 1.020k in 5.088592s
145
+ OSV - Gzipped 55.220 (±12.7%) i/s (18.11 ms/i) - 258.000 in 5.030928s
146
+ CSV - Gzipped 12.591 (±15.9%) i/s (79.42 ms/i) - 59.000 in 5.039709s
147
+
148
+ Comparison:
149
+ FastCSV - StringIO: 200.5 i/s
150
+ OSV - Array output: 189.7 i/s - same-ish: difference falls within error
151
+ FastCSV - Array output: 97.9 i/s - 2.05x slower
152
+ OSV - StringIO: 80.8 i/s - 2.48x slower
153
+ OSV - Hash output: 73.4 i/s - 2.73x slower
154
+ OSV - Gzipped: 55.2 i/s - 3.63x slower
155
+ CSV - Array output: 25.5 i/s - 7.87x slower
156
+ CSV - StringIO: 15.9 i/s - 12.63x slower
157
+ CSV - Gzipped: 12.6 i/s - 15.92x slower
158
+ CSV - Hash output: 11.9 i/s - 16.80x slower
159
+ ```
160
+
161
+ ### 1,000,000 lines
162
+
163
+ ```
164
+ Benchmarking with 1000001 lines of data
165
+
166
+ ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin23]
167
+ Warming up --------------------------------------
168
+ OSV - Hash output 1.000 i/100ms
169
+ CSV - Hash output 1.000 i/100ms
170
+ OSV - Array output 1.000 i/100ms
171
+ CSV - Array output 1.000 i/100ms
172
+ FastCSV - Array output
173
+ 1.000 i/100ms
174
+ OSV - StringIO 1.000 i/100ms
175
+ CSV - StringIO 1.000 i/100ms
176
+ FastCSV - StringIO 1.000 i/100ms
177
+ OSV - Gzipped 1.000 i/100ms
178
+ CSV - Gzipped 1.000 i/100ms
179
+ Calculating -------------------------------------
180
+ OSV - Hash output 0.578 (± 0.0%) i/s (1.73 s/i) - 3.000 in 5.287845s
181
+ CSV - Hash output 0.117 (± 0.0%) i/s (8.57 s/i) - 1.000 in 8.571770s
182
+ OSV - Array output 1.142 (± 0.0%) i/s (875.97 ms/i) - 5.000 in 5.234694s
183
+ CSV - Array output 0.235 (± 0.0%) i/s (4.25 s/i) - 2.000 in 8.561144s
184
+ FastCSV - Array output
185
+ 0.768 (± 0.0%) i/s (1.30 s/i) - 4.000 in 6.924574s
186
+ OSV - StringIO 0.522 (± 0.0%) i/s (1.91 s/i) - 3.000 in 5.803969s
187
+ CSV - StringIO 0.132 (± 0.0%) i/s (7.59 s/i) - 1.000 in 7.593243s
188
+ FastCSV - StringIO 1.039 (± 0.0%) i/s (962.53 ms/i) - 6.000 in 5.806644s
189
+ OSV - Gzipped 0.437 (± 0.0%) i/s (2.29 s/i) - 3.000 in 6.885125s
190
+ CSV - Gzipped 0.115 (± 0.0%) i/s (8.68 s/i) - 1.000 in 8.684069s
191
+
192
+ Comparison:
193
+ OSV - Array output: 1.1 i/s
194
+ FastCSV - StringIO: 1.0 i/s - 1.10x slower
195
+ FastCSV - Array output: 0.8 i/s - 1.49x slower
196
+ OSV - Hash output: 0.6 i/s - 1.98x slower
197
+ OSV - StringIO: 0.5 i/s - 2.19x slower
198
+ OSV - Gzipped: 0.4 i/s - 2.61x slower
199
+ CSV - Array output: 0.2 i/s - 4.86x slower
200
+ CSV - StringIO: 0.1 i/s - 8.67x slower
201
+ CSV - Hash output: 0.1 i/s - 9.79x slower
202
+ CSV - Gzipped: 0.1 i/s - 9.91x slower
203
+ ```
data/ext/osv/Cargo.toml CHANGED
@@ -14,3 +14,4 @@ magnus = { version = "0.7", features = ["rb-sys"] }
14
14
  rb-sys = "^0.9"
15
15
  serde = { version = "1.0", features = ["derive"] }
16
16
  serde_magnus = "0.8.1"
17
+ thiserror = "1.0"
@@ -1,11 +1,50 @@
1
1
  use super::{
2
- header_cache::StringCache,
2
+ header_cache::{CacheError, StringCache},
3
3
  parser::RecordParser,
4
- reader::{ReadImpl, RecordReader},
4
+ read_impl::ReadImpl,
5
+ reader::RecordReader,
5
6
  };
6
7
  use flate2::read::GzDecoder;
7
- use magnus::{rb_sys::AsRawValue, value::ReprValue, Error, RString, Ruby, Value};
8
- use std::{fs::File, io::Read, marker::PhantomData, os::fd::FromRawFd, thread};
8
+ use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, RString, Ruby, Value};
9
+ use std::{
10
+ fs::File,
11
+ io::{self, Read},
12
+ marker::PhantomData,
13
+ os::fd::FromRawFd,
14
+ thread,
15
+ };
16
+ use thiserror::Error;
17
+
18
+ #[derive(Error, Debug)]
19
+ pub enum ReaderError {
20
+ #[error("Failed to get file descriptor: {0}")]
21
+ FileDescriptor(String),
22
+ #[error("Invalid file descriptor")]
23
+ InvalidFileDescriptor,
24
+ #[error("Failed to open file: {0}")]
25
+ FileOpen(#[from] io::Error),
26
+ #[error("Failed to intern headers: {0}")]
27
+ HeaderIntern(#[from] CacheError),
28
+ #[error("Unsupported GzipReader")]
29
+ UnsupportedGzipReader,
30
+ #[error("Ruby error: {0}")]
31
+ Ruby(String),
32
+ }
33
+
34
+ impl From<MagnusError> for ReaderError {
35
+ fn from(err: MagnusError) -> Self {
36
+ Self::Ruby(err.to_string())
37
+ }
38
+ }
39
+
40
+ impl From<ReaderError> for MagnusError {
41
+ fn from(err: ReaderError) -> Self {
42
+ MagnusError::new(
43
+ Ruby::get().unwrap().exception_runtime_error(),
44
+ err.to_string(),
45
+ )
46
+ }
47
+ }
9
48
 
10
49
  pub struct RecordReaderBuilder<'a, T: RecordParser + Send + 'static> {
11
50
  ruby: &'a Ruby,
@@ -57,36 +96,83 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
57
96
  self
58
97
  }
59
98
 
60
- fn get_reader(&self) -> Result<Box<dyn Read + Send + 'static>, Error> {
99
+ fn handle_string_io(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
100
+ let string: RString = self.to_read.funcall("string", ())?;
101
+ let content = string.to_string()?;
102
+ Ok(Box::new(std::io::Cursor::new(content)))
103
+ }
104
+
105
+ fn handle_file_descriptor(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
106
+ let raw_value = self.to_read.as_raw();
107
+ let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
108
+ .map_err(|_| {
109
+ ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
110
+ })?;
111
+
112
+ if fd < 0 {
113
+ return Err(ReaderError::InvalidFileDescriptor);
114
+ }
115
+
116
+ let file = unsafe { File::from_raw_fd(fd) };
117
+ Ok(Box::new(file))
118
+ }
119
+
120
+ fn handle_file_path(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
121
+ let path = self.to_read.to_r_string()?.to_string()?;
122
+ let file = File::open(&path)?;
123
+
124
+ Ok(if path.ends_with(".gz") {
125
+ Box::new(GzDecoder::new(file))
126
+ } else {
127
+ Box::new(file)
128
+ })
129
+ }
130
+
131
+ fn get_reader(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
61
132
  let string_io: magnus::RClass = self.ruby.eval("StringIO")?;
133
+ let gzip_reader_class: magnus::RClass = self.ruby.eval("Zlib::GzipReader")?;
62
134
 
63
135
  if self.to_read.is_kind_of(string_io) {
64
- let string: RString = self.to_read.funcall("string", ())?;
65
- let content = string.to_string()?;
66
- Ok(Box::new(std::io::Cursor::new(content)))
136
+ self.handle_string_io()
137
+ } else if self.to_read.is_kind_of(gzip_reader_class) {
138
+ Err(ReaderError::UnsupportedGzipReader)
67
139
  } else if self.to_read.is_kind_of(self.ruby.class_io()) {
68
- let fd = unsafe { rb_sys::rb_io_descriptor(self.to_read.as_raw()) };
69
- let file = unsafe { File::from_raw_fd(fd) };
70
- Ok(Box::new(file))
140
+ self.handle_file_descriptor()
71
141
  } else {
72
- let path = self.to_read.to_r_string()?.to_string()?;
73
- let file = std::fs::File::open(&path).map_err(|e| {
74
- Error::new(
75
- self.ruby.exception_runtime_error(),
76
- format!("Failed to open file: {e}"),
77
- )
78
- })?;
79
- if path.ends_with(".gz") {
80
- let file = GzDecoder::new(file);
81
- Ok(Box::new(file))
82
- } else {
83
- Ok(Box::new(file))
142
+ self.handle_file_path()
143
+ }
144
+ }
145
+
146
+ fn get_single_threaded_reader(&self) -> Result<Box<dyn Read>, ReaderError> {
147
+ let string_io: magnus::RClass = self.ruby.eval("StringIO")?;
148
+ let gzip_reader_class: magnus::RClass = self.ruby.eval("Zlib::GzipReader")?;
149
+
150
+ if self.to_read.is_kind_of(string_io) {
151
+ self.handle_string_io().map(|r| -> Box<dyn Read> { r })
152
+ } else if self.to_read.is_kind_of(gzip_reader_class) {
153
+ Ok(Box::new(RubyReader::new(self.to_read)))
154
+ } else if self.to_read.is_kind_of(self.ruby.class_io()) {
155
+ self.handle_file_descriptor()
156
+ .map(|r| -> Box<dyn Read> { r })
157
+ } else {
158
+ self.handle_file_path().map(|r| -> Box<dyn Read> { r })
159
+ }
160
+ }
161
+
162
+ pub fn build(self) -> Result<RecordReader<T>, ReaderError> {
163
+ match self.get_reader() {
164
+ Ok(readable) => self.build_multi_threaded(readable),
165
+ Err(_) => {
166
+ let readable = self.get_single_threaded_reader()?;
167
+ self.build_single_threaded(readable)
84
168
  }
85
169
  }
86
170
  }
87
171
 
88
- pub fn build(self) -> Result<RecordReader<T>, Error> {
89
- let readable = self.get_reader()?;
172
+ fn build_multi_threaded(
173
+ self,
174
+ readable: Box<dyn Read + Send + 'static>,
175
+ ) -> Result<RecordReader<T>, ReaderError> {
90
176
  let mut reader = csv::ReaderBuilder::new()
91
177
  .has_headers(self.has_headers)
92
178
  .delimiter(self.delimiter)
@@ -94,17 +180,12 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
94
180
  .from_reader(readable);
95
181
 
96
182
  let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
97
- let null_string = self.null_string;
98
-
99
- let static_headers = StringCache::intern_many(&headers).map_err(|e| {
100
- Error::new(
101
- self.ruby.exception_runtime_error(),
102
- format!("Failed to intern headers: {e}"),
103
- )
104
- })?;
183
+ let static_headers = StringCache::intern_many(&headers)?;
105
184
  let headers_for_cleanup = static_headers.clone();
106
185
 
107
186
  let (sender, receiver) = kanal::bounded(self.buffer);
187
+ let null_string = self.null_string.clone();
188
+
108
189
  let handle = thread::spawn(move || {
109
190
  let mut record = csv::StringRecord::new();
110
191
  while let Ok(true) = reader.read_record(&mut record) {
@@ -125,4 +206,58 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
125
206
  },
126
207
  })
127
208
  }
209
+
210
+ fn build_single_threaded(
211
+ self,
212
+ readable: Box<dyn Read>,
213
+ ) -> Result<RecordReader<T>, ReaderError> {
214
+ let mut reader = csv::ReaderBuilder::new()
215
+ .has_headers(self.has_headers)
216
+ .delimiter(self.delimiter)
217
+ .quote(self.quote_char)
218
+ .from_reader(readable);
219
+
220
+ let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
221
+ let static_headers = StringCache::intern_many(&headers)?;
222
+
223
+ Ok(RecordReader {
224
+ reader: ReadImpl::SingleThreaded {
225
+ reader,
226
+ headers: static_headers,
227
+ null_string: self.null_string,
228
+ },
229
+ })
230
+ }
231
+ }
232
+
233
+ struct RubyReader {
234
+ inner: Value,
235
+ }
236
+
237
+ impl RubyReader {
238
+ fn new(inner: Value) -> Self {
239
+ Self { inner }
240
+ }
241
+ }
242
+
243
+ impl Read for RubyReader {
244
+ fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
245
+ let result = self.inner.funcall::<_, _, Value>("read", (buf.len(),));
246
+ match result {
247
+ Ok(data) => {
248
+ if data.is_nil() {
249
+ return Ok(0);
250
+ }
251
+
252
+ let string = RString::from_value(data).ok_or_else(|| {
253
+ io::Error::new(io::ErrorKind::Other, "Failed to convert to RString")
254
+ })?;
255
+ let bytes = unsafe { string.as_slice() };
256
+ let len = bytes.len().min(buf.len());
257
+ buf[..len].copy_from_slice(&bytes[..len]);
258
+ Ok(len)
259
+ }
260
+ Err(e) => Err(io::Error::new(io::ErrorKind::Other, e.to_string())),
261
+ }
262
+ }
128
263
  }
@@ -4,22 +4,29 @@
4
4
  ///
5
5
  /// Note: Performance testing on macOS showed minimal speed improvements,
6
6
  /// so this optimization could be removed if any issues arise.
7
-
8
-
9
7
  use std::{
10
8
  collections::HashMap,
11
9
  sync::{atomic::AtomicU32, LazyLock, Mutex},
12
10
  };
11
+ use thiserror::Error;
12
+
13
+ #[derive(Debug, Error)]
14
+ pub enum CacheError {
15
+ #[error("Failed to acquire lock: {0}")]
16
+ LockError(String),
17
+ }
13
18
 
14
19
  static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, AtomicU32>>> =
15
20
  LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
16
21
 
17
- pub struct StringCache {}
22
+ pub struct StringCache;
18
23
 
19
24
  impl StringCache {
20
25
  #[allow(dead_code)]
21
- pub fn intern(string: String) -> Result<&'static str, String> {
22
- let mut cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
26
+ pub fn intern(string: String) -> Result<&'static str, CacheError> {
27
+ let mut cache = STRING_CACHE
28
+ .lock()
29
+ .map_err(|e| CacheError::LockError(e.to_string()))?;
23
30
 
24
31
  if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
25
32
  count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
@@ -31,33 +38,36 @@ impl StringCache {
31
38
  }
32
39
  }
33
40
 
34
- pub fn intern_many(strings: &[String]) -> Result<Vec<&'static str>, String> {
35
- let mut cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
36
- let mut result = Vec::with_capacity(strings.len());
41
+ pub fn intern_many(strings: &[String]) -> Result<Vec<&'static str>, CacheError> {
42
+ let mut cache = STRING_CACHE
43
+ .lock()
44
+ .map_err(|e| CacheError::LockError(e.to_string()))?;
37
45
 
46
+ let mut result = Vec::with_capacity(strings.len());
38
47
  for string in strings {
39
- let static_str: &'static str =
40
- if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
41
- count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
42
- existing
43
- } else {
44
- let leaked = Box::leak(string.clone().into_boxed_str());
45
- cache.insert(leaked, AtomicU32::new(1));
46
- leaked
47
- };
48
- result.push(static_str);
48
+ if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
49
+ count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
50
+ result.push(existing);
51
+ } else {
52
+ let leaked = Box::leak(string.clone().into_boxed_str());
53
+ cache.insert(leaked, AtomicU32::new(1));
54
+ result.push(leaked);
55
+ }
49
56
  }
50
-
51
57
  Ok(result)
52
58
  }
53
59
 
54
- pub fn clear(headers: &[&'static str]) -> Result<(), String> {
55
- let cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
60
+ pub fn clear(headers: &[&'static str]) -> Result<(), CacheError> {
61
+ let mut cache = STRING_CACHE
62
+ .lock()
63
+ .map_err(|e| CacheError::LockError(e.to_string()))?;
56
64
 
57
65
  for header in headers {
58
66
  if let Some(count) = cache.get(header) {
59
- let remaining = count.fetch_sub(1, std::sync::atomic::Ordering::Relaxed);
60
- if remaining == 0 {
67
+ // Returns the previous value of the counter
68
+ let was = count.fetch_sub(1, std::sync::atomic::Ordering::Relaxed);
69
+ if was == 1 {
70
+ cache.remove(header);
61
71
  let ptr = *header as *const str as *mut str;
62
72
  unsafe {
63
73
  let _ = Box::from_raw(ptr);
@@ -1,6 +1,7 @@
1
1
  mod builder;
2
2
  mod header_cache;
3
3
  mod parser;
4
+ pub mod read_impl;
4
5
  mod reader;
5
6
  mod record;
6
7
 
@@ -2,6 +2,7 @@ use std::collections::HashMap;
2
2
 
3
3
  pub trait RecordParser {
4
4
  type Output;
5
+
5
6
  fn parse(
6
7
  headers: &[&'static str],
7
8
  record: &csv::StringRecord,
@@ -11,41 +12,58 @@ pub trait RecordParser {
11
12
 
12
13
  impl RecordParser for HashMap<&'static str, Option<String>> {
13
14
  type Output = Self;
15
+
16
+ #[inline]
14
17
  fn parse(
15
18
  headers: &[&'static str],
16
19
  record: &csv::StringRecord,
17
20
  null_string: &str,
18
21
  ) -> Self::Output {
19
22
  let mut map = HashMap::with_capacity(headers.len());
20
- for (header, field) in headers.iter().zip(record.iter()) {
21
- map.insert(
22
- *header,
23
- if field == null_string {
24
- None
25
- } else {
26
- Some(field.to_string())
27
- },
28
- );
29
- }
23
+ headers
24
+ .iter()
25
+ .zip(record.iter())
26
+ .for_each(|(header, field)| {
27
+ map.insert(
28
+ *header,
29
+ if field == null_string {
30
+ None
31
+ } else {
32
+ // Avoid allocating for empty strings
33
+ if field.is_empty() {
34
+ Some(String::new())
35
+ } else {
36
+ Some(field.to_string())
37
+ }
38
+ },
39
+ );
40
+ });
30
41
  map
31
42
  }
32
43
  }
33
44
 
34
45
  impl RecordParser for Vec<Option<String>> {
35
46
  type Output = Self;
47
+
48
+ #[inline]
36
49
  fn parse(
37
50
  _headers: &[&'static str],
38
51
  record: &csv::StringRecord,
39
52
  null_string: &str,
40
53
  ) -> Self::Output {
41
54
  let mut vec = Vec::with_capacity(record.len());
42
- for field in record.iter() {
43
- vec.push(if field == null_string {
55
+ vec.extend(record.iter().map(|field| {
56
+ if field == null_string {
44
57
  None
45
58
  } else {
46
- Some(field.to_string())
47
- });
48
- }
59
+ // Avoid allocating for empty strings
60
+ if field.is_empty() {
61
+ Some(String::new())
62
+ } else {
63
+ Some(field.to_string())
64
+ }
65
+ }
66
+ }));
49
67
  vec
50
68
  }
51
69
  }
@@ -0,0 +1,65 @@
1
+ use super::{header_cache::StringCache, parser::RecordParser};
2
+ use std::{io::Read, thread};
3
+
4
+ pub enum ReadImpl<T: RecordParser> {
5
+ SingleThreaded {
6
+ reader: csv::Reader<Box<dyn Read>>,
7
+ headers: Vec<&'static str>,
8
+ null_string: String,
9
+ },
10
+ MultiThreaded {
11
+ headers: Vec<&'static str>,
12
+ receiver: kanal::Receiver<T::Output>,
13
+ handle: Option<thread::JoinHandle<()>>,
14
+ },
15
+ }
16
+
17
+ impl<T: RecordParser> ReadImpl<T> {
18
+ #[inline]
19
+ pub fn next(&mut self) -> Option<T::Output> {
20
+ match self {
21
+ Self::MultiThreaded {
22
+ receiver, handle, ..
23
+ } => match receiver.recv() {
24
+ Ok(record) => Some(record),
25
+ Err(_) => {
26
+ if let Some(handle) = handle.take() {
27
+ let _ = handle.join();
28
+ }
29
+ None
30
+ }
31
+ },
32
+ Self::SingleThreaded {
33
+ reader,
34
+ headers,
35
+ null_string,
36
+ } => {
37
+ let mut record = csv::StringRecord::new();
38
+ match reader.read_record(&mut record) {
39
+ Ok(true) => Some(T::parse(headers, &record, null_string)),
40
+ _ => None,
41
+ }
42
+ }
43
+ }
44
+ }
45
+
46
+ #[inline]
47
+ pub fn cleanup(&mut self) {
48
+ match self {
49
+ Self::MultiThreaded {
50
+ receiver,
51
+ handle,
52
+ headers,
53
+ } => {
54
+ receiver.close();
55
+ if let Some(handle) = handle.take() {
56
+ let _ = handle.join();
57
+ }
58
+ let _ = StringCache::clear(headers);
59
+ }
60
+ Self::SingleThreaded { headers, .. } => {
61
+ let _ = StringCache::clear(headers);
62
+ }
63
+ }
64
+ }
65
+ }
@@ -1,66 +1,35 @@
1
- use super::{header_cache::StringCache, parser::RecordParser};
1
+ use super::{parser::RecordParser, read_impl::ReadImpl};
2
2
  use magnus::{Error, Ruby};
3
- use std::{io::Read, thread};
3
+ use std::{borrow::Cow, io::Read};
4
4
 
5
5
  pub struct RecordReader<T: RecordParser> {
6
6
  pub(crate) reader: ReadImpl<T>,
7
7
  }
8
8
 
9
- impl<T: RecordParser> Drop for RecordReader<T> {
10
- fn drop(&mut self) {
11
- match &mut self.reader {
12
- ReadImpl::MultiThreaded {
13
- receiver,
14
- handle,
15
- headers,
16
- } => {
17
- receiver.close();
18
- if let Some(handle) = handle.take() {
19
- let _ = handle.join();
20
- }
21
- StringCache::clear(headers).unwrap();
22
- }
23
- ReadImpl::SingleThreaded { headers, .. } => {
24
- StringCache::clear(headers).unwrap();
25
- }
26
- }
27
- }
28
- }
29
-
30
- #[allow(dead_code)]
31
- pub enum ReadImpl<T: RecordParser> {
32
- SingleThreaded {
33
- reader: csv::Reader<Box<dyn Read + Send + 'static>>,
34
- headers: Vec<&'static str>,
35
- null_string: String,
36
- },
37
- MultiThreaded {
38
- headers: Vec<&'static str>,
39
- receiver: kanal::Receiver<T::Output>,
40
- handle: Option<thread::JoinHandle<()>>,
41
- },
42
- }
43
-
44
9
  impl<T: RecordParser> RecordReader<T> {
10
+ #[inline]
45
11
  pub(crate) fn get_headers(
46
12
  ruby: &Ruby,
47
13
  reader: &mut csv::Reader<impl Read>,
48
14
  has_headers: bool,
49
15
  ) -> Result<Vec<String>, Error> {
50
- let first_row = reader
51
- .headers()
52
- .map_err(|e| {
53
- Error::new(
54
- ruby.exception_runtime_error(),
55
- format!("Failed to read headers: {e}"),
56
- )
57
- })?
58
- .clone();
16
+ let first_row = reader.headers().map_err(|e| {
17
+ Error::new(
18
+ ruby.exception_runtime_error(),
19
+ Cow::Owned(format!("Failed to read headers: {e}")),
20
+ )
21
+ })?;
59
22
 
60
23
  Ok(if has_headers {
61
- first_row.iter().map(String::from).collect()
24
+ // Pre-allocate the vector with exact capacity
25
+ let mut headers = Vec::with_capacity(first_row.len());
26
+ headers.extend(first_row.iter().map(String::from));
27
+ headers
62
28
  } else {
63
- (0..first_row.len()).map(|i| format!("c{i}")).collect()
29
+ // Pre-allocate the vector with exact capacity
30
+ let mut headers = Vec::with_capacity(first_row.len());
31
+ headers.extend((0..first_row.len()).map(|i| format!("c{i}")));
32
+ headers
64
33
  })
65
34
  }
66
35
  }
@@ -68,30 +37,21 @@ impl<T: RecordParser> RecordReader<T> {
68
37
  impl<T: RecordParser> Iterator for RecordReader<T> {
69
38
  type Item = T::Output;
70
39
 
40
+ #[inline]
71
41
  fn next(&mut self) -> Option<Self::Item> {
72
- match &mut self.reader {
73
- ReadImpl::MultiThreaded {
74
- receiver, handle, ..
75
- } => match receiver.recv() {
76
- Ok(record) => Some(record),
77
- Err(_) => {
78
- if let Some(handle) = handle.take() {
79
- let _ = handle.join();
80
- }
81
- None
82
- }
83
- },
84
- ReadImpl::SingleThreaded {
85
- reader,
86
- headers,
87
- null_string,
88
- } => {
89
- let mut record = csv::StringRecord::new();
90
- match reader.read_record(&mut record) {
91
- Ok(true) => Some(T::parse(headers, &record, null_string)),
92
- _ => None,
93
- }
94
- }
95
- }
42
+ self.reader.next()
43
+ }
44
+
45
+ #[inline]
46
+ fn size_hint(&self) -> (usize, Option<usize>) {
47
+ // We can't know the exact size without reading the whole file
48
+ (0, None)
49
+ }
50
+ }
51
+
52
+ impl<T: RecordParser> Drop for RecordReader<T> {
53
+ #[inline]
54
+ fn drop(&mut self) {
55
+ self.reader.cleanup();
96
56
  }
97
57
  }
@@ -1,4 +1,4 @@
1
- use magnus::{IntoValue, RHash, Ruby, Value};
1
+ use magnus::{IntoValue, Ruby, Value};
2
2
  use std::collections::HashMap;
3
3
 
4
4
  #[derive(Debug)]
@@ -8,14 +8,16 @@ pub enum CsvRecord {
8
8
  }
9
9
 
10
10
  impl IntoValue for CsvRecord {
11
+ #[inline]
11
12
  fn into_value_with(self, handle: &Ruby) -> Value {
12
13
  match self {
13
14
  CsvRecord::Vec(vec) => vec.into_value_with(handle),
14
15
  CsvRecord::Map(map) => {
15
- let hash = RHash::new();
16
- for (k, v) in map {
17
- hash.aset(k, v).unwrap();
18
- }
16
+ // Pre-allocate the hash with the known size
17
+ let hash = handle.hash_new_capa(map.len());
18
+ map.into_iter()
19
+ .try_for_each(|(k, v)| hash.aset(k, v))
20
+ .unwrap();
19
21
  hash.into_value_with(handle)
20
22
  }
21
23
  }
data/lib/osv/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module OSV
2
- VERSION = "0.3.3"
2
+ VERSION = "0.3.4"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: osv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.3.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-12-23 00:00:00.000000000 Z
11
+ date: 2024-12-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -59,6 +59,7 @@ files:
59
59
  - ext/osv/src/csv/header_cache.rs
60
60
  - ext/osv/src/csv/mod.rs
61
61
  - ext/osv/src/csv/parser.rs
62
+ - ext/osv/src/csv/read_impl.rs
62
63
  - ext/osv/src/csv/reader.rs
63
64
  - ext/osv/src/csv/record.rs
64
65
  - ext/osv/src/lib.rs