osv 0.3.8 → 0.3.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 02205de8cef4d5f7633c06720a9e925a2b608116354da4a1678d4746d2197d23
4
- data.tar.gz: 3e1d63323fdaad1b6a60e0a0a63801f98710615d6616c882f0cdce00e36c6e2e
3
+ metadata.gz: aed16dbfb14e6caebceb388104731091a5354394cd174804982dbce8f4b95963
4
+ data.tar.gz: 232497c8ec55ab15559f126c4bc90c6bcd0dd4296efadcbf23d07d6a24c7969d
5
5
  SHA512:
6
- metadata.gz: df6a4a4b86c41010ea671ac0e98c2ee6307e62ceff35dab125868f0ee7edb6d14984348ecd4ac9f913489e5a6be0b364240b461334554385aabe5b3374fe798d
7
- data.tar.gz: d931b888ce9d0ad1cdb1fa3d0be8cd0e526292206742f5adde718f414e9feca97eff3af6d4139d144c18a50e4807650ea9f7582153bcee80cea1e6ed4ce4ef49
6
+ metadata.gz: 36e1bab13ede785e2f41f0a56b37a2ff7448deff182b614c71e15ccd16799abe94f1f7db63034cd1c04195666c85a12fa0284dde75753b16e05e8767e9c87b18
7
+ data.tar.gz: '0990fee8e41dd9eb046d6b3733770adb4ef9825165bb0a012b32dac03ddd9d8f2107860d75ef05437aa8d820d42d16c8148642fd3faab326dcab8007731bcf61'
data/Cargo.lock CHANGED
@@ -274,6 +274,7 @@ dependencies = [
274
274
  "serde",
275
275
  "serde_magnus",
276
276
  "thiserror",
277
+ "xxhash-rust",
277
278
  ]
278
279
 
279
280
  [[package]]
@@ -526,3 +527,9 @@ name = "windows_x86_64_msvc"
526
527
  version = "0.52.6"
527
528
  source = "registry+https://github.com/rust-lang/crates.io-index"
528
529
  checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
530
+
531
+ [[package]]
532
+ name = "xxhash-rust"
533
+ version = "0.8.14"
534
+ source = "registry+https://github.com/rust-lang/crates.io-index"
535
+ checksum = "d7d48f1b18be023c95e7b75f481cac649d74be7c507ff4a407c55cfb957f7934"
data/README.md CHANGED
@@ -70,9 +70,10 @@ Both methods support the following options:
70
70
  - by default, empty strings are interpreted as empty strings
71
71
  - if you want to interpret empty strings as nil, set this to an empty string
72
72
  - `buffer_size`: Integer specifying the read buffer size
73
- - `result_type`: String specifying the output format ("hash" or "array")
73
+ - `result_type`: String specifying the output format ("hash" or "array" or :hash or :array)
74
74
  - `flexible`: Boolean specifying if the parser should be flexible (default: false)
75
75
  - `flexible_default`: String specifying the default value for missing fields. Implicitly enables flexible mode if set. (default: `nil`)
76
+ - `trim`: String specifying the trim mode ("all" or "headers" or "fields" or :all or :headers or :fields)
76
77
 
77
78
  ### Input Sources
78
79
 
@@ -112,45 +113,50 @@ Here's some unscientific benchmarks. You can find the code in the [benchmark/com
112
113
  ### 10,000 lines
113
114
 
114
115
  ```
115
- Benchmarking with 10001 lines of data
116
+ Benchmarking with 100001 lines of data
116
117
 
117
- ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin23]
118
+ ruby 3.3.6 (2024-11-05 revision 75015d4c1f) [arm64-darwin24]
118
119
  Warming up --------------------------------------
119
- OSV - Hash output 6.000 i/100ms
120
+ OSV - Hash output 1.000 i/100ms
120
121
  CSV - Hash output 1.000 i/100ms
121
- OSV - Array output 18.000 i/100ms
122
- CSV - Array output 2.000 i/100ms
122
+ OSV - Array output 1.000 i/100ms
123
+ OSV - Direct Open Array output
124
+ 12.719M i/100ms
125
+ CSV - Array output 1.000 i/100ms
123
126
  FastCSV - Array output
124
- 9.000 i/100ms
125
- OSV - StringIO 7.000 i/100ms
127
+ 1.000 i/100ms
128
+ OSV - StringIO 1.000 i/100ms
126
129
  CSV - StringIO 1.000 i/100ms
127
- FastCSV - StringIO 20.000 i/100ms
128
- OSV - Gzipped 6.000 i/100ms
130
+ FastCSV - StringIO 1.000 i/100ms
131
+ OSV - Gzipped 1.000 i/100ms
129
132
  CSV - Gzipped 1.000 i/100ms
130
133
  Calculating -------------------------------------
131
- OSV - Hash output 73.360 4.1%) i/s (13.63 ms/i) - 366.000 in 5.000390s
132
- CSV - Hash output 11.93725.1%) i/s (83.78 ms/i) - 52.000 in 5.036297s
133
- OSV - Array output 189.738 8.4%) i/s (5.27 ms/i) - 954.000 in 5.071018s
134
- CSV - Array output 25.471 (±11.8%) i/s (39.26 ms/i) - 120.000 in 5.015289s
134
+ OSV - Hash output 6.72214.9%) i/s (148.77 ms/i) - 59.000 in 10.074753s
135
+ CSV - Hash output 1.223 0.0%) i/s (817.62 ms/i) - 13.000 in 10.788284s
136
+ OSV - Array output 17.28411.6%) i/s (57.86 ms/i) - 171.000 in 10.007321s
137
+ OSV - Direct Open Array output
138
+ 213.629M (±13.5%) i/s (4.68 ns/i) - 1.921B in 10.005506s
139
+ CSV - Array output 2.193 (± 0.0%) i/s (455.93 ms/i) - 22.000 in 10.052607s
135
140
  FastCSV - Array output
136
- 97.8672.0%) i/s (10.22 ms/i) - 495.000 in 5.060957s
137
- OSV - StringIO 80.784 6.2%) i/s (12.38 ms/i) - 406.000 in 5.046696s
138
- CSV - StringIO 15.872 (± 0.0%) i/s (63.01 ms/i) - 80.000 in 5.043361s
139
- FastCSV - StringIO 200.5112.0%) i/s (4.99 ms/i) - 1.020k in 5.088592s
140
- OSV - Gzipped 55.22012.7%) i/s (18.11 ms/i) - 258.000 in 5.030928s
141
- CSV - Gzipped 12.59115.9%) i/s (79.42 ms/i) - 59.000 in 5.039709s
141
+ 7.9930.0%) i/s (125.11 ms/i) - 80.000 in 10.053729s
142
+ OSV - StringIO 6.62615.1%) i/s (150.91 ms/i) - 66.000 in 10.103646s
143
+ CSV - StringIO 1.478 (± 0.0%) i/s (676.78 ms/i) - 15.000 in 10.158640s
144
+ FastCSV - StringIO 17.0745.9%) i/s (58.57 ms/i) - 171.000 in 10.059266s
145
+ OSV - Gzipped 5.639 0.0%) i/s (177.32 ms/i) - 57.000 in 10.152487s
146
+ CSV - Gzipped 1.176 0.0%) i/s (850.19 ms/i) - 12.000 in 10.233398s
142
147
 
143
148
  Comparison:
144
- FastCSV - StringIO: 200.5 i/s
145
- OSV - Array output: 189.7 i/s - same-ish: difference falls within error
146
- FastCSV - Array output: 97.9 i/s - 2.05x slower
147
- OSV - StringIO: 80.8 i/s - 2.48x slower
148
- OSV - Hash output: 73.4 i/s - 2.73x slower
149
- OSV - Gzipped: 55.2 i/s - 3.63x slower
150
- CSV - Array output: 25.5 i/s - 7.87x slower
151
- CSV - StringIO: 15.9 i/s - 12.63x slower
152
- CSV - Gzipped: 12.6 i/s - 15.92x slower
153
- CSV - Hash output: 11.9 i/s - 16.80x slower
149
+ OSV - Direct Open Array output: 213629268.6 i/s
150
+ OSV - Array output: 17.3 i/s - 12360250.79x slower
151
+ FastCSV - StringIO: 17.1 i/s - 12511956.50x slower
152
+ FastCSV - Array output: 8.0 i/s - 26727225.72x slower
153
+ OSV - Hash output: 6.7 i/s - 31780615.83x slower
154
+ OSV - StringIO: 6.6 i/s - 32239620.60x slower
155
+ OSV - Gzipped: 5.6 i/s - 37881517.48x slower
156
+ CSV - Array output: 2.2 i/s - 97400427.87x slower
157
+ CSV - StringIO: 1.5 i/s - 144580048.04x slower
158
+ CSV - Hash output: 1.2 i/s - 174666591.31x slower
159
+ CSV - Gzipped: 1.2 i/s - 181626018.23x slower
154
160
  ```
155
161
 
156
162
  ### 1,000,000 lines
@@ -158,11 +164,13 @@ FastCSV - Array output: 97.9 i/s - 2.05x slower
158
164
  ```
159
165
  Benchmarking with 1000001 lines of data
160
166
 
161
- ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin23]
167
+ ruby 3.3.6 (2024-11-05 revision 75015d4c1f) [arm64-darwin24]
162
168
  Warming up --------------------------------------
163
169
  OSV - Hash output 1.000 i/100ms
164
170
  CSV - Hash output 1.000 i/100ms
165
171
  OSV - Array output 1.000 i/100ms
172
+ OSV - Direct Open Array output
173
+ 1.000 i/100ms
166
174
  CSV - Array output 1.000 i/100ms
167
175
  FastCSV - Array output
168
176
  1.000 i/100ms
@@ -172,27 +180,30 @@ FastCSV - Array output
172
180
  OSV - Gzipped 1.000 i/100ms
173
181
  CSV - Gzipped 1.000 i/100ms
174
182
  Calculating -------------------------------------
175
- OSV - Hash output 0.578 (± 0.0%) i/s (1.73 s/i) - 3.000 in 5.287845s
176
- CSV - Hash output 0.117 (± 0.0%) i/s (8.57 s/i) - 1.000 in 8.571770s
177
- OSV - Array output 1.142 (± 0.0%) i/s (875.97 ms/i) - 5.000 in 5.234694s
178
- CSV - Array output 0.235 (± 0.0%) i/s (4.25 s/i) - 2.000 in 8.561144s
183
+ OSV - Hash output 0.492 (± 0.0%) i/s (2.03 s/i) - 5.000 in 10.463278s
184
+ CSV - Hash output 0.114 (± 0.0%) i/s (8.75 s/i) - 2.000 in 17.573877s
185
+ OSV - Array output 1.502 (± 0.0%) i/s (665.58 ms/i) - 14.000 in 10.217551s
186
+ OSV - Direct Open Array output
187
+ 1.626 (± 0.0%) i/s (614.90 ms/i) - 16.000 in 10.190323s
188
+ CSV - Array output 0.183 (± 0.0%) i/s (5.46 s/i) - 2.000 in 10.951943s
179
189
  FastCSV - Array output
180
- 0.768 (± 0.0%) i/s (1.30 s/i) - 4.000 in 6.924574s
181
- OSV - StringIO 0.522 (± 0.0%) i/s (1.91 s/i) - 3.000 in 5.803969s
182
- CSV - StringIO 0.132 (± 0.0%) i/s (7.59 s/i) - 1.000 in 7.593243s
183
- FastCSV - StringIO 1.039 (± 0.0%) i/s (962.53 ms/i) - 6.000 in 5.806644s
184
- OSV - Gzipped 0.437 (± 0.0%) i/s (2.29 s/i) - 3.000 in 6.885125s
185
- CSV - Gzipped 0.115 (± 0.0%) i/s (8.68 s/i) - 1.000 in 8.684069s
190
+ 0.326 (± 0.0%) i/s (3.07 s/i) - 4.000 in 12.340605s
191
+ OSV - StringIO 0.567 (± 0.0%) i/s (1.76 s/i) - 6.000 in 10.698027s
192
+ CSV - StringIO 0.141 (± 0.0%) i/s (7.10 s/i) - 2.000 in 14.237144s
193
+ FastCSV - StringIO 0.923 (± 0.0%) i/s (1.08 s/i) - 10.000 in 11.567775s
194
+ OSV - Gzipped 0.437 (± 0.0%) i/s (2.29 s/i) - 5.000 in 11.452764s
195
+ CSV - Gzipped 0.104 (± 0.0%) i/s (9.64 s/i) - 2.000 in 19.373423s
186
196
 
187
197
  Comparison:
188
- OSV - Array output: 1.1 i/s
189
- FastCSV - StringIO: 1.0 i/s - 1.10x slower
190
- FastCSV - Array output: 0.8 i/s - 1.49x slower
191
- OSV - Hash output: 0.6 i/s - 1.98x slower
192
- OSV - StringIO: 0.5 i/s - 2.19x slower
193
- OSV - Gzipped: 0.4 i/s - 2.61x slower
194
- CSV - Array output: 0.2 i/s - 4.86x slower
195
- CSV - StringIO: 0.1 i/s - 8.67x slower
196
- CSV - Hash output: 0.1 i/s - 9.79x slower
197
- CSV - Gzipped: 0.1 i/s - 9.91x slower
198
+ OSV - Direct Open Array output: 1.6 i/s
199
+ OSV - Array output: 1.5 i/s - 1.08x slower
200
+ FastCSV - StringIO: 0.9 i/s - 1.76x slower
201
+ OSV - StringIO: 0.6 i/s - 2.87x slower
202
+ OSV - Hash output: 0.5 i/s - 3.30x slower
203
+ OSV - Gzipped: 0.4 i/s - 3.72x slower
204
+ FastCSV - Array output: 0.3 i/s - 4.99x slower
205
+ CSV - Array output: 0.2 i/s - 8.88x slower
206
+ CSV - StringIO: 0.1 i/s - 11.55x slower
207
+ CSV - Hash output: 0.1 i/s - 14.24x slower
208
+ CSV - Gzipped: 0.1 i/s - 15.68x slower
198
209
  ```
data/Rakefile CHANGED
@@ -11,7 +11,7 @@ Rake::ExtensionTask.new("osv") do |c|
11
11
  end
12
12
 
13
13
  task :dev do
14
- ENV["RB_SYS_CARGO_PROFILE"] = "dev"
14
+ ENV["RB_SYS_CARGO_PROFILE"] = "release"
15
15
  end
16
16
 
17
17
  Rake::TestTask.new do |t|
@@ -20,3 +20,9 @@ Rake::TestTask.new do |t|
20
20
  t.libs << "lib"
21
21
  t.libs << "test"
22
22
  end
23
+
24
+ task :release do
25
+ sh "bundle exec rake test"
26
+ sh "gem build osv.gemspec"
27
+ sh "gem push osv-#{OSV::VERSION}.gem"
28
+ end
data/ext/osv/Cargo.toml CHANGED
@@ -15,3 +15,4 @@ rb-sys = "^0.9"
15
15
  serde = { version = "1.0", features = ["derive"] }
16
16
  serde_magnus = "0.8.1"
17
17
  thiserror = "2.0"
18
+ xxhash-rust = { version = "0.8.12", features = ["xxh3"] }
@@ -3,18 +3,21 @@ use super::{
3
3
  parser::RecordParser,
4
4
  read_impl::ReadImpl,
5
5
  reader::RecordReader,
6
+ READ_BUFFER_SIZE,
6
7
  };
7
8
  use flate2::read::GzDecoder;
8
9
  use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, RString, Ruby, Value};
9
10
  use std::{
10
11
  fs::File,
11
- io::{self, Read},
12
+ io::{self, BufReader, Read},
12
13
  marker::PhantomData,
13
14
  os::fd::FromRawFd,
14
15
  thread,
15
16
  };
16
17
  use thiserror::Error;
17
18
 
19
+ pub(crate) static BUFFER_CHANNEL_SIZE: usize = 1024;
20
+
18
21
  #[derive(Error, Debug)]
19
22
  pub enum ReaderError {
20
23
  #[error("Failed to get file descriptor: {0}")]
@@ -56,6 +59,7 @@ pub struct RecordReaderBuilder<'a, T: RecordParser + Send + 'static> {
56
59
  buffer: usize,
57
60
  flexible: bool,
58
61
  flexible_default: Option<String>,
62
+ trim: csv::Trim,
59
63
  _phantom: PhantomData<T>,
60
64
  }
61
65
 
@@ -68,9 +72,10 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
68
72
  delimiter: b',',
69
73
  quote_char: b'"',
70
74
  null_string: None,
71
- buffer: 1000,
75
+ buffer: BUFFER_CHANNEL_SIZE,
72
76
  flexible: false,
73
77
  flexible_default: None,
78
+ trim: csv::Trim::None,
74
79
  _phantom: PhantomData,
75
80
  }
76
81
  }
@@ -110,6 +115,11 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
110
115
  self
111
116
  }
112
117
 
118
+ pub fn trim(mut self, trim: csv::Trim) -> Self {
119
+ self.trim = trim;
120
+ self
121
+ }
122
+
113
123
  fn handle_string_io(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
114
124
  let string: RString = self.to_read.funcall("string", ())?;
115
125
  let content = string.to_string()?;
@@ -128,7 +138,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
128
138
  }
129
139
 
130
140
  let file = unsafe { File::from_raw_fd(fd) };
131
- Ok(Box::new(file))
141
+ Ok(Box::new(BufReader::with_capacity(READ_BUFFER_SIZE, file)))
132
142
  }
133
143
 
134
144
  fn handle_file_path(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
@@ -136,24 +146,27 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
136
146
  let file = File::open(&path)?;
137
147
 
138
148
  Ok(if path.ends_with(".gz") {
139
- Box::new(GzDecoder::new(file))
149
+ Box::new(GzDecoder::new(BufReader::with_capacity(
150
+ READ_BUFFER_SIZE,
151
+ file,
152
+ )))
140
153
  } else {
141
- Box::new(file)
154
+ Box::new(BufReader::with_capacity(READ_BUFFER_SIZE, file))
142
155
  })
143
156
  }
144
157
 
145
- fn get_reader(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
158
+ fn get_reader(&self) -> Result<(Box<dyn Read + Send + 'static>, bool), ReaderError> {
146
159
  let string_io: magnus::RClass = self.ruby.eval("StringIO")?;
147
160
  let gzip_reader_class: magnus::RClass = self.ruby.eval("Zlib::GzipReader")?;
148
161
 
149
162
  if self.to_read.is_kind_of(string_io) {
150
- self.handle_string_io()
163
+ self.handle_string_io().map(|r| (r, false))
151
164
  } else if self.to_read.is_kind_of(gzip_reader_class) {
152
165
  Err(ReaderError::UnsupportedGzipReader)
153
166
  } else if self.to_read.is_kind_of(self.ruby.class_io()) {
154
- self.handle_file_descriptor()
167
+ self.handle_file_descriptor().map(|r| (r, true))
155
168
  } else {
156
- self.handle_file_path()
169
+ self.handle_file_path().map(|r| (r, false))
157
170
  }
158
171
  }
159
172
 
@@ -175,7 +188,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
175
188
 
176
189
  pub fn build(self) -> Result<RecordReader<T>, ReaderError> {
177
190
  match self.get_reader() {
178
- Ok(readable) => self.build_multi_threaded(readable),
191
+ Ok((readable, should_forget)) => self.build_multi_threaded(readable, should_forget),
179
192
  Err(_) => {
180
193
  let readable = self.get_single_threaded_reader()?;
181
194
  self.build_single_threaded(readable)
@@ -186,6 +199,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
186
199
  fn build_multi_threaded(
187
200
  self,
188
201
  readable: Box<dyn Read + Send + 'static>,
202
+ should_forget: bool,
189
203
  ) -> Result<RecordReader<T>, ReaderError> {
190
204
  let flexible = self.flexible || self.flexible_default.is_some();
191
205
  let mut reader = csv::ReaderBuilder::new()
@@ -193,6 +207,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
193
207
  .delimiter(self.delimiter)
194
208
  .quote(self.quote_char)
195
209
  .flexible(flexible)
210
+ .trim(self.trim)
196
211
  .from_reader(readable);
197
212
 
198
213
  let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
@@ -204,7 +219,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
204
219
 
205
220
  let flexible_default = self.flexible_default.clone();
206
221
  let handle = thread::spawn(move || {
207
- let mut record = csv::StringRecord::new();
222
+ let mut record = csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers.len());
208
223
  while let Ok(true) = reader.read_record(&mut record) {
209
224
  let row = T::parse(
210
225
  &static_headers,
@@ -216,8 +231,10 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
216
231
  break;
217
232
  }
218
233
  }
219
- let file_to_forget = reader.into_inner();
220
- std::mem::forget(file_to_forget);
234
+ if should_forget {
235
+ let file_to_forget = reader.into_inner();
236
+ std::mem::forget(file_to_forget);
237
+ }
221
238
  });
222
239
 
223
240
  Ok(RecordReader {
@@ -239,6 +256,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
239
256
  .delimiter(self.delimiter)
240
257
  .quote(self.quote_char)
241
258
  .flexible(flexible)
259
+ .trim(self.trim)
242
260
  .from_reader(readable);
243
261
 
244
262
  let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
@@ -257,30 +275,55 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
257
275
 
258
276
  struct RubyReader {
259
277
  inner: Value,
278
+ buffer: Option<Vec<u8>>,
279
+ offset: usize,
260
280
  }
261
281
 
262
282
  impl RubyReader {
263
283
  fn new(inner: Value) -> Self {
264
- Self { inner }
284
+ Self {
285
+ inner,
286
+ buffer: None,
287
+ offset: 0,
288
+ }
265
289
  }
266
290
  }
267
291
 
292
+ // Read the entire inner into a vector and then read future reads from that vector with offset
268
293
  impl Read for RubyReader {
269
294
  fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
270
- let result = self.inner.funcall::<_, _, Value>("read", (buf.len(),));
295
+ // If we have an existing buffer, read from it
296
+ if let Some(buffer) = self.buffer.as_ref() {
297
+ let remaining = buffer.len() - self.offset;
298
+ let copy_size = remaining.min(buf.len());
299
+ buf[..copy_size].copy_from_slice(&buffer[self.offset..self.offset + copy_size]);
300
+ self.offset += copy_size;
301
+ return Ok(copy_size);
302
+ }
303
+
304
+ // No buffer yet - read the entire content from Ruby
305
+ let result = self.inner.funcall::<_, _, Value>("read", ());
271
306
  match result {
272
307
  Ok(data) => {
273
308
  if data.is_nil() {
274
- return Ok(0);
309
+ return Ok(0); // EOF
275
310
  }
276
311
 
277
312
  let string = RString::from_value(data).ok_or_else(|| {
278
313
  io::Error::new(io::ErrorKind::Other, "Failed to convert to RString")
279
314
  })?;
280
315
  let bytes = unsafe { string.as_slice() };
281
- let len = bytes.len().min(buf.len());
282
- buf[..len].copy_from_slice(&bytes[..len]);
283
- Ok(len)
316
+
317
+ // Store the entire content in the buffer
318
+ self.buffer = Some(bytes.to_vec());
319
+ self.offset = 0;
320
+
321
+ // Read initial chunk
322
+ let copy_size = bytes.len().min(buf.len());
323
+ buf[..copy_size].copy_from_slice(&bytes[..copy_size]);
324
+ self.offset = copy_size;
325
+
326
+ Ok(copy_size)
284
327
  }
285
328
  Err(e) => Err(io::Error::new(io::ErrorKind::Other, e.to_string())),
286
329
  }
@@ -6,4 +6,6 @@ mod reader;
6
6
  mod record;
7
7
 
8
8
  pub use builder::RecordReaderBuilder;
9
+ pub(crate) use builder::BUFFER_CHANNEL_SIZE;
10
+ pub(crate) use read_impl::READ_BUFFER_SIZE;
9
11
  pub use record::CsvRecord;
@@ -1,4 +1,5 @@
1
1
  use std::collections::HashMap;
2
+ use std::hash::BuildHasher;
2
3
 
3
4
  pub trait RecordParser {
4
5
  type Output;
@@ -11,7 +12,7 @@ pub trait RecordParser {
11
12
  ) -> Self::Output;
12
13
  }
13
14
 
14
- impl RecordParser for HashMap<&'static str, Option<String>> {
15
+ impl<S: BuildHasher + Default> RecordParser for HashMap<&'static str, Option<String>, S> {
15
16
  type Output = Self;
16
17
 
17
18
  #[inline]
@@ -21,21 +22,21 @@ impl RecordParser for HashMap<&'static str, Option<String>> {
21
22
  null_string: Option<&str>,
22
23
  flexible_default: Option<&str>,
23
24
  ) -> Self::Output {
24
- let mut map = HashMap::with_capacity(headers.len());
25
- headers.iter().enumerate().for_each(|(i, header)| {
25
+ let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
26
+ headers.iter().enumerate().for_each(|(i, &header)| {
26
27
  let value = record.get(i).map_or_else(
27
- || flexible_default.map(|s| s.to_string()),
28
+ || flexible_default.map(ToString::to_string),
28
29
  |field| {
29
30
  if null_string == Some(field) {
30
31
  None
31
32
  } else if field.is_empty() {
32
33
  Some(String::new())
33
34
  } else {
34
- Some(field.to_string())
35
+ Some(field.into())
35
36
  }
36
37
  },
37
38
  );
38
- map.insert(*header, value);
39
+ map.insert(header, value);
39
40
  });
40
41
  map
41
42
  }
@@ -53,20 +54,20 @@ impl RecordParser for Vec<Option<String>> {
53
54
  ) -> Self::Output {
54
55
  let target_len = headers.len();
55
56
  let mut vec = Vec::with_capacity(target_len);
56
- vec.extend(record.iter().map(|field| {
57
- if null_string == Some(field) {
57
+ for field in record.iter() {
58
+ let value = if Some(field) == null_string {
58
59
  None
59
60
  } else if field.is_empty() {
60
61
  Some(String::new())
61
62
  } else {
62
- Some(field.to_string())
63
- }
64
- }));
63
+ Some(field.into())
64
+ };
65
+ vec.push(value);
66
+ }
65
67
 
66
- // Fill remaining slots with flexible_default if needed
67
- if let Some(default) = flexible_default {
68
- while vec.len() < target_len {
69
- vec.push(Some(default.to_string()));
68
+ if vec.len() < target_len {
69
+ if let Some(default) = flexible_default {
70
+ vec.resize_with(target_len, || Some(default.to_string()));
70
71
  }
71
72
  }
72
73
  vec
@@ -1,6 +1,8 @@
1
1
  use super::{header_cache::StringCache, parser::RecordParser};
2
2
  use std::{io::Read, thread};
3
3
 
4
+ pub(crate) const READ_BUFFER_SIZE: usize = 8192;
5
+
4
6
  pub enum ReadImpl<T: RecordParser> {
5
7
  SingleThreaded {
6
8
  reader: csv::Reader<Box<dyn Read>>,
@@ -36,7 +38,7 @@ impl<T: RecordParser> ReadImpl<T> {
36
38
  null_string,
37
39
  flexible_default,
38
40
  } => {
39
- let mut record = csv::StringRecord::new();
41
+ let mut record = csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers.len());
40
42
  match reader.read_record(&mut record) {
41
43
  Ok(true) => Some(T::parse(
42
44
  headers,
@@ -1,13 +1,13 @@
1
1
  use magnus::{IntoValue, Ruby, Value};
2
- use std::collections::HashMap;
2
+ use std::{collections::HashMap, hash::BuildHasher};
3
3
 
4
4
  #[derive(Debug)]
5
- pub enum CsvRecord {
5
+ pub enum CsvRecord<S: BuildHasher + Default> {
6
6
  Vec(Vec<Option<String>>),
7
- Map(HashMap<&'static str, Option<String>>),
7
+ Map(HashMap<&'static str, Option<String>, S>),
8
8
  }
9
9
 
10
- impl IntoValue for CsvRecord {
10
+ impl<S: BuildHasher + Default> IntoValue for CsvRecord<S> {
11
11
  #[inline]
12
12
  fn into_value_with(self, handle: &Ruby) -> Value {
13
13
  match self {
@@ -1,13 +1,15 @@
1
1
  use crate::csv::{CsvRecord, RecordReaderBuilder};
2
2
  use crate::utils::*;
3
+ use csv::Trim;
3
4
  use magnus::value::ReprValue;
4
5
  use magnus::{block::Yield, Error, KwArgs, RHash, Ruby, Symbol, Value};
5
6
  use std::collections::HashMap;
7
+ use xxhash_rust::xxh3::Xxh3Builder;
6
8
 
7
9
  pub fn parse_csv(
8
10
  rb_self: Value,
9
11
  args: &[Value],
10
- ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord>>>, Error> {
12
+ ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<Xxh3Builder>>>>, Error> {
11
13
  let ruby = unsafe { Ruby::get_unchecked() };
12
14
 
13
15
  let CsvArgs {
@@ -20,6 +22,7 @@ pub fn parse_csv(
20
22
  result_type,
21
23
  flexible,
22
24
  flexible_default,
25
+ trim,
23
26
  } = parse_csv_args(&ruby, args)?;
24
27
 
25
28
  if !ruby.block_given() {
@@ -34,27 +37,37 @@ pub fn parse_csv(
34
37
  result_type,
35
38
  flexible,
36
39
  flexible_default,
40
+ trim: match trim {
41
+ Trim::All => Some("all".to_string()),
42
+ Trim::Headers => Some("headers".to_string()),
43
+ Trim::Fields => Some("fields".to_string()),
44
+ _ => None,
45
+ },
37
46
  });
38
47
  }
39
48
 
40
- let iter: Box<dyn Iterator<Item = CsvRecord>> = match result_type.as_str() {
49
+ let iter: Box<dyn Iterator<Item = CsvRecord<Xxh3Builder>>> = match result_type.as_str() {
41
50
  "hash" => Box::new(
42
- RecordReaderBuilder::<HashMap<&'static str, Option<String>>>::new(&ruby, to_read)
43
- .has_headers(has_headers)
44
- .flexible(flexible)
45
- .flexible_default(flexible_default)
46
- .delimiter(delimiter)
47
- .quote_char(quote_char)
48
- .null_string(null_string)
49
- .buffer(buffer_size)
50
- .build()?
51
- .map(CsvRecord::Map),
51
+ RecordReaderBuilder::<HashMap<&'static str, Option<String>, Xxh3Builder>>::new(
52
+ &ruby, to_read,
53
+ )
54
+ .has_headers(has_headers)
55
+ .flexible(flexible)
56
+ .flexible_default(flexible_default)
57
+ .trim(trim)
58
+ .delimiter(delimiter)
59
+ .quote_char(quote_char)
60
+ .null_string(null_string)
61
+ .buffer(buffer_size)
62
+ .build()?
63
+ .map(CsvRecord::Map),
52
64
  ),
53
65
  "array" => Box::new(
54
66
  RecordReaderBuilder::<Vec<Option<String>>>::new(&ruby, to_read)
55
67
  .has_headers(has_headers)
56
68
  .flexible(flexible)
57
69
  .flexible_default(flexible_default)
70
+ .trim(trim)
58
71
  .delimiter(delimiter)
59
72
  .quote_char(quote_char)
60
73
  .null_string(null_string)
@@ -84,11 +97,12 @@ struct EnumeratorArgs {
84
97
  result_type: String,
85
98
  flexible: bool,
86
99
  flexible_default: Option<String>,
100
+ trim: Option<String>,
87
101
  }
88
102
 
89
103
  fn create_enumerator(
90
104
  args: EnumeratorArgs,
91
- ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord>>>, Error> {
105
+ ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<Xxh3Builder>>>>, Error> {
92
106
  let kwargs = RHash::new();
93
107
  kwargs.aset(Symbol::new("has_headers"), args.has_headers)?;
94
108
  kwargs.aset(
@@ -104,6 +118,7 @@ fn create_enumerator(
104
118
  kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
105
119
  kwargs.aset(Symbol::new("flexible"), args.flexible)?;
106
120
  kwargs.aset(Symbol::new("flexible_default"), args.flexible_default)?;
121
+ kwargs.aset(Symbol::new("trim"), args.trim.map(Symbol::new))?;
107
122
  let enumerator = args
108
123
  .rb_self
109
124
  .enumeratorize("for_each", (args.to_read, KwArgs(kwargs)));
data/ext/osv/src/utils.rs CHANGED
@@ -4,6 +4,29 @@ use magnus::{
4
4
  Error, RString, Ruby, Symbol, Value,
5
5
  };
6
6
 
7
+ use crate::csv::BUFFER_CHANNEL_SIZE;
8
+
9
+ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, Error> {
10
+ if value.is_nil() {
11
+ Ok(None)
12
+ } else if value.is_kind_of(ruby.class_string()) {
13
+ RString::from_value(value)
14
+ .ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid string value"))?
15
+ .to_string()
16
+ .map(|s| Some(s))
17
+ } else if value.is_kind_of(ruby.class_symbol()) {
18
+ Symbol::from_value(value)
19
+ .ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid symbol value"))?
20
+ .funcall("to_s", ())
21
+ .map(|s| Some(s))
22
+ } else {
23
+ Err(Error::new(
24
+ magnus::exception::type_error(),
25
+ "Value must be a String or Symbol",
26
+ ))
27
+ }
28
+ }
29
+
7
30
  #[derive(Debug)]
8
31
  pub struct CsvArgs {
9
32
  pub to_read: Value,
@@ -15,6 +38,7 @@ pub struct CsvArgs {
15
38
  pub result_type: String,
16
39
  pub flexible: bool,
17
40
  pub flexible_default: Option<String>,
41
+ pub trim: csv::Trim,
18
42
  }
19
43
 
20
44
  /// Parse common arguments for CSV parsing
@@ -34,6 +58,7 @@ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
34
58
  Option<Value>,
35
59
  Option<bool>,
36
60
  Option<Option<String>>,
61
+ Option<Value>,
37
62
  ),
38
63
  (),
39
64
  >(
@@ -48,6 +73,7 @@ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
48
73
  "result_type",
49
74
  "flexible",
50
75
  "flexible_default",
76
+ "trim",
51
77
  ],
52
78
  )?;
53
79
 
@@ -81,38 +107,28 @@ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
81
107
 
82
108
  let null_string = kwargs.optional.3.unwrap_or_default();
83
109
 
84
- let buffer_size = kwargs.optional.4.unwrap_or(1000);
110
+ let buffer_size = kwargs.optional.4.unwrap_or(BUFFER_CHANNEL_SIZE);
85
111
 
86
- let result_type = match kwargs.optional.5 {
87
- Some(value) => {
88
- let parsed = if value.is_kind_of(ruby.class_string()) {
89
- RString::from_value(value)
90
- .ok_or_else(|| {
91
- Error::new(magnus::exception::type_error(), "Invalid string value")
92
- })?
93
- .to_string()?
94
- } else if value.is_kind_of(ruby.class_symbol()) {
95
- Symbol::from_value(value)
96
- .ok_or_else(|| {
97
- Error::new(magnus::exception::type_error(), "Invalid symbol value")
98
- })?
99
- .funcall("to_s", ())?
100
- } else {
112
+ let result_type = match kwargs
113
+ .optional
114
+ .5
115
+ .map(|value| parse_string_or_symbol(ruby, value))
116
+ {
117
+ Some(Ok(Some(parsed))) => match parsed.as_str() {
118
+ "hash" | "array" => parsed,
119
+ _ => {
101
120
  return Err(Error::new(
102
- magnus::exception::type_error(),
103
- "result_type must be a String or Symbol",
104
- ));
105
- };
106
-
107
- match parsed.as_str() {
108
- "hash" | "array" => parsed,
109
- _ => {
110
- return Err(Error::new(
111
- magnus::exception::runtime_error(),
112
- "result_type must be either 'hash' or 'array'",
113
- ))
114
- }
121
+ magnus::exception::runtime_error(),
122
+ "result_type must be either 'hash' or 'array'",
123
+ ))
115
124
  }
125
+ },
126
+ Some(Ok(None)) => String::from("hash"),
127
+ Some(Err(_)) => {
128
+ return Err(Error::new(
129
+ magnus::exception::type_error(),
130
+ "result_type must be a String or Symbol",
131
+ ))
116
132
  }
117
133
  None => String::from("hash"),
118
134
  };
@@ -121,6 +137,35 @@ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
121
137
 
122
138
  let flexible_default = kwargs.optional.7.unwrap_or_default();
123
139
 
140
+ let trim = match kwargs
141
+ .optional
142
+ .8
143
+ .map(|value| parse_string_or_symbol(ruby, value))
144
+ {
145
+ Some(Ok(Some(parsed))) => match parsed.as_str() {
146
+ "all" => csv::Trim::All,
147
+ "headers" => csv::Trim::Headers,
148
+ "fields" => csv::Trim::Fields,
149
+ invalid => {
150
+ return Err(Error::new(
151
+ magnus::exception::runtime_error(),
152
+ format!(
153
+ "trim must be either 'all', 'headers', or 'fields' but got '{}'",
154
+ invalid
155
+ ),
156
+ ))
157
+ }
158
+ },
159
+ Some(Ok(None)) => csv::Trim::None,
160
+ Some(Err(_)) => {
161
+ return Err(Error::new(
162
+ magnus::exception::type_error(),
163
+ "trim must be a String or Symbol",
164
+ ))
165
+ }
166
+ None => csv::Trim::None,
167
+ };
168
+
124
169
  Ok(CsvArgs {
125
170
  to_read,
126
171
  has_headers,
@@ -131,5 +176,6 @@ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
131
176
  result_type,
132
177
  flexible,
133
178
  flexible_default,
179
+ trim,
134
180
  })
135
181
  }
data/lib/osv/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module OSV
2
- VERSION = "0.3.8"
2
+ VERSION = "0.3.10"
3
3
  end
data/lib/osv.rbi CHANGED
@@ -14,12 +14,15 @@ module OSV
14
14
  # an empty string.
15
15
  # - `buffer_size`: Integer specifying the read buffer size
16
16
  # - `result_type`: String specifying the output format
17
- # ("hash" or "array")
17
+ # ("hash" or "array" or :hash or :array)
18
18
  # - `flexible`: Boolean specifying if the parser should be flexible
19
19
  # (default: false)
20
20
  # - `flexible_default`: String specifying the default value for missing fields.
21
21
  # Implicitly enables flexible mode if set.
22
22
  # (default: `nil`)
23
+ # - `trim`: String specifying the trim mode
24
+ # ("all" or "headers" or "fields" or :all or :headers or :fields)
25
+ # (default: `nil`)
23
26
  sig do
24
27
  params(
25
28
  input: T.any(String, StringIO, IO),
@@ -28,9 +31,10 @@ module OSV
28
31
  quote_char: T.nilable(String),
29
32
  nil_string: T.nilable(String),
30
33
  buffer_size: T.nilable(Integer),
31
- result_type: T.nilable(String),
34
+ result_type: T.nilable(T.any(String, Symbol)),
32
35
  flexible: T.nilable(T::Boolean),
33
36
  flexible_default: T.nilable(String),
37
+ trim: T.nilable(T.any(String, Symbol)),
34
38
  blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.nilable(String)], T::Array[T.nilable(String)])).void)
35
39
  ).returns(T.any(Enumerator, T.untyped))
36
40
  end
@@ -44,6 +48,7 @@ module OSV
44
48
  result_type: nil,
45
49
  flexible: nil,
46
50
  flexible_default: nil,
51
+ trim: nil,
47
52
  &blk
48
53
  )
49
54
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: osv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.8
4
+ version: 0.3.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko