RubyGems - osv - Versions diffs - 0.3.8 → 0.3.10 - Mend

osv 0.3.8 → 0.3.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 02205de8cef4d5f7633c06720a9e925a2b608116354da4a1678d4746d2197d23
-  data.tar.gz: 3e1d63323fdaad1b6a60e0a0a63801f98710615d6616c882f0cdce00e36c6e2e
+  metadata.gz: aed16dbfb14e6caebceb388104731091a5354394cd174804982dbce8f4b95963
+  data.tar.gz: 232497c8ec55ab15559f126c4bc90c6bcd0dd4296efadcbf23d07d6a24c7969d
 SHA512:
-  metadata.gz: df6a4a4b86c41010ea671ac0e98c2ee6307e62ceff35dab125868f0ee7edb6d14984348ecd4ac9f913489e5a6be0b364240b461334554385aabe5b3374fe798d
-  data.tar.gz: d931b888ce9d0ad1cdb1fa3d0be8cd0e526292206742f5adde718f414e9feca97eff3af6d4139d144c18a50e4807650ea9f7582153bcee80cea1e6ed4ce4ef49
+  metadata.gz: 36e1bab13ede785e2f41f0a56b37a2ff7448deff182b614c71e15ccd16799abe94f1f7db63034cd1c04195666c85a12fa0284dde75753b16e05e8767e9c87b18
+  data.tar.gz: '0990fee8e41dd9eb046d6b3733770adb4ef9825165bb0a012b32dac03ddd9d8f2107860d75ef05437aa8d820d42d16c8148642fd3faab326dcab8007731bcf61'

data/Cargo.lock CHANGED Viewed

@@ -274,6 +274,7 @@ dependencies = [
  "serde",
  "serde_magnus",
  "thiserror",
+ "xxhash-rust",
 ]
 [[package]]
@@ -526,3 +527,9 @@ name = "windows_x86_64_msvc"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
+[[package]]
+name = "xxhash-rust"
+version = "0.8.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7d48f1b18be023c95e7b75f481cac649d74be7c507ff4a407c55cfb957f7934"

data/README.md CHANGED Viewed

@@ -70,9 +70,10 @@ Both methods support the following options:
   - by default, empty strings are interpreted as empty strings
   - if you want to interpret empty strings as nil, set this to an empty string
 - `buffer_size`: Integer specifying the read buffer size
-- `result_type`: String specifying the output format ("hash" or "array")
+- `result_type`: String specifying the output format ("hash" or "array" or :hash or :array)
 - `flexible`: Boolean specifying if the parser should be flexible (default: false)
 - `flexible_default`: String specifying the default value for missing fields. Implicitly enables flexible mode if set. (default: `nil`)
+- `trim`: String specifying the trim mode ("all" or "headers" or "fields" or :all or :headers or :fields)
 ### Input Sources
@@ -112,45 +113,50 @@ Here's some unscientific benchmarks. You can find the code in the [benchmark/com
 ### 10,000 lines
 ```
-Benchmarking with 10001 lines of data
+Benchmarking with 100001 lines of data
-ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin23]
+ruby 3.3.6 (2024-11-05 revision 75015d4c1f) [arm64-darwin24]
 Warming up --------------------------------------
-   OSV - Hash output     6.000 i/100ms
+   OSV - Hash output     1.000 i/100ms
    CSV - Hash output     1.000 i/100ms
-  OSV - Array output    18.000 i/100ms
-  CSV - Array output     2.000 i/100ms
+  OSV - Array output     1.000 i/100ms
+OSV - Direct Open Array output
+                        12.719M i/100ms
+  CSV - Array output     1.000 i/100ms
 FastCSV - Array output
-                         9.000 i/100ms
-      OSV - StringIO     7.000 i/100ms
+                         1.000 i/100ms
+      OSV - StringIO     1.000 i/100ms
       CSV - StringIO     1.000 i/100ms
-  FastCSV - StringIO    20.000 i/100ms
-       OSV - Gzipped     6.000 i/100ms
+  FastCSV - StringIO     1.000 i/100ms
+       OSV - Gzipped     1.000 i/100ms
        CSV - Gzipped     1.000 i/100ms
 Calculating -------------------------------------
-   OSV - Hash output     73.360 (± 4.1%) i/s   (13.63 ms/i) -    366.000 in   5.000390s
-   CSV - Hash output     11.937 (±25.1%) i/s   (83.78 ms/i) -     52.000 in   5.036297s
-  OSV - Array output    189.738 (± 8.4%) i/s    (5.27 ms/i) -    954.000 in   5.071018s
-  CSV - Array output     25.471 (±11.8%) i/s   (39.26 ms/i) -    120.000 in   5.015289s
+   OSV - Hash output      6.722 (±14.9%) i/s  (148.77 ms/i) -     59.000 in  10.074753s
+   CSV - Hash output      1.223 (± 0.0%) i/s  (817.62 ms/i) -     13.000 in  10.788284s
+  OSV - Array output     17.284 (±11.6%) i/s   (57.86 ms/i) -    171.000 in  10.007321s
+OSV - Direct Open Array output
+                        213.629M (±13.5%) i/s    (4.68 ns/i) -      1.921B in  10.005506s
+  CSV - Array output      2.193 (± 0.0%) i/s  (455.93 ms/i) -     22.000 in  10.052607s
 FastCSV - Array output
-                         97.867 (± 2.0%) i/s   (10.22 ms/i) -    495.000 in   5.060957s
-      OSV - StringIO     80.784 (± 6.2%) i/s   (12.38 ms/i) -    406.000 in   5.046696s
-      CSV - StringIO     15.872 (± 0.0%) i/s   (63.01 ms/i) -     80.000 in   5.043361s
-  FastCSV - StringIO    200.511 (± 2.0%) i/s    (4.99 ms/i) -      1.020k in   5.088592s
-       OSV - Gzipped     55.220 (±12.7%) i/s   (18.11 ms/i) -    258.000 in   5.030928s
-       CSV - Gzipped     12.591 (±15.9%) i/s   (79.42 ms/i) -     59.000 in   5.039709s
+                          7.993 (± 0.0%) i/s  (125.11 ms/i) -     80.000 in  10.053729s
+      OSV - StringIO      6.626 (±15.1%) i/s  (150.91 ms/i) -     66.000 in  10.103646s
+      CSV - StringIO      1.478 (± 0.0%) i/s  (676.78 ms/i) -     15.000 in  10.158640s
+  FastCSV - StringIO     17.074 (± 5.9%) i/s   (58.57 ms/i) -    171.000 in  10.059266s
+       OSV - Gzipped      5.639 (± 0.0%) i/s  (177.32 ms/i) -     57.000 in  10.152487s
+       CSV - Gzipped      1.176 (± 0.0%) i/s  (850.19 ms/i) -     12.000 in  10.233398s
 Comparison:
-  FastCSV - StringIO:      200.5 i/s
-  OSV - Array output:      189.7 i/s - same-ish: difference falls within error
-FastCSV - Array output:       97.9 i/s - 2.05x  slower
-      OSV - StringIO:       80.8 i/s - 2.48x  slower
-   OSV - Hash output:       73.4 i/s - 2.73x  slower
-       OSV - Gzipped:       55.2 i/s - 3.63x  slower
-  CSV - Array output:       25.5 i/s - 7.87x  slower
-      CSV - StringIO:       15.9 i/s - 12.63x  slower
-       CSV - Gzipped:       12.6 i/s - 15.92x  slower
-   CSV - Hash output:       11.9 i/s - 16.80x  slower
+OSV - Direct Open Array output: 213629268.6 i/s
+  OSV - Array output:       17.3 i/s - 12360250.79x  slower
+  FastCSV - StringIO:       17.1 i/s - 12511956.50x  slower
+FastCSV - Array output:        8.0 i/s - 26727225.72x  slower
+   OSV - Hash output:        6.7 i/s - 31780615.83x  slower
+      OSV - StringIO:        6.6 i/s - 32239620.60x  slower
+       OSV - Gzipped:        5.6 i/s - 37881517.48x  slower
+  CSV - Array output:        2.2 i/s - 97400427.87x  slower
+      CSV - StringIO:        1.5 i/s - 144580048.04x  slower
+   CSV - Hash output:        1.2 i/s - 174666591.31x  slower
+       CSV - Gzipped:        1.2 i/s - 181626018.23x  slower
 ```
 ### 1,000,000 lines
@@ -158,11 +164,13 @@ FastCSV - Array output:       97.9 i/s - 2.05x  slower
 ```
 Benchmarking with 1000001 lines of data
-ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin23]
+ruby 3.3.6 (2024-11-05 revision 75015d4c1f) [arm64-darwin24]
 Warming up --------------------------------------
    OSV - Hash output     1.000 i/100ms
    CSV - Hash output     1.000 i/100ms
   OSV - Array output     1.000 i/100ms
+OSV - Direct Open Array output
+                         1.000 i/100ms
   CSV - Array output     1.000 i/100ms
 FastCSV - Array output
                          1.000 i/100ms
@@ -172,27 +180,30 @@ FastCSV - Array output
        OSV - Gzipped     1.000 i/100ms
        CSV - Gzipped     1.000 i/100ms
 Calculating -------------------------------------
-   OSV - Hash output      0.578 (± 0.0%) i/s     (1.73 s/i) -      3.000 in   5.287845s
-   CSV - Hash output      0.117 (± 0.0%) i/s     (8.57 s/i) -      1.000 in   8.571770s
-  OSV - Array output      1.142 (± 0.0%) i/s  (875.97 ms/i) -      5.000 in   5.234694s
-  CSV - Array output      0.235 (± 0.0%) i/s     (4.25 s/i) -      2.000 in   8.561144s
+   OSV - Hash output      0.492 (± 0.0%) i/s     (2.03 s/i) -      5.000 in  10.463278s
+   CSV - Hash output      0.114 (± 0.0%) i/s     (8.75 s/i) -      2.000 in  17.573877s
+  OSV - Array output      1.502 (± 0.0%) i/s  (665.58 ms/i) -     14.000 in  10.217551s
+OSV - Direct Open Array output
+                          1.626 (± 0.0%) i/s  (614.90 ms/i) -     16.000 in  10.190323s
+  CSV - Array output      0.183 (± 0.0%) i/s     (5.46 s/i) -      2.000 in  10.951943s
 FastCSV - Array output
-                          0.768 (± 0.0%) i/s     (1.30 s/i) -      4.000 in   6.924574s
-      OSV - StringIO      0.522 (± 0.0%) i/s     (1.91 s/i) -      3.000 in   5.803969s
-      CSV - StringIO      0.132 (± 0.0%) i/s     (7.59 s/i) -      1.000 in   7.593243s
-  FastCSV - StringIO      1.039 (± 0.0%) i/s  (962.53 ms/i) -      6.000 in   5.806644s
-       OSV - Gzipped      0.437 (± 0.0%) i/s     (2.29 s/i) -      3.000 in   6.885125s
-       CSV - Gzipped      0.115 (± 0.0%) i/s     (8.68 s/i) -      1.000 in   8.684069s
+                          0.326 (± 0.0%) i/s     (3.07 s/i) -      4.000 in  12.340605s
+      OSV - StringIO      0.567 (± 0.0%) i/s     (1.76 s/i) -      6.000 in  10.698027s
+      CSV - StringIO      0.141 (± 0.0%) i/s     (7.10 s/i) -      2.000 in  14.237144s
+  FastCSV - StringIO      0.923 (± 0.0%) i/s     (1.08 s/i) -     10.000 in  11.567775s
+       OSV - Gzipped      0.437 (± 0.0%) i/s     (2.29 s/i) -      5.000 in  11.452764s
+       CSV - Gzipped      0.104 (± 0.0%) i/s     (9.64 s/i) -      2.000 in  19.373423s
 Comparison:
-  OSV - Array output:        1.1 i/s
-  FastCSV - StringIO:        1.0 i/s - 1.10x  slower
-FastCSV - Array output:        0.8 i/s - 1.49x  slower
-   OSV - Hash output:        0.6 i/s - 1.98x  slower
-      OSV - StringIO:        0.5 i/s - 2.19x  slower
-       OSV - Gzipped:        0.4 i/s - 2.61x  slower
-  CSV - Array output:        0.2 i/s - 4.86x  slower
-      CSV - StringIO:        0.1 i/s - 8.67x  slower
-   CSV - Hash output:        0.1 i/s - 9.79x  slower
-       CSV - Gzipped:        0.1 i/s - 9.91x  slower
+OSV - Direct Open Array output:        1.6 i/s
+  OSV - Array output:        1.5 i/s - 1.08x  slower
+  FastCSV - StringIO:        0.9 i/s - 1.76x  slower
+      OSV - StringIO:        0.6 i/s - 2.87x  slower
+   OSV - Hash output:        0.5 i/s - 3.30x  slower
+       OSV - Gzipped:        0.4 i/s - 3.72x  slower
+FastCSV - Array output:        0.3 i/s - 4.99x  slower
+  CSV - Array output:        0.2 i/s - 8.88x  slower
+      CSV - StringIO:        0.1 i/s - 11.55x  slower
+   CSV - Hash output:        0.1 i/s - 14.24x  slower
+       CSV - Gzipped:        0.1 i/s - 15.68x  slower
 ```

data/Rakefile CHANGED Viewed

@@ -11,7 +11,7 @@ Rake::ExtensionTask.new("osv") do |c|
 end
 task :dev do
-  ENV["RB_SYS_CARGO_PROFILE"] = "dev"
+  ENV["RB_SYS_CARGO_PROFILE"] = "release"
 end
 Rake::TestTask.new do |t|
@@ -20,3 +20,9 @@ Rake::TestTask.new do |t|
   t.libs << "lib"
   t.libs << "test"
 end
+task :release do
+  sh "bundle exec rake test"
+  sh "gem build osv.gemspec"
+  sh "gem push osv-#{OSV::VERSION}.gem"
+end

data/ext/osv/Cargo.toml CHANGED Viewed

@@ -15,3 +15,4 @@ rb-sys = "^0.9"
 serde = { version = "1.0", features = ["derive"] }
 serde_magnus = "0.8.1"
 thiserror = "2.0"
+xxhash-rust = { version = "0.8.12", features = ["xxh3"] }

data/ext/osv/src/csv/builder.rs CHANGED Viewed

@@ -3,18 +3,21 @@ use super::{
     parser::RecordParser,
     read_impl::ReadImpl,
     reader::RecordReader,
+    READ_BUFFER_SIZE,
 };
 use flate2::read::GzDecoder;
 use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, RString, Ruby, Value};
 use std::{
     fs::File,
-    io::{self, Read},
+    io::{self, BufReader, Read},
     marker::PhantomData,
     os::fd::FromRawFd,
     thread,
 };
 use thiserror::Error;
+pub(crate) static BUFFER_CHANNEL_SIZE: usize = 1024;
 #[derive(Error, Debug)]
 pub enum ReaderError {
     #[error("Failed to get file descriptor: {0}")]
@@ -56,6 +59,7 @@ pub struct RecordReaderBuilder<'a, T: RecordParser + Send + 'static> {
     buffer: usize,
     flexible: bool,
     flexible_default: Option<String>,
+    trim: csv::Trim,
     _phantom: PhantomData<T>,
 }
@@ -68,9 +72,10 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
             delimiter: b',',
             quote_char: b'"',
             null_string: None,
-            buffer: 1000,
+            buffer: BUFFER_CHANNEL_SIZE,
             flexible: false,
             flexible_default: None,
+            trim: csv::Trim::None,
             _phantom: PhantomData,
         }
     }
@@ -110,6 +115,11 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
         self
     }
+    pub fn trim(mut self, trim: csv::Trim) -> Self {
+        self.trim = trim;
+        self
+    }
     fn handle_string_io(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
         let string: RString = self.to_read.funcall("string", ())?;
         let content = string.to_string()?;
@@ -128,7 +138,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
         }
         let file = unsafe { File::from_raw_fd(fd) };
-        Ok(Box::new(file))
+        Ok(Box::new(BufReader::with_capacity(READ_BUFFER_SIZE, file)))
     }
     fn handle_file_path(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
@@ -136,24 +146,27 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
         let file = File::open(&path)?;
         Ok(if path.ends_with(".gz") {
-            Box::new(GzDecoder::new(file))
+            Box::new(GzDecoder::new(BufReader::with_capacity(
+                READ_BUFFER_SIZE,
+                file,
+            )))
         } else {
-            Box::new(file)
+            Box::new(BufReader::with_capacity(READ_BUFFER_SIZE, file))
         })
     }
-    fn get_reader(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
+    fn get_reader(&self) -> Result<(Box<dyn Read + Send + 'static>, bool), ReaderError> {
         let string_io: magnus::RClass = self.ruby.eval("StringIO")?;
         let gzip_reader_class: magnus::RClass = self.ruby.eval("Zlib::GzipReader")?;
         if self.to_read.is_kind_of(string_io) {
-            self.handle_string_io()
+            self.handle_string_io().map(|r| (r, false))
         } else if self.to_read.is_kind_of(gzip_reader_class) {
             Err(ReaderError::UnsupportedGzipReader)
         } else if self.to_read.is_kind_of(self.ruby.class_io()) {
-            self.handle_file_descriptor()
+            self.handle_file_descriptor().map(|r| (r, true))
         } else {
-            self.handle_file_path()
+            self.handle_file_path().map(|r| (r, false))
         }
     }
@@ -175,7 +188,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
     pub fn build(self) -> Result<RecordReader<T>, ReaderError> {
         match self.get_reader() {
-            Ok(readable) => self.build_multi_threaded(readable),
+            Ok((readable, should_forget)) => self.build_multi_threaded(readable, should_forget),
             Err(_) => {
                 let readable = self.get_single_threaded_reader()?;
                 self.build_single_threaded(readable)
@@ -186,6 +199,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
     fn build_multi_threaded(
         self,
         readable: Box<dyn Read + Send + 'static>,
+        should_forget: bool,
     ) -> Result<RecordReader<T>, ReaderError> {
         let flexible = self.flexible || self.flexible_default.is_some();
         let mut reader = csv::ReaderBuilder::new()
@@ -193,6 +207,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
             .delimiter(self.delimiter)
             .quote(self.quote_char)
             .flexible(flexible)
+            .trim(self.trim)
             .from_reader(readable);
         let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
@@ -204,7 +219,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
         let flexible_default = self.flexible_default.clone();
         let handle = thread::spawn(move || {
-            let mut record = csv::StringRecord::new();
+            let mut record = csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers.len());
             while let Ok(true) = reader.read_record(&mut record) {
                 let row = T::parse(
                     &static_headers,
@@ -216,8 +231,10 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
                     break;
                 }
             }
-            let file_to_forget = reader.into_inner();
-            std::mem::forget(file_to_forget);
+            if should_forget {
+                let file_to_forget = reader.into_inner();
+                std::mem::forget(file_to_forget);
+            }
         });
         Ok(RecordReader {
@@ -239,6 +256,7 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
             .delimiter(self.delimiter)
             .quote(self.quote_char)
             .flexible(flexible)
+            .trim(self.trim)
             .from_reader(readable);
         let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
@@ -257,30 +275,55 @@ impl<'a, T: RecordParser + Send + 'static> RecordReaderBuilder<'a, T> {
 struct RubyReader {
     inner: Value,
+    buffer: Option<Vec<u8>>,
+    offset: usize,
 }
 impl RubyReader {
     fn new(inner: Value) -> Self {
-        Self { inner }
+        Self {
+            inner,
+            buffer: None,
+            offset: 0,
+        }
     }
 }
+// Read the entire inner into a vector and then read future reads from that vector with offset
 impl Read for RubyReader {
     fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
-        let result = self.inner.funcall::<_, _, Value>("read", (buf.len(),));
+        // If we have an existing buffer, read from it
+        if let Some(buffer) = self.buffer.as_ref() {
+            let remaining = buffer.len() - self.offset;
+            let copy_size = remaining.min(buf.len());
+            buf[..copy_size].copy_from_slice(&buffer[self.offset..self.offset + copy_size]);
+            self.offset += copy_size;
+            return Ok(copy_size);
+        }
+        // No buffer yet - read the entire content from Ruby
+        let result = self.inner.funcall::<_, _, Value>("read", ());
         match result {
             Ok(data) => {
                 if data.is_nil() {
-                    return Ok(0);
+                    return Ok(0); // EOF
                 }
                 let string = RString::from_value(data).ok_or_else(|| {
                     io::Error::new(io::ErrorKind::Other, "Failed to convert to RString")
                 })?;
                 let bytes = unsafe { string.as_slice() };
-                let len = bytes.len().min(buf.len());
-                buf[..len].copy_from_slice(&bytes[..len]);
-                Ok(len)
+                // Store the entire content in the buffer
+                self.buffer = Some(bytes.to_vec());
+                self.offset = 0;
+                // Read initial chunk
+                let copy_size = bytes.len().min(buf.len());
+                buf[..copy_size].copy_from_slice(&bytes[..copy_size]);
+                self.offset = copy_size;
+                Ok(copy_size)
             }
             Err(e) => Err(io::Error::new(io::ErrorKind::Other, e.to_string())),
         }

data/ext/osv/src/csv/mod.rs CHANGED Viewed

@@ -6,4 +6,6 @@ mod reader;
 mod record;
 pub use builder::RecordReaderBuilder;
+pub(crate) use builder::BUFFER_CHANNEL_SIZE;
+pub(crate) use read_impl::READ_BUFFER_SIZE;
 pub use record::CsvRecord;

data/ext/osv/src/csv/parser.rs CHANGED Viewed

@@ -1,4 +1,5 @@
 use std::collections::HashMap;
+use std::hash::BuildHasher;
 pub trait RecordParser {
     type Output;
@@ -11,7 +12,7 @@ pub trait RecordParser {
     ) -> Self::Output;
 }
-impl RecordParser for HashMap<&'static str, Option<String>> {
+impl<S: BuildHasher + Default> RecordParser for HashMap<&'static str, Option<String>, S> {
     type Output = Self;
     #[inline]
@@ -21,21 +22,21 @@ impl RecordParser for HashMap<&'static str, Option<String>> {
         null_string: Option<&str>,
         flexible_default: Option<&str>,
     ) -> Self::Output {
-        let mut map = HashMap::with_capacity(headers.len());
-        headers.iter().enumerate().for_each(|(i, header)| {
+        let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
+        headers.iter().enumerate().for_each(|(i, &header)| {
             let value = record.get(i).map_or_else(
-                || flexible_default.map(|s| s.to_string()),
+                || flexible_default.map(ToString::to_string),
                 |field| {
                     if null_string == Some(field) {
                         None
                     } else if field.is_empty() {
                         Some(String::new())
                     } else {
-                        Some(field.to_string())
+                        Some(field.into())
                     }
                 },
             );
-            map.insert(*header, value);
+            map.insert(header, value);
         });
         map
     }
@@ -53,20 +54,20 @@ impl RecordParser for Vec<Option<String>> {
     ) -> Self::Output {
         let target_len = headers.len();
         let mut vec = Vec::with_capacity(target_len);
-        vec.extend(record.iter().map(|field| {
-            if null_string == Some(field) {
+        for field in record.iter() {
+            let value = if Some(field) == null_string {
                 None
             } else if field.is_empty() {
                 Some(String::new())
             } else {
-                Some(field.to_string())
-            }
-        }));
+                Some(field.into())
+            };
+            vec.push(value);
+        }
-        // Fill remaining slots with flexible_default if needed
-        if let Some(default) = flexible_default {
-            while vec.len() < target_len {
-                vec.push(Some(default.to_string()));
+        if vec.len() < target_len {
+            if let Some(default) = flexible_default {
+                vec.resize_with(target_len, || Some(default.to_string()));
             }
         }
         vec

data/ext/osv/src/csv/read_impl.rs CHANGED Viewed

@@ -1,6 +1,8 @@
 use super::{header_cache::StringCache, parser::RecordParser};
 use std::{io::Read, thread};
+pub(crate) const READ_BUFFER_SIZE: usize = 8192;
 pub enum ReadImpl<T: RecordParser> {
     SingleThreaded {
         reader: csv::Reader<Box<dyn Read>>,
@@ -36,7 +38,7 @@ impl<T: RecordParser> ReadImpl<T> {
                 null_string,
                 flexible_default,
             } => {
-                let mut record = csv::StringRecord::new();
+                let mut record = csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers.len());
                 match reader.read_record(&mut record) {
                     Ok(true) => Some(T::parse(
                         headers,

data/ext/osv/src/csv/record.rs CHANGED Viewed

@@ -1,13 +1,13 @@
 use magnus::{IntoValue, Ruby, Value};
-use std::collections::HashMap;
+use std::{collections::HashMap, hash::BuildHasher};
 #[derive(Debug)]
-pub enum CsvRecord {
+pub enum CsvRecord<S: BuildHasher + Default> {
     Vec(Vec<Option<String>>),
-    Map(HashMap<&'static str, Option<String>>),
+    Map(HashMap<&'static str, Option<String>, S>),
 }
-impl IntoValue for CsvRecord {
+impl<S: BuildHasher + Default> IntoValue for CsvRecord<S> {
     #[inline]
     fn into_value_with(self, handle: &Ruby) -> Value {
         match self {

data/ext/osv/src/reader.rs CHANGED Viewed

@@ -1,13 +1,15 @@
 use crate::csv::{CsvRecord, RecordReaderBuilder};
 use crate::utils::*;
+use csv::Trim;
 use magnus::value::ReprValue;
 use magnus::{block::Yield, Error, KwArgs, RHash, Ruby, Symbol, Value};
 use std::collections::HashMap;
+use xxhash_rust::xxh3::Xxh3Builder;
 pub fn parse_csv(
     rb_self: Value,
     args: &[Value],
-) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord>>>, Error> {
+) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<Xxh3Builder>>>>, Error> {
     let ruby = unsafe { Ruby::get_unchecked() };
     let CsvArgs {
@@ -20,6 +22,7 @@ pub fn parse_csv(
         result_type,
         flexible,
         flexible_default,
+        trim,
     } = parse_csv_args(&ruby, args)?;
     if !ruby.block_given() {
@@ -34,27 +37,37 @@ pub fn parse_csv(
             result_type,
             flexible,
             flexible_default,
+            trim: match trim {
+                Trim::All => Some("all".to_string()),
+                Trim::Headers => Some("headers".to_string()),
+                Trim::Fields => Some("fields".to_string()),
+                _ => None,
+            },
         });
     }
-    let iter: Box<dyn Iterator<Item = CsvRecord>> = match result_type.as_str() {
+    let iter: Box<dyn Iterator<Item = CsvRecord<Xxh3Builder>>> = match result_type.as_str() {
         "hash" => Box::new(
-            RecordReaderBuilder::<HashMap<&'static str, Option<String>>>::new(&ruby, to_read)
-                .has_headers(has_headers)
-                .flexible(flexible)
-                .flexible_default(flexible_default)
-                .delimiter(delimiter)
-                .quote_char(quote_char)
-                .null_string(null_string)
-                .buffer(buffer_size)
-                .build()?
-                .map(CsvRecord::Map),
+            RecordReaderBuilder::<HashMap<&'static str, Option<String>, Xxh3Builder>>::new(
+                &ruby, to_read,
+            )
+            .has_headers(has_headers)
+            .flexible(flexible)
+            .flexible_default(flexible_default)
+            .trim(trim)
+            .delimiter(delimiter)
+            .quote_char(quote_char)
+            .null_string(null_string)
+            .buffer(buffer_size)
+            .build()?
+            .map(CsvRecord::Map),
         ),
         "array" => Box::new(
             RecordReaderBuilder::<Vec<Option<String>>>::new(&ruby, to_read)
                 .has_headers(has_headers)
                 .flexible(flexible)
                 .flexible_default(flexible_default)
+                .trim(trim)
                 .delimiter(delimiter)
                 .quote_char(quote_char)
                 .null_string(null_string)
@@ -84,11 +97,12 @@ struct EnumeratorArgs {
     result_type: String,
     flexible: bool,
     flexible_default: Option<String>,
+    trim: Option<String>,
 }
 fn create_enumerator(
     args: EnumeratorArgs,
-) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord>>>, Error> {
+) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<Xxh3Builder>>>>, Error> {
     let kwargs = RHash::new();
     kwargs.aset(Symbol::new("has_headers"), args.has_headers)?;
     kwargs.aset(
@@ -104,6 +118,7 @@ fn create_enumerator(
     kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
     kwargs.aset(Symbol::new("flexible"), args.flexible)?;
     kwargs.aset(Symbol::new("flexible_default"), args.flexible_default)?;
+    kwargs.aset(Symbol::new("trim"), args.trim.map(Symbol::new))?;
     let enumerator = args
         .rb_self
         .enumeratorize("for_each", (args.to_read, KwArgs(kwargs)));

data/ext/osv/src/utils.rs CHANGED Viewed

@@ -4,6 +4,29 @@ use magnus::{
     Error, RString, Ruby, Symbol, Value,
 };
+use crate::csv::BUFFER_CHANNEL_SIZE;
+fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, Error> {
+    if value.is_nil() {
+        Ok(None)
+    } else if value.is_kind_of(ruby.class_string()) {
+        RString::from_value(value)
+            .ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid string value"))?
+            .to_string()
+            .map(|s| Some(s))
+    } else if value.is_kind_of(ruby.class_symbol()) {
+        Symbol::from_value(value)
+            .ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid symbol value"))?
+            .funcall("to_s", ())
+            .map(|s| Some(s))
+    } else {
+        Err(Error::new(
+            magnus::exception::type_error(),
+            "Value must be a String or Symbol",
+        ))
+    }
+}
 #[derive(Debug)]
 pub struct CsvArgs {
     pub to_read: Value,
@@ -15,6 +38,7 @@ pub struct CsvArgs {
     pub result_type: String,
     pub flexible: bool,
     pub flexible_default: Option<String>,
+    pub trim: csv::Trim,
 }
 /// Parse common arguments for CSV parsing
@@ -34,6 +58,7 @@ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
             Option<Value>,
             Option<bool>,
             Option<Option<String>>,
+            Option<Value>,
         ),
         (),
     >(
@@ -48,6 +73,7 @@ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
             "result_type",
             "flexible",
             "flexible_default",
+            "trim",
         ],
     )?;
@@ -81,38 +107,28 @@ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
     let null_string = kwargs.optional.3.unwrap_or_default();
-    let buffer_size = kwargs.optional.4.unwrap_or(1000);
+    let buffer_size = kwargs.optional.4.unwrap_or(BUFFER_CHANNEL_SIZE);
-    let result_type = match kwargs.optional.5 {
-        Some(value) => {
-            let parsed = if value.is_kind_of(ruby.class_string()) {
-                RString::from_value(value)
-                    .ok_or_else(|| {
-                        Error::new(magnus::exception::type_error(), "Invalid string value")
-                    })?
-                    .to_string()?
-            } else if value.is_kind_of(ruby.class_symbol()) {
-                Symbol::from_value(value)
-                    .ok_or_else(|| {
-                        Error::new(magnus::exception::type_error(), "Invalid symbol value")
-                    })?
-                    .funcall("to_s", ())?
-            } else {
+    let result_type = match kwargs
+        .optional
+        .5
+        .map(|value| parse_string_or_symbol(ruby, value))
+    {
+        Some(Ok(Some(parsed))) => match parsed.as_str() {
+            "hash" | "array" => parsed,
+            _ => {
                 return Err(Error::new(
-                    magnus::exception::type_error(),
-                    "result_type must be a String or Symbol",
-                ));
-            };
-            match parsed.as_str() {
-                "hash" | "array" => parsed,
-                _ => {
-                    return Err(Error::new(
-                        magnus::exception::runtime_error(),
-                        "result_type must be either 'hash' or 'array'",
-                    ))
-                }
+                    magnus::exception::runtime_error(),
+                    "result_type must be either 'hash' or 'array'",
+                ))
             }
+        },
+        Some(Ok(None)) => String::from("hash"),
+        Some(Err(_)) => {
+            return Err(Error::new(
+                magnus::exception::type_error(),
+                "result_type must be a String or Symbol",
+            ))
         }
         None => String::from("hash"),
     };
@@ -121,6 +137,35 @@ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
     let flexible_default = kwargs.optional.7.unwrap_or_default();
+    let trim = match kwargs
+        .optional
+        .8
+        .map(|value| parse_string_or_symbol(ruby, value))
+    {
+        Some(Ok(Some(parsed))) => match parsed.as_str() {
+            "all" => csv::Trim::All,
+            "headers" => csv::Trim::Headers,
+            "fields" => csv::Trim::Fields,
+            invalid => {
+                return Err(Error::new(
+                    magnus::exception::runtime_error(),
+                    format!(
+                        "trim must be either 'all', 'headers', or 'fields' but got '{}'",
+                        invalid
+                    ),
+                ))
+            }
+        },
+        Some(Ok(None)) => csv::Trim::None,
+        Some(Err(_)) => {
+            return Err(Error::new(
+                magnus::exception::type_error(),
+                "trim must be a String or Symbol",
+            ))
+        }
+        None => csv::Trim::None,
+    };
     Ok(CsvArgs {
         to_read,
         has_headers,
@@ -131,5 +176,6 @@ pub fn parse_csv_args(ruby: &Ruby, args: &[Value]) -> Result<CsvArgs, Error> {
         result_type,
         flexible,
         flexible_default,
+        trim,
     })
 }

data/lib/osv/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module OSV
-  VERSION = "0.3.8"
+  VERSION = "0.3.10"
 end

data/lib/osv.rbi CHANGED Viewed

@@ -14,12 +14,15 @@ module OSV
   #                   an empty string.
   #   - `buffer_size`: Integer specifying the read buffer size
   #   - `result_type`: String specifying the output format
-  #                    ("hash" or "array")
+  #                    ("hash" or "array" or :hash or :array)
   #   - `flexible`: Boolean specifying if the parser should be flexible
   #                 (default: false)
   #   - `flexible_default`: String specifying the default value for missing fields.
   #                         Implicitly enables flexible mode if set.
   #                         (default: `nil`)
+  #   - `trim`: String specifying the trim mode
+  #             ("all" or "headers" or "fields" or :all or :headers or :fields)
+  #             (default: `nil`)
   sig do
     params(
       input: T.any(String, StringIO, IO),
@@ -28,9 +31,10 @@ module OSV
       quote_char: T.nilable(String),
       nil_string: T.nilable(String),
       buffer_size: T.nilable(Integer),
-      result_type: T.nilable(String),
+      result_type: T.nilable(T.any(String, Symbol)),
       flexible: T.nilable(T::Boolean),
       flexible_default: T.nilable(String),
+      trim: T.nilable(T.any(String, Symbol)),
       blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.nilable(String)], T::Array[T.nilable(String)])).void)
     ).returns(T.any(Enumerator, T.untyped))
   end
@@ -44,6 +48,7 @@ module OSV
     result_type: nil,
     flexible: nil,
     flexible_default: nil,
+    trim: nil,
     &blk
   )
   end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: osv
 version: !ruby/object:Gem::Version
-  version: 0.3.8
+  version: 0.3.10
 platform: ruby
 authors:
 - Nathan Jaremko