RubyGems - osv - Versions diffs - 0.3.16 → 0.3.18 - Mend

osv 0.3.16 → 0.3.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/README.md +26 -26
data/ext/osv/src/csv/builder.rs +92 -85
data/ext/osv/src/csv/header_cache.rs +28 -2
data/ext/osv/src/csv/mod.rs +1 -2
data/ext/osv/src/csv/parser.rs +16 -80
data/ext/osv/src/csv/record.rs +4 -4
data/ext/osv/src/csv/record_reader.rs +51 -117
data/ext/osv/src/csv/ruby_integration.rs +10 -21
data/ext/osv/src/csv/ruby_reader.rs +8 -1
data/ext/osv/src/reader.rs +64 -46
data/ext/osv/src/utils.rs +15 -19
data/lib/osv/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 91401989a8532162a9731fed3cb07661c0676105f77465da23f9a267773e7651
-  data.tar.gz: aeba48f1338a4160044e8c7264f80eb065d950567288bded39acf5d9bc593d7b
+  metadata.gz: da944a5af1cc88630fe0952e6e710d2acb8ac420ae8708a107064f5ecf444dec
+  data.tar.gz: bd6de3860ff2f47eb03b9019d307d647fa8c2e8f366543fbe95604f284871b62
 SHA512:
-  metadata.gz: 8d2ea3f724a6f7af317bb1ae865513c15f2ef0e475b070e7f9ae2e1b4155b2d82090387beb0c6a2e5cb8664b1f6dd0cf61e6ad9545957bc3ada1a3e87758b1ee
-  data.tar.gz: 0eaa86241092c14f4c2973d74e65877b7f3f87487a2681b9a094054f98db759772bcf012ec2f4fa073bd16f2b02927212b13afec484f84daf764d3b3e0811b6b
+  metadata.gz: 8a130687fb25aaae3734f2e69c2258ccf893c584cd0c2893b751282b393ee4d52b2317a338f1ef68a864222e4947614ffdca7e6b98d8d37dc934dfede61f7bc1
+  data.tar.gz: 332a5dc1c6ce6df721b22f9e66b54d48426da3a0148917f9ec13036edd63e1fb70a950a2971964d289e076536af47d090c89fd95961d8ca4b51f1f1b8a221a98

data/README.md CHANGED Viewed

@@ -142,34 +142,34 @@ OSV - Gzipped Direct     1.000 i/100ms
    FastCSV - Gzipped     1.000 i/100ms
        CSV - Gzipped     1.000 i/100ms
 Calculating -------------------------------------
-      CSV - StringIO      0.080 (± 0.0%) i/s    (12.43 s/i) -      3.000 in  37.301114s
-  FastCSV - StringIO      0.368 (± 0.0%) i/s     (2.72 s/i) -     12.000 in  32.619020s
-      OSV - StringIO      0.699 (± 0.0%) i/s     (1.43 s/i) -     21.000 in  30.091225s
-   CSV - Hash output      0.059 (± 0.0%) i/s    (16.95 s/i) -      2.000 in  33.908533s
-   OSV - Hash output      0.329 (± 0.0%) i/s     (3.04 s/i) -     10.000 in  30.551275s
-  CSV - Array output      0.066 (± 0.0%) i/s    (15.18 s/i) -      2.000 in  30.357327s
-  OSV - Array output      0.632 (± 0.0%) i/s     (1.58 s/i) -     19.000 in  30.150113s
+      CSV - StringIO      0.083 (± 0.0%) i/s    (12.06 s/i) -      3.000 in  36.304469s
+  FastCSV - StringIO      0.335 (± 0.0%) i/s     (2.98 s/i) -     10.000 in  31.019521s
+      OSV - StringIO      0.705 (± 0.0%) i/s     (1.42 s/i) -     21.000 in  30.629511s
+   CSV - Hash output      0.060 (± 0.0%) i/s    (16.74 s/i) -      2.000 in  33.475977s
+   OSV - Hash output      0.434 (± 0.0%) i/s     (2.30 s/i) -     13.000 in  30.071679s
+  CSV - Array output      0.063 (± 0.0%) i/s    (15.88 s/i) -      2.000 in  32.229906s
+  OSV - Array output      0.406 (± 0.0%) i/s     (2.47 s/i) -     12.000 in  31.072600s
 FastCSV - Array output
-                          0.350 (± 0.0%) i/s     (2.86 s/i) -     11.000 in  31.477268s
+                          0.321 (± 0.0%) i/s     (3.11 s/i) -     10.000 in  31.458966s
 OSV - Direct Open Array output
-                          0.641 (± 0.0%) i/s     (1.56 s/i) -     20.000 in  31.275201s
-       OSV - Gzipped      0.530 (± 0.0%) i/s     (1.89 s/i) -     16.000 in  30.183753s
-OSV - Gzipped Direct      0.727 (± 0.0%) i/s     (1.37 s/i) -     22.000 in  30.283991s
-   FastCSV - Gzipped      0.323 (± 0.0%) i/s     (3.09 s/i) -     10.000 in  30.949600s
-       CSV - Gzipped      0.056 (± 0.0%) i/s    (17.72 s/i) -      2.000 in  35.440473s
+                          0.686 (± 0.0%) i/s     (1.46 s/i) -     21.000 in  30.639715s
+       OSV - Gzipped      0.524 (± 0.0%) i/s     (1.91 s/i) -     16.000 in  30.695259s
+OSV - Gzipped Direct      0.519 (± 0.0%) i/s     (1.93 s/i) -     16.000 in  30.830005s
+   FastCSV - Gzipped      0.313 (± 0.0%) i/s     (3.20 s/i) -     10.000 in  32.031002s
+       CSV - Gzipped      0.057 (± 0.0%) i/s    (17.55 s/i) -      2.000 in  35.107808s
 Comparison:
-OSV - Gzipped Direct:        0.7 i/s
-      OSV - StringIO:        0.7 i/s - 1.04x  slower
-OSV - Direct Open Array output:        0.6 i/s - 1.14x  slower
-  OSV - Array output:        0.6 i/s - 1.15x  slower
-       OSV - Gzipped:        0.5 i/s - 1.37x  slower
-  FastCSV - StringIO:        0.4 i/s - 1.98x  slower
-FastCSV - Array output:        0.3 i/s - 2.08x  slower
-   OSV - Hash output:        0.3 i/s - 2.21x  slower
-   FastCSV - Gzipped:        0.3 i/s - 2.25x  slower
-      CSV - StringIO:        0.1 i/s - 9.04x  slower
-  CSV - Array output:        0.1 i/s - 11.04x  slower
-   CSV - Hash output:        0.1 i/s - 12.33x  slower
-       CSV - Gzipped:        0.1 i/s - 12.89x  slower
+      OSV - StringIO          : 0.7 i/s
+OSV - Direct Open Array output: 0.7 i/s - 1.03x  slower
+       OSV - Gzipped          : 0.5 i/s - 1.34x  slower
+OSV - Gzipped Direct          : 0.5 i/s - 1.36x  slower
+   OSV - Hash output          : 0.4 i/s - 1.62x  slower
+  OSV - Array output          : 0.4 i/s - 1.74x  slower
+  FastCSV - StringIO          : 0.3 i/s - 2.10x  slower
+FastCSV - Array output        : 0.3 i/s - 2.20x  slower
+   FastCSV - Gzipped          : 0.3 i/s - 2.26x  slower
+      CSV - StringIO          : 0.1 i/s - 8.50x  slower
+  CSV - Array output          : 0.1 i/s - 11.20x  slower
+   CSV - Hash output          : 0.1 i/s - 11.80x  slower
+       CSV - Gzipped          : 0.1 i/s - 12.37x  slower
 ```

data/ext/osv/src/csv/builder.rs CHANGED Viewed

@@ -6,8 +6,10 @@ use super::{
     ForgottenFileHandle,
 };
 use flate2::read::GzDecoder;
-use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, Ruby, Value};
+use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, RString, Ruby, Value};
 use std::{
+    borrow::Cow,
+    fmt::Debug,
     fs::File,
     io::{self, BufReader, Read},
     marker::PhantomData,
@@ -17,18 +19,21 @@ use std::{
 use thiserror::Error;
-pub(crate) static BUFFER_CHANNEL_SIZE: usize = 1024;
+/// Errors that can occur when building a RecordReader
 #[derive(Error, Debug)]
 pub enum ReaderError {
     #[error("Failed to get file descriptor: {0}")]
     FileDescriptor(String),
-    #[error("Invalid file descriptor")]
-    InvalidFileDescriptor,
+    #[error("Invalid file descriptor: {0}")]
+    InvalidFileDescriptor(i32),
     #[error("Failed to open file: {0}")]
     FileOpen(#[from] io::Error),
     #[error("Failed to intern headers: {0}")]
     HeaderIntern(#[from] CacheError),
+    #[error("Invalid flexible default value: {0}")]
+    InvalidFlexibleDefault(String),
+    #[error("Invalid null string value: {0}")]
+    InvalidNullString(String),
     #[error("Ruby error: {0}")]
     Ruby(String),
 }
@@ -48,63 +53,27 @@ impl From<ReaderError> for MagnusError {
     }
 }
-pub struct RecordReaderBuilder<'a, T: RecordParser<'a> + Send> {
-    ruby: &'a Ruby,
+/// Builder for configuring and creating a RecordReader instance.
+///
+/// This struct provides a fluent interface for setting up CSV parsing options
+/// and creating a RecordReader with the specified configuration.
+pub struct RecordReaderBuilder<'a, T: RecordParser<'a>> {
+    ruby: Ruby,
     to_read: Value,
     has_headers: bool,
     delimiter: u8,
     quote_char: u8,
     null_string: Option<String>,
-    buffer: usize,
     flexible: bool,
-    flexible_default: Option<&'a str>,
+    flexible_default: Option<String>,
     trim: csv::Trim,
     _phantom: PhantomData<T>,
+    _phantom_a: PhantomData<&'a ()>,
 }
-impl<T: RecordParser<'static> + Send + 'static> RecordReaderBuilder<'static, T> {
-    fn build_multi_threaded(
-        self,
-        readable: Box<dyn Read + Send + 'static>,
-    ) -> Result<RecordReader<'static, T>, ReaderError> {
-        let flexible = self.flexible || self.flexible_default.is_some();
-        let mut reader = csv::ReaderBuilder::new()
-            .has_headers(self.has_headers)
-            .delimiter(self.delimiter)
-            .quote(self.quote_char)
-            .flexible(flexible)
-            .trim(self.trim)
-            .from_reader(readable);
-        let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
-        let static_headers = StringCache::intern_many(&headers)?;
-        Ok(RecordReader::new_multi_threaded(
-            reader,
-            static_headers,
-            self.buffer,
-            self.null_string,
-            self.flexible_default,
-        ))
-    }
-    pub fn build_threaded(self) -> Result<RecordReader<'static, T>, ReaderError> {
-        if self.to_read.is_kind_of(self.ruby.class_io()) {
-            let readable = self.handle_file_descriptor()?;
-            self.build_multi_threaded(readable)
-        } else if self.to_read.is_kind_of(self.ruby.class_string()) {
-            let readable = self.handle_file_path()?;
-            self.build_multi_threaded(readable)
-        } else {
-            let readable = build_ruby_reader(self.ruby, self.to_read)?;
-            let buffered_reader = BufReader::with_capacity(READ_BUFFER_SIZE, readable);
-            self.build_single_threaded(buffered_reader)
-        }
-    }
-}
-impl<'a, T: RecordParser<'a> + Send> RecordReaderBuilder<'a, T> {
-    pub fn new(ruby: &'a Ruby, to_read: Value) -> Self {
+impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
+    /// Creates a new builder instance with default settings.
+    pub fn new(ruby: Ruby, to_read: Value) -> Self {
         Self {
             ruby,
             to_read,
@@ -112,92 +81,107 @@ impl<'a, T: RecordParser<'a> + Send> RecordReaderBuilder<'a, T> {
             delimiter: b',',
             quote_char: b'"',
             null_string: None,
-            buffer: BUFFER_CHANNEL_SIZE,
             flexible: false,
             flexible_default: None,
             trim: csv::Trim::None,
             _phantom: PhantomData,
+            _phantom_a: PhantomData,
         }
     }
+    /// Sets whether the CSV file has headers.
+    #[must_use]
     pub fn has_headers(mut self, has_headers: bool) -> Self {
         self.has_headers = has_headers;
         self
     }
+    /// Sets the delimiter character for the CSV.
+    #[must_use]
     pub fn delimiter(mut self, delimiter: u8) -> Self {
         self.delimiter = delimiter;
         self
     }
+    /// Sets the quote character for the CSV.
+    #[must_use]
     pub fn quote_char(mut self, quote_char: u8) -> Self {
         self.quote_char = quote_char;
         self
     }
+    /// Sets the string that should be interpreted as null.
+    #[must_use]
     pub fn null_string(mut self, null_string: Option<String>) -> Self {
         self.null_string = null_string;
         self
     }
-    pub fn buffer(mut self, buffer: usize) -> Self {
-        self.buffer = buffer;
-        self
-    }
+    /// Sets whether the reader should be flexible with field counts.
+    #[must_use]
     pub fn flexible(mut self, flexible: bool) -> Self {
         self.flexible = flexible;
         self
     }
-    pub fn flexible_default(mut self, flexible_default: Option<&'a str>) -> Self {
+    /// Sets the default value for missing fields when in flexible mode.
+    #[must_use]
+    pub fn flexible_default(mut self, flexible_default: Option<String>) -> Self {
         self.flexible_default = flexible_default;
         self
     }
+    /// Sets the trimming mode for fields.
+    #[must_use]
     pub fn trim(mut self, trim: csv::Trim) -> Self {
         self.trim = trim;
         self
     }
-    fn handle_file_descriptor(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
+    /// Handles reading from a file descriptor.
+    fn handle_file_descriptor(&self) -> Result<Box<dyn SeekableRead>, ReaderError> {
         let raw_value = self.to_read.as_raw();
         let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
-            .map_err(|_| {
-                ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
-            })?;
+            .map_err(|e| ReaderError::FileDescriptor(format!("{:?}", e)))?;
         if fd < 0 {
-            return Err(ReaderError::InvalidFileDescriptor);
+            return Err(ReaderError::InvalidFileDescriptor(fd));
         }
         let file = unsafe { File::from_raw_fd(fd) };
         let forgotten = ForgottenFileHandle(ManuallyDrop::new(file));
-        Ok(Box::new(BufReader::with_capacity(
-            READ_BUFFER_SIZE,
-            forgotten,
-        )))
+        Ok(Box::new(forgotten))
     }
-    fn handle_file_path(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
+    /// Handles reading from a file path.
+    fn handle_file_path(&self) -> Result<Box<dyn SeekableRead>, ReaderError> {
         let path = self.to_read.to_r_string()?.to_string()?;
         let file = File::open(&path)?;
-        Ok(if path.ends_with(".gz") {
-            Box::new(GzDecoder::new(BufReader::with_capacity(
-                READ_BUFFER_SIZE,
-                file,
-            )))
+        if path.ends_with(".gz") {
+            // For gzipped files, we need to decompress them into memory first
+            // since GzDecoder doesn't support seeking
+            let mut decoder = GzDecoder::new(BufReader::with_capacity(READ_BUFFER_SIZE, file));
+            let mut contents = Vec::new();
+            decoder.read_to_end(&mut contents)?;
+            Ok(Box::new(std::io::Cursor::new(contents)))
         } else {
-            Box::new(BufReader::with_capacity(READ_BUFFER_SIZE, file))
-        })
+            Ok(Box::new(file))
+        }
     }
-    fn build_single_threaded(
-        self,
-        readable: BufReader<Box<dyn SeekableRead>>,
-    ) -> Result<RecordReader<'a, T>, ReaderError> {
+    /// Builds the RecordReader with the configured options.
+    pub fn build(self) -> Result<RecordReader<'a, T>, ReaderError> {
+        let readable = if self.to_read.is_kind_of(self.ruby.class_io()) {
+            self.handle_file_descriptor()?
+        } else if self.to_read.is_kind_of(self.ruby.class_string()) {
+            self.handle_file_path()?
+        } else {
+            build_ruby_reader(&self.ruby, self.to_read)?
+        };
         let flexible = self.flexible || self.flexible_default.is_some();
+        let reader = BufReader::with_capacity(READ_BUFFER_SIZE, readable);
         let mut reader = csv::ReaderBuilder::new()
             .has_headers(self.has_headers)
@@ -205,16 +189,39 @@ impl<'a, T: RecordParser<'a> + Send> RecordReaderBuilder<'a, T> {
             .quote(self.quote_char)
             .flexible(flexible)
             .trim(self.trim)
-            .from_reader(readable);
+            .from_reader(reader);
-        let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
+        let headers = RecordReader::<T>::get_headers(&self.ruby, &mut reader, self.has_headers)?;
         let static_headers = StringCache::intern_many(&headers)?;
-        Ok(RecordReader::new_single_threaded(
+        // We intern both of these to get static string references we can reuse throughout the parser.
+        let flexible_default = self
+            .flexible_default
+            .map(|s| {
+                RString::new(&s)
+                    .to_interned_str()
+                    .as_str()
+                    .map_err(|e| ReaderError::InvalidFlexibleDefault(format!("{:?}", e)))
+            })
+            .transpose()?
+            .map(|s| Cow::Borrowed(s));
+        let null_string = self
+            .null_string
+            .map(|s| {
+                RString::new(&s)
+                    .to_interned_str()
+                    .as_str()
+                    .map_err(|e| ReaderError::InvalidNullString(format!("{:?}", e)))
+            })
+            .transpose()?
+            .map(|s| Cow::Borrowed(s));
+        Ok(RecordReader::new(
             reader,
             static_headers,
-            self.null_string,
-            self.flexible_default,
+            null_string,
+            flexible_default,
         ))
     }
 }

data/ext/osv/src/csv/header_cache.rs CHANGED Viewed

@@ -1,4 +1,3 @@
-use magnus::{r_string::FString, value::Opaque, IntoValue, RString, Ruby, Value};
 /// This module exists to avoid cloning header keys in returned HashMaps.
 /// Since the underlying RString creation already involves cloning,
 /// this caching layer aims to reduce redundant allocations.
@@ -7,8 +6,14 @@ use magnus::{r_string::FString, value::Opaque, IntoValue, RString, Ruby, Value};
 /// so this optimization could be removed if any issues arise.
 use std::{
     collections::HashMap,
-    sync::{atomic::AtomicU32, atomic::Ordering, LazyLock, Mutex},
+    sync::{
+        atomic::{AtomicU32, Ordering},
+        LazyLock, Mutex, OnceLock,
+    },
 };
+use magnus::{r_string::FString, value::Opaque, IntoValue, RString, Ruby, Value};
 use thiserror::Error;
 #[derive(Debug, Error)]
@@ -132,3 +137,24 @@ impl StringCache {
         Ok(())
     }
 }
+pub struct HeaderCacheCleanupIter<I> {
+    pub inner: I,
+    pub headers: OnceLock<Vec<StringCacheKey>>,
+}
+impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
+    type Item = I::Item;
+    fn next(&mut self) -> Option<Self::Item> {
+        self.inner.next()
+    }
+}
+impl<I> Drop for HeaderCacheCleanupIter<I> {
+    fn drop(&mut self) {
+        if let Some(headers) = self.headers.get() {
+            StringCache::clear(&headers).unwrap();
+        }
+    }
+}

data/ext/osv/src/csv/mod.rs CHANGED Viewed

@@ -7,8 +7,7 @@ mod ruby_integration;
 mod ruby_reader;
 pub use builder::RecordReaderBuilder;
-pub(crate) use builder::BUFFER_CHANNEL_SIZE;
 pub use header_cache::StringCacheKey;
-pub use record::CowValue;
+pub use record::CowStr;
 pub use record::CsvRecord;
 pub use ruby_integration::*;

data/ext/osv/src/csv/parser.rs CHANGED Viewed

@@ -3,21 +3,21 @@ use std::collections::HashMap;
 use std::hash::BuildHasher;
 use super::header_cache::StringCacheKey;
-use super::CowValue;
+use super::CowStr;
 pub trait RecordParser<'a> {
-    type Output: 'a;
+    type Output;
     fn parse(
         headers: &[StringCacheKey],
         record: &csv::StringRecord,
-        null_string: Option<&str>,
+        null_string: Option<Cow<'a, str>>,
         flexible_default: Option<Cow<'a, str>>,
     ) -> Self::Output;
 }
-impl<'a, S: BuildHasher + Default + 'a> RecordParser<'a>
-    for HashMap<StringCacheKey, Option<CowValue<'a>>, S>
+impl<'a, S: BuildHasher + Default> RecordParser<'a>
+    for HashMap<StringCacheKey, Option<CowStr<'a>>, S>
 {
     type Output = Self;
@@ -25,23 +25,23 @@ impl<'a, S: BuildHasher + Default + 'a> RecordParser<'a>
     fn parse(
         headers: &[StringCacheKey],
         record: &csv::StringRecord,
-        null_string: Option<&str>,
+        null_string: Option<Cow<'a, str>>,
         flexible_default: Option<Cow<'a, str>>,
     ) -> Self::Output {
         let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
         let shared_empty = Cow::Borrowed("");
-        let shared_default = flexible_default.map(CowValue);
+        let shared_default = flexible_default.map(CowStr);
         headers.iter().enumerate().for_each(|(i, ref header)| {
             let value = record.get(i).map_or_else(
                 || shared_default.clone(),
                 |field| {
-                    if null_string == Some(field) {
+                    if null_string.as_deref() == Some(field) {
                         None
                     } else if field.is_empty() {
-                        Some(CowValue(shared_empty.clone()))
+                        Some(CowStr(shared_empty.clone()))
                     } else {
-                        Some(CowValue(Cow::Owned(field.to_string())))
+                        Some(CowStr(Cow::Owned(field.to_string())))
                     }
                 },
             );
@@ -51,29 +51,29 @@ impl<'a, S: BuildHasher + Default + 'a> RecordParser<'a>
     }
 }
-impl<'a> RecordParser<'a> for Vec<Option<CowValue<'a>>> {
+impl<'a> RecordParser<'a> for Vec<Option<CowStr<'a>>> {
     type Output = Self;
     #[inline]
     fn parse(
         headers: &[StringCacheKey],
         record: &csv::StringRecord,
-        null_string: Option<&str>,
+        null_string: Option<Cow<'a, str>>,
         flexible_default: Option<Cow<'a, str>>,
     ) -> Self::Output {
         let target_len = headers.len();
         let mut vec = Vec::with_capacity(target_len);
         let shared_empty = Cow::Borrowed("");
-        let shared_default = flexible_default.map(CowValue);
+        let shared_default = flexible_default.map(CowStr);
         for field in record.iter() {
-            let value = if Some(field) == null_string {
+            let value = if Some(field) == null_string.as_deref() {
                 None
             } else if field.is_empty() {
-                Some(CowValue(shared_empty.clone()))
+                Some(CowStr(shared_empty.clone()))
             } else {
-                Some(CowValue(Cow::Owned(field.to_string())))
+                Some(CowStr(Cow::Owned(field.to_string())))
             };
             vec.push(value);
         }
@@ -86,67 +86,3 @@ impl<'a> RecordParser<'a> for Vec<Option<CowValue<'a>>> {
         vec
     }
 }
-// impl<'a, S: BuildHasher + Default + 'a> RecordParser<'a>
-//     for HashMap<&'static str, Option<String>, S>
-// {
-//     type Output = Self;
-//     #[inline]
-//     fn parse(
-//         headers: &[&'static str],
-//         record: &csv::StringRecord,
-//         null_string: Option<&str>,
-//         flexible_default: Option<Cow<'a, str>>,
-//     ) -> Self::Output {
-//         let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
-//         headers.iter().enumerate().for_each(|(i, &header)| {
-//             let value = record.get(i).map_or_else(
-//                 || flexible_default.clone(),
-//                 |field| {
-//                     if null_string == Some(field) {
-//                         None
-//                     } else if field.is_empty() {
-//                         Some(String::new())
-//                     } else {
-//                         Some(field.into())
-//                     }
-//                 },
-//             );
-//             map.insert(header, value);
-//         });
-//         map
-//     }
-// }
-// impl<'a> RecordParser<'a> for Vec<Option<String>> {
-//     type Output = Self;
-//     #[inline]
-//     fn parse(
-//         headers: &[&'static str],
-//         record: &csv::StringRecord,
-//         null_string: Option<&str>,
-//         flexible_default: Option<Cow<'a, str>>,
-//     ) -> Self::Output {
-//         let target_len = headers.len();
-//         let mut vec = Vec::with_capacity(target_len);
-//         for field in record.iter() {
-//             let value = if Some(field) == null_string {
-//                 None
-//             } else if field.is_empty() {
-//                 Some(String::new())
-//             } else {
-//                 Some(field.into())
-//             };
-//             vec.push(value);
-//         }
-//         if vec.len() < target_len {
-//             if let Some(default) = flexible_default {
-//                 vec.resize_with(target_len, || Some(default.to_string()));
-//             }
-//         }
-//         vec
-//     }
-// }

data/ext/osv/src/csv/record.rs CHANGED Viewed

@@ -6,8 +6,8 @@ use super::StringCacheKey;
 #[derive(Debug)]
 pub enum CsvRecord<'a, S: BuildHasher + Default> {
-    Vec(Vec<Option<CowValue<'a>>>),
-    Map(HashMap<StringCacheKey, Option<CowValue<'a>>, S>),
+    Vec(Vec<Option<CowStr<'a>>>),
+    Map(HashMap<StringCacheKey, Option<CowStr<'a>>, S>),
 }
 impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
@@ -46,9 +46,9 @@ impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
 }
 #[derive(Debug, Clone)]
-pub struct CowValue<'a>(pub Cow<'a, str>);
+pub struct CowStr<'a>(pub Cow<'a, str>);
-impl IntoValue for CowValue<'_> {
+impl IntoValue for CowStr<'_> {
     fn into_value_with(self, handle: &Ruby) -> Value {
         self.0.into_value_with(handle)
     }

data/ext/osv/src/csv/record_reader.rs CHANGED Viewed

@@ -2,32 +2,34 @@ use super::header_cache::StringCacheKey;
 use super::parser::RecordParser;
 use super::{header_cache::StringCache, ruby_reader::SeekableRead};
 use magnus::{Error, Ruby};
-use std::io::BufReader;
-use std::{borrow::Cow, io::Read, thread};
+use std::borrow::Cow;
+use std::io::{BufReader, Read};
+/// Size of the internal buffer used for reading CSV records
 pub(crate) const READ_BUFFER_SIZE: usize = 16384;
+/// A reader that processes CSV records using a specified parser.
+///
+/// This struct implements Iterator to provide a streaming interface for CSV records.
 pub struct RecordReader<'a, T: RecordParser<'a>> {
-    inner: ReaderImpl<'a, T>,
-}
-#[allow(clippy::large_enum_variant)]
-enum ReaderImpl<'a, T: RecordParser<'a>> {
-    SingleThreaded {
-        reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
-        headers: Vec<StringCacheKey>,
-        null_string: Option<String>,
-        flexible_default: Option<Cow<'a, str>>,
-        string_record: csv::StringRecord,
-    },
-    MultiThreaded {
-        headers: Vec<StringCacheKey>,
-        receiver: kanal::Receiver<T::Output>,
-        handle: Option<thread::JoinHandle<()>>,
-    },
+    reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
+    headers: Vec<StringCacheKey>,
+    null_string: Option<Cow<'a, str>>,
+    flexible_default: Option<Cow<'a, str>>,
+    string_record: csv::StringRecord,
+    parser: std::marker::PhantomData<T>,
 }
 impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
+    /// Reads and processes headers from a CSV reader.
+    ///
+    /// # Arguments
+    /// * `ruby` - Ruby VM context for error handling
+    /// * `reader` - CSV reader instance
+    /// * `has_headers` - Whether the CSV file contains headers
+    ///
+    /// # Returns
+    /// A vector of header strings or generated column names if `has_headers` is false
     #[inline]
     pub(crate) fn get_headers(
         ruby: &Ruby,
@@ -41,67 +43,41 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
             )
         })?;
-        let mut headers = Vec::with_capacity(first_row.len());
-        if has_headers {
-            headers.extend(first_row.iter().map(String::from));
+        Ok(if has_headers {
+            first_row.iter().map(String::from).collect()
         } else {
-            headers.extend((0..first_row.len()).map(|i| format!("c{i}")));
-        }
-        Ok(headers)
+            (0..first_row.len()).map(|i| format!("c{i}")).collect()
+        })
     }
-    pub(crate) fn new_single_threaded(
+    /// Creates a new RecordReader instance.
+    pub(crate) fn new(
         reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
         headers: Vec<StringCacheKey>,
-        null_string: Option<String>,
-        flexible_default: Option<&'a str>,
+        null_string: Option<Cow<'a, str>>,
+        flexible_default: Option<Cow<'a, str>>,
     ) -> Self {
         let headers_len = headers.len();
         Self {
-            inner: ReaderImpl::SingleThreaded {
-                reader,
-                headers,
-                null_string,
-                flexible_default: flexible_default.map(Cow::Borrowed),
-                string_record: csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers_len),
-            },
+            reader,
+            headers,
+            null_string,
+            flexible_default,
+            string_record: csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers_len),
+            parser: std::marker::PhantomData,
         }
     }
-}
-impl<T: RecordParser<'static> + Send> RecordReader<'static, T> {
-    pub(crate) fn new_multi_threaded(
-        mut reader: csv::Reader<Box<dyn Read + Send + 'static>>,
-        headers: Vec<StringCacheKey>,
-        buffer_size: usize,
-        null_string: Option<String>,
-        flexible_default: Option<&'static str>,
-    ) -> Self {
-        let (sender, receiver) = kanal::bounded(buffer_size);
-        let headers_for_thread = headers.clone();
-        let handle = thread::spawn(move || {
-            let mut record =
-                csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers_for_thread.len());
-            while let Ok(true) = reader.read_record(&mut record) {
-                let row = T::parse(
-                    &headers_for_thread,
-                    &record,
-                    null_string.as_deref(),
-                    flexible_default.map(Cow::Borrowed),
-                );
-                if sender.send(row).is_err() {
-                    break;
-                }
-            }
-        });
-        Self {
-            inner: ReaderImpl::MultiThreaded {
-                headers,
-                receiver,
-                handle: Some(handle),
-            },
+    /// Attempts to read the next record, returning any errors encountered.
+    fn try_next(&mut self) -> csv::Result<Option<T::Output>> {
+        match self.reader.read_record(&mut self.string_record)? {
+            true => Ok(Some(T::parse(
+                &self.headers,
+                &self.string_record,
+                self.null_string.clone(),
+                self.flexible_default.clone(),
+            ))),
+            false => Ok(None),
         }
     }
 }
@@ -111,63 +87,21 @@ impl<'a, T: RecordParser<'a>> Iterator for RecordReader<'a, T> {
     #[inline]
     fn next(&mut self) -> Option<Self::Item> {
-        match &mut self.inner {
-            ReaderImpl::MultiThreaded {
-                receiver, handle, ..
-            } => match receiver.recv() {
-                Ok(record) => Some(record),
-                Err(_) => {
-                    if let Some(handle) = handle.take() {
-                        let _ = handle.join();
-                    }
-                    None
-                }
-            },
-            ReaderImpl::SingleThreaded {
-                reader,
-                headers,
-                null_string,
-                flexible_default,
-                ref mut string_record,
-            } => match reader.read_record(string_record) {
-                Ok(true) => Some(T::parse(
-                    headers,
-                    string_record,
-                    null_string.as_deref(),
-                    flexible_default.clone(),
-                )),
-                Ok(false) => None,
-                Err(_e) => None,
-            },
-        }
+        // Note: We intentionally swallow errors here to maintain Iterator contract.
+        // Errors can be handled by using try_next() directly if needed.
+        self.try_next().ok().flatten()
     }
     #[inline]
     fn size_hint(&self) -> (usize, Option<usize>) {
-        // We can't know the exact size without reading the whole file
-        (0, None)
+        (0, None) // Cannot determine size without reading entire file
     }
 }
 impl<'a, T: RecordParser<'a>> Drop for RecordReader<'a, T> {
     #[inline]
     fn drop(&mut self) {
-        match &mut self.inner {
-            ReaderImpl::MultiThreaded {
-                receiver,
-                handle,
-                headers,
-                ..
-            } => {
-                receiver.close();
-                if let Some(handle) = handle.take() {
-                    let _ = handle.join();
-                }
-                let _ = StringCache::clear(&headers);
-            }
-            ReaderImpl::SingleThreaded { headers, .. } => {
-                let _ = StringCache::clear(&headers);
-            }
-        }
+        // Intentionally ignore errors during cleanup as there's no meaningful way to handle them
+        let _ = StringCache::clear(&self.headers);
     }
 }

data/ext/osv/src/csv/ruby_integration.rs CHANGED Viewed

@@ -1,30 +1,19 @@
-use std::{fs::File, io, mem::ManuallyDrop};
+use std::{
+    fs::File,
+    io::{self, Read, Seek, SeekFrom},
+    mem::ManuallyDrop,
+};
 pub struct ForgottenFileHandle(pub ManuallyDrop<File>);
-impl std::io::Read for ForgottenFileHandle {
+impl Read for ForgottenFileHandle {
     fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
         self.0.read(buf)
     }
+}
-    fn read_vectored(&mut self, bufs: &mut [std::io::IoSliceMut<'_>]) -> io::Result<usize> {
-        self.0.read_vectored(bufs)
-    }
-    // fn read_buf(&mut self, cursor: BorrowedCursor<'_>) -> io::Result<()> {
-    //     self.0.read_buf(cursor)
-    // }
-    // #[inline]
-    // fn is_read_vectored(&self) -> bool {
-    //     self.0.is_read_vectored()
-    // }
-    fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
-        self.0.read_to_end(buf)
-    }
-    fn read_to_string(&mut self, buf: &mut String) -> io::Result<usize> {
-        self.0.read_to_string(buf)
+impl Seek for ForgottenFileHandle {
+    fn seek(&mut self, pos: SeekFrom) -> io::Result<u64> {
+        self.0.seek(pos)
     }
 }

data/ext/osv/src/csv/ruby_reader.rs CHANGED Viewed

@@ -2,9 +2,12 @@ use magnus::{
     value::{Opaque, ReprValue},
     RClass, RString, Ruby, Value,
 };
-use std::io::{self, Read, Seek, SeekFrom, Write};
+use std::fs::File;
+use std::io::{self, BufReader, Read, Seek, SeekFrom, Write};
 use std::sync::OnceLock;
+use super::ForgottenFileHandle;
 static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
 /// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
@@ -17,6 +20,10 @@ pub struct RubyReader<T> {
 pub trait SeekableRead: std::io::Read + Seek {}
 impl SeekableRead for RubyReader<Value> {}
 impl SeekableRead for RubyReader<RString> {}
+impl SeekableRead for File {}
+impl<T: Read + Seek> SeekableRead for BufReader<T> {}
+impl SeekableRead for std::io::Cursor<Vec<u8>> {}
+impl SeekableRead for ForgottenFileHandle {}
 pub fn build_ruby_reader(
     ruby: &Ruby,

data/ext/osv/src/reader.rs CHANGED Viewed

@@ -1,4 +1,4 @@
-use crate::csv::{CowValue, CsvRecord, RecordReaderBuilder, StringCacheKey};
+use crate::csv::{CowStr, CsvRecord, RecordReaderBuilder, StringCacheKey};
 use crate::utils::*;
 use ahash::RandomState;
 use csv::Trim;
@@ -6,12 +6,49 @@ use magnus::value::ReprValue;
 use magnus::{block::Yield, Error, KwArgs, RHash, Ruby, Symbol, Value};
 use std::collections::HashMap;
+/// Valid result types for CSV parsing
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum ResultType {
+    Hash,
+    Array,
+}
+impl ResultType {
+    fn from_str(s: &str) -> Option<Self> {
+        match s {
+            "hash" => Some(Self::Hash),
+            "array" => Some(Self::Array),
+            _ => None,
+        }
+    }
+}
+/// Arguments for creating an enumerator
+#[derive(Debug)]
+struct EnumeratorArgs {
+    rb_self: Value,
+    to_read: Value,
+    has_headers: bool,
+    delimiter: u8,
+    quote_char: u8,
+    null_string: Option<String>,
+    result_type: String,
+    flexible: bool,
+    flexible_default: Option<String>,
+    trim: Option<String>,
+}
+/// Parses a CSV file with the given configuration.
+///
+/// # Safety
+/// This function uses unsafe code to get the Ruby runtime and leak memory for static references.
+/// This is necessary for Ruby integration but should be used with caution.
 pub fn parse_csv(
     rb_self: Value,
     args: &[Value],
 ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<'static, RandomState>>>>, Error> {
-    let original = unsafe { Ruby::get_unchecked() };
-    let ruby: &'static Ruby = Box::leak(Box::new(original));
+    //  SAFETY: We're in a Ruby callback, so Ruby runtime is guaranteed to be initialized
+    let ruby = unsafe { Ruby::get_unchecked() };
     let ReadCsvArgs {
         to_read,
@@ -19,16 +56,11 @@ pub fn parse_csv(
         delimiter,
         quote_char,
         null_string,
-        buffer_size,
         result_type,
         flexible,
         flexible_default,
         trim,
-    } = parse_read_csv_args(ruby, args)?;
-    let flexible_default: &'static Option<String> = Box::leak(Box::new(flexible_default));
-    let leaked_flexible_default: &'static Option<&str> =
-        Box::leak(Box::new(flexible_default.as_deref()));
+    } = parse_read_csv_args(&ruby, args)?;
     if !ruby.block_given() {
         return create_enumerator(EnumeratorArgs {
@@ -38,10 +70,9 @@ pub fn parse_csv(
             delimiter,
             quote_char,
             null_string,
-            buffer_size,
-            result_type,
+            result_type: result_type,
             flexible,
-            flexible_default: leaked_flexible_default.as_deref(),
+            flexible_default: flexible_default,
             trim: match trim {
                 Trim::All => Some("all".to_string()),
                 Trim::Headers => Some("headers".to_string()),
@@ -51,60 +82,47 @@ pub fn parse_csv(
         });
     }
-    let iter: Box<dyn Iterator<Item = CsvRecord<RandomState>>> = match result_type.as_str() {
-        "hash" => {
+    let result_type = ResultType::from_str(&result_type).ok_or_else(|| {
+        Error::new(
+            ruby.exception_runtime_error(),
+            "Invalid result type, expected 'hash' or 'array'",
+        )
+    })?;
+    let iter: Box<dyn Iterator<Item = CsvRecord<RandomState>>> = match result_type {
+        ResultType::Hash => {
             let builder = RecordReaderBuilder::<
-                HashMap<StringCacheKey, Option<CowValue<'static>>, RandomState>,
+                HashMap<StringCacheKey, Option<CowStr<'static>>, RandomState>,
             >::new(ruby, to_read)
             .has_headers(has_headers)
             .flexible(flexible)
-            .flexible_default(flexible_default.as_deref())
+            .flexible_default(flexible_default)
             .trim(trim)
             .delimiter(delimiter)
             .quote_char(quote_char)
-            .null_string(null_string)
-            .buffer(buffer_size);
+            .null_string(null_string);
-            Box::new(builder.build_threaded()?.map(CsvRecord::Map))
+            Box::new(builder.build()?.map(CsvRecord::Map))
         }
-        "array" => Box::new(
-            RecordReaderBuilder::<Vec<Option<CowValue<'static>>>>::new(ruby, to_read)
+        ResultType::Array => {
+            let builder = RecordReaderBuilder::<Vec<Option<CowStr<'static>>>>::new(ruby, to_read)
                 .has_headers(has_headers)
                 .flexible(flexible)
-                .flexible_default(flexible_default.as_deref())
+                .flexible_default(flexible_default)
                 .trim(trim)
                 .delimiter(delimiter)
                 .quote_char(quote_char)
                 .null_string(null_string)
-                .buffer(buffer_size)
-                .build_threaded()?
-                .map(CsvRecord::Vec),
-        ),
-        _ => {
-            return Err(Error::new(
-                ruby.exception_runtime_error(),
-                "Invalid result type",
-            ))
+                .build()?;
+            Box::new(builder.map(CsvRecord::Vec))
         }
     };
     Ok(Yield::Iter(iter))
 }
-struct EnumeratorArgs {
-    rb_self: Value,
-    to_read: Value,
-    has_headers: bool,
-    delimiter: u8,
-    quote_char: u8,
-    null_string: Option<String>,
-    buffer_size: usize,
-    result_type: String,
-    flexible: bool,
-    flexible_default: Option<&'static str>,
-    trim: Option<String>,
-}
+/// Creates an enumerator for lazy CSV parsing
 fn create_enumerator(
     args: EnumeratorArgs,
 ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<'static, RandomState>>>>, Error> {
@@ -119,11 +137,11 @@ fn create_enumerator(
         String::from_utf8(vec![args.quote_char]).unwrap(),
     )?;
     kwargs.aset(Symbol::new("nil_string"), args.null_string)?;
-    kwargs.aset(Symbol::new("buffer_size"), args.buffer_size)?;
     kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
     kwargs.aset(Symbol::new("flexible"), args.flexible)?;
     kwargs.aset(Symbol::new("flexible_default"), args.flexible_default)?;
     kwargs.aset(Symbol::new("trim"), args.trim.map(Symbol::new))?;
     let enumerator = args
         .rb_self
         .enumeratorize("for_each", (args.to_read, KwArgs(kwargs)));

data/ext/osv/src/utils.rs CHANGED Viewed

@@ -4,8 +4,6 @@ use magnus::{
     Error, RString, Ruby, Symbol, Value,
 };
-use crate::csv::BUFFER_CHANNEL_SIZE;
 fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, Error> {
     if value.is_nil() {
         Ok(None)
@@ -34,7 +32,6 @@ pub struct ReadCsvArgs {
     pub delimiter: u8,
     pub quote_char: u8,
     pub null_string: Option<String>,
-    pub buffer_size: usize,
     pub result_type: String,
     pub flexible: bool,
     pub flexible_default: Option<String>,
@@ -50,15 +47,14 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
         _,
         (),
         (
-            Option<bool>,
-            Option<String>,
-            Option<String>,
+            Option<Option<bool>>,
+            Option<Option<String>>,
             Option<Option<String>>,
-            Option<usize>,
-            Option<Value>,
-            Option<bool>,
             Option<Option<String>>,
-            Option<Value>,
+            Option<Option<Value>>,
+            Option<Option<bool>>,
+            Option<Option<Option<String>>>,
+            Option<Option<Value>>,
         ),
         (),
     >(
@@ -69,7 +65,6 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
             "col_sep",
             "quote_char",
             "nil_string",
-            "buffer_size",
             "result_type",
             "flexible",
             "flexible_default",
@@ -77,11 +72,12 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
         ],
     )?;
-    let has_headers = kwargs.optional.0.unwrap_or(true);
+    let has_headers = kwargs.optional.0.flatten().unwrap_or(true);
     let delimiter = *kwargs
         .optional
         .1
+        .flatten()
         .unwrap_or_else(|| ",".to_string())
         .as_bytes()
         .first()
@@ -95,6 +91,7 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
     let quote_char = *kwargs
         .optional
         .2
+        .flatten()
         .unwrap_or_else(|| "\"".to_string())
         .as_bytes()
         .first()
@@ -107,11 +104,10 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
     let null_string = kwargs.optional.3.unwrap_or_default();
-    let buffer_size = kwargs.optional.4.unwrap_or(BUFFER_CHANNEL_SIZE);
     let result_type = match kwargs
         .optional
-        .5
+        .4
+        .flatten()
         .map(|value| parse_string_or_symbol(ruby, value))
     {
         Some(Ok(Some(parsed))) => match parsed.as_str() {
@@ -133,13 +129,14 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
         None => String::from("hash"),
     };
-    let flexible = kwargs.optional.6.unwrap_or_default();
+    let flexible = kwargs.optional.5.flatten().unwrap_or_default();
-    let flexible_default = kwargs.optional.7.unwrap_or_default();
+    let flexible_default = kwargs.optional.6.flatten().unwrap_or_default();
     let trim = match kwargs
         .optional
-        .8
+        .7
+        .flatten()
         .map(|value| parse_string_or_symbol(ruby, value))
     {
         Some(Ok(Some(parsed))) => match parsed.as_str() {
@@ -172,7 +169,6 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
         delimiter,
         quote_char,
         null_string,
-        buffer_size,
         result_type,
         flexible,
         flexible_default,

data/lib/osv/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module OSV
-  VERSION = "0.3.16"
+  VERSION = "0.3.18"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: osv
 version: !ruby/object:Gem::Version
-  version: 0.3.16
+  version: 0.3.18
 platform: ruby
 authors:
 - Nathan Jaremko
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2025-01-03 00:00:00.000000000 Z
+date: 2025-01-13 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rb_sys