RubyGems - osv - Versions diffs - 0.3.15 → 0.3.17 - Mend

osv 0.3.15 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/Cargo.lock +11 -1
data/README.md +27 -27
data/ext/osv/Cargo.toml +1 -0
data/ext/osv/src/csv/builder.rs +92 -85
data/ext/osv/src/csv/header_cache.rs +105 -26
data/ext/osv/src/csv/mod.rs +2 -2
data/ext/osv/src/csv/parser.rs +22 -85
data/ext/osv/src/csv/record.rs +25 -8
data/ext/osv/src/csv/record_reader.rs +53 -118
data/ext/osv/src/csv/ruby_integration.rs +10 -21
data/ext/osv/src/csv/ruby_reader.rs +9 -4
data/ext/osv/src/reader.rs +64 -46
data/ext/osv/src/utils.rs +4 -12
data/lib/osv/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 872cf06d1389f45f77b4eefc178cc8462ab165b833ab2c5bf4dc7f92e1c8308e
-  data.tar.gz: 84e6c5d0e03389966b8882a5a73f1698ddee3ed0edae24f2fd5b7f257935a98e
+  metadata.gz: 4469c67b2a39d9ffa23923e36cd894eac415ca004a432e700102a334af11efd8
+  data.tar.gz: 8dee3117fe6511b9c5b6005ae37d991891e0f314508986743b659080c7885855
 SHA512:
-  metadata.gz: 445581447e8f5ec336da7843af715a5f5fbc298232a24f303a22eebb844f83f65ecc2e85d877a448119adae9e6a5529e377d87399a36e6f070562fa4ce0a11b7
-  data.tar.gz: '08f417b19b0549aa4a3db1538e4be413c5ec8faa3bd18e4c101a6fc3ea3e9496d04c30e39ea8eec9cc0cc3a38f8f83f7c2274e09c75259a26f3609620cf07a80'
+  metadata.gz: d8c94dc1c576cca0043c7501752bdd6dee0c8bf0523d9c99a0e8ab4d614a0eb4e6f087fa62be97bb5816f9998f2c414758ffcab260e90889afada8379fb03aec
+  data.tar.gz: c51ece65a713af0b351a183415816302fcdc35ad598d0e5ee9e5b693c1ef66826c5dfc3dab90f04499c90e994e590e6dd7121999b5dfe54ce20997e41df0ac02

data/Cargo.lock CHANGED Viewed

@@ -45,7 +45,7 @@ dependencies = [
  "bitflags",
  "cexpr",
  "clang-sys",
- "itertools",
+ "itertools 0.12.1",
  "lazy_static",
  "lazycell",
  "proc-macro2",
@@ -175,6 +175,15 @@ dependencies = [
  "either",
 ]
+[[package]]
+name = "itertools"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
+dependencies = [
+ "either",
+]
 [[package]]
 name = "itoa"
 version = "1.0.14"
@@ -347,6 +356,7 @@ dependencies = [
  "ahash",
  "csv",
  "flate2",
+ "itertools 0.14.0",
  "jemallocator",
  "kanal",
  "magnus 0.7.1",

data/README.md CHANGED Viewed

@@ -121,7 +121,7 @@ Here's some unscientific benchmarks. You can find the code in the [benchmark/com
 ### 1,000,000 records
 ```
-🏃  Running benchmarks...
+🏃 Running benchmarks...
 Benchmarking with 3000001 lines of data
 ruby 3.3.6 (2024-11-05 revision 75015d4c1f) +YJIT [arm64-darwin24]
@@ -142,34 +142,34 @@ OSV - Gzipped Direct     1.000 i/100ms
    FastCSV - Gzipped     1.000 i/100ms
        CSV - Gzipped     1.000 i/100ms
 Calculating -------------------------------------
-      CSV - StringIO      0.079 (± 0.0%) i/s    (12.69 s/i) -      3.000 in  38.139709s
-  FastCSV - StringIO      0.370 (± 0.0%) i/s     (2.71 s/i) -     12.000 in  32.474164s
-      OSV - StringIO      0.635 (± 0.0%) i/s     (1.58 s/i) -     19.000 in  30.772490s
-   CSV - Hash output      0.058 (± 0.0%) i/s    (17.11 s/i) -      2.000 in  34.212335s
-   OSV - Hash output      0.249 (± 0.0%) i/s     (4.01 s/i) -      8.000 in  32.124319s
-  CSV - Array output      0.066 (± 0.0%) i/s    (15.11 s/i) -      2.000 in  30.212137s
-  OSV - Array output      0.665 (± 0.0%) i/s     (1.50 s/i) -     20.000 in  30.813986s
+      CSV - StringIO      0.080 (± 0.0%) i/s    (12.43 s/i) -      3.000 in  37.301114s
+  FastCSV - StringIO      0.368 (± 0.0%) i/s     (2.72 s/i) -     12.000 in  32.619020s
+      OSV - StringIO      0.699 (± 0.0%) i/s     (1.43 s/i) -     21.000 in  30.091225s
+   CSV - Hash output      0.059 (± 0.0%) i/s    (16.95 s/i) -      2.000 in  33.908533s
+   OSV - Hash output      0.329 (± 0.0%) i/s     (3.04 s/i) -     10.000 in  30.551275s
+  CSV - Array output      0.066 (± 0.0%) i/s    (15.18 s/i) -      2.000 in  30.357327s
+  OSV - Array output      0.632 (± 0.0%) i/s     (1.58 s/i) -     19.000 in  30.150113s
 FastCSV - Array output
-                          0.351 (± 0.0%) i/s     (2.85 s/i) -     11.000 in  31.418786s
+                          0.350 (± 0.0%) i/s     (2.86 s/i) -     11.000 in  31.477268s
 OSV - Direct Open Array output
-                          0.713 (± 0.0%) i/s     (1.40 s/i) -     22.000 in  30.938525s
-       OSV - Gzipped      0.506 (± 0.0%) i/s     (1.98 s/i) -     16.000 in  31.709708s
-OSV - Gzipped Direct      0.685 (± 0.0%) i/s     (1.46 s/i) -     21.000 in  31.145435s
-   FastCSV - Gzipped      0.324 (± 0.0%) i/s     (3.09 s/i) -     10.000 in  30.983582s
-       CSV - Gzipped      0.057 (± 0.0%) i/s    (17.69 s/i) -      2.000 in  35.379009s
+                          0.641 (± 0.0%) i/s     (1.56 s/i) -     20.000 in  31.275201s
+       OSV - Gzipped      0.530 (± 0.0%) i/s     (1.89 s/i) -     16.000 in  30.183753s
+OSV - Gzipped Direct      0.727 (± 0.0%) i/s     (1.37 s/i) -     22.000 in  30.283991s
+   FastCSV - Gzipped      0.323 (± 0.0%) i/s     (3.09 s/i) -     10.000 in  30.949600s
+       CSV - Gzipped      0.056 (± 0.0%) i/s    (17.72 s/i) -      2.000 in  35.440473s
 Comparison:
-OSV - Direct Open Array output:        0.7 i/s
-OSV - Gzipped Direct:        0.7 i/s - 1.04x  slower
-  OSV - Array output:        0.7 i/s - 1.07x  slower
-      OSV - StringIO:        0.6 i/s - 1.12x  slower
-       OSV - Gzipped:        0.5 i/s - 1.41x  slower
-  FastCSV - StringIO:        0.4 i/s - 1.93x  slower
-FastCSV - Array output:        0.4 i/s - 2.03x  slower
-   FastCSV - Gzipped:        0.3 i/s - 2.20x  slower
-   OSV - Hash output:        0.2 i/s - 2.86x  slower
-      CSV - StringIO:        0.1 i/s - 9.05x  slower
-  CSV - Array output:        0.1 i/s - 10.77x  slower
-   CSV - Hash output:        0.1 i/s - 12.20x  slower
-       CSV - Gzipped:        0.1 i/s - 12.61x  slower
+OSV - Gzipped Direct:        0.7 i/s
+      OSV - StringIO:        0.7 i/s - 1.04x  slower
+OSV - Direct Open Array output:        0.6 i/s - 1.14x  slower
+  OSV - Array output:        0.6 i/s - 1.15x  slower
+       OSV - Gzipped:        0.5 i/s - 1.37x  slower
+  FastCSV - StringIO:        0.4 i/s - 1.98x  slower
+FastCSV - Array output:        0.3 i/s - 2.08x  slower
+   OSV - Hash output:        0.3 i/s - 2.21x  slower
+   FastCSV - Gzipped:        0.3 i/s - 2.25x  slower
+      CSV - StringIO:        0.1 i/s - 9.04x  slower
+  CSV - Array output:        0.1 i/s - 11.04x  slower
+   CSV - Hash output:        0.1 i/s - 12.33x  slower
+       CSV - Gzipped:        0.1 i/s - 12.89x  slower
 ```

data/ext/osv/Cargo.toml CHANGED Viewed

@@ -16,6 +16,7 @@ rb-sys = "^0.9"
 serde = { version = "1.0", features = ["derive"] }
 serde_magnus = "0.8.1"
 thiserror = "2.0"
+itertools = "^0.14"
 [target.'cfg(target_os = "linux")'.dependencies]
 jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }

data/ext/osv/src/csv/builder.rs CHANGED Viewed

@@ -6,8 +6,10 @@ use super::{
     ForgottenFileHandle,
 };
 use flate2::read::GzDecoder;
-use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, Ruby, Value};
+use magnus::{rb_sys::AsRawValue, value::ReprValue, Error as MagnusError, RString, Ruby, Value};
 use std::{
+    borrow::Cow,
+    fmt::Debug,
     fs::File,
     io::{self, BufReader, Read},
     marker::PhantomData,
@@ -17,18 +19,21 @@ use std::{
 use thiserror::Error;
-pub(crate) static BUFFER_CHANNEL_SIZE: usize = 1024;
+/// Errors that can occur when building a RecordReader
 #[derive(Error, Debug)]
 pub enum ReaderError {
     #[error("Failed to get file descriptor: {0}")]
     FileDescriptor(String),
-    #[error("Invalid file descriptor")]
-    InvalidFileDescriptor,
+    #[error("Invalid file descriptor: {0}")]
+    InvalidFileDescriptor(i32),
     #[error("Failed to open file: {0}")]
     FileOpen(#[from] io::Error),
     #[error("Failed to intern headers: {0}")]
     HeaderIntern(#[from] CacheError),
+    #[error("Invalid flexible default value: {0}")]
+    InvalidFlexibleDefault(String),
+    #[error("Invalid null string value: {0}")]
+    InvalidNullString(String),
     #[error("Ruby error: {0}")]
     Ruby(String),
 }
@@ -48,63 +53,27 @@ impl From<ReaderError> for MagnusError {
     }
 }
-pub struct RecordReaderBuilder<'a, T: RecordParser<'a> + Send> {
-    ruby: &'a Ruby,
+/// Builder for configuring and creating a RecordReader instance.
+///
+/// This struct provides a fluent interface for setting up CSV parsing options
+/// and creating a RecordReader with the specified configuration.
+pub struct RecordReaderBuilder<'a, T: RecordParser<'a>> {
+    ruby: Ruby,
     to_read: Value,
     has_headers: bool,
     delimiter: u8,
     quote_char: u8,
     null_string: Option<String>,
-    buffer: usize,
     flexible: bool,
-    flexible_default: Option<&'a str>,
+    flexible_default: Option<String>,
     trim: csv::Trim,
     _phantom: PhantomData<T>,
+    _phantom_a: PhantomData<&'a ()>,
 }
-impl<T: RecordParser<'static> + Send + 'static> RecordReaderBuilder<'static, T> {
-    fn build_multi_threaded(
-        self,
-        readable: Box<dyn Read + Send + 'static>,
-    ) -> Result<RecordReader<'static, T>, ReaderError> {
-        let flexible = self.flexible || self.flexible_default.is_some();
-        let mut reader = csv::ReaderBuilder::new()
-            .has_headers(self.has_headers)
-            .delimiter(self.delimiter)
-            .quote(self.quote_char)
-            .flexible(flexible)
-            .trim(self.trim)
-            .from_reader(readable);
-        let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
-        let static_headers = StringCache::intern_many(&headers)?;
-        Ok(RecordReader::new_multi_threaded(
-            reader,
-            static_headers,
-            self.buffer,
-            self.null_string,
-            self.flexible_default,
-        ))
-    }
-    pub fn build_threaded(self) -> Result<RecordReader<'static, T>, ReaderError> {
-        if self.to_read.is_kind_of(self.ruby.class_io()) {
-            let readable = self.handle_file_descriptor()?;
-            self.build_multi_threaded(readable)
-        } else if self.to_read.is_kind_of(self.ruby.class_string()) {
-            let readable = self.handle_file_path()?;
-            self.build_multi_threaded(readable)
-        } else {
-            let readable = build_ruby_reader(self.ruby, self.to_read)?;
-            let buffered_reader = BufReader::with_capacity(READ_BUFFER_SIZE, readable);
-            self.build_single_threaded(buffered_reader)
-        }
-    }
-}
-impl<'a, T: RecordParser<'a> + Send> RecordReaderBuilder<'a, T> {
-    pub fn new(ruby: &'a Ruby, to_read: Value) -> Self {
+impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
+    /// Creates a new builder instance with default settings.
+    pub fn new(ruby: Ruby, to_read: Value) -> Self {
         Self {
             ruby,
             to_read,
@@ -112,92 +81,107 @@ impl<'a, T: RecordParser<'a> + Send> RecordReaderBuilder<'a, T> {
             delimiter: b',',
             quote_char: b'"',
             null_string: None,
-            buffer: BUFFER_CHANNEL_SIZE,
             flexible: false,
             flexible_default: None,
             trim: csv::Trim::None,
             _phantom: PhantomData,
+            _phantom_a: PhantomData,
         }
     }
+    /// Sets whether the CSV file has headers.
+    #[must_use]
     pub fn has_headers(mut self, has_headers: bool) -> Self {
         self.has_headers = has_headers;
         self
     }
+    /// Sets the delimiter character for the CSV.
+    #[must_use]
     pub fn delimiter(mut self, delimiter: u8) -> Self {
         self.delimiter = delimiter;
         self
     }
+    /// Sets the quote character for the CSV.
+    #[must_use]
     pub fn quote_char(mut self, quote_char: u8) -> Self {
         self.quote_char = quote_char;
         self
     }
+    /// Sets the string that should be interpreted as null.
+    #[must_use]
     pub fn null_string(mut self, null_string: Option<String>) -> Self {
         self.null_string = null_string;
         self
     }
-    pub fn buffer(mut self, buffer: usize) -> Self {
-        self.buffer = buffer;
-        self
-    }
+    /// Sets whether the reader should be flexible with field counts.
+    #[must_use]
     pub fn flexible(mut self, flexible: bool) -> Self {
         self.flexible = flexible;
         self
     }
-    pub fn flexible_default(mut self, flexible_default: Option<&'a str>) -> Self {
+    /// Sets the default value for missing fields when in flexible mode.
+    #[must_use]
+    pub fn flexible_default(mut self, flexible_default: Option<String>) -> Self {
         self.flexible_default = flexible_default;
         self
     }
+    /// Sets the trimming mode for fields.
+    #[must_use]
     pub fn trim(mut self, trim: csv::Trim) -> Self {
         self.trim = trim;
         self
     }
-    fn handle_file_descriptor(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
+    /// Handles reading from a file descriptor.
+    fn handle_file_descriptor(&self) -> Result<Box<dyn SeekableRead>, ReaderError> {
         let raw_value = self.to_read.as_raw();
         let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
-            .map_err(|_| {
-                ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
-            })?;
+            .map_err(|e| ReaderError::FileDescriptor(format!("{:?}", e)))?;
         if fd < 0 {
-            return Err(ReaderError::InvalidFileDescriptor);
+            return Err(ReaderError::InvalidFileDescriptor(fd));
         }
         let file = unsafe { File::from_raw_fd(fd) };
         let forgotten = ForgottenFileHandle(ManuallyDrop::new(file));
-        Ok(Box::new(BufReader::with_capacity(
-            READ_BUFFER_SIZE,
-            forgotten,
-        )))
+        Ok(Box::new(forgotten))
     }
-    fn handle_file_path(&self) -> Result<Box<dyn Read + Send + 'static>, ReaderError> {
+    /// Handles reading from a file path.
+    fn handle_file_path(&self) -> Result<Box<dyn SeekableRead>, ReaderError> {
         let path = self.to_read.to_r_string()?.to_string()?;
         let file = File::open(&path)?;
-        Ok(if path.ends_with(".gz") {
-            Box::new(GzDecoder::new(BufReader::with_capacity(
-                READ_BUFFER_SIZE,
-                file,
-            )))
+        if path.ends_with(".gz") {
+            // For gzipped files, we need to decompress them into memory first
+            // since GzDecoder doesn't support seeking
+            let mut decoder = GzDecoder::new(BufReader::with_capacity(READ_BUFFER_SIZE, file));
+            let mut contents = Vec::new();
+            decoder.read_to_end(&mut contents)?;
+            Ok(Box::new(std::io::Cursor::new(contents)))
         } else {
-            Box::new(BufReader::with_capacity(READ_BUFFER_SIZE, file))
-        })
+            Ok(Box::new(file))
+        }
     }
-    fn build_single_threaded(
-        self,
-        readable: BufReader<Box<dyn SeekableRead>>,
-    ) -> Result<RecordReader<'a, T>, ReaderError> {
+    /// Builds the RecordReader with the configured options.
+    pub fn build(self) -> Result<RecordReader<'a, T>, ReaderError> {
+        let readable = if self.to_read.is_kind_of(self.ruby.class_io()) {
+            self.handle_file_descriptor()?
+        } else if self.to_read.is_kind_of(self.ruby.class_string()) {
+            self.handle_file_path()?
+        } else {
+            build_ruby_reader(&self.ruby, self.to_read)?
+        };
         let flexible = self.flexible || self.flexible_default.is_some();
+        let reader = BufReader::with_capacity(READ_BUFFER_SIZE, readable);
         let mut reader = csv::ReaderBuilder::new()
             .has_headers(self.has_headers)
@@ -205,16 +189,39 @@ impl<'a, T: RecordParser<'a> + Send> RecordReaderBuilder<'a, T> {
             .quote(self.quote_char)
             .flexible(flexible)
             .trim(self.trim)
-            .from_reader(readable);
+            .from_reader(reader);
-        let headers = RecordReader::<T>::get_headers(self.ruby, &mut reader, self.has_headers)?;
+        let headers = RecordReader::<T>::get_headers(&self.ruby, &mut reader, self.has_headers)?;
         let static_headers = StringCache::intern_many(&headers)?;
-        Ok(RecordReader::new_single_threaded(
+        // We intern both of these to get static string references we can reuse throughout the parser.
+        let flexible_default = self
+            .flexible_default
+            .map(|s| {
+                RString::new(&s)
+                    .to_interned_str()
+                    .as_str()
+                    .map_err(|e| ReaderError::InvalidFlexibleDefault(format!("{:?}", e)))
+            })
+            .transpose()?
+            .map(|s| Cow::Borrowed(s));
+        let null_string = self
+            .null_string
+            .map(|s| {
+                RString::new(&s)
+                    .to_interned_str()
+                    .as_str()
+                    .map_err(|e| ReaderError::InvalidNullString(format!("{:?}", e)))
+            })
+            .transpose()?
+            .map(|s| Cow::Borrowed(s));
+        Ok(RecordReader::new(
             reader,
             static_headers,
-            self.null_string,
-            self.flexible_default,
+            null_string,
+            flexible_default,
         ))
     }
 }

data/ext/osv/src/csv/header_cache.rs CHANGED Viewed

@@ -6,8 +6,14 @@
 /// so this optimization could be removed if any issues arise.
 use std::{
     collections::HashMap,
-    sync::{atomic::AtomicU32, LazyLock, Mutex},
+    sync::{
+        atomic::{AtomicU32, Ordering},
+        LazyLock, Mutex, OnceLock,
+    },
 };
+use magnus::{r_string::FString, value::Opaque, IntoValue, RString, Ruby, Value};
 use thiserror::Error;
 #[derive(Debug, Error)]
@@ -16,66 +22,139 @@ pub enum CacheError {
     LockError(String),
 }
-static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, AtomicU32>>> =
+static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey, AtomicU32)>>> =
     LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
 pub struct StringCache;
+#[derive(Copy, Clone)]
+pub struct StringCacheKey(Opaque<FString>, &'static str);
+impl StringCacheKey {
+    pub fn new(string: &str) -> Self {
+        let rstr = RString::new(string);
+        let fstr = rstr.to_interned_str();
+        Self(Opaque::from(fstr), fstr.as_str().unwrap())
+    }
+}
+impl AsRef<str> for StringCacheKey {
+    fn as_ref(&self) -> &'static str {
+        self.1
+    }
+}
+impl IntoValue for StringCacheKey {
+    fn into_value_with(self, handle: &Ruby) -> Value {
+        handle.into_value(self.0)
+    }
+}
+impl std::fmt::Debug for StringCacheKey {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.1.fmt(f)
+    }
+}
+impl PartialEq for StringCacheKey {
+    fn eq(&self, other: &Self) -> bool {
+        self.1 == other.1
+    }
+}
+impl std::cmp::Eq for StringCacheKey {}
+impl std::hash::Hash for StringCacheKey {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.1.hash(state);
+    }
+}
 impl StringCache {
     #[allow(dead_code)]
-    pub fn intern(string: String) -> Result<&'static str, CacheError> {
+    pub fn intern(string: String) -> Result<StringCacheKey, CacheError> {
         let mut cache = STRING_CACHE
             .lock()
             .map_err(|e| CacheError::LockError(e.to_string()))?;
-        if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
-            count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
-            Ok(existing)
+        if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
+            counter.fetch_add(1, Ordering::Relaxed);
+            Ok(*interned_string)
         } else {
+            let interned = StringCacheKey::new(string.as_str());
             let leaked = Box::leak(string.into_boxed_str());
-            cache.insert(leaked, AtomicU32::new(1));
-            Ok(leaked)
+            cache.insert(leaked, (interned, AtomicU32::new(1)));
+            Ok(interned)
         }
     }
-    pub fn intern_many(strings: &[String]) -> Result<Vec<&'static str>, CacheError> {
+    pub fn intern_many(strings: &[String]) -> Result<Vec<StringCacheKey>, CacheError> {
         let mut cache = STRING_CACHE
             .lock()
             .map_err(|e| CacheError::LockError(e.to_string()))?;
-        let mut result = Vec::with_capacity(strings.len());
+        let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
         for string in strings {
-            if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
-                count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
-                result.push(existing);
+            if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
+                counter.fetch_add(1, Ordering::Relaxed);
+                result.push(*interned_string);
             } else {
+                let interned = StringCacheKey::new(&string);
                 let leaked = Box::leak(string.clone().into_boxed_str());
-                cache.insert(leaked, AtomicU32::new(1));
-                result.push(leaked);
+                cache.insert(leaked, (interned, AtomicU32::new(1)));
+                result.push(interned);
             }
         }
         Ok(result)
     }
-    pub fn clear(headers: &[&'static str]) -> Result<(), CacheError> {
+    pub fn clear(headers: &[StringCacheKey]) -> Result<(), CacheError> {
         let mut cache = STRING_CACHE
             .lock()
             .map_err(|e| CacheError::LockError(e.to_string()))?;
-        for header in headers {
-            if let Some(count) = cache.get(header) {
-                // Returns the previous value of the counter
-                let was = count.fetch_sub(1, std::sync::atomic::Ordering::Relaxed);
-                if was == 1 {
-                    cache.remove(header);
-                    let ptr = *header as *const str as *mut str;
-                    unsafe {
-                        let _ = Box::from_raw(ptr);
+        let to_remove: Vec<_> = headers
+            .iter()
+            .filter_map(|header| {
+                let key = header.as_ref();
+                if let Some((_, (_, counter))) = cache.get_key_value(key) {
+                    let prev_count = counter.fetch_sub(1, Ordering::Relaxed);
+                    if prev_count == 1 {
+                        Some(key)
+                    } else {
+                        None
                     }
+                } else {
+                    None
                 }
-            }
+            })
+            .collect();
+        for key in to_remove {
+            cache.remove(key);
         }
         Ok(())
     }
 }
+pub struct HeaderCacheCleanupIter<I> {
+    pub inner: I,
+    pub headers: OnceLock<Vec<StringCacheKey>>,
+}
+impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
+    type Item = I::Item;
+    fn next(&mut self) -> Option<Self::Item> {
+        self.inner.next()
+    }
+}
+impl<I> Drop for HeaderCacheCleanupIter<I> {
+    fn drop(&mut self) {
+        if let Some(headers) = self.headers.get() {
+            StringCache::clear(&headers).unwrap();
+        }
+    }
+}

data/ext/osv/src/csv/mod.rs CHANGED Viewed

@@ -7,7 +7,7 @@ mod ruby_integration;
 mod ruby_reader;
 pub use builder::RecordReaderBuilder;
-pub(crate) use builder::BUFFER_CHANNEL_SIZE;
-pub use record::CowValue;
+pub use header_cache::StringCacheKey;
+pub use record::CowStr;
 pub use record::CsvRecord;
 pub use ruby_integration::*;