RubyGems - osv - Versions diffs - 0.3.15 → 0.3.16 - Mend

osv 0.3.15 → 0.3.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/Cargo.lock +11 -1
data/README.md +27 -27
data/ext/osv/Cargo.toml +1 -0
data/ext/osv/src/csv/header_cache.rs +79 -26
data/ext/osv/src/csv/mod.rs +1 -0
data/ext/osv/src/csv/parser.rs +7 -6
data/ext/osv/src/csv/record.rs +22 -5
data/ext/osv/src/csv/record_reader.rs +7 -6
data/ext/osv/src/csv/ruby_reader.rs +1 -3
data/ext/osv/src/reader.rs +2 -2
data/lib/osv/version.rb +1 -1
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 872cf06d1389f45f77b4eefc178cc8462ab165b833ab2c5bf4dc7f92e1c8308e
-  data.tar.gz: 84e6c5d0e03389966b8882a5a73f1698ddee3ed0edae24f2fd5b7f257935a98e
+  metadata.gz: 91401989a8532162a9731fed3cb07661c0676105f77465da23f9a267773e7651
+  data.tar.gz: aeba48f1338a4160044e8c7264f80eb065d950567288bded39acf5d9bc593d7b
 SHA512:
-  metadata.gz: 445581447e8f5ec336da7843af715a5f5fbc298232a24f303a22eebb844f83f65ecc2e85d877a448119adae9e6a5529e377d87399a36e6f070562fa4ce0a11b7
-  data.tar.gz: '08f417b19b0549aa4a3db1538e4be413c5ec8faa3bd18e4c101a6fc3ea3e9496d04c30e39ea8eec9cc0cc3a38f8f83f7c2274e09c75259a26f3609620cf07a80'
+  metadata.gz: 8d2ea3f724a6f7af317bb1ae865513c15f2ef0e475b070e7f9ae2e1b4155b2d82090387beb0c6a2e5cb8664b1f6dd0cf61e6ad9545957bc3ada1a3e87758b1ee
+  data.tar.gz: 0eaa86241092c14f4c2973d74e65877b7f3f87487a2681b9a094054f98db759772bcf012ec2f4fa073bd16f2b02927212b13afec484f84daf764d3b3e0811b6b

data/Cargo.lock CHANGED Viewed

@@ -45,7 +45,7 @@ dependencies = [
  "bitflags",
  "cexpr",
  "clang-sys",
- "itertools",
+ "itertools 0.12.1",
  "lazy_static",
  "lazycell",
  "proc-macro2",
@@ -175,6 +175,15 @@ dependencies = [
  "either",
 ]
+[[package]]
+name = "itertools"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
+dependencies = [
+ "either",
+]
 [[package]]
 name = "itoa"
 version = "1.0.14"
@@ -347,6 +356,7 @@ dependencies = [
  "ahash",
  "csv",
  "flate2",
+ "itertools 0.14.0",
  "jemallocator",
  "kanal",
  "magnus 0.7.1",

data/README.md CHANGED Viewed

@@ -121,7 +121,7 @@ Here's some unscientific benchmarks. You can find the code in the [benchmark/com
 ### 1,000,000 records
 ```
-🏃  Running benchmarks...
+🏃 Running benchmarks...
 Benchmarking with 3000001 lines of data
 ruby 3.3.6 (2024-11-05 revision 75015d4c1f) +YJIT [arm64-darwin24]
@@ -142,34 +142,34 @@ OSV - Gzipped Direct     1.000 i/100ms
    FastCSV - Gzipped     1.000 i/100ms
        CSV - Gzipped     1.000 i/100ms
 Calculating -------------------------------------
-      CSV - StringIO      0.079 (± 0.0%) i/s    (12.69 s/i) -      3.000 in  38.139709s
-  FastCSV - StringIO      0.370 (± 0.0%) i/s     (2.71 s/i) -     12.000 in  32.474164s
-      OSV - StringIO      0.635 (± 0.0%) i/s     (1.58 s/i) -     19.000 in  30.772490s
-   CSV - Hash output      0.058 (± 0.0%) i/s    (17.11 s/i) -      2.000 in  34.212335s
-   OSV - Hash output      0.249 (± 0.0%) i/s     (4.01 s/i) -      8.000 in  32.124319s
-  CSV - Array output      0.066 (± 0.0%) i/s    (15.11 s/i) -      2.000 in  30.212137s
-  OSV - Array output      0.665 (± 0.0%) i/s     (1.50 s/i) -     20.000 in  30.813986s
+      CSV - StringIO      0.080 (± 0.0%) i/s    (12.43 s/i) -      3.000 in  37.301114s
+  FastCSV - StringIO      0.368 (± 0.0%) i/s     (2.72 s/i) -     12.000 in  32.619020s
+      OSV - StringIO      0.699 (± 0.0%) i/s     (1.43 s/i) -     21.000 in  30.091225s
+   CSV - Hash output      0.059 (± 0.0%) i/s    (16.95 s/i) -      2.000 in  33.908533s
+   OSV - Hash output      0.329 (± 0.0%) i/s     (3.04 s/i) -     10.000 in  30.551275s
+  CSV - Array output      0.066 (± 0.0%) i/s    (15.18 s/i) -      2.000 in  30.357327s
+  OSV - Array output      0.632 (± 0.0%) i/s     (1.58 s/i) -     19.000 in  30.150113s
 FastCSV - Array output
-                          0.351 (± 0.0%) i/s     (2.85 s/i) -     11.000 in  31.418786s
+                          0.350 (± 0.0%) i/s     (2.86 s/i) -     11.000 in  31.477268s
 OSV - Direct Open Array output
-                          0.713 (± 0.0%) i/s     (1.40 s/i) -     22.000 in  30.938525s
-       OSV - Gzipped      0.506 (± 0.0%) i/s     (1.98 s/i) -     16.000 in  31.709708s
-OSV - Gzipped Direct      0.685 (± 0.0%) i/s     (1.46 s/i) -     21.000 in  31.145435s
-   FastCSV - Gzipped      0.324 (± 0.0%) i/s     (3.09 s/i) -     10.000 in  30.983582s
-       CSV - Gzipped      0.057 (± 0.0%) i/s    (17.69 s/i) -      2.000 in  35.379009s
+                          0.641 (± 0.0%) i/s     (1.56 s/i) -     20.000 in  31.275201s
+       OSV - Gzipped      0.530 (± 0.0%) i/s     (1.89 s/i) -     16.000 in  30.183753s
+OSV - Gzipped Direct      0.727 (± 0.0%) i/s     (1.37 s/i) -     22.000 in  30.283991s
+   FastCSV - Gzipped      0.323 (± 0.0%) i/s     (3.09 s/i) -     10.000 in  30.949600s
+       CSV - Gzipped      0.056 (± 0.0%) i/s    (17.72 s/i) -      2.000 in  35.440473s
 Comparison:
-OSV - Direct Open Array output:        0.7 i/s
-OSV - Gzipped Direct:        0.7 i/s - 1.04x  slower
-  OSV - Array output:        0.7 i/s - 1.07x  slower
-      OSV - StringIO:        0.6 i/s - 1.12x  slower
-       OSV - Gzipped:        0.5 i/s - 1.41x  slower
-  FastCSV - StringIO:        0.4 i/s - 1.93x  slower
-FastCSV - Array output:        0.4 i/s - 2.03x  slower
-   FastCSV - Gzipped:        0.3 i/s - 2.20x  slower
-   OSV - Hash output:        0.2 i/s - 2.86x  slower
-      CSV - StringIO:        0.1 i/s - 9.05x  slower
-  CSV - Array output:        0.1 i/s - 10.77x  slower
-   CSV - Hash output:        0.1 i/s - 12.20x  slower
-       CSV - Gzipped:        0.1 i/s - 12.61x  slower
+OSV - Gzipped Direct:        0.7 i/s
+      OSV - StringIO:        0.7 i/s - 1.04x  slower
+OSV - Direct Open Array output:        0.6 i/s - 1.14x  slower
+  OSV - Array output:        0.6 i/s - 1.15x  slower
+       OSV - Gzipped:        0.5 i/s - 1.37x  slower
+  FastCSV - StringIO:        0.4 i/s - 1.98x  slower
+FastCSV - Array output:        0.3 i/s - 2.08x  slower
+   OSV - Hash output:        0.3 i/s - 2.21x  slower
+   FastCSV - Gzipped:        0.3 i/s - 2.25x  slower
+      CSV - StringIO:        0.1 i/s - 9.04x  slower
+  CSV - Array output:        0.1 i/s - 11.04x  slower
+   CSV - Hash output:        0.1 i/s - 12.33x  slower
+       CSV - Gzipped:        0.1 i/s - 12.89x  slower
 ```

data/ext/osv/Cargo.toml CHANGED Viewed

@@ -16,6 +16,7 @@ rb-sys = "^0.9"
 serde = { version = "1.0", features = ["derive"] }
 serde_magnus = "0.8.1"
 thiserror = "2.0"
+itertools = "^0.14"
 [target.'cfg(target_os = "linux")'.dependencies]
 jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }

data/ext/osv/src/csv/header_cache.rs CHANGED Viewed

@@ -1,3 +1,4 @@
+use magnus::{r_string::FString, value::Opaque, IntoValue, RString, Ruby, Value};
 /// This module exists to avoid cloning header keys in returned HashMaps.
 /// Since the underlying RString creation already involves cloning,
 /// this caching layer aims to reduce redundant allocations.
@@ -6,7 +7,7 @@
 /// so this optimization could be removed if any issues arise.
 use std::{
     collections::HashMap,
-    sync::{atomic::AtomicU32, LazyLock, Mutex},
+    sync::{atomic::AtomicU32, atomic::Ordering, LazyLock, Mutex},
 };
 use thiserror::Error;
@@ -16,64 +17,116 @@ pub enum CacheError {
     LockError(String),
 }
-static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, AtomicU32>>> =
+static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey, AtomicU32)>>> =
     LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
 pub struct StringCache;
+#[derive(Copy, Clone)]
+pub struct StringCacheKey(Opaque<FString>, &'static str);
+impl StringCacheKey {
+    pub fn new(string: &str) -> Self {
+        let rstr = RString::new(string);
+        let fstr = rstr.to_interned_str();
+        Self(Opaque::from(fstr), fstr.as_str().unwrap())
+    }
+}
+impl AsRef<str> for StringCacheKey {
+    fn as_ref(&self) -> &'static str {
+        self.1
+    }
+}
+impl IntoValue for StringCacheKey {
+    fn into_value_with(self, handle: &Ruby) -> Value {
+        handle.into_value(self.0)
+    }
+}
+impl std::fmt::Debug for StringCacheKey {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.1.fmt(f)
+    }
+}
+impl PartialEq for StringCacheKey {
+    fn eq(&self, other: &Self) -> bool {
+        self.1 == other.1
+    }
+}
+impl std::cmp::Eq for StringCacheKey {}
+impl std::hash::Hash for StringCacheKey {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.1.hash(state);
+    }
+}
 impl StringCache {
     #[allow(dead_code)]
-    pub fn intern(string: String) -> Result<&'static str, CacheError> {
+    pub fn intern(string: String) -> Result<StringCacheKey, CacheError> {
         let mut cache = STRING_CACHE
             .lock()
             .map_err(|e| CacheError::LockError(e.to_string()))?;
-        if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
-            count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
-            Ok(existing)
+        if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
+            counter.fetch_add(1, Ordering::Relaxed);
+            Ok(*interned_string)
         } else {
+            let interned = StringCacheKey::new(string.as_str());
             let leaked = Box::leak(string.into_boxed_str());
-            cache.insert(leaked, AtomicU32::new(1));
-            Ok(leaked)
+            cache.insert(leaked, (interned, AtomicU32::new(1)));
+            Ok(interned)
         }
     }
-    pub fn intern_many(strings: &[String]) -> Result<Vec<&'static str>, CacheError> {
+    pub fn intern_many(strings: &[String]) -> Result<Vec<StringCacheKey>, CacheError> {
         let mut cache = STRING_CACHE
             .lock()
             .map_err(|e| CacheError::LockError(e.to_string()))?;
-        let mut result = Vec::with_capacity(strings.len());
+        let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
         for string in strings {
-            if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
-                count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
-                result.push(existing);
+            if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
+                counter.fetch_add(1, Ordering::Relaxed);
+                result.push(*interned_string);
             } else {
+                let interned = StringCacheKey::new(&string);
                 let leaked = Box::leak(string.clone().into_boxed_str());
-                cache.insert(leaked, AtomicU32::new(1));
-                result.push(leaked);
+                cache.insert(leaked, (interned, AtomicU32::new(1)));
+                result.push(interned);
             }
         }
         Ok(result)
     }
-    pub fn clear(headers: &[&'static str]) -> Result<(), CacheError> {
+    pub fn clear(headers: &[StringCacheKey]) -> Result<(), CacheError> {
         let mut cache = STRING_CACHE
             .lock()
             .map_err(|e| CacheError::LockError(e.to_string()))?;
-        for header in headers {
-            if let Some(count) = cache.get(header) {
-                // Returns the previous value of the counter
-                let was = count.fetch_sub(1, std::sync::atomic::Ordering::Relaxed);
-                if was == 1 {
-                    cache.remove(header);
-                    let ptr = *header as *const str as *mut str;
-                    unsafe {
-                        let _ = Box::from_raw(ptr);
+        let to_remove: Vec<_> = headers
+            .iter()
+            .filter_map(|header| {
+                let key = header.as_ref();
+                if let Some((_, (_, counter))) = cache.get_key_value(key) {
+                    let prev_count = counter.fetch_sub(1, Ordering::Relaxed);
+                    if prev_count == 1 {
+                        Some(key)
+                    } else {
+                        None
                     }
+                } else {
+                    None
                 }
-            }
+            })
+            .collect();
+        for key in to_remove {
+            cache.remove(key);
         }
         Ok(())

data/ext/osv/src/csv/mod.rs CHANGED Viewed

@@ -8,6 +8,7 @@ mod ruby_reader;
 pub use builder::RecordReaderBuilder;
 pub(crate) use builder::BUFFER_CHANNEL_SIZE;
+pub use header_cache::StringCacheKey;
 pub use record::CowValue;
 pub use record::CsvRecord;
 pub use ruby_integration::*;

data/ext/osv/src/csv/parser.rs CHANGED Viewed

@@ -2,13 +2,14 @@ use std::borrow::Cow;
 use std::collections::HashMap;
 use std::hash::BuildHasher;
+use super::header_cache::StringCacheKey;
 use super::CowValue;
 pub trait RecordParser<'a> {
     type Output: 'a;
     fn parse(
-        headers: &[&'static str],
+        headers: &[StringCacheKey],
         record: &csv::StringRecord,
         null_string: Option<&str>,
         flexible_default: Option<Cow<'a, str>>,
@@ -16,13 +17,13 @@ pub trait RecordParser<'a> {
 }
 impl<'a, S: BuildHasher + Default + 'a> RecordParser<'a>
-    for HashMap<&'static str, Option<CowValue<'a>>, S>
+    for HashMap<StringCacheKey, Option<CowValue<'a>>, S>
 {
     type Output = Self;
     #[inline]
     fn parse(
-        headers: &[&'static str],
+        headers: &[StringCacheKey],
         record: &csv::StringRecord,
         null_string: Option<&str>,
         flexible_default: Option<Cow<'a, str>>,
@@ -31,7 +32,7 @@ impl<'a, S: BuildHasher + Default + 'a> RecordParser<'a>
         let shared_empty = Cow::Borrowed("");
         let shared_default = flexible_default.map(CowValue);
-        headers.iter().enumerate().for_each(|(i, &header)| {
+        headers.iter().enumerate().for_each(|(i, ref header)| {
             let value = record.get(i).map_or_else(
                 || shared_default.clone(),
                 |field| {
@@ -44,7 +45,7 @@ impl<'a, S: BuildHasher + Default + 'a> RecordParser<'a>
                     }
                 },
             );
-            map.insert(header, value);
+            map.insert((*header).clone(), value);
         });
         map
     }
@@ -55,7 +56,7 @@ impl<'a> RecordParser<'a> for Vec<Option<CowValue<'a>>> {
     #[inline]
     fn parse(
-        headers: &[&'static str],
+        headers: &[StringCacheKey],
         record: &csv::StringRecord,
         null_string: Option<&str>,
         flexible_default: Option<Cow<'a, str>>,

data/ext/osv/src/csv/record.rs CHANGED Viewed

@@ -1,10 +1,13 @@
-use magnus::{IntoValue, Ruby, Value};
+use itertools::Itertools;
+use magnus::{value::ReprValue, IntoValue, Ruby, Value};
 use std::{borrow::Cow, collections::HashMap, hash::BuildHasher};
+use super::StringCacheKey;
 #[derive(Debug)]
 pub enum CsvRecord<'a, S: BuildHasher + Default> {
     Vec(Vec<Option<CowValue<'a>>>),
-    Map(HashMap<&'static str, Option<CowValue<'a>>, S>),
+    Map(HashMap<StringCacheKey, Option<CowValue<'a>>, S>),
 }
 impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
@@ -19,9 +22,23 @@ impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
             CsvRecord::Map(map) => {
                 // Pre-allocate the hash with the known size
                 let hash = handle.hash_new_capa(map.len());
-                map.into_iter()
-                    .try_for_each(|(k, v)| hash.aset(k, v))
-                    .unwrap();
+                let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
+                let mut i = 0;
+                for chunk in &map.into_iter().chunks(128) {
+                    for (k, v) in chunk {
+                        values[i] = handle.into_value(k);
+                        values[i + 1] = handle.into_value(v);
+                        i += 2;
+                    }
+                    hash.bulk_insert(&values[..i]).unwrap();
+                    // Zero out used values
+                    values[..i].fill(handle.qnil().as_value());
+                    i = 0;
+                }
                 hash.into_value_with(handle)
             }
         }

data/ext/osv/src/csv/record_reader.rs CHANGED Viewed

@@ -1,3 +1,4 @@
+use super::header_cache::StringCacheKey;
 use super::parser::RecordParser;
 use super::{header_cache::StringCache, ruby_reader::SeekableRead};
 use magnus::{Error, Ruby};
@@ -14,13 +15,13 @@ pub struct RecordReader<'a, T: RecordParser<'a>> {
 enum ReaderImpl<'a, T: RecordParser<'a>> {
     SingleThreaded {
         reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
-        headers: Vec<&'static str>,
+        headers: Vec<StringCacheKey>,
         null_string: Option<String>,
         flexible_default: Option<Cow<'a, str>>,
         string_record: csv::StringRecord,
     },
     MultiThreaded {
-        headers: Vec<&'static str>,
+        headers: Vec<StringCacheKey>,
         receiver: kanal::Receiver<T::Output>,
         handle: Option<thread::JoinHandle<()>>,
     },
@@ -51,7 +52,7 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
     pub(crate) fn new_single_threaded(
         reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
-        headers: Vec<&'static str>,
+        headers: Vec<StringCacheKey>,
         null_string: Option<String>,
         flexible_default: Option<&'a str>,
     ) -> Self {
@@ -71,7 +72,7 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
 impl<T: RecordParser<'static> + Send> RecordReader<'static, T> {
     pub(crate) fn new_multi_threaded(
         mut reader: csv::Reader<Box<dyn Read + Send + 'static>>,
-        headers: Vec<&'static str>,
+        headers: Vec<StringCacheKey>,
         buffer_size: usize,
         null_string: Option<String>,
         flexible_default: Option<&'static str>,
@@ -162,10 +163,10 @@ impl<'a, T: RecordParser<'a>> Drop for RecordReader<'a, T> {
                 if let Some(handle) = handle.take() {
                     let _ = handle.join();
                 }
-                let _ = StringCache::clear(headers);
+                let _ = StringCache::clear(&headers);
             }
             ReaderImpl::SingleThreaded { headers, .. } => {
-                let _ = StringCache::clear(headers);
+                let _ = StringCache::clear(&headers);
             }
         }
     }

data/ext/osv/src/csv/ruby_reader.rs CHANGED Viewed

@@ -74,9 +74,7 @@ impl Seek for RubyReader<RString> {
         match pos {
             io::SeekFrom::Start(offset) => self.offset = offset as usize,
             io::SeekFrom::Current(offset) => self.offset = (self.offset as i64 + offset) as usize,
-            io::SeekFrom::End(offset) => {
-                self.offset = self.inner.len() - offset as usize
-            }
+            io::SeekFrom::End(offset) => self.offset = self.inner.len() - offset as usize,
         }
         Ok(self.offset as u64)
     }

data/ext/osv/src/reader.rs CHANGED Viewed

@@ -1,4 +1,4 @@
-use crate::csv::{CowValue, CsvRecord, RecordReaderBuilder};
+use crate::csv::{CowValue, CsvRecord, RecordReaderBuilder, StringCacheKey};
 use crate::utils::*;
 use ahash::RandomState;
 use csv::Trim;
@@ -54,7 +54,7 @@ pub fn parse_csv(
     let iter: Box<dyn Iterator<Item = CsvRecord<RandomState>>> = match result_type.as_str() {
         "hash" => {
             let builder = RecordReaderBuilder::<
-                HashMap<&'static str, Option<CowValue<'static>>, RandomState>,
+                HashMap<StringCacheKey, Option<CowValue<'static>>, RandomState>,
             >::new(ruby, to_read)
             .has_headers(has_headers)
             .flexible(flexible)

data/lib/osv/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module OSV
-  VERSION = "0.3.15"
+  VERSION = "0.3.16"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: osv
 version: !ruby/object:Gem::Version
-  version: 0.3.15
+  version: 0.3.16
 platform: ruby
 authors:
 - Nathan Jaremko