RubyGems - osv - Versions diffs - 0.3.22 → 0.4.1 - Mend

osv 0.3.22 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/README.md +2 -3
data/ext/osv/src/csv/builder.rs +12 -24
data/ext/osv/src/csv/header_cache.rs +20 -15
data/ext/osv/src/csv/parser.rs +64 -35
data/ext/osv/src/csv/record.rs +3 -3
data/ext/osv/src/csv/record_reader.rs +53 -21
data/ext/osv/src/reader.rs +8 -8
data/ext/osv/src/utils.rs +8 -8
data/lib/osv/version.rb +1 -1
data/lib/osv.rbi +2 -5
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 26bda7b8aed144013156dea4f4f68c322b0a2042d6478c225edde0c44f54452f
-  data.tar.gz: cd63b6b71c158d8a09196a4fff496c5c7e7a9ac2c9a64724bbf31c56ff9ee0c7
+  metadata.gz: 137ae556685639f7d13234e3061d9b310757ce02f75a713753d175f1bc71b628
+  data.tar.gz: 5892494ad08d783955d2b932150d65433a4d3593376fadbaf54e54780e7a350f
 SHA512:
-  metadata.gz: 947a7cc0d9f644977d157d0424893daf10c62efee5b4d544f81731ca7f04056cc75bab464560f4aea0b793b4b4e6e63a964fed8adace0c232cd388182a709a3a
-  data.tar.gz: 78536bdbba174b441792e39dc3b1c2ca08d85bafe4dce7763156160abcc611bb38c0743e6cadfd5d3466410b42fa0e598dc32e4865393109b8c88ec9673bf44b
+  metadata.gz: 6efbc2ee65a8e79379722ae977ee7dbec6131b78968d080f9feb86a3310368c387da54dd8c073e9b4008cb80d906293ea9115982d00d5ff637cf5ab51179b53c
+  data.tar.gz: 7b4ab3199f90654cd831dfbb52a9d22b70237e7120bd5308a1b7698268fa981abefd7ee47d53424d0c7bff46956256db8f1e139d17e381fd5570a16ca183e376

data/README.md CHANGED Viewed

@@ -84,11 +84,10 @@ OSV.for_each("data.csv",
   # Parsing behavior
   flexible: false,       # Allow varying number of fields (default: false)
-  flexible_default: nil, # Default value for missing fields. If unset, we ignore missing fields.
-                         # Implicitly enables flexible mode if set.
   trim: :all,            # Whether to trim whitespace. Options are :all, :headers, or :fields (default: nil)
   buffer_size: 1024,     # Number of rows to buffer in memory (default: 1024)
   ignore_null_bytes: false, # Boolean specifying if null bytes should be ignored (default: false)
+  lossy: false,             # Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character (default: false)
 )
 ```
@@ -103,9 +102,9 @@ OSV.for_each("data.csv",
 - `buffer_size`: Integer specifying the number of rows to buffer in memory (default: 1024)
 - `result_type`: String specifying the output format ("hash" or "array" or :hash or :array)
 - `flexible`: Boolean specifying if the parser should be flexible (default: false)
-- `flexible_default`: String specifying the default value for missing fields. Implicitly enables flexible mode if set. (default: `nil`)
 - `trim`: String specifying the trim mode ("all" or "headers" or "fields" or :all or :headers or :fields)
 - `ignore_null_bytes`: Boolean specifying if null bytes should be ignored (default: false)
+- `lossy`: Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character (default: false)
 When `has_headers` is false, hash keys will be generated as `"c0"`, `"c1"`, etc.

data/ext/osv/src/csv/builder.rs CHANGED Viewed

@@ -79,9 +79,9 @@ pub struct RecordReaderBuilder<'a, T: RecordParser<'a>> {
     quote_char: u8,
     null_string: Option<String>,
     flexible: bool,
-    flexible_default: Option<String>,
     trim: csv::Trim,
     ignore_null_bytes: bool,
+    lossy: bool,
     _phantom: PhantomData<T>,
     _phantom_a: PhantomData<&'a ()>,
 }
@@ -97,9 +97,9 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
             quote_char: b'"',
             null_string: None,
             flexible: false,
-            flexible_default: None,
             trim: csv::Trim::None,
             ignore_null_bytes: false,
+            lossy: false,
             _phantom: PhantomData,
             _phantom_a: PhantomData,
         }
@@ -140,13 +140,6 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
         self
     }
-    /// Sets the default value for missing fields when in flexible mode.
-    #[must_use]
-    pub fn flexible_default(mut self, flexible_default: Option<String>) -> Self {
-        self.flexible_default = flexible_default;
-        self
-    }
     /// Sets the trimming mode for fields.
     #[must_use]
     pub fn trim(mut self, trim: csv::Trim) -> Self {
@@ -160,6 +153,12 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
         self
     }
+    #[must_use]
+    pub fn lossy(mut self, lossy: bool) -> Self {
+        self.lossy = lossy;
+        self
+    }
     /// Handles reading from a file descriptor.
     fn handle_file_descriptor(&self) -> Result<Box<dyn SeekableRead>, ReaderError> {
         let raw_value = self.to_read.as_raw();
@@ -202,7 +201,7 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
             build_ruby_reader(&self.ruby, self.to_read)?
         };
-        let flexible = self.flexible || self.flexible_default.is_some();
+        let flexible = self.flexible;
         let reader = BufReader::with_capacity(READ_BUFFER_SIZE, readable);
         let mut reader = csv::ReaderBuilder::new()
@@ -214,24 +213,13 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
             .from_reader(reader);
         let mut headers =
-            RecordReader::<T>::get_headers(&self.ruby, &mut reader, self.has_headers)?;
+            RecordReader::<T>::get_headers(&self.ruby, &mut reader, self.has_headers, self.lossy)?;
         if self.ignore_null_bytes {
             headers = headers.iter().map(|h| h.replace("\0", "")).collect();
         }
         let static_headers = StringCache::intern_many(&headers)?;
-        // We intern both of these to get static string references we can reuse throughout the parser.
-        let flexible_default = self
-            .flexible_default
-            .map(|s| {
-                RString::new(&s)
-                    .to_interned_str()
-                    .as_str()
-                    .map_err(|e| ReaderError::InvalidFlexibleDefault(format!("{:?}", e)))
-            })
-            .transpose()?
-            .map(Cow::Borrowed);
         let null_string = self
             .null_string
             .map(|s| {
@@ -247,8 +235,8 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
             reader,
             static_headers,
             null_string,
-            flexible_default,
             self.ignore_null_bytes,
+            self.lossy,
         ))
     }
 }

data/ext/osv/src/csv/header_cache.rs CHANGED Viewed

@@ -8,7 +8,7 @@ use std::{
     collections::HashMap,
     sync::{
         atomic::{AtomicU32, Ordering},
-        LazyLock, Mutex, OnceLock,
+        Arc, LazyLock, Mutex, OnceLock,
     },
 };
@@ -22,12 +22,11 @@ pub enum CacheError {
     LockError(String),
 }
-static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey, AtomicU32)>>> =
+static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (Arc<StringCacheKey>, AtomicU32)>>> =
     LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
 pub struct StringCache;
-#[derive(Copy, Clone)]
 pub struct StringCacheKey(Opaque<FString>, &'static str);
 impl StringCacheKey {
@@ -50,6 +49,12 @@ impl IntoValue for StringCacheKey {
     }
 }
+impl IntoValue for &StringCacheKey {
+    fn into_value_with(self, handle: &Ruby) -> Value {
+        handle.into_value(self.0)
+    }
+}
 impl std::fmt::Debug for StringCacheKey {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         self.1.fmt(f)
@@ -72,43 +77,43 @@ impl std::hash::Hash for StringCacheKey {
 impl StringCache {
     #[allow(dead_code)]
-    pub fn intern(string: String) -> Result<StringCacheKey, CacheError> {
+    pub fn intern(string: String) -> Result<Arc<StringCacheKey>, CacheError> {
         let mut cache = STRING_CACHE
             .lock()
             .map_err(|e| CacheError::LockError(e.to_string()))?;
         if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
             counter.fetch_add(1, Ordering::Relaxed);
-            Ok(*interned_string)
+            Ok(interned_string.clone())
         } else {
-            let interned = StringCacheKey::new(string.as_str());
+            let interned = Arc::new(StringCacheKey::new(string.as_str()));
             let leaked = Box::leak(string.into_boxed_str());
-            cache.insert(leaked, (interned, AtomicU32::new(1)));
+            cache.insert(leaked, (interned.clone(), AtomicU32::new(1)));
             Ok(interned)
         }
     }
-    pub fn intern_many(strings: &[String]) -> Result<Vec<StringCacheKey>, CacheError> {
+    pub fn intern_many(strings: &[String]) -> Result<Vec<Arc<StringCacheKey>>, CacheError> {
         let mut cache = STRING_CACHE
             .lock()
             .map_err(|e| CacheError::LockError(e.to_string()))?;
-        let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
+        let mut result: Vec<Arc<StringCacheKey>> = Vec::with_capacity(strings.len());
         for string in strings {
             if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
                 counter.fetch_add(1, Ordering::Relaxed);
-                result.push(*interned_string);
+                result.push(interned_string.clone());
             } else {
-                let interned = StringCacheKey::new(string);
+                let interned = Arc::new(StringCacheKey::new(string));
                 let leaked = Box::leak(string.clone().into_boxed_str());
-                cache.insert(leaked, (interned, AtomicU32::new(1)));
+                cache.insert(leaked, (interned.clone(), AtomicU32::new(1)));
                 result.push(interned);
             }
         }
         Ok(result)
     }
-    pub fn clear(headers: &[StringCacheKey]) -> Result<(), CacheError> {
+    pub fn clear(headers: &[Arc<StringCacheKey>]) -> Result<(), CacheError> {
         let mut cache = STRING_CACHE
             .lock()
             .map_err(|e| CacheError::LockError(e.to_string()))?;
@@ -116,7 +121,7 @@ impl StringCache {
         let to_remove: Vec<_> = headers
             .iter()
             .filter_map(|header| {
-                let key = header.as_ref();
+                let key = header.as_ref().as_ref();
                 if let Some((_, (_, counter))) = cache.get_key_value(key) {
                     let prev_count = counter.fetch_sub(1, Ordering::Relaxed);
                     if prev_count == 1 {
@@ -140,7 +145,7 @@ impl StringCache {
 pub struct HeaderCacheCleanupIter<I> {
     pub inner: I,
-    pub headers: OnceLock<Vec<StringCacheKey>>,
+    pub headers: OnceLock<Vec<Arc<StringCacheKey>>>,
 }
 impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {

data/ext/osv/src/csv/parser.rs CHANGED Viewed

@@ -1,44 +1,47 @@
 use std::borrow::Cow;
 use std::collections::HashMap;
 use std::hash::BuildHasher;
+use std::sync::Arc;
 use super::header_cache::StringCacheKey;
 use super::CowStr;
+pub enum CsvRecordType {
+    String(csv::StringRecord),
+    Byte(csv::ByteRecord),
+}
 pub trait RecordParser<'a> {
     type Output;
     fn parse(
-        headers: &[StringCacheKey],
-        record: &csv::StringRecord,
+        headers: &[Arc<StringCacheKey>],
+        record: &CsvRecordType,
         null_string: Option<Cow<'a, str>>,
-        flexible_default: Option<Cow<'a, str>>,
         ignore_null_bytes: bool,
     ) -> Self::Output;
 }
 impl<'a, S: BuildHasher + Default> RecordParser<'a>
-    for HashMap<StringCacheKey, Option<CowStr<'a>>, S>
+    for HashMap<Arc<StringCacheKey>, Option<CowStr<'a>>, S>
 {
     type Output = Self;
     #[inline]
     fn parse(
-        headers: &[StringCacheKey],
-        record: &csv::StringRecord,
+        headers: &[Arc<StringCacheKey>],
+        record: &CsvRecordType,
         null_string: Option<Cow<'a, str>>,
-        flexible_default: Option<Cow<'a, str>>,
         ignore_null_bytes: bool,
     ) -> Self::Output {
         let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
         let shared_empty = Cow::Borrowed("");
-        let shared_default = flexible_default.map(CowStr);
         headers.iter().enumerate().for_each(|(i, header)| {
-            let value = record.get(i).map_or_else(
-                || shared_default.clone(),
-                |field| {
-                    if null_string.as_deref() == Some(field) {
+            let value = match record {
+                CsvRecordType::String(s) => s.get(i).and_then(|field| {
+                    if null_string.as_deref() == Some(field.as_ref()) {
                         None
                     } else if field.is_empty() {
                         Some(CowStr(shared_empty.clone()))
@@ -47,9 +50,23 @@ impl<'a, S: BuildHasher + Default> RecordParser<'a>
                     } else {
                         Some(CowStr(Cow::Owned(field.to_string())))
                     }
-                },
-            );
-            map.insert(*header, value);
+                }),
+                CsvRecordType::Byte(b) => b.get(i).and_then(|field| {
+                    let field = String::from_utf8_lossy(field);
+                    if null_string.as_deref() == Some(field.as_ref()) {
+                        None
+                    } else if field.is_empty() {
+                        Some(CowStr(shared_empty.clone()))
+                    } else if ignore_null_bytes {
+                        Some(CowStr(Cow::Owned(field.replace("\0", ""))))
+                    } else {
+                        Some(CowStr(Cow::Owned(field.to_string())))
+                    }
+                }),
+            };
+            map.insert(header.clone(), value);
         });
         map
     }
@@ -60,36 +77,48 @@ impl<'a> RecordParser<'a> for Vec<Option<CowStr<'a>>> {
     #[inline]
     fn parse(
-        headers: &[StringCacheKey],
-        record: &csv::StringRecord,
+        headers: &[Arc<StringCacheKey>],
+        record: &CsvRecordType,
         null_string: Option<Cow<'a, str>>,
-        flexible_default: Option<Cow<'a, str>>,
         ignore_null_bytes: bool,
     ) -> Self::Output {
         let target_len = headers.len();
         let mut vec = Vec::with_capacity(target_len);
         let shared_empty = Cow::Borrowed("");
-        let shared_default = flexible_default.map(CowStr);
-        for field in record.iter() {
-            let value = if Some(field) == null_string.as_deref() {
-                None
-            } else if field.is_empty() {
-                Some(CowStr(shared_empty.clone()))
-            } else if ignore_null_bytes {
-                Some(CowStr(Cow::Owned(field.replace("\0", ""))))
-            } else {
-                Some(CowStr(Cow::Owned(field.to_string())))
-            };
-            vec.push(value);
-        }
-        if vec.len() < target_len {
-            if let Some(default) = shared_default {
-                vec.resize_with(target_len, || Some(default.clone()));
+        match record {
+            CsvRecordType::String(record) => {
+                for field in record.iter() {
+                    let value = if Some(field.as_ref()) == null_string.as_deref() {
+                        None
+                    } else if field.is_empty() {
+                        Some(CowStr(shared_empty.clone()))
+                    } else if ignore_null_bytes {
+                        Some(CowStr(Cow::Owned(field.replace("\0", ""))))
+                    } else {
+                        Some(CowStr(Cow::Owned(field.to_string())))
+                    };
+                    vec.push(value);
+                }
+            }
+            CsvRecordType::Byte(record) => {
+                for field in record.iter() {
+                    let field = String::from_utf8_lossy(field);
+                    let value = if Some(field.as_ref()) == null_string.as_deref() {
+                        None
+                    } else if field.is_empty() {
+                        Some(CowStr(shared_empty.clone()))
+                    } else if ignore_null_bytes {
+                        Some(CowStr(Cow::Owned(field.replace("\0", ""))))
+                    } else {
+                        Some(CowStr(Cow::Owned(field.to_string())))
+                    };
+                    vec.push(value);
+                }
             }
         }
         vec
     }
 }

data/ext/osv/src/csv/record.rs CHANGED Viewed

@@ -1,13 +1,13 @@
 use itertools::Itertools;
 use magnus::{value::ReprValue, IntoValue, Ruby, Value};
-use std::{borrow::Cow, collections::HashMap, hash::BuildHasher};
+use std::{borrow::Cow, collections::HashMap, hash::BuildHasher, sync::Arc};
 use super::StringCacheKey;
 #[derive(Debug)]
 pub enum CsvRecord<'a, S: BuildHasher + Default> {
     Vec(Vec<Option<CowStr<'a>>>),
-    Map(HashMap<StringCacheKey, Option<CowStr<'a>>, S>),
+    Map(HashMap<Arc<StringCacheKey>, Option<CowStr<'a>>, S>),
 }
 impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
@@ -28,7 +28,7 @@ impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
                 for chunk in &map.into_iter().chunks(128) {
                     for (k, v) in chunk {
-                        values[i] = handle.into_value(k);
+                        values[i] = handle.into_value(k.as_ref());
                         values[i + 1] = handle.into_value(v);
                         i += 2;
                     }

data/ext/osv/src/csv/record_reader.rs CHANGED Viewed

@@ -1,10 +1,11 @@
 use super::builder::ReaderError;
 use super::header_cache::StringCacheKey;
-use super::parser::RecordParser;
+use super::parser::{CsvRecordType, RecordParser};
 use super::{header_cache::StringCache, ruby_reader::SeekableRead};
 use magnus::{Error, Ruby};
 use std::borrow::Cow;
 use std::io::{BufReader, Read};
+use std::sync::Arc;
 /// Size of the internal buffer used for reading CSV records
 pub(crate) const READ_BUFFER_SIZE: usize = 16384;
@@ -14,10 +15,9 @@ pub(crate) const READ_BUFFER_SIZE: usize = 16384;
 /// This struct implements Iterator to provide a streaming interface for CSV records.
 pub struct RecordReader<'a, T: RecordParser<'a>> {
     reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
-    headers: Vec<StringCacheKey>,
+    headers: Vec<Arc<StringCacheKey>>,
     null_string: Option<Cow<'a, str>>,
-    flexible_default: Option<Cow<'a, str>>,
-    string_record: csv::StringRecord,
+    string_record: CsvRecordType,
     parser: std::marker::PhantomData<T>,
     ignore_null_bytes: bool,
 }
@@ -37,36 +37,65 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
         ruby: &Ruby,
         reader: &mut csv::Reader<impl Read>,
         has_headers: bool,
+        lossy: bool,
     ) -> Result<Vec<String>, Error> {
-        let first_row = reader.headers().map_err(|e| {
-            Error::new(
-                ruby.exception_runtime_error(),
-                format!("Failed to read headers: {e}"),
-            )
-        })?;
-        Ok(if has_headers {
-            first_row.iter().map(String::from).collect()
+        let headers = if lossy {
+            let first_row = reader.byte_headers().map_err(|e| {
+                Error::new(
+                    ruby.exception_runtime_error(),
+                    format!("Failed to read headers: {e}"),
+                )
+            })?;
+            if has_headers {
+                first_row
+                    .iter()
+                    .map(String::from_utf8_lossy)
+                    .map(|x| x.to_string())
+                    .collect()
+            } else {
+                (0..first_row.len()).map(|i| format!("c{i}")).collect()
+            }
         } else {
-            (0..first_row.len()).map(|i| format!("c{i}")).collect()
-        })
+            let first_row = reader.headers().map_err(|e| {
+                Error::new(
+                    ruby.exception_runtime_error(),
+                    format!("Failed to read headers: {e}"),
+                )
+            })?;
+            if has_headers {
+                first_row.iter().map(String::from).collect()
+            } else {
+                (0..first_row.len()).map(|i| format!("c{i}")).collect()
+            }
+        };
+        Ok(headers)
     }
     /// Creates a new RecordReader instance.
     pub(crate) fn new(
         reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
-        headers: Vec<StringCacheKey>,
+        headers: Vec<Arc<StringCacheKey>>,
         null_string: Option<Cow<'a, str>>,
-        flexible_default: Option<Cow<'a, str>>,
         ignore_null_bytes: bool,
+        lossy: bool,
     ) -> Self {
         let headers_len = headers.len();
         Self {
             reader,
             headers,
             null_string,
-            flexible_default,
-            string_record: csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers_len),
+            string_record: if lossy {
+                CsvRecordType::Byte(csv::ByteRecord::with_capacity(
+                    READ_BUFFER_SIZE,
+                    headers_len,
+                ))
+            } else {
+                CsvRecordType::String(csv::StringRecord::with_capacity(
+                    READ_BUFFER_SIZE,
+                    headers_len,
+                ))
+            },
             parser: std::marker::PhantomData,
             ignore_null_bytes,
         }
@@ -74,12 +103,15 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
     /// Attempts to read the next record, returning any errors encountered.
     fn try_next(&mut self) -> Result<Option<T::Output>, ReaderError> {
-        if self.reader.read_record(&mut self.string_record)? {
+        let record = match self.string_record {
+            CsvRecordType::String(ref mut record) => self.reader.read_record(record),
+            CsvRecordType::Byte(ref mut record) => self.reader.read_byte_record(record),
+        }?;
+        if record {
             Ok(Some(T::parse(
                 &self.headers,
                 &self.string_record,
                 self.null_string.clone(),
-                self.flexible_default.clone(),
                 self.ignore_null_bytes,
             )))
         } else {

data/ext/osv/src/reader.rs CHANGED Viewed

@@ -5,6 +5,7 @@ use csv::Trim;
 use magnus::value::ReprValue;
 use magnus::{Error, IntoValue, KwArgs, RHash, Ruby, Symbol, Value};
 use std::collections::HashMap;
+use std::sync::Arc;
 /// Valid result types for CSV parsing
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
@@ -34,9 +35,9 @@ struct EnumeratorArgs {
     null_string: Option<String>,
     result_type: String,
     flexible: bool,
-    flexible_default: Option<String>,
     trim: Option<String>,
     ignore_null_bytes: bool,
+    lossy: bool,
 }
 /// Parses a CSV file with the given configuration.
@@ -56,9 +57,9 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result<Value, Error> {
         null_string,
         result_type,
         flexible,
-        flexible_default,
         trim,
         ignore_null_bytes,
+        lossy,
     } = parse_read_csv_args(&ruby, args)?;
     if !ruby.block_given() {
@@ -71,7 +72,6 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result<Value, Error> {
             null_string,
             result_type,
             flexible,
-            flexible_default,
             trim: match trim {
                 Trim::All => Some("all".to_string()),
                 Trim::Headers => Some("headers".to_string()),
@@ -79,6 +79,7 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result<Value, Error> {
                 _ => None,
             },
             ignore_null_bytes,
+            lossy,
         })
         .map(|yield_enum| yield_enum.into_value_with(&ruby));
     }
@@ -93,16 +94,16 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result<Value, Error> {
     match result_type {
         ResultType::Hash => {
             let builder = RecordReaderBuilder::<
-                HashMap<StringCacheKey, Option<CowStr<'static>>, RandomState>,
+                HashMap<Arc<StringCacheKey>, Option<CowStr<'static>>, RandomState>,
             >::new(ruby, to_read)
             .has_headers(has_headers)
             .flexible(flexible)
-            .flexible_default(flexible_default)
             .trim(trim)
             .delimiter(delimiter)
             .quote_char(quote_char)
             .null_string(null_string)
             .ignore_null_bytes(ignore_null_bytes)
+            .lossy(lossy)
             .build()?;
             let ruby = unsafe { Ruby::get_unchecked() };
@@ -115,12 +116,12 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result<Value, Error> {
             let builder = RecordReaderBuilder::<Vec<Option<CowStr<'static>>>>::new(ruby, to_read)
                 .has_headers(has_headers)
                 .flexible(flexible)
-                .flexible_default(flexible_default)
                 .trim(trim)
                 .delimiter(delimiter)
                 .quote_char(quote_char)
                 .null_string(null_string)
                 .ignore_null_bytes(ignore_null_bytes)
+                .lossy(lossy)
                 .build()?;
             let ruby = unsafe { Ruby::get_unchecked() };
@@ -150,10 +151,9 @@ fn create_enumerator(args: EnumeratorArgs) -> Result<magnus::Enumerator, Error>
     kwargs.aset(Symbol::new("nil_string"), args.null_string)?;
     kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
     kwargs.aset(Symbol::new("flexible"), args.flexible)?;
-    kwargs.aset(Symbol::new("flexible_default"), args.flexible_default)?;
     kwargs.aset(Symbol::new("trim"), args.trim.map(Symbol::new))?;
     kwargs.aset(Symbol::new("ignore_null_bytes"), args.ignore_null_bytes)?;
+    kwargs.aset(Symbol::new("lossy"), args.lossy)?;
     Ok(args
         .rb_self
         .enumeratorize("for_each", (args.to_read, KwArgs(kwargs))))

data/ext/osv/src/utils.rs CHANGED Viewed

@@ -34,9 +34,9 @@ pub struct ReadCsvArgs {
     pub null_string: Option<String>,
     pub result_type: String,
     pub flexible: bool,
-    pub flexible_default: Option<String>,
     pub trim: csv::Trim,
     pub ignore_null_bytes: bool,
+    pub lossy: bool,
 }
 /// Parse common arguments for CSV parsing
@@ -54,9 +54,9 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
             Option<Option<String>>,
             Option<Option<Value>>,
             Option<Option<bool>>,
-            Option<Option<Option<String>>>,
             Option<Option<Value>>,
             Option<Option<bool>>,
+            Option<Option<bool>>,
         ),
         (),
     >(
@@ -69,9 +69,9 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
             "nil_string",
             "result_type",
             "flexible",
-            "flexible_default",
             "trim",
             "ignore_null_bytes",
+            "lossy",
         ],
     )?;
@@ -134,11 +134,9 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
     let flexible = kwargs.optional.5.flatten().unwrap_or_default();
-    let flexible_default = kwargs.optional.6.flatten().unwrap_or_default();
     let trim = match kwargs
         .optional
-        .7
+        .6
         .flatten()
         .map(|value| parse_string_or_symbol(ruby, value))
     {
@@ -166,7 +164,9 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
         None => csv::Trim::None,
     };
-    let ignore_null_bytes = kwargs.optional.8.flatten().unwrap_or_default();
+    let ignore_null_bytes = kwargs.optional.7.flatten().unwrap_or_default();
+    let lossy = kwargs.optional.8.flatten().unwrap_or_default();
     Ok(ReadCsvArgs {
         to_read,
@@ -176,8 +176,8 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
         null_string,
         result_type,
         flexible,
-        flexible_default,
         trim,
         ignore_null_bytes,
+        lossy,
     })
 }

data/lib/osv/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module OSV
-  VERSION = "0.3.22"
+  VERSION = "0.4.1"
 end

data/lib/osv.rbi CHANGED Viewed

@@ -17,14 +17,12 @@ module OSV
   #                    ("hash" or "array" or :hash or :array)
   #   - `flexible`: Boolean specifying if the parser should be flexible
   #                 (default: false)
-  #   - `flexible_default`: String specifying the default value for missing fields.
-  #                         Implicitly enables flexible mode if set.
-  #                         (default: `nil`)
   #   - `trim`: String specifying the trim mode
   #             ("all" or "headers" or "fields" or :all or :headers or :fields)
   #             (default: `nil`)
   #   - `ignore_null_bytes`: Boolean specifying if null bytes should be ignored
   #                         (default: false)
+  #   - `lossy`: Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character
   sig do
     params(
       input: T.any(String, StringIO, IO),
@@ -35,7 +33,6 @@ module OSV
       buffer_size: T.nilable(Integer),
       result_type: T.nilable(T.any(String, Symbol)),
       flexible: T.nilable(T::Boolean),
-      flexible_default: T.nilable(String),
       ignore_null_bytes: T.nilable(T::Boolean),
       trim: T.nilable(T.any(String, Symbol)),
       blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.nilable(String)], T::Array[T.nilable(String)])).void)
@@ -50,9 +47,9 @@ module OSV
     buffer_size: nil,
     result_type: nil,
     flexible: nil,
-    flexible_default: nil,
     ignore_null_bytes: nil,
     trim: nil,
+    lossy: nil,
     &blk
   )
   end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: osv
 version: !ruby/object:Gem::Version
-  version: 0.3.22
+  version: 0.4.1
 platform: ruby
 authors:
 - Nathan Jaremko
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2025-01-30 00:00:00.000000000 Z
+date: 2025-01-31 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rb_sys