osv 0.3.22 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 26bda7b8aed144013156dea4f4f68c322b0a2042d6478c225edde0c44f54452f
4
- data.tar.gz: cd63b6b71c158d8a09196a4fff496c5c7e7a9ac2c9a64724bbf31c56ff9ee0c7
3
+ metadata.gz: 137ae556685639f7d13234e3061d9b310757ce02f75a713753d175f1bc71b628
4
+ data.tar.gz: 5892494ad08d783955d2b932150d65433a4d3593376fadbaf54e54780e7a350f
5
5
  SHA512:
6
- metadata.gz: 947a7cc0d9f644977d157d0424893daf10c62efee5b4d544f81731ca7f04056cc75bab464560f4aea0b793b4b4e6e63a964fed8adace0c232cd388182a709a3a
7
- data.tar.gz: 78536bdbba174b441792e39dc3b1c2ca08d85bafe4dce7763156160abcc611bb38c0743e6cadfd5d3466410b42fa0e598dc32e4865393109b8c88ec9673bf44b
6
+ metadata.gz: 6efbc2ee65a8e79379722ae977ee7dbec6131b78968d080f9feb86a3310368c387da54dd8c073e9b4008cb80d906293ea9115982d00d5ff637cf5ab51179b53c
7
+ data.tar.gz: 7b4ab3199f90654cd831dfbb52a9d22b70237e7120bd5308a1b7698268fa981abefd7ee47d53424d0c7bff46956256db8f1e139d17e381fd5570a16ca183e376
data/README.md CHANGED
@@ -84,11 +84,10 @@ OSV.for_each("data.csv",
84
84
 
85
85
  # Parsing behavior
86
86
  flexible: false, # Allow varying number of fields (default: false)
87
- flexible_default: nil, # Default value for missing fields. If unset, we ignore missing fields.
88
- # Implicitly enables flexible mode if set.
89
87
  trim: :all, # Whether to trim whitespace. Options are :all, :headers, or :fields (default: nil)
90
88
  buffer_size: 1024, # Number of rows to buffer in memory (default: 1024)
91
89
  ignore_null_bytes: false, # Boolean specifying if null bytes should be ignored (default: false)
90
+ lossy: false, # Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character (default: false)
92
91
  )
93
92
  ```
94
93
 
@@ -103,9 +102,9 @@ OSV.for_each("data.csv",
103
102
  - `buffer_size`: Integer specifying the number of rows to buffer in memory (default: 1024)
104
103
  - `result_type`: String specifying the output format ("hash" or "array" or :hash or :array)
105
104
  - `flexible`: Boolean specifying if the parser should be flexible (default: false)
106
- - `flexible_default`: String specifying the default value for missing fields. Implicitly enables flexible mode if set. (default: `nil`)
107
105
  - `trim`: String specifying the trim mode ("all" or "headers" or "fields" or :all or :headers or :fields)
108
106
  - `ignore_null_bytes`: Boolean specifying if null bytes should be ignored (default: false)
107
+ - `lossy`: Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character (default: false)
109
108
 
110
109
  When `has_headers` is false, hash keys will be generated as `"c0"`, `"c1"`, etc.
111
110
 
@@ -79,9 +79,9 @@ pub struct RecordReaderBuilder<'a, T: RecordParser<'a>> {
79
79
  quote_char: u8,
80
80
  null_string: Option<String>,
81
81
  flexible: bool,
82
- flexible_default: Option<String>,
83
82
  trim: csv::Trim,
84
83
  ignore_null_bytes: bool,
84
+ lossy: bool,
85
85
  _phantom: PhantomData<T>,
86
86
  _phantom_a: PhantomData<&'a ()>,
87
87
  }
@@ -97,9 +97,9 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
97
97
  quote_char: b'"',
98
98
  null_string: None,
99
99
  flexible: false,
100
- flexible_default: None,
101
100
  trim: csv::Trim::None,
102
101
  ignore_null_bytes: false,
102
+ lossy: false,
103
103
  _phantom: PhantomData,
104
104
  _phantom_a: PhantomData,
105
105
  }
@@ -140,13 +140,6 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
140
140
  self
141
141
  }
142
142
 
143
- /// Sets the default value for missing fields when in flexible mode.
144
- #[must_use]
145
- pub fn flexible_default(mut self, flexible_default: Option<String>) -> Self {
146
- self.flexible_default = flexible_default;
147
- self
148
- }
149
-
150
143
  /// Sets the trimming mode for fields.
151
144
  #[must_use]
152
145
  pub fn trim(mut self, trim: csv::Trim) -> Self {
@@ -160,6 +153,12 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
160
153
  self
161
154
  }
162
155
 
156
+ #[must_use]
157
+ pub fn lossy(mut self, lossy: bool) -> Self {
158
+ self.lossy = lossy;
159
+ self
160
+ }
161
+
163
162
  /// Handles reading from a file descriptor.
164
163
  fn handle_file_descriptor(&self) -> Result<Box<dyn SeekableRead>, ReaderError> {
165
164
  let raw_value = self.to_read.as_raw();
@@ -202,7 +201,7 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
202
201
  build_ruby_reader(&self.ruby, self.to_read)?
203
202
  };
204
203
 
205
- let flexible = self.flexible || self.flexible_default.is_some();
204
+ let flexible = self.flexible;
206
205
  let reader = BufReader::with_capacity(READ_BUFFER_SIZE, readable);
207
206
 
208
207
  let mut reader = csv::ReaderBuilder::new()
@@ -214,24 +213,13 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
214
213
  .from_reader(reader);
215
214
 
216
215
  let mut headers =
217
- RecordReader::<T>::get_headers(&self.ruby, &mut reader, self.has_headers)?;
216
+ RecordReader::<T>::get_headers(&self.ruby, &mut reader, self.has_headers, self.lossy)?;
217
+
218
218
  if self.ignore_null_bytes {
219
219
  headers = headers.iter().map(|h| h.replace("\0", "")).collect();
220
220
  }
221
221
  let static_headers = StringCache::intern_many(&headers)?;
222
222
 
223
- // We intern both of these to get static string references we can reuse throughout the parser.
224
- let flexible_default = self
225
- .flexible_default
226
- .map(|s| {
227
- RString::new(&s)
228
- .to_interned_str()
229
- .as_str()
230
- .map_err(|e| ReaderError::InvalidFlexibleDefault(format!("{:?}", e)))
231
- })
232
- .transpose()?
233
- .map(Cow::Borrowed);
234
-
235
223
  let null_string = self
236
224
  .null_string
237
225
  .map(|s| {
@@ -247,8 +235,8 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
247
235
  reader,
248
236
  static_headers,
249
237
  null_string,
250
- flexible_default,
251
238
  self.ignore_null_bytes,
239
+ self.lossy,
252
240
  ))
253
241
  }
254
242
  }
@@ -8,7 +8,7 @@ use std::{
8
8
  collections::HashMap,
9
9
  sync::{
10
10
  atomic::{AtomicU32, Ordering},
11
- LazyLock, Mutex, OnceLock,
11
+ Arc, LazyLock, Mutex, OnceLock,
12
12
  },
13
13
  };
14
14
 
@@ -22,12 +22,11 @@ pub enum CacheError {
22
22
  LockError(String),
23
23
  }
24
24
 
25
- static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey, AtomicU32)>>> =
25
+ static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (Arc<StringCacheKey>, AtomicU32)>>> =
26
26
  LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
27
27
 
28
28
  pub struct StringCache;
29
29
 
30
- #[derive(Copy, Clone)]
31
30
  pub struct StringCacheKey(Opaque<FString>, &'static str);
32
31
 
33
32
  impl StringCacheKey {
@@ -50,6 +49,12 @@ impl IntoValue for StringCacheKey {
50
49
  }
51
50
  }
52
51
 
52
+ impl IntoValue for &StringCacheKey {
53
+ fn into_value_with(self, handle: &Ruby) -> Value {
54
+ handle.into_value(self.0)
55
+ }
56
+ }
57
+
53
58
  impl std::fmt::Debug for StringCacheKey {
54
59
  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
55
60
  self.1.fmt(f)
@@ -72,43 +77,43 @@ impl std::hash::Hash for StringCacheKey {
72
77
 
73
78
  impl StringCache {
74
79
  #[allow(dead_code)]
75
- pub fn intern(string: String) -> Result<StringCacheKey, CacheError> {
80
+ pub fn intern(string: String) -> Result<Arc<StringCacheKey>, CacheError> {
76
81
  let mut cache = STRING_CACHE
77
82
  .lock()
78
83
  .map_err(|e| CacheError::LockError(e.to_string()))?;
79
84
 
80
85
  if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
81
86
  counter.fetch_add(1, Ordering::Relaxed);
82
- Ok(*interned_string)
87
+ Ok(interned_string.clone())
83
88
  } else {
84
- let interned = StringCacheKey::new(string.as_str());
89
+ let interned = Arc::new(StringCacheKey::new(string.as_str()));
85
90
  let leaked = Box::leak(string.into_boxed_str());
86
- cache.insert(leaked, (interned, AtomicU32::new(1)));
91
+ cache.insert(leaked, (interned.clone(), AtomicU32::new(1)));
87
92
  Ok(interned)
88
93
  }
89
94
  }
90
95
 
91
- pub fn intern_many(strings: &[String]) -> Result<Vec<StringCacheKey>, CacheError> {
96
+ pub fn intern_many(strings: &[String]) -> Result<Vec<Arc<StringCacheKey>>, CacheError> {
92
97
  let mut cache = STRING_CACHE
93
98
  .lock()
94
99
  .map_err(|e| CacheError::LockError(e.to_string()))?;
95
100
 
96
- let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
101
+ let mut result: Vec<Arc<StringCacheKey>> = Vec::with_capacity(strings.len());
97
102
  for string in strings {
98
103
  if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
99
104
  counter.fetch_add(1, Ordering::Relaxed);
100
- result.push(*interned_string);
105
+ result.push(interned_string.clone());
101
106
  } else {
102
- let interned = StringCacheKey::new(string);
107
+ let interned = Arc::new(StringCacheKey::new(string));
103
108
  let leaked = Box::leak(string.clone().into_boxed_str());
104
- cache.insert(leaked, (interned, AtomicU32::new(1)));
109
+ cache.insert(leaked, (interned.clone(), AtomicU32::new(1)));
105
110
  result.push(interned);
106
111
  }
107
112
  }
108
113
  Ok(result)
109
114
  }
110
115
 
111
- pub fn clear(headers: &[StringCacheKey]) -> Result<(), CacheError> {
116
+ pub fn clear(headers: &[Arc<StringCacheKey>]) -> Result<(), CacheError> {
112
117
  let mut cache = STRING_CACHE
113
118
  .lock()
114
119
  .map_err(|e| CacheError::LockError(e.to_string()))?;
@@ -116,7 +121,7 @@ impl StringCache {
116
121
  let to_remove: Vec<_> = headers
117
122
  .iter()
118
123
  .filter_map(|header| {
119
- let key = header.as_ref();
124
+ let key = header.as_ref().as_ref();
120
125
  if let Some((_, (_, counter))) = cache.get_key_value(key) {
121
126
  let prev_count = counter.fetch_sub(1, Ordering::Relaxed);
122
127
  if prev_count == 1 {
@@ -140,7 +145,7 @@ impl StringCache {
140
145
 
141
146
  pub struct HeaderCacheCleanupIter<I> {
142
147
  pub inner: I,
143
- pub headers: OnceLock<Vec<StringCacheKey>>,
148
+ pub headers: OnceLock<Vec<Arc<StringCacheKey>>>,
144
149
  }
145
150
 
146
151
  impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
@@ -1,44 +1,47 @@
1
1
  use std::borrow::Cow;
2
2
  use std::collections::HashMap;
3
3
  use std::hash::BuildHasher;
4
+ use std::sync::Arc;
4
5
 
5
6
  use super::header_cache::StringCacheKey;
6
7
  use super::CowStr;
7
8
 
9
+ pub enum CsvRecordType {
10
+ String(csv::StringRecord),
11
+ Byte(csv::ByteRecord),
12
+ }
13
+
8
14
  pub trait RecordParser<'a> {
9
15
  type Output;
10
16
 
11
17
  fn parse(
12
- headers: &[StringCacheKey],
13
- record: &csv::StringRecord,
18
+ headers: &[Arc<StringCacheKey>],
19
+ record: &CsvRecordType,
14
20
  null_string: Option<Cow<'a, str>>,
15
- flexible_default: Option<Cow<'a, str>>,
16
21
  ignore_null_bytes: bool,
17
22
  ) -> Self::Output;
18
23
  }
19
24
 
20
25
  impl<'a, S: BuildHasher + Default> RecordParser<'a>
21
- for HashMap<StringCacheKey, Option<CowStr<'a>>, S>
26
+ for HashMap<Arc<StringCacheKey>, Option<CowStr<'a>>, S>
22
27
  {
23
28
  type Output = Self;
24
29
 
25
30
  #[inline]
26
31
  fn parse(
27
- headers: &[StringCacheKey],
28
- record: &csv::StringRecord,
32
+ headers: &[Arc<StringCacheKey>],
33
+ record: &CsvRecordType,
29
34
  null_string: Option<Cow<'a, str>>,
30
- flexible_default: Option<Cow<'a, str>>,
31
35
  ignore_null_bytes: bool,
32
36
  ) -> Self::Output {
33
37
  let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
34
38
 
35
39
  let shared_empty = Cow::Borrowed("");
36
- let shared_default = flexible_default.map(CowStr);
40
+
37
41
  headers.iter().enumerate().for_each(|(i, header)| {
38
- let value = record.get(i).map_or_else(
39
- || shared_default.clone(),
40
- |field| {
41
- if null_string.as_deref() == Some(field) {
42
+ let value = match record {
43
+ CsvRecordType::String(s) => s.get(i).and_then(|field| {
44
+ if null_string.as_deref() == Some(field.as_ref()) {
42
45
  None
43
46
  } else if field.is_empty() {
44
47
  Some(CowStr(shared_empty.clone()))
@@ -47,9 +50,23 @@ impl<'a, S: BuildHasher + Default> RecordParser<'a>
47
50
  } else {
48
51
  Some(CowStr(Cow::Owned(field.to_string())))
49
52
  }
50
- },
51
- );
52
- map.insert(*header, value);
53
+ }),
54
+
55
+ CsvRecordType::Byte(b) => b.get(i).and_then(|field| {
56
+ let field = String::from_utf8_lossy(field);
57
+ if null_string.as_deref() == Some(field.as_ref()) {
58
+ None
59
+ } else if field.is_empty() {
60
+ Some(CowStr(shared_empty.clone()))
61
+ } else if ignore_null_bytes {
62
+ Some(CowStr(Cow::Owned(field.replace("\0", ""))))
63
+ } else {
64
+ Some(CowStr(Cow::Owned(field.to_string())))
65
+ }
66
+ }),
67
+ };
68
+
69
+ map.insert(header.clone(), value);
53
70
  });
54
71
  map
55
72
  }
@@ -60,36 +77,48 @@ impl<'a> RecordParser<'a> for Vec<Option<CowStr<'a>>> {
60
77
 
61
78
  #[inline]
62
79
  fn parse(
63
- headers: &[StringCacheKey],
64
- record: &csv::StringRecord,
80
+ headers: &[Arc<StringCacheKey>],
81
+ record: &CsvRecordType,
65
82
  null_string: Option<Cow<'a, str>>,
66
- flexible_default: Option<Cow<'a, str>>,
67
83
  ignore_null_bytes: bool,
68
84
  ) -> Self::Output {
69
85
  let target_len = headers.len();
70
86
  let mut vec = Vec::with_capacity(target_len);
71
87
 
72
88
  let shared_empty = Cow::Borrowed("");
73
- let shared_default = flexible_default.map(CowStr);
74
-
75
- for field in record.iter() {
76
- let value = if Some(field) == null_string.as_deref() {
77
- None
78
- } else if field.is_empty() {
79
- Some(CowStr(shared_empty.clone()))
80
- } else if ignore_null_bytes {
81
- Some(CowStr(Cow::Owned(field.replace("\0", ""))))
82
- } else {
83
- Some(CowStr(Cow::Owned(field.to_string())))
84
- };
85
- vec.push(value);
86
- }
87
89
 
88
- if vec.len() < target_len {
89
- if let Some(default) = shared_default {
90
- vec.resize_with(target_len, || Some(default.clone()));
90
+ match record {
91
+ CsvRecordType::String(record) => {
92
+ for field in record.iter() {
93
+ let value = if Some(field.as_ref()) == null_string.as_deref() {
94
+ None
95
+ } else if field.is_empty() {
96
+ Some(CowStr(shared_empty.clone()))
97
+ } else if ignore_null_bytes {
98
+ Some(CowStr(Cow::Owned(field.replace("\0", ""))))
99
+ } else {
100
+ Some(CowStr(Cow::Owned(field.to_string())))
101
+ };
102
+ vec.push(value);
103
+ }
104
+ }
105
+ CsvRecordType::Byte(record) => {
106
+ for field in record.iter() {
107
+ let field = String::from_utf8_lossy(field);
108
+ let value = if Some(field.as_ref()) == null_string.as_deref() {
109
+ None
110
+ } else if field.is_empty() {
111
+ Some(CowStr(shared_empty.clone()))
112
+ } else if ignore_null_bytes {
113
+ Some(CowStr(Cow::Owned(field.replace("\0", ""))))
114
+ } else {
115
+ Some(CowStr(Cow::Owned(field.to_string())))
116
+ };
117
+ vec.push(value);
118
+ }
91
119
  }
92
120
  }
121
+
93
122
  vec
94
123
  }
95
124
  }
@@ -1,13 +1,13 @@
1
1
  use itertools::Itertools;
2
2
  use magnus::{value::ReprValue, IntoValue, Ruby, Value};
3
- use std::{borrow::Cow, collections::HashMap, hash::BuildHasher};
3
+ use std::{borrow::Cow, collections::HashMap, hash::BuildHasher, sync::Arc};
4
4
 
5
5
  use super::StringCacheKey;
6
6
 
7
7
  #[derive(Debug)]
8
8
  pub enum CsvRecord<'a, S: BuildHasher + Default> {
9
9
  Vec(Vec<Option<CowStr<'a>>>),
10
- Map(HashMap<StringCacheKey, Option<CowStr<'a>>, S>),
10
+ Map(HashMap<Arc<StringCacheKey>, Option<CowStr<'a>>, S>),
11
11
  }
12
12
 
13
13
  impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
@@ -28,7 +28,7 @@ impl<S: BuildHasher + Default> IntoValue for CsvRecord<'_, S> {
28
28
 
29
29
  for chunk in &map.into_iter().chunks(128) {
30
30
  for (k, v) in chunk {
31
- values[i] = handle.into_value(k);
31
+ values[i] = handle.into_value(k.as_ref());
32
32
  values[i + 1] = handle.into_value(v);
33
33
  i += 2;
34
34
  }
@@ -1,10 +1,11 @@
1
1
  use super::builder::ReaderError;
2
2
  use super::header_cache::StringCacheKey;
3
- use super::parser::RecordParser;
3
+ use super::parser::{CsvRecordType, RecordParser};
4
4
  use super::{header_cache::StringCache, ruby_reader::SeekableRead};
5
5
  use magnus::{Error, Ruby};
6
6
  use std::borrow::Cow;
7
7
  use std::io::{BufReader, Read};
8
+ use std::sync::Arc;
8
9
 
9
10
  /// Size of the internal buffer used for reading CSV records
10
11
  pub(crate) const READ_BUFFER_SIZE: usize = 16384;
@@ -14,10 +15,9 @@ pub(crate) const READ_BUFFER_SIZE: usize = 16384;
14
15
  /// This struct implements Iterator to provide a streaming interface for CSV records.
15
16
  pub struct RecordReader<'a, T: RecordParser<'a>> {
16
17
  reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
17
- headers: Vec<StringCacheKey>,
18
+ headers: Vec<Arc<StringCacheKey>>,
18
19
  null_string: Option<Cow<'a, str>>,
19
- flexible_default: Option<Cow<'a, str>>,
20
- string_record: csv::StringRecord,
20
+ string_record: CsvRecordType,
21
21
  parser: std::marker::PhantomData<T>,
22
22
  ignore_null_bytes: bool,
23
23
  }
@@ -37,36 +37,65 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
37
37
  ruby: &Ruby,
38
38
  reader: &mut csv::Reader<impl Read>,
39
39
  has_headers: bool,
40
+ lossy: bool,
40
41
  ) -> Result<Vec<String>, Error> {
41
- let first_row = reader.headers().map_err(|e| {
42
- Error::new(
43
- ruby.exception_runtime_error(),
44
- format!("Failed to read headers: {e}"),
45
- )
46
- })?;
47
-
48
- Ok(if has_headers {
49
- first_row.iter().map(String::from).collect()
42
+ let headers = if lossy {
43
+ let first_row = reader.byte_headers().map_err(|e| {
44
+ Error::new(
45
+ ruby.exception_runtime_error(),
46
+ format!("Failed to read headers: {e}"),
47
+ )
48
+ })?;
49
+ if has_headers {
50
+ first_row
51
+ .iter()
52
+ .map(String::from_utf8_lossy)
53
+ .map(|x| x.to_string())
54
+ .collect()
55
+ } else {
56
+ (0..first_row.len()).map(|i| format!("c{i}")).collect()
57
+ }
50
58
  } else {
51
- (0..first_row.len()).map(|i| format!("c{i}")).collect()
52
- })
59
+ let first_row = reader.headers().map_err(|e| {
60
+ Error::new(
61
+ ruby.exception_runtime_error(),
62
+ format!("Failed to read headers: {e}"),
63
+ )
64
+ })?;
65
+ if has_headers {
66
+ first_row.iter().map(String::from).collect()
67
+ } else {
68
+ (0..first_row.len()).map(|i| format!("c{i}")).collect()
69
+ }
70
+ };
71
+
72
+ Ok(headers)
53
73
  }
54
74
 
55
75
  /// Creates a new RecordReader instance.
56
76
  pub(crate) fn new(
57
77
  reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
58
- headers: Vec<StringCacheKey>,
78
+ headers: Vec<Arc<StringCacheKey>>,
59
79
  null_string: Option<Cow<'a, str>>,
60
- flexible_default: Option<Cow<'a, str>>,
61
80
  ignore_null_bytes: bool,
81
+ lossy: bool,
62
82
  ) -> Self {
63
83
  let headers_len = headers.len();
64
84
  Self {
65
85
  reader,
66
86
  headers,
67
87
  null_string,
68
- flexible_default,
69
- string_record: csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers_len),
88
+ string_record: if lossy {
89
+ CsvRecordType::Byte(csv::ByteRecord::with_capacity(
90
+ READ_BUFFER_SIZE,
91
+ headers_len,
92
+ ))
93
+ } else {
94
+ CsvRecordType::String(csv::StringRecord::with_capacity(
95
+ READ_BUFFER_SIZE,
96
+ headers_len,
97
+ ))
98
+ },
70
99
  parser: std::marker::PhantomData,
71
100
  ignore_null_bytes,
72
101
  }
@@ -74,12 +103,15 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
74
103
 
75
104
  /// Attempts to read the next record, returning any errors encountered.
76
105
  fn try_next(&mut self) -> Result<Option<T::Output>, ReaderError> {
77
- if self.reader.read_record(&mut self.string_record)? {
106
+ let record = match self.string_record {
107
+ CsvRecordType::String(ref mut record) => self.reader.read_record(record),
108
+ CsvRecordType::Byte(ref mut record) => self.reader.read_byte_record(record),
109
+ }?;
110
+ if record {
78
111
  Ok(Some(T::parse(
79
112
  &self.headers,
80
113
  &self.string_record,
81
114
  self.null_string.clone(),
82
- self.flexible_default.clone(),
83
115
  self.ignore_null_bytes,
84
116
  )))
85
117
  } else {
@@ -5,6 +5,7 @@ use csv::Trim;
5
5
  use magnus::value::ReprValue;
6
6
  use magnus::{Error, IntoValue, KwArgs, RHash, Ruby, Symbol, Value};
7
7
  use std::collections::HashMap;
8
+ use std::sync::Arc;
8
9
 
9
10
  /// Valid result types for CSV parsing
10
11
  #[derive(Debug, Clone, Copy, PartialEq, Eq)]
@@ -34,9 +35,9 @@ struct EnumeratorArgs {
34
35
  null_string: Option<String>,
35
36
  result_type: String,
36
37
  flexible: bool,
37
- flexible_default: Option<String>,
38
38
  trim: Option<String>,
39
39
  ignore_null_bytes: bool,
40
+ lossy: bool,
40
41
  }
41
42
 
42
43
  /// Parses a CSV file with the given configuration.
@@ -56,9 +57,9 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result<Value, Error> {
56
57
  null_string,
57
58
  result_type,
58
59
  flexible,
59
- flexible_default,
60
60
  trim,
61
61
  ignore_null_bytes,
62
+ lossy,
62
63
  } = parse_read_csv_args(&ruby, args)?;
63
64
 
64
65
  if !ruby.block_given() {
@@ -71,7 +72,6 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result<Value, Error> {
71
72
  null_string,
72
73
  result_type,
73
74
  flexible,
74
- flexible_default,
75
75
  trim: match trim {
76
76
  Trim::All => Some("all".to_string()),
77
77
  Trim::Headers => Some("headers".to_string()),
@@ -79,6 +79,7 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result<Value, Error> {
79
79
  _ => None,
80
80
  },
81
81
  ignore_null_bytes,
82
+ lossy,
82
83
  })
83
84
  .map(|yield_enum| yield_enum.into_value_with(&ruby));
84
85
  }
@@ -93,16 +94,16 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result<Value, Error> {
93
94
  match result_type {
94
95
  ResultType::Hash => {
95
96
  let builder = RecordReaderBuilder::<
96
- HashMap<StringCacheKey, Option<CowStr<'static>>, RandomState>,
97
+ HashMap<Arc<StringCacheKey>, Option<CowStr<'static>>, RandomState>,
97
98
  >::new(ruby, to_read)
98
99
  .has_headers(has_headers)
99
100
  .flexible(flexible)
100
- .flexible_default(flexible_default)
101
101
  .trim(trim)
102
102
  .delimiter(delimiter)
103
103
  .quote_char(quote_char)
104
104
  .null_string(null_string)
105
105
  .ignore_null_bytes(ignore_null_bytes)
106
+ .lossy(lossy)
106
107
  .build()?;
107
108
 
108
109
  let ruby = unsafe { Ruby::get_unchecked() };
@@ -115,12 +116,12 @@ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result<Value, Error> {
115
116
  let builder = RecordReaderBuilder::<Vec<Option<CowStr<'static>>>>::new(ruby, to_read)
116
117
  .has_headers(has_headers)
117
118
  .flexible(flexible)
118
- .flexible_default(flexible_default)
119
119
  .trim(trim)
120
120
  .delimiter(delimiter)
121
121
  .quote_char(quote_char)
122
122
  .null_string(null_string)
123
123
  .ignore_null_bytes(ignore_null_bytes)
124
+ .lossy(lossy)
124
125
  .build()?;
125
126
 
126
127
  let ruby = unsafe { Ruby::get_unchecked() };
@@ -150,10 +151,9 @@ fn create_enumerator(args: EnumeratorArgs) -> Result<magnus::Enumerator, Error>
150
151
  kwargs.aset(Symbol::new("nil_string"), args.null_string)?;
151
152
  kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
152
153
  kwargs.aset(Symbol::new("flexible"), args.flexible)?;
153
- kwargs.aset(Symbol::new("flexible_default"), args.flexible_default)?;
154
154
  kwargs.aset(Symbol::new("trim"), args.trim.map(Symbol::new))?;
155
155
  kwargs.aset(Symbol::new("ignore_null_bytes"), args.ignore_null_bytes)?;
156
-
156
+ kwargs.aset(Symbol::new("lossy"), args.lossy)?;
157
157
  Ok(args
158
158
  .rb_self
159
159
  .enumeratorize("for_each", (args.to_read, KwArgs(kwargs))))
data/ext/osv/src/utils.rs CHANGED
@@ -34,9 +34,9 @@ pub struct ReadCsvArgs {
34
34
  pub null_string: Option<String>,
35
35
  pub result_type: String,
36
36
  pub flexible: bool,
37
- pub flexible_default: Option<String>,
38
37
  pub trim: csv::Trim,
39
38
  pub ignore_null_bytes: bool,
39
+ pub lossy: bool,
40
40
  }
41
41
 
42
42
  /// Parse common arguments for CSV parsing
@@ -54,9 +54,9 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
54
54
  Option<Option<String>>,
55
55
  Option<Option<Value>>,
56
56
  Option<Option<bool>>,
57
- Option<Option<Option<String>>>,
58
57
  Option<Option<Value>>,
59
58
  Option<Option<bool>>,
59
+ Option<Option<bool>>,
60
60
  ),
61
61
  (),
62
62
  >(
@@ -69,9 +69,9 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
69
69
  "nil_string",
70
70
  "result_type",
71
71
  "flexible",
72
- "flexible_default",
73
72
  "trim",
74
73
  "ignore_null_bytes",
74
+ "lossy",
75
75
  ],
76
76
  )?;
77
77
 
@@ -134,11 +134,9 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
134
134
 
135
135
  let flexible = kwargs.optional.5.flatten().unwrap_or_default();
136
136
 
137
- let flexible_default = kwargs.optional.6.flatten().unwrap_or_default();
138
-
139
137
  let trim = match kwargs
140
138
  .optional
141
- .7
139
+ .6
142
140
  .flatten()
143
141
  .map(|value| parse_string_or_symbol(ruby, value))
144
142
  {
@@ -166,7 +164,9 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
166
164
  None => csv::Trim::None,
167
165
  };
168
166
 
169
- let ignore_null_bytes = kwargs.optional.8.flatten().unwrap_or_default();
167
+ let ignore_null_bytes = kwargs.optional.7.flatten().unwrap_or_default();
168
+
169
+ let lossy = kwargs.optional.8.flatten().unwrap_or_default();
170
170
 
171
171
  Ok(ReadCsvArgs {
172
172
  to_read,
@@ -176,8 +176,8 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
176
176
  null_string,
177
177
  result_type,
178
178
  flexible,
179
- flexible_default,
180
179
  trim,
181
180
  ignore_null_bytes,
181
+ lossy,
182
182
  })
183
183
  }
data/lib/osv/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module OSV
2
- VERSION = "0.3.22"
2
+ VERSION = "0.4.1"
3
3
  end
data/lib/osv.rbi CHANGED
@@ -17,14 +17,12 @@ module OSV
17
17
  # ("hash" or "array" or :hash or :array)
18
18
  # - `flexible`: Boolean specifying if the parser should be flexible
19
19
  # (default: false)
20
- # - `flexible_default`: String specifying the default value for missing fields.
21
- # Implicitly enables flexible mode if set.
22
- # (default: `nil`)
23
20
  # - `trim`: String specifying the trim mode
24
21
  # ("all" or "headers" or "fields" or :all or :headers or :fields)
25
22
  # (default: `nil`)
26
23
  # - `ignore_null_bytes`: Boolean specifying if null bytes should be ignored
27
24
  # (default: false)
25
+ # - `lossy`: Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character
28
26
  sig do
29
27
  params(
30
28
  input: T.any(String, StringIO, IO),
@@ -35,7 +33,6 @@ module OSV
35
33
  buffer_size: T.nilable(Integer),
36
34
  result_type: T.nilable(T.any(String, Symbol)),
37
35
  flexible: T.nilable(T::Boolean),
38
- flexible_default: T.nilable(String),
39
36
  ignore_null_bytes: T.nilable(T::Boolean),
40
37
  trim: T.nilable(T.any(String, Symbol)),
41
38
  blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.nilable(String)], T::Array[T.nilable(String)])).void)
@@ -50,9 +47,9 @@ module OSV
50
47
  buffer_size: nil,
51
48
  result_type: nil,
52
49
  flexible: nil,
53
- flexible_default: nil,
54
50
  ignore_null_bytes: nil,
55
51
  trim: nil,
52
+ lossy: nil,
56
53
  &blk
57
54
  )
58
55
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: osv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.22
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-30 00:00:00.000000000 Z
11
+ date: 2025-01-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys