osv 0.3.21 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dba77f8e6d2a8d19d969871594f1c2b18557125d3eee4b5c2759a0398d21da01
4
- data.tar.gz: da35996064fd954c99e6241c045444ec5dc1dc52204db956291401d0c3dfe805
3
+ metadata.gz: 51e4a387f1ed43bddc9f1f7a118637953d04239b5324ef131b9c860577ed4d41
4
+ data.tar.gz: e42928a09656216bbadcc2458953a8c5f28401ddf27095fc05038e0960471854
5
5
  SHA512:
6
- metadata.gz: 26398c526e829d7a6c3f943e175d10f8f626a42cec81da9bb65b4fc30dd29078629d29602f4fe3461fbb2cc86ca6d1b91f7f5f6ba2e7de362c2d44f5a79fcf5a
7
- data.tar.gz: 39cd4dd78ed38559302c9735b3514648fbc668b415be65870b4deecdea8b6fb41dc7053d347ea5c8fe8004cc653b933ec201eed98582d12cbb26d84302396372
6
+ metadata.gz: 4100c50a629ba5803db883532cfbe547eb3091e421b0876595d91791d8952a7b0169477c9c6f31063eafa5b91d0a9b1a9f0a5ae016d70cdd101e284beebfaf22
7
+ data.tar.gz: 90a822c644fcb37dc1892ede85a54395bc9e62a4b0b0a1af838182d390702d0ee4253151faafcedbf734b0a381fe2acf5c1ab23b842059fbdd4d51570fe33e58
data/README.md CHANGED
@@ -84,11 +84,10 @@ OSV.for_each("data.csv",
84
84
 
85
85
  # Parsing behavior
86
86
  flexible: false, # Allow varying number of fields (default: false)
87
- flexible_default: nil, # Default value for missing fields. If unset, we ignore missing fields.
88
- # Implicitly enables flexible mode if set.
89
87
  trim: :all, # Whether to trim whitespace. Options are :all, :headers, or :fields (default: nil)
90
88
  buffer_size: 1024, # Number of rows to buffer in memory (default: 1024)
91
89
  ignore_null_bytes: false, # Boolean specifying if null bytes should be ignored (default: false)
90
+ lossy: false, # Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character (default: false)
92
91
  )
93
92
  ```
94
93
 
@@ -103,9 +102,9 @@ OSV.for_each("data.csv",
103
102
  - `buffer_size`: Integer specifying the number of rows to buffer in memory (default: 1024)
104
103
  - `result_type`: String specifying the output format ("hash" or "array" or :hash or :array)
105
104
  - `flexible`: Boolean specifying if the parser should be flexible (default: false)
106
- - `flexible_default`: String specifying the default value for missing fields. Implicitly enables flexible mode if set. (default: `nil`)
107
105
  - `trim`: String specifying the trim mode ("all" or "headers" or "fields" or :all or :headers or :fields)
108
106
  - `ignore_null_bytes`: Boolean specifying if null bytes should be ignored (default: false)
107
+ - `lossy`: Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character (default: false)
109
108
 
110
109
  When `has_headers` is false, hash keys will be generated as `"c0"`, `"c1"`, etc.
111
110
 
@@ -34,6 +34,10 @@ pub enum ReaderError {
34
34
  InvalidFlexibleDefault(String),
35
35
  #[error("Invalid null string value: {0}")]
36
36
  InvalidNullString(String),
37
+ #[error("Failed to parse CSV record: {0}")]
38
+ CsvParse(#[from] csv::Error),
39
+ #[error("Invalid UTF-8: {0}")]
40
+ InvalidUtf8(String),
37
41
  #[error("Ruby error: {0}")]
38
42
  Ruby(String),
39
43
  }
@@ -46,10 +50,20 @@ impl From<MagnusError> for ReaderError {
46
50
 
47
51
  impl From<ReaderError> for MagnusError {
48
52
  fn from(err: ReaderError) -> Self {
49
- MagnusError::new(
50
- Ruby::get().unwrap().exception_runtime_error(),
51
- err.to_string(),
52
- )
53
+ let ruby = Ruby::get().unwrap();
54
+ match err {
55
+ ReaderError::CsvParse(csv_err) => {
56
+ if csv_err.to_string().contains("invalid utf-8") {
57
+ MagnusError::new(ruby.exception_encoding_error(), csv_err.to_string())
58
+ } else {
59
+ MagnusError::new(ruby.exception_runtime_error(), csv_err.to_string())
60
+ }
61
+ }
62
+ ReaderError::InvalidUtf8(utf8_err) => {
63
+ MagnusError::new(ruby.exception_encoding_error(), utf8_err.to_string())
64
+ }
65
+ _ => MagnusError::new(ruby.exception_runtime_error(), err.to_string()),
66
+ }
53
67
  }
54
68
  }
55
69
 
@@ -65,9 +79,9 @@ pub struct RecordReaderBuilder<'a, T: RecordParser<'a>> {
65
79
  quote_char: u8,
66
80
  null_string: Option<String>,
67
81
  flexible: bool,
68
- flexible_default: Option<String>,
69
82
  trim: csv::Trim,
70
83
  ignore_null_bytes: bool,
84
+ lossy: bool,
71
85
  _phantom: PhantomData<T>,
72
86
  _phantom_a: PhantomData<&'a ()>,
73
87
  }
@@ -83,9 +97,9 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
83
97
  quote_char: b'"',
84
98
  null_string: None,
85
99
  flexible: false,
86
- flexible_default: None,
87
100
  trim: csv::Trim::None,
88
101
  ignore_null_bytes: false,
102
+ lossy: false,
89
103
  _phantom: PhantomData,
90
104
  _phantom_a: PhantomData,
91
105
  }
@@ -126,13 +140,6 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
126
140
  self
127
141
  }
128
142
 
129
- /// Sets the default value for missing fields when in flexible mode.
130
- #[must_use]
131
- pub fn flexible_default(mut self, flexible_default: Option<String>) -> Self {
132
- self.flexible_default = flexible_default;
133
- self
134
- }
135
-
136
143
  /// Sets the trimming mode for fields.
137
144
  #[must_use]
138
145
  pub fn trim(mut self, trim: csv::Trim) -> Self {
@@ -146,6 +153,12 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
146
153
  self
147
154
  }
148
155
 
156
+ #[must_use]
157
+ pub fn lossy(mut self, lossy: bool) -> Self {
158
+ self.lossy = lossy;
159
+ self
160
+ }
161
+
149
162
  /// Handles reading from a file descriptor.
150
163
  fn handle_file_descriptor(&self) -> Result<Box<dyn SeekableRead>, ReaderError> {
151
164
  let raw_value = self.to_read.as_raw();
@@ -188,7 +201,7 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
188
201
  build_ruby_reader(&self.ruby, self.to_read)?
189
202
  };
190
203
 
191
- let flexible = self.flexible || self.flexible_default.is_some();
204
+ let flexible = self.flexible;
192
205
  let reader = BufReader::with_capacity(READ_BUFFER_SIZE, readable);
193
206
 
194
207
  let mut reader = csv::ReaderBuilder::new()
@@ -199,24 +212,13 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
199
212
  .trim(self.trim)
200
213
  .from_reader(reader);
201
214
 
202
- let mut headers = RecordReader::<T>::get_headers(&self.ruby, &mut reader, self.has_headers)?;
215
+ let mut headers =
216
+ RecordReader::<T>::get_headers(&self.ruby, &mut reader, self.has_headers)?;
203
217
  if self.ignore_null_bytes {
204
218
  headers = headers.iter().map(|h| h.replace("\0", "")).collect();
205
219
  }
206
220
  let static_headers = StringCache::intern_many(&headers)?;
207
221
 
208
- // We intern both of these to get static string references we can reuse throughout the parser.
209
- let flexible_default = self
210
- .flexible_default
211
- .map(|s| {
212
- RString::new(&s)
213
- .to_interned_str()
214
- .as_str()
215
- .map_err(|e| ReaderError::InvalidFlexibleDefault(format!("{:?}", e)))
216
- })
217
- .transpose()?
218
- .map(Cow::Borrowed);
219
-
220
222
  let null_string = self
221
223
  .null_string
222
224
  .map(|s| {
@@ -232,8 +234,8 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
232
234
  reader,
233
235
  static_headers,
234
236
  null_string,
235
- flexible_default,
236
237
  self.ignore_null_bytes,
238
+ self.lossy,
237
239
  ))
238
240
  }
239
241
  }
@@ -5,14 +5,18 @@ use std::hash::BuildHasher;
5
5
  use super::header_cache::StringCacheKey;
6
6
  use super::CowStr;
7
7
 
8
+ pub enum CsvRecordType {
9
+ String(csv::StringRecord),
10
+ Byte(csv::ByteRecord),
11
+ }
12
+
8
13
  pub trait RecordParser<'a> {
9
14
  type Output;
10
15
 
11
16
  fn parse(
12
17
  headers: &[StringCacheKey],
13
- record: &csv::StringRecord,
18
+ record: &CsvRecordType,
14
19
  null_string: Option<Cow<'a, str>>,
15
- flexible_default: Option<Cow<'a, str>>,
16
20
  ignore_null_bytes: bool,
17
21
  ) -> Self::Output;
18
22
  }
@@ -25,31 +29,42 @@ impl<'a, S: BuildHasher + Default> RecordParser<'a>
25
29
  #[inline]
26
30
  fn parse(
27
31
  headers: &[StringCacheKey],
28
- record: &csv::StringRecord,
32
+ record: &CsvRecordType,
29
33
  null_string: Option<Cow<'a, str>>,
30
- flexible_default: Option<Cow<'a, str>>,
31
34
  ignore_null_bytes: bool,
32
35
  ) -> Self::Output {
33
36
  let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
34
37
 
35
38
  let shared_empty = Cow::Borrowed("");
36
- let shared_default = flexible_default.map(CowStr);
39
+
37
40
  headers.iter().enumerate().for_each(|(i, header)| {
38
- let value = record.get(i).map_or_else(
39
- || shared_default.clone(),
40
- |field| {
41
- if null_string.as_deref() == Some(field) {
41
+ let value = match record {
42
+ CsvRecordType::String(s) => s.get(i).and_then(|field| {
43
+ if null_string.as_deref() == Some(field.as_ref()) {
42
44
  None
43
45
  } else if field.is_empty() {
44
46
  Some(CowStr(shared_empty.clone()))
45
- } else if ignore_null_bytes {
47
+ } else if ignore_null_bytes {
46
48
  Some(CowStr(Cow::Owned(field.replace("\0", ""))))
49
+ } else {
50
+ Some(CowStr(Cow::Owned(field.to_string())))
47
51
  }
48
- else {
52
+ }),
53
+
54
+ CsvRecordType::Byte(b) => b.get(i).and_then(|field| {
55
+ let field = String::from_utf8_lossy(field);
56
+ if null_string.as_deref() == Some(field.as_ref()) {
57
+ None
58
+ } else if field.is_empty() {
59
+ Some(CowStr(shared_empty.clone()))
60
+ } else if ignore_null_bytes {
61
+ Some(CowStr(Cow::Owned(field.replace("\0", ""))))
62
+ } else {
49
63
  Some(CowStr(Cow::Owned(field.to_string())))
50
64
  }
51
- },
52
- );
65
+ }),
66
+ };
67
+
53
68
  map.insert(*header, value);
54
69
  });
55
70
  map
@@ -62,36 +77,47 @@ impl<'a> RecordParser<'a> for Vec<Option<CowStr<'a>>> {
62
77
  #[inline]
63
78
  fn parse(
64
79
  headers: &[StringCacheKey],
65
- record: &csv::StringRecord,
80
+ record: &CsvRecordType,
66
81
  null_string: Option<Cow<'a, str>>,
67
- flexible_default: Option<Cow<'a, str>>,
68
82
  ignore_null_bytes: bool,
69
83
  ) -> Self::Output {
70
84
  let target_len = headers.len();
71
85
  let mut vec = Vec::with_capacity(target_len);
72
86
 
73
87
  let shared_empty = Cow::Borrowed("");
74
- let shared_default = flexible_default.map(CowStr);
75
88
 
76
- for field in record.iter() {
77
- let value = if Some(field) == null_string.as_deref() {
78
- None
79
- } else if field.is_empty() {
80
- Some(CowStr(shared_empty.clone()))
81
- } else if ignore_null_bytes {
82
- Some(CowStr(Cow::Owned(field.replace("\0", ""))))
89
+ match record {
90
+ CsvRecordType::String(record) => {
91
+ for field in record.iter() {
92
+ let value = if Some(field.as_ref()) == null_string.as_deref() {
93
+ None
94
+ } else if field.is_empty() {
95
+ Some(CowStr(shared_empty.clone()))
96
+ } else if ignore_null_bytes {
97
+ Some(CowStr(Cow::Owned(field.replace("\0", ""))))
98
+ } else {
99
+ Some(CowStr(Cow::Owned(field.to_string())))
100
+ };
101
+ vec.push(value);
102
+ }
83
103
  }
84
- else {
85
- Some(CowStr(Cow::Owned(field.to_string())))
86
- };
87
- vec.push(value);
88
- }
89
-
90
- if vec.len() < target_len {
91
- if let Some(default) = shared_default {
92
- vec.resize_with(target_len, || Some(default.clone()));
104
+ CsvRecordType::Byte(record) => {
105
+ for field in record.iter() {
106
+ let field = String::from_utf8_lossy(field);
107
+ let value = if Some(field.as_ref()) == null_string.as_deref() {
108
+ None
109
+ } else if field.is_empty() {
110
+ Some(CowStr(shared_empty.clone()))
111
+ } else if ignore_null_bytes {
112
+ Some(CowStr(Cow::Owned(field.replace("\0", ""))))
113
+ } else {
114
+ Some(CowStr(Cow::Owned(field.to_string())))
115
+ };
116
+ vec.push(value);
117
+ }
93
118
  }
94
119
  }
120
+
95
121
  vec
96
122
  }
97
123
  }
@@ -1,5 +1,6 @@
1
+ use super::builder::ReaderError;
1
2
  use super::header_cache::StringCacheKey;
2
- use super::parser::RecordParser;
3
+ use super::parser::{CsvRecordType, RecordParser};
3
4
  use super::{header_cache::StringCache, ruby_reader::SeekableRead};
4
5
  use magnus::{Error, Ruby};
5
6
  use std::borrow::Cow;
@@ -15,8 +16,7 @@ pub struct RecordReader<'a, T: RecordParser<'a>> {
15
16
  reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
16
17
  headers: Vec<StringCacheKey>,
17
18
  null_string: Option<Cow<'a, str>>,
18
- flexible_default: Option<Cow<'a, str>>,
19
- string_record: csv::StringRecord,
19
+ string_record: CsvRecordType,
20
20
  parser: std::marker::PhantomData<T>,
21
21
  ignore_null_bytes: bool,
22
22
  }
@@ -56,44 +56,59 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
56
56
  reader: csv::Reader<BufReader<Box<dyn SeekableRead>>>,
57
57
  headers: Vec<StringCacheKey>,
58
58
  null_string: Option<Cow<'a, str>>,
59
- flexible_default: Option<Cow<'a, str>>,
60
59
  ignore_null_bytes: bool,
60
+ lossy: bool,
61
61
  ) -> Self {
62
62
  let headers_len = headers.len();
63
63
  Self {
64
64
  reader,
65
65
  headers,
66
66
  null_string,
67
- flexible_default,
68
- string_record: csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers_len),
67
+ string_record: if lossy {
68
+ CsvRecordType::Byte(csv::ByteRecord::with_capacity(
69
+ READ_BUFFER_SIZE,
70
+ headers_len,
71
+ ))
72
+ } else {
73
+ CsvRecordType::String(csv::StringRecord::with_capacity(
74
+ READ_BUFFER_SIZE,
75
+ headers_len,
76
+ ))
77
+ },
69
78
  parser: std::marker::PhantomData,
70
79
  ignore_null_bytes,
71
80
  }
72
81
  }
73
82
 
74
83
  /// Attempts to read the next record, returning any errors encountered.
75
- fn try_next(&mut self) -> csv::Result<Option<T::Output>> {
76
- match self.reader.read_record(&mut self.string_record)? {
77
- true => Ok(Some(T::parse(
84
+ fn try_next(&mut self) -> Result<Option<T::Output>, ReaderError> {
85
+ let record = match self.string_record {
86
+ CsvRecordType::String(ref mut record) => self.reader.read_record(record),
87
+ CsvRecordType::Byte(ref mut record) => self.reader.read_byte_record(record),
88
+ }?;
89
+ if record {
90
+ Ok(Some(T::parse(
78
91
  &self.headers,
79
92
  &self.string_record,
80
93
  self.null_string.clone(),
81
- self.flexible_default.clone(),
82
- self.ignore_null_bytes
83
- ))),
84
- false => Ok(None),
94
+ self.ignore_null_bytes,
95
+ )))
96
+ } else {
97
+ Ok(None)
85
98
  }
86
99
  }
87
100
  }
88
101
 
89
102
  impl<'a, T: RecordParser<'a>> Iterator for RecordReader<'a, T> {
90
- type Item = T::Output;
103
+ type Item = Result<T::Output, ReaderError>;
91
104
 
92
105
  #[inline]
93
106
  fn next(&mut self) -> Option<Self::Item> {
94
- // Note: We intentionally swallow errors here to maintain Iterator contract.
95
- // Errors can be handled by using try_next() directly if needed.
96
- self.try_next().ok().flatten()
107
+ match self.try_next() {
108
+ Ok(Some(record)) => Some(Ok(record)),
109
+ Ok(None) => None,
110
+ Err(e) => Some(Err(e)),
111
+ }
97
112
  }
98
113
 
99
114
  #[inline]
@@ -1,4 +1,5 @@
1
1
  use magnus::{
2
+ error::Error as MagnusError,
2
3
  value::{Opaque, ReprValue},
3
4
  RClass, RString, Ruby, Value,
4
5
  };
@@ -6,7 +7,7 @@ use std::fs::File;
6
7
  use std::io::{self, BufReader, Read, Seek, SeekFrom, Write};
7
8
  use std::sync::OnceLock;
8
9
 
9
- use super::ForgottenFileHandle;
10
+ use super::{builder::ReaderError, ForgottenFileHandle};
10
11
 
11
12
  static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
12
13
 
@@ -25,10 +26,7 @@ impl<T: Read + Seek> SeekableRead for BufReader<T> {}
25
26
  impl SeekableRead for std::io::Cursor<Vec<u8>> {}
26
27
  impl SeekableRead for ForgottenFileHandle {}
27
28
 
28
- pub fn build_ruby_reader(
29
- ruby: &Ruby,
30
- input: Value,
31
- ) -> Result<Box<dyn SeekableRead>, magnus::Error> {
29
+ pub fn build_ruby_reader(ruby: &Ruby, input: Value) -> Result<Box<dyn SeekableRead>, ReaderError> {
32
30
  if RubyReader::is_string_io(ruby, &input) {
33
31
  RubyReader::from_string_io(ruby, input)
34
32
  } else if RubyReader::is_io_like(&input) {
@@ -88,14 +86,14 @@ impl Seek for RubyReader<RString> {
88
86
  }
89
87
 
90
88
  impl RubyReader<Value> {
91
- fn from_io(input: Value) -> Result<Box<dyn SeekableRead>, magnus::Error> {
89
+ fn from_io(input: Value) -> Result<Box<dyn SeekableRead>, ReaderError> {
92
90
  if Self::is_io_like(&input) {
93
91
  Ok(Box::new(Self::from_io_like(input)))
94
92
  } else {
95
- Err(magnus::Error::new(
93
+ Err(MagnusError::new(
96
94
  magnus::exception::type_error(),
97
95
  "Input is not an IO-like object",
98
- ))
96
+ ))?
99
97
  }
100
98
  }
101
99
 
@@ -112,15 +110,12 @@ impl RubyReader<Value> {
112
110
  }
113
111
 
114
112
  impl RubyReader<RString> {
115
- pub fn from_string_io(
116
- ruby: &Ruby,
117
- input: Value,
118
- ) -> Result<Box<dyn SeekableRead>, magnus::Error> {
113
+ pub fn from_string_io(ruby: &Ruby, input: Value) -> Result<Box<dyn SeekableRead>, ReaderError> {
119
114
  if !Self::is_string_io(ruby, &input) {
120
- return Err(magnus::Error::new(
115
+ return Err(MagnusError::new(
121
116
  magnus::exception::type_error(),
122
117
  "Input is not a StringIO",
123
- ));
118
+ ))?;
124
119
  }
125
120
 
126
121
  let string_content = input.funcall::<_, _, RString>("string", ()).unwrap();
@@ -138,11 +133,11 @@ impl RubyReader<RString> {
138
133
  input.is_kind_of(ruby.get_inner(*string_io_class))
139
134
  }
140
135
 
141
- fn from_string_like(input: Value) -> Result<Box<dyn SeekableRead>, magnus::Error> {
142
- // Try calling `to_str`, and if that fails, try `to_s`
136
+ fn from_string_like(input: Value) -> Result<Box<dyn SeekableRead>, ReaderError> {
143
137
  let string_content = input
144
138
  .funcall::<_, _, RString>("to_str", ())
145
139
  .or_else(|_| input.funcall::<_, _, RString>("to_s", ()))?;
140
+
146
141
  Ok(Box::new(Self {
147
142
  inner: string_content,
148
143
  offset: 0,
@@ -154,12 +149,16 @@ impl Read for RubyReader<Value> {
154
149
  fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
155
150
  let bytes = self
156
151
  .inner
157
- .funcall::<_, _, RString>("read", (buf.len(),))
152
+ .funcall::<_, _, Option<RString>>("read", (buf.len(),))
158
153
  .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
159
154
 
160
- buf.write_all(unsafe { bytes.as_slice() })?;
161
-
162
- Ok(bytes.len())
155
+ match bytes {
156
+ Some(bytes) => {
157
+ buf.write_all(unsafe { bytes.as_slice() })?;
158
+ Ok(bytes.len())
159
+ }
160
+ None => Ok(0), // EOF
161
+ }
163
162
  }
164
163
  }
165
164
 
@@ -3,7 +3,7 @@ use crate::utils::*;
3
3
  use ahash::RandomState;
4
4
  use csv::Trim;
5
5
  use magnus::value::ReprValue;
6
- use magnus::{block::Yield, Error, KwArgs, RHash, Ruby, Symbol, Value};
6
+ use magnus::{Error, IntoValue, KwArgs, RHash, Ruby, Symbol, Value};
7
7
  use std::collections::HashMap;
8
8
 
9
9
  /// Valid result types for CSV parsing
@@ -34,9 +34,9 @@ struct EnumeratorArgs {
34
34
  null_string: Option<String>,
35
35
  result_type: String,
36
36
  flexible: bool,
37
- flexible_default: Option<String>,
38
37
  trim: Option<String>,
39
38
  ignore_null_bytes: bool,
39
+ lossy: bool,
40
40
  }
41
41
 
42
42
  /// Parses a CSV file with the given configuration.
@@ -44,10 +44,7 @@ struct EnumeratorArgs {
44
44
  /// # Safety
45
45
  /// This function uses unsafe code to get the Ruby runtime and leak memory for static references.
46
46
  /// This is necessary for Ruby integration but should be used with caution.
47
- pub fn parse_csv(
48
- rb_self: Value,
49
- args: &[Value],
50
- ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<'static, RandomState>>>>, Error> {
47
+ pub fn parse_csv(rb_self: Value, args: &[Value]) -> Result<Value, Error> {
51
48
  // SAFETY: We're in a Ruby callback, so Ruby runtime is guaranteed to be initialized
52
49
  let ruby = unsafe { Ruby::get_unchecked() };
53
50
 
@@ -59,9 +56,9 @@ pub fn parse_csv(
59
56
  null_string,
60
57
  result_type,
61
58
  flexible,
62
- flexible_default,
63
59
  trim,
64
60
  ignore_null_bytes,
61
+ lossy,
65
62
  } = parse_read_csv_args(&ruby, args)?;
66
63
 
67
64
  if !ruby.block_given() {
@@ -74,7 +71,6 @@ pub fn parse_csv(
74
71
  null_string,
75
72
  result_type,
76
73
  flexible,
77
- flexible_default,
78
74
  trim: match trim {
79
75
  Trim::All => Some("all".to_string()),
80
76
  Trim::Headers => Some("headers".to_string()),
@@ -82,7 +78,9 @@ pub fn parse_csv(
82
78
  _ => None,
83
79
  },
84
80
  ignore_null_bytes,
85
- });
81
+ lossy,
82
+ })
83
+ .map(|yield_enum| yield_enum.into_value_with(&ruby));
86
84
  }
87
85
 
88
86
  let result_type = ResultType::from_str(&result_type).ok_or_else(|| {
@@ -92,46 +90,53 @@ pub fn parse_csv(
92
90
  )
93
91
  })?;
94
92
 
95
- let iter: Box<dyn Iterator<Item = CsvRecord<RandomState>>> = match result_type {
93
+ match result_type {
96
94
  ResultType::Hash => {
97
95
  let builder = RecordReaderBuilder::<
98
96
  HashMap<StringCacheKey, Option<CowStr<'static>>, RandomState>,
99
97
  >::new(ruby, to_read)
100
98
  .has_headers(has_headers)
101
99
  .flexible(flexible)
102
- .flexible_default(flexible_default)
103
100
  .trim(trim)
104
101
  .delimiter(delimiter)
105
102
  .quote_char(quote_char)
106
103
  .null_string(null_string)
107
104
  .ignore_null_bytes(ignore_null_bytes)
105
+ .lossy(lossy)
108
106
  .build()?;
109
107
 
110
- Box::new(builder.map(CsvRecord::Map))
108
+ let ruby = unsafe { Ruby::get_unchecked() };
109
+ for result in builder {
110
+ let record = result?;
111
+ let _: Value = ruby.yield_value(CsvRecord::Map(record))?;
112
+ }
111
113
  }
112
114
  ResultType::Array => {
113
115
  let builder = RecordReaderBuilder::<Vec<Option<CowStr<'static>>>>::new(ruby, to_read)
114
116
  .has_headers(has_headers)
115
117
  .flexible(flexible)
116
- .flexible_default(flexible_default)
117
118
  .trim(trim)
118
119
  .delimiter(delimiter)
119
120
  .quote_char(quote_char)
120
121
  .null_string(null_string)
121
122
  .ignore_null_bytes(ignore_null_bytes)
123
+ .lossy(lossy)
122
124
  .build()?;
123
125
 
124
- Box::new(builder.map(CsvRecord::Vec))
126
+ let ruby = unsafe { Ruby::get_unchecked() };
127
+ for result in builder {
128
+ let record = result?;
129
+ let _: Value = ruby.yield_value(CsvRecord::<ahash::RandomState>::Vec(record))?;
130
+ }
125
131
  }
126
- };
132
+ }
127
133
 
128
- Ok(Yield::Iter(iter))
134
+ let ruby = unsafe { Ruby::get_unchecked() };
135
+ Ok(ruby.qnil().into_value_with(&ruby))
129
136
  }
130
137
 
131
138
  /// Creates an enumerator for lazy CSV parsing
132
- fn create_enumerator(
133
- args: EnumeratorArgs,
134
- ) -> Result<Yield<Box<dyn Iterator<Item = CsvRecord<'static, RandomState>>>>, Error> {
139
+ fn create_enumerator(args: EnumeratorArgs) -> Result<magnus::Enumerator, Error> {
135
140
  let kwargs = RHash::new();
136
141
  kwargs.aset(Symbol::new("has_headers"), args.has_headers)?;
137
142
  kwargs.aset(
@@ -145,14 +150,10 @@ fn create_enumerator(
145
150
  kwargs.aset(Symbol::new("nil_string"), args.null_string)?;
146
151
  kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
147
152
  kwargs.aset(Symbol::new("flexible"), args.flexible)?;
148
- kwargs.aset(Symbol::new("flexible_default"), args.flexible_default)?;
149
153
  kwargs.aset(Symbol::new("trim"), args.trim.map(Symbol::new))?;
150
-
151
154
  kwargs.aset(Symbol::new("ignore_null_bytes"), args.ignore_null_bytes)?;
152
-
153
-
154
- let enumerator = args
155
+ kwargs.aset(Symbol::new("lossy"), args.lossy)?;
156
+ Ok(args
155
157
  .rb_self
156
- .enumeratorize("for_each", (args.to_read, KwArgs(kwargs)));
157
- Ok(Yield::Enumerator(enumerator))
158
+ .enumeratorize("for_each", (args.to_read, KwArgs(kwargs))))
158
159
  }
data/ext/osv/src/utils.rs CHANGED
@@ -34,9 +34,9 @@ pub struct ReadCsvArgs {
34
34
  pub null_string: Option<String>,
35
35
  pub result_type: String,
36
36
  pub flexible: bool,
37
- pub flexible_default: Option<String>,
38
37
  pub trim: csv::Trim,
39
38
  pub ignore_null_bytes: bool,
39
+ pub lossy: bool,
40
40
  }
41
41
 
42
42
  /// Parse common arguments for CSV parsing
@@ -54,9 +54,9 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
54
54
  Option<Option<String>>,
55
55
  Option<Option<Value>>,
56
56
  Option<Option<bool>>,
57
- Option<Option<Option<String>>>,
58
57
  Option<Option<Value>>,
59
58
  Option<Option<bool>>,
59
+ Option<Option<bool>>,
60
60
  ),
61
61
  (),
62
62
  >(
@@ -69,9 +69,9 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
69
69
  "nil_string",
70
70
  "result_type",
71
71
  "flexible",
72
- "flexible_default",
73
72
  "trim",
74
73
  "ignore_null_bytes",
74
+ "lossy",
75
75
  ],
76
76
  )?;
77
77
 
@@ -134,11 +134,9 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
134
134
 
135
135
  let flexible = kwargs.optional.5.flatten().unwrap_or_default();
136
136
 
137
- let flexible_default = kwargs.optional.6.flatten().unwrap_or_default();
138
-
139
137
  let trim = match kwargs
140
138
  .optional
141
- .7
139
+ .6
142
140
  .flatten()
143
141
  .map(|value| parse_string_or_symbol(ruby, value))
144
142
  {
@@ -166,7 +164,9 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
166
164
  None => csv::Trim::None,
167
165
  };
168
166
 
169
- let ignore_null_bytes = kwargs.optional.8.flatten().unwrap_or_default();
167
+ let ignore_null_bytes = kwargs.optional.7.flatten().unwrap_or_default();
168
+
169
+ let lossy = kwargs.optional.8.flatten().unwrap_or_default();
170
170
 
171
171
  Ok(ReadCsvArgs {
172
172
  to_read,
@@ -176,8 +176,8 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
176
176
  null_string,
177
177
  result_type,
178
178
  flexible,
179
- flexible_default,
180
179
  trim,
181
180
  ignore_null_bytes,
181
+ lossy,
182
182
  })
183
183
  }
data/lib/osv/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module OSV
2
- VERSION = "0.3.21"
2
+ VERSION = "0.4.0"
3
3
  end
data/lib/osv.rbi CHANGED
@@ -17,14 +17,12 @@ module OSV
17
17
  # ("hash" or "array" or :hash or :array)
18
18
  # - `flexible`: Boolean specifying if the parser should be flexible
19
19
  # (default: false)
20
- # - `flexible_default`: String specifying the default value for missing fields.
21
- # Implicitly enables flexible mode if set.
22
- # (default: `nil`)
23
20
  # - `trim`: String specifying the trim mode
24
21
  # ("all" or "headers" or "fields" or :all or :headers or :fields)
25
22
  # (default: `nil`)
26
23
  # - `ignore_null_bytes`: Boolean specifying if null bytes should be ignored
27
24
  # (default: false)
25
+ # - `lossy`: Boolean specifying if invalid UTF-8 characters should be replaced with a replacement character
28
26
  sig do
29
27
  params(
30
28
  input: T.any(String, StringIO, IO),
@@ -35,7 +33,6 @@ module OSV
35
33
  buffer_size: T.nilable(Integer),
36
34
  result_type: T.nilable(T.any(String, Symbol)),
37
35
  flexible: T.nilable(T::Boolean),
38
- flexible_default: T.nilable(String),
39
36
  ignore_null_bytes: T.nilable(T::Boolean),
40
37
  trim: T.nilable(T.any(String, Symbol)),
41
38
  blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.nilable(String)], T::Array[T.nilable(String)])).void)
@@ -50,9 +47,9 @@ module OSV
50
47
  buffer_size: nil,
51
48
  result_type: nil,
52
49
  flexible: nil,
53
- flexible_default: nil,
54
50
  ignore_null_bytes: nil,
55
51
  trim: nil,
52
+ lossy: nil,
56
53
  &blk
57
54
  )
58
55
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: osv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.21
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-24 00:00:00.000000000 Z
11
+ date: 2025-01-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys