osv 0.3.17 → 0.3.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4469c67b2a39d9ffa23923e36cd894eac415ca004a432e700102a334af11efd8
4
- data.tar.gz: 8dee3117fe6511b9c5b6005ae37d991891e0f314508986743b659080c7885855
3
+ metadata.gz: 87b1080bdb84d6db09a077d9eacc6c4792cfe93851671e4ba0438fe27c3d3218
4
+ data.tar.gz: f1dc704d251d906e9d6011c288c37844af8ccc976eb90611c629b7bc9cc9353e
5
5
  SHA512:
6
- metadata.gz: d8c94dc1c576cca0043c7501752bdd6dee0c8bf0523d9c99a0e8ab4d614a0eb4e6f087fa62be97bb5816f9998f2c414758ffcab260e90889afada8379fb03aec
7
- data.tar.gz: c51ece65a713af0b351a183415816302fcdc35ad598d0e5ee9e5b693c1ef66826c5dfc3dab90f04499c90e994e590e6dd7121999b5dfe54ce20997e41df0ac02
6
+ metadata.gz: aa7a083aebc9f480528de32fea2890189857c2381811f1cd90a8115724c75640a6bd11424e186492ace41a5fed28ee74dd17172f983801b48b31b81a750e24e9
7
+ data.tar.gz: 214c20e5011660e2250364ca79bd91a6c812053b401e9a1266abd500cf1e9e7c045f02054144f46190e45b459480aae70bbfab4d19ab246d1e1f5a8556f616a1
data/README.md CHANGED
@@ -142,34 +142,34 @@ OSV - Gzipped Direct 1.000 i/100ms
142
142
  FastCSV - Gzipped 1.000 i/100ms
143
143
  CSV - Gzipped 1.000 i/100ms
144
144
  Calculating -------------------------------------
145
- CSV - StringIO 0.080 (± 0.0%) i/s (12.43 s/i) - 3.000 in 37.301114s
146
- FastCSV - StringIO 0.368 (± 0.0%) i/s (2.72 s/i) - 12.000 in 32.619020s
147
- OSV - StringIO 0.699 (± 0.0%) i/s (1.43 s/i) - 21.000 in 30.091225s
148
- CSV - Hash output 0.059 (± 0.0%) i/s (16.95 s/i) - 2.000 in 33.908533s
149
- OSV - Hash output 0.329 (± 0.0%) i/s (3.04 s/i) - 10.000 in 30.551275s
150
- CSV - Array output 0.066 (± 0.0%) i/s (15.18 s/i) - 2.000 in 30.357327s
151
- OSV - Array output 0.632 (± 0.0%) i/s (1.58 s/i) - 19.000 in 30.150113s
145
+ CSV - StringIO 0.083 (± 0.0%) i/s (12.06 s/i) - 3.000 in 36.304469s
146
+ FastCSV - StringIO 0.335 (± 0.0%) i/s (2.98 s/i) - 10.000 in 31.019521s
147
+ OSV - StringIO 0.705 (± 0.0%) i/s (1.42 s/i) - 21.000 in 30.629511s
148
+ CSV - Hash output 0.060 (± 0.0%) i/s (16.74 s/i) - 2.000 in 33.475977s
149
+ OSV - Hash output 0.434 (± 0.0%) i/s (2.30 s/i) - 13.000 in 30.071679s
150
+ CSV - Array output 0.063 (± 0.0%) i/s (15.88 s/i) - 2.000 in 32.229906s
151
+ OSV - Array output 0.406 (± 0.0%) i/s (2.47 s/i) - 12.000 in 31.072600s
152
152
  FastCSV - Array output
153
- 0.350 (± 0.0%) i/s (2.86 s/i) - 11.000 in 31.477268s
153
+ 0.321 (± 0.0%) i/s (3.11 s/i) - 10.000 in 31.458966s
154
154
  OSV - Direct Open Array output
155
- 0.641 (± 0.0%) i/s (1.56 s/i) - 20.000 in 31.275201s
156
- OSV - Gzipped 0.530 (± 0.0%) i/s (1.89 s/i) - 16.000 in 30.183753s
157
- OSV - Gzipped Direct 0.727 (± 0.0%) i/s (1.37 s/i) - 22.000 in 30.283991s
158
- FastCSV - Gzipped 0.323 (± 0.0%) i/s (3.09 s/i) - 10.000 in 30.949600s
159
- CSV - Gzipped 0.056 (± 0.0%) i/s (17.72 s/i) - 2.000 in 35.440473s
155
+ 0.686 (± 0.0%) i/s (1.46 s/i) - 21.000 in 30.639715s
156
+ OSV - Gzipped 0.524 (± 0.0%) i/s (1.91 s/i) - 16.000 in 30.695259s
157
+ OSV - Gzipped Direct 0.519 (± 0.0%) i/s (1.93 s/i) - 16.000 in 30.830005s
158
+ FastCSV - Gzipped 0.313 (± 0.0%) i/s (3.20 s/i) - 10.000 in 32.031002s
159
+ CSV - Gzipped 0.057 (± 0.0%) i/s (17.55 s/i) - 2.000 in 35.107808s
160
160
 
161
161
  Comparison:
162
- OSV - Gzipped Direct: 0.7 i/s
163
- OSV - StringIO: 0.7 i/s - 1.04x slower
164
- OSV - Direct Open Array output: 0.6 i/s - 1.14x slower
165
- OSV - Array output: 0.6 i/s - 1.15x slower
166
- OSV - Gzipped: 0.5 i/s - 1.37x slower
167
- FastCSV - StringIO: 0.4 i/s - 1.98x slower
168
- FastCSV - Array output: 0.3 i/s - 2.08x slower
169
- OSV - Hash output: 0.3 i/s - 2.21x slower
170
- FastCSV - Gzipped: 0.3 i/s - 2.25x slower
171
- CSV - StringIO: 0.1 i/s - 9.04x slower
172
- CSV - Array output: 0.1 i/s - 11.04x slower
173
- CSV - Hash output: 0.1 i/s - 12.33x slower
174
- CSV - Gzipped: 0.1 i/s - 12.89x slower
162
+ OSV - StringIO : 0.7 i/s
163
+ OSV - Direct Open Array output: 0.7 i/s - 1.03x slower
164
+ OSV - Gzipped : 0.5 i/s - 1.34x slower
165
+ OSV - Gzipped Direct : 0.5 i/s - 1.36x slower
166
+ OSV - Hash output : 0.4 i/s - 1.62x slower
167
+ OSV - Array output : 0.4 i/s - 1.74x slower
168
+ FastCSV - StringIO : 0.3 i/s - 2.10x slower
169
+ FastCSV - Array output : 0.3 i/s - 2.20x slower
170
+ FastCSV - Gzipped : 0.3 i/s - 2.26x slower
171
+ CSV - StringIO : 0.1 i/s - 8.50x slower
172
+ CSV - Array output : 0.1 i/s - 11.20x slower
173
+ CSV - Hash output : 0.1 i/s - 11.80x slower
174
+ CSV - Gzipped : 0.1 i/s - 12.37x slower
175
175
  ```
@@ -67,6 +67,7 @@ pub struct RecordReaderBuilder<'a, T: RecordParser<'a>> {
67
67
  flexible: bool,
68
68
  flexible_default: Option<String>,
69
69
  trim: csv::Trim,
70
+ ignore_null_bytes: bool,
70
71
  _phantom: PhantomData<T>,
71
72
  _phantom_a: PhantomData<&'a ()>,
72
73
  }
@@ -84,6 +85,7 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
84
85
  flexible: false,
85
86
  flexible_default: None,
86
87
  trim: csv::Trim::None,
88
+ ignore_null_bytes: false,
87
89
  _phantom: PhantomData,
88
90
  _phantom_a: PhantomData,
89
91
  }
@@ -138,6 +140,12 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
138
140
  self
139
141
  }
140
142
 
143
+ #[must_use]
144
+ pub fn ignore_null_bytes(mut self, ignore_null_bytes: bool) -> Self {
145
+ self.ignore_null_bytes = ignore_null_bytes;
146
+ self
147
+ }
148
+
141
149
  /// Handles reading from a file descriptor.
142
150
  fn handle_file_descriptor(&self) -> Result<Box<dyn SeekableRead>, ReaderError> {
143
151
  let raw_value = self.to_read.as_raw();
@@ -191,7 +199,10 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
191
199
  .trim(self.trim)
192
200
  .from_reader(reader);
193
201
 
194
- let headers = RecordReader::<T>::get_headers(&self.ruby, &mut reader, self.has_headers)?;
202
+ let mut headers = RecordReader::<T>::get_headers(&self.ruby, &mut reader, self.has_headers)?;
203
+ if self.ignore_null_bytes {
204
+ headers = headers.iter().map(|h| h.replace("\0", "")).collect();
205
+ }
195
206
  let static_headers = StringCache::intern_many(&headers)?;
196
207
 
197
208
  // We intern both of these to get static string references we can reuse throughout the parser.
@@ -204,7 +215,7 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
204
215
  .map_err(|e| ReaderError::InvalidFlexibleDefault(format!("{:?}", e)))
205
216
  })
206
217
  .transpose()?
207
- .map(|s| Cow::Borrowed(s));
218
+ .map(Cow::Borrowed);
208
219
 
209
220
  let null_string = self
210
221
  .null_string
@@ -215,13 +226,14 @@ impl<'a, T: RecordParser<'a>> RecordReaderBuilder<'a, T> {
215
226
  .map_err(|e| ReaderError::InvalidNullString(format!("{:?}", e)))
216
227
  })
217
228
  .transpose()?
218
- .map(|s| Cow::Borrowed(s));
229
+ .map(Cow::Borrowed);
219
230
 
220
231
  Ok(RecordReader::new(
221
232
  reader,
222
233
  static_headers,
223
234
  null_string,
224
235
  flexible_default,
236
+ self.ignore_null_bytes,
225
237
  ))
226
238
  }
227
239
  }
@@ -99,7 +99,7 @@ impl StringCache {
99
99
  counter.fetch_add(1, Ordering::Relaxed);
100
100
  result.push(*interned_string);
101
101
  } else {
102
- let interned = StringCacheKey::new(&string);
102
+ let interned = StringCacheKey::new(string);
103
103
  let leaked = Box::leak(string.clone().into_boxed_str());
104
104
  cache.insert(leaked, (interned, AtomicU32::new(1)));
105
105
  result.push(interned);
@@ -154,7 +154,7 @@ impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
154
154
  impl<I> Drop for HeaderCacheCleanupIter<I> {
155
155
  fn drop(&mut self) {
156
156
  if let Some(headers) = self.headers.get() {
157
- StringCache::clear(&headers).unwrap();
157
+ StringCache::clear(headers).unwrap();
158
158
  }
159
159
  }
160
160
  }
@@ -13,6 +13,7 @@ pub trait RecordParser<'a> {
13
13
  record: &csv::StringRecord,
14
14
  null_string: Option<Cow<'a, str>>,
15
15
  flexible_default: Option<Cow<'a, str>>,
16
+ ignore_null_bytes: bool,
16
17
  ) -> Self::Output;
17
18
  }
18
19
 
@@ -27,12 +28,13 @@ impl<'a, S: BuildHasher + Default> RecordParser<'a>
27
28
  record: &csv::StringRecord,
28
29
  null_string: Option<Cow<'a, str>>,
29
30
  flexible_default: Option<Cow<'a, str>>,
31
+ ignore_null_bytes: bool,
30
32
  ) -> Self::Output {
31
33
  let mut map = HashMap::with_capacity_and_hasher(headers.len(), S::default());
32
34
 
33
35
  let shared_empty = Cow::Borrowed("");
34
36
  let shared_default = flexible_default.map(CowStr);
35
- headers.iter().enumerate().for_each(|(i, ref header)| {
37
+ headers.iter().enumerate().for_each(|(i, header)| {
36
38
  let value = record.get(i).map_or_else(
37
39
  || shared_default.clone(),
38
40
  |field| {
@@ -40,12 +42,15 @@ impl<'a, S: BuildHasher + Default> RecordParser<'a>
40
42
  None
41
43
  } else if field.is_empty() {
42
44
  Some(CowStr(shared_empty.clone()))
43
- } else {
45
+ } else if ignore_null_bytes {
46
+ Some(CowStr(Cow::Owned(field.replace("\0", "").to_string())))
47
+ }
48
+ else {
44
49
  Some(CowStr(Cow::Owned(field.to_string())))
45
50
  }
46
51
  },
47
52
  );
48
- map.insert((*header).clone(), value);
53
+ map.insert(*header, value);
49
54
  });
50
55
  map
51
56
  }
@@ -60,6 +65,7 @@ impl<'a> RecordParser<'a> for Vec<Option<CowStr<'a>>> {
60
65
  record: &csv::StringRecord,
61
66
  null_string: Option<Cow<'a, str>>,
62
67
  flexible_default: Option<Cow<'a, str>>,
68
+ ignore_null_bytes: bool,
63
69
  ) -> Self::Output {
64
70
  let target_len = headers.len();
65
71
  let mut vec = Vec::with_capacity(target_len);
@@ -72,7 +78,10 @@ impl<'a> RecordParser<'a> for Vec<Option<CowStr<'a>>> {
72
78
  None
73
79
  } else if field.is_empty() {
74
80
  Some(CowStr(shared_empty.clone()))
75
- } else {
81
+ } else if ignore_null_bytes {
82
+ Some(CowStr(Cow::Owned(field.replace("\0", "").to_string())))
83
+ }
84
+ else {
76
85
  Some(CowStr(Cow::Owned(field.to_string())))
77
86
  };
78
87
  vec.push(value);
@@ -18,6 +18,7 @@ pub struct RecordReader<'a, T: RecordParser<'a>> {
18
18
  flexible_default: Option<Cow<'a, str>>,
19
19
  string_record: csv::StringRecord,
20
20
  parser: std::marker::PhantomData<T>,
21
+ ignore_null_bytes: bool,
21
22
  }
22
23
 
23
24
  impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
@@ -56,6 +57,7 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
56
57
  headers: Vec<StringCacheKey>,
57
58
  null_string: Option<Cow<'a, str>>,
58
59
  flexible_default: Option<Cow<'a, str>>,
60
+ ignore_null_bytes: bool,
59
61
  ) -> Self {
60
62
  let headers_len = headers.len();
61
63
  Self {
@@ -65,6 +67,7 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
65
67
  flexible_default,
66
68
  string_record: csv::StringRecord::with_capacity(READ_BUFFER_SIZE, headers_len),
67
69
  parser: std::marker::PhantomData,
70
+ ignore_null_bytes,
68
71
  }
69
72
  }
70
73
 
@@ -76,6 +79,7 @@ impl<'a, T: RecordParser<'a>> RecordReader<'a, T> {
76
79
  &self.string_record,
77
80
  self.null_string.clone(),
78
81
  self.flexible_default.clone(),
82
+ self.ignore_null_bytes
79
83
  ))),
80
84
  false => Ok(None),
81
85
  }
@@ -60,6 +60,7 @@ pub fn parse_csv(
60
60
  flexible,
61
61
  flexible_default,
62
62
  trim,
63
+ ignore_null_bytes,
63
64
  } = parse_read_csv_args(&ruby, args)?;
64
65
 
65
66
  if !ruby.block_given() {
@@ -70,9 +71,9 @@ pub fn parse_csv(
70
71
  delimiter,
71
72
  quote_char,
72
73
  null_string,
73
- result_type: result_type,
74
+ result_type,
74
75
  flexible,
75
- flexible_default: flexible_default,
76
+ flexible_default,
76
77
  trim: match trim {
77
78
  Trim::All => Some("all".to_string()),
78
79
  Trim::Headers => Some("headers".to_string()),
@@ -100,9 +101,11 @@ pub fn parse_csv(
100
101
  .trim(trim)
101
102
  .delimiter(delimiter)
102
103
  .quote_char(quote_char)
103
- .null_string(null_string);
104
+ .null_string(null_string)
105
+ .ignore_null_bytes(ignore_null_bytes)
106
+ .build()?;
104
107
 
105
- Box::new(builder.build()?.map(CsvRecord::Map))
108
+ Box::new(builder.map(CsvRecord::Map))
106
109
  }
107
110
  ResultType::Array => {
108
111
  let builder = RecordReaderBuilder::<Vec<Option<CowStr<'static>>>>::new(ruby, to_read)
@@ -113,6 +116,7 @@ pub fn parse_csv(
113
116
  .delimiter(delimiter)
114
117
  .quote_char(quote_char)
115
118
  .null_string(null_string)
119
+ .ignore_null_bytes(ignore_null_bytes)
116
120
  .build()?;
117
121
 
118
122
  Box::new(builder.map(CsvRecord::Vec))
data/ext/osv/src/utils.rs CHANGED
@@ -36,6 +36,7 @@ pub struct ReadCsvArgs {
36
36
  pub flexible: bool,
37
37
  pub flexible_default: Option<String>,
38
38
  pub trim: csv::Trim,
39
+ pub ignore_null_bytes: bool,
39
40
  }
40
41
 
41
42
  /// Parse common arguments for CSV parsing
@@ -47,14 +48,15 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
47
48
  _,
48
49
  (),
49
50
  (
50
- Option<bool>,
51
- Option<String>,
52
- Option<String>,
51
+ Option<Option<bool>>,
53
52
  Option<Option<String>>,
54
- Option<Value>,
55
- Option<bool>,
56
53
  Option<Option<String>>,
57
- Option<Value>,
54
+ Option<Option<String>>,
55
+ Option<Option<Value>>,
56
+ Option<Option<bool>>,
57
+ Option<Option<Option<String>>>,
58
+ Option<Option<Value>>,
59
+ Option<Option<bool>>,
58
60
  ),
59
61
  (),
60
62
  >(
@@ -69,14 +71,16 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
69
71
  "flexible",
70
72
  "flexible_default",
71
73
  "trim",
74
+ "ignore_null_bytes",
72
75
  ],
73
76
  )?;
74
77
 
75
- let has_headers = kwargs.optional.0.unwrap_or(true);
78
+ let has_headers = kwargs.optional.0.flatten().unwrap_or(true);
76
79
 
77
80
  let delimiter = *kwargs
78
81
  .optional
79
82
  .1
83
+ .flatten()
80
84
  .unwrap_or_else(|| ",".to_string())
81
85
  .as_bytes()
82
86
  .first()
@@ -90,6 +94,7 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
90
94
  let quote_char = *kwargs
91
95
  .optional
92
96
  .2
97
+ .flatten()
93
98
  .unwrap_or_else(|| "\"".to_string())
94
99
  .as_bytes()
95
100
  .first()
@@ -105,6 +110,7 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
105
110
  let result_type = match kwargs
106
111
  .optional
107
112
  .4
113
+ .flatten()
108
114
  .map(|value| parse_string_or_symbol(ruby, value))
109
115
  {
110
116
  Some(Ok(Some(parsed))) => match parsed.as_str() {
@@ -126,13 +132,14 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
126
132
  None => String::from("hash"),
127
133
  };
128
134
 
129
- let flexible = kwargs.optional.5.unwrap_or_default();
135
+ let flexible = kwargs.optional.5.flatten().unwrap_or_default();
130
136
 
131
- let flexible_default = kwargs.optional.6.unwrap_or_default();
137
+ let flexible_default = kwargs.optional.6.flatten().unwrap_or_default();
132
138
 
133
139
  let trim = match kwargs
134
140
  .optional
135
141
  .7
142
+ .flatten()
136
143
  .map(|value| parse_string_or_symbol(ruby, value))
137
144
  {
138
145
  Some(Ok(Some(parsed))) => match parsed.as_str() {
@@ -159,6 +166,8 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
159
166
  None => csv::Trim::None,
160
167
  };
161
168
 
169
+ let ignore_null_bytes = kwargs.optional.8.flatten().unwrap_or_default();
170
+
162
171
  Ok(ReadCsvArgs {
163
172
  to_read,
164
173
  has_headers,
@@ -169,5 +178,6 @@ pub fn parse_read_csv_args(ruby: &Ruby, args: &[Value]) -> Result<ReadCsvArgs, E
169
178
  flexible,
170
179
  flexible_default,
171
180
  trim,
181
+ ignore_null_bytes,
172
182
  })
173
183
  }
data/lib/osv/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module OSV
2
- VERSION = "0.3.17"
2
+ VERSION = "0.3.19"
3
3
  end
data/lib/osv.rbi CHANGED
@@ -23,6 +23,8 @@ module OSV
23
23
  # - `trim`: String specifying the trim mode
24
24
  # ("all" or "headers" or "fields" or :all or :headers or :fields)
25
25
  # (default: `nil`)
26
+ # - `ignore_null_bytes`: Boolean specifying if null bytes should be ignored
27
+ # (default: false)
26
28
  sig do
27
29
  params(
28
30
  input: T.any(String, StringIO, IO),
@@ -34,6 +36,7 @@ module OSV
34
36
  result_type: T.nilable(T.any(String, Symbol)),
35
37
  flexible: T.nilable(T::Boolean),
36
38
  flexible_default: T.nilable(String),
39
+ ignore_null_bytes: T.nilable(T::Boolean),
37
40
  trim: T.nilable(T.any(String, Symbol)),
38
41
  blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.nilable(String)], T::Array[T.nilable(String)])).void)
39
42
  ).returns(T.any(Enumerator, T.untyped))
@@ -48,6 +51,7 @@ module OSV
48
51
  result_type: nil,
49
52
  flexible: nil,
50
53
  flexible_default: nil,
54
+ ignore_null_bytes: nil,
51
55
  trim: nil,
52
56
  &blk
53
57
  )
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: osv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.17
4
+ version: 0.3.19
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-06 00:00:00.000000000 Z
11
+ date: 2025-01-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys