parquet 0.2.12 → 0.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7baa8799961bd4698da7c59c93cf8c36418553c29e4a56106a9338e1e00796d9
4
- data.tar.gz: 84e6e87d4ea74a0be77e7cefa9ba21fd8c410b6a873108965294f41ac7443b04
3
+ metadata.gz: 0cf24938c23cee5bc8ed4049e2b3fee7794cb619755e26cf83d4bb8826ebccd7
4
+ data.tar.gz: 85f55738e3503729535de7854d7438bca69f0b82e648471c285a3eefdb51a69b
5
5
  SHA512:
6
- metadata.gz: ff0aa33661944a72a69a31c287143b45d0c376fcba27ea4b5e416409702bb1acf896edeb5c2fb2bf485dd0083b5de21ca2ba9ee0cf619479b0f01f99b33a7c11
7
- data.tar.gz: ed88efcc1e55a3c8b685f16e52dcdb9a378d64d2cf161ba27b6a613684bbbf13a60b532de556e21096558a0ca86a65ba201d11e793644214b2e015203531968f
6
+ metadata.gz: 17eaa053e7c05605d63c84786958f2980817509a6ba165654bfe50459cc30a37553671cc8b57a70831c255e463b26fc2768afcee3621664b443f9e0e67dc4460
7
+ data.tar.gz: e1a90f2683fce4a10b489eba3b0d98754ebeed9c418bd274e1af38c8fd9b5ad50f1e4ce72568f9c98001db408367c9610723f5c24796c801a1eaed4c23377d42
@@ -6,6 +6,7 @@ pub struct RowEnumeratorArgs {
6
6
  pub to_read: Value,
7
7
  pub result_type: ParserResultType,
8
8
  pub columns: Option<Vec<String>>,
9
+ pub strict: bool,
9
10
  }
10
11
 
11
12
  /// Creates an enumerator for lazy Parquet row parsing
@@ -18,6 +19,9 @@ pub fn create_row_enumerator(args: RowEnumeratorArgs) -> Result<magnus::Enumerat
18
19
  if let Some(columns) = args.columns {
19
20
  kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
20
21
  }
22
+ if args.strict {
23
+ kwargs.aset(Symbol::new("strict"), true)?;
24
+ }
21
25
  Ok(args
22
26
  .rb_self
23
27
  .enumeratorize("each_row", (args.to_read, KwArgs(kwargs))))
@@ -29,6 +33,7 @@ pub struct ColumnEnumeratorArgs {
29
33
  pub result_type: ParserResultType,
30
34
  pub columns: Option<Vec<String>>,
31
35
  pub batch_size: Option<usize>,
36
+ pub strict: bool,
32
37
  }
33
38
 
34
39
  #[inline]
@@ -46,6 +51,9 @@ pub fn create_column_enumerator(
46
51
  if let Some(batch_size) = args.batch_size {
47
52
  kwargs.aset(Symbol::new("batch_size"), batch_size)?;
48
53
  }
54
+ if args.strict {
55
+ kwargs.aset(Symbol::new("strict"), true)?;
56
+ }
49
57
  Ok(args
50
58
  .rb_self
51
59
  .enumeratorize("each_column", (args.to_read, KwArgs(kwargs))))
@@ -16,7 +16,7 @@ use magnus::{IntoValue, RString, Ruby, Value};
16
16
 
17
17
  use thiserror::Error;
18
18
 
19
- #[derive(Debug, Error)]
19
+ #[derive(Debug, Clone, Error)]
20
20
  pub enum CacheError {
21
21
  #[error("Failed to acquire lock: {0}")]
22
22
  LockError(String),
@@ -26,6 +26,10 @@ pub enum ReaderError {
26
26
  Parquet(#[from] parquet::errors::ParquetError),
27
27
  #[error("Arrow error: {0}")]
28
28
  Arrow(#[from] arrow_schema::ArrowError),
29
+ #[error("UTF-8 error: {0}")]
30
+ Utf8Error(#[from] simdutf8::basic::Utf8Error),
31
+ #[error("Jiff error: {0}")]
32
+ Jiff(#[from] jiff::Error),
29
33
  }
30
34
 
31
35
  impl From<MagnusError> for ReaderError {
@@ -1,4 +1,5 @@
1
1
  use crate::header_cache::StringCache;
2
+ use crate::types::{ArrayWrapper, TryIntoValue};
2
3
  use crate::{
3
4
  create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ForgottenFileHandle,
4
5
  ParquetValueVec, ParserResultType, SeekableRubyValue,
@@ -27,6 +28,7 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
27
28
  result_type,
28
29
  columns,
29
30
  batch_size,
31
+ strict,
30
32
  } = parse_parquet_columns_args(&ruby, args)?;
31
33
 
32
34
  if !ruby.block_given() {
@@ -36,6 +38,7 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
36
38
  result_type,
37
39
  columns,
38
40
  batch_size,
41
+ strict,
39
42
  })
40
43
  .map(|yield_enum| yield_enum.into_value_with(&ruby));
41
44
  }
@@ -150,7 +153,7 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
150
153
  map.insert(*field, vec![]);
151
154
  }
152
155
  let record = ColumnRecord::Map(map);
153
- let _: Value = ruby.yield_value(record)?;
156
+ let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
154
157
  return Ok(ruby.qnil().into_value_with(&ruby));
155
158
  }
156
159
 
@@ -160,24 +163,37 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
160
163
  let headers_clone = headers.clone();
161
164
  let iter = batch_reader.map(move |batch| {
162
165
  batch.map_err(ReaderError::Arrow).and_then(|batch| {
163
- let headers = headers_clone.get_or_init(|| {
164
- let schema = batch.schema();
165
- let fields = schema.fields();
166
- let mut header_string = Vec::with_capacity(fields.len());
167
- for field in fields {
168
- header_string.push(field.name().to_owned());
169
- }
170
- StringCache::intern_many(&header_string).unwrap()
171
- });
172
-
173
- let mut map =
174
- HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
175
-
176
- batch.columns().iter().enumerate().for_each(|(i, column)| {
177
- let header = headers[i];
178
- let values = ParquetValueVec::try_from(column.clone()).unwrap();
179
- map.insert(header, values.into_inner());
180
- });
166
+ let local_headers = headers_clone
167
+ .get_or_init(|| {
168
+ let schema = batch.schema();
169
+ let fields = schema.fields();
170
+ let mut header_string = Vec::with_capacity(fields.len());
171
+ for field in fields {
172
+ header_string.push(field.name().to_owned());
173
+ }
174
+ StringCache::intern_many(&header_string)
175
+ })
176
+ .as_ref()
177
+ .map_err(|e| ReaderError::HeaderIntern(e.clone()))?;
178
+
179
+ let mut map = HashMap::with_capacity_and_hasher(
180
+ local_headers.len(),
181
+ RandomState::default(),
182
+ );
183
+
184
+ batch
185
+ .columns()
186
+ .iter()
187
+ .enumerate()
188
+ .try_for_each(|(i, column)| {
189
+ let header = local_headers[i];
190
+ let values = ParquetValueVec::try_from(ArrayWrapper {
191
+ array: &*column,
192
+ strict: strict,
193
+ })?;
194
+ map.insert(header, values.into_inner());
195
+ Ok::<_, ReaderError>(())
196
+ })?;
181
197
 
182
198
  Ok(ColumnRecord::Map::<RandomState>(map))
183
199
  })
@@ -185,7 +201,7 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
185
201
 
186
202
  for result in iter {
187
203
  let record = result?;
188
- let _: Value = ruby.yield_value(record)?;
204
+ let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
189
205
  }
190
206
  }
191
207
  ParserResultType::Array => {
@@ -195,17 +211,20 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
195
211
  .columns()
196
212
  .into_iter()
197
213
  .map(|column| {
198
- let values = ParquetValueVec::try_from(column.clone()).unwrap();
199
- values.into_inner()
214
+ let values = ParquetValueVec::try_from(ArrayWrapper {
215
+ array: &*column,
216
+ strict: strict,
217
+ })?;
218
+ Ok::<_, ReaderError>(values.into_inner())
200
219
  })
201
- .collect();
220
+ .collect::<Result<Vec<_>, _>>()?;
202
221
  Ok(ColumnRecord::Vec::<RandomState>(vec))
203
222
  })
204
223
  });
205
224
 
206
225
  for result in iter {
207
226
  let record = result?;
208
- let _: Value = ruby.yield_value(record)?;
227
+ let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
209
228
  }
210
229
  }
211
230
  }
@@ -1,4 +1,5 @@
1
1
  use crate::header_cache::StringCache;
2
+ use crate::types::TryIntoValue;
2
3
  use crate::{
3
4
  create_row_enumerator, utils::*, ForgottenFileHandle, ParquetField, ParserResultType,
4
5
  ReaderError, RowEnumeratorArgs, RowRecord, SeekableRubyValue,
@@ -25,6 +26,7 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
25
26
  to_read,
26
27
  result_type,
27
28
  columns,
29
+ strict,
28
30
  } = parse_parquet_rows_args(&ruby, args)?;
29
31
 
30
32
  if !ruby.block_given() {
@@ -33,6 +35,7 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
33
35
  to_read,
34
36
  result_type,
35
37
  columns,
38
+ strict,
36
39
  })
37
40
  .map(|yield_enum| yield_enum.into_value_with(&ruby));
38
41
  }
@@ -102,7 +105,7 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
102
105
  let mut map =
103
106
  HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
104
107
  row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
105
- map.insert(headers[i], ParquetField(v.clone()));
108
+ map.insert(headers[i], ParquetField(v.clone(), strict));
106
109
  });
107
110
  Ok(map)
108
111
  })
@@ -112,7 +115,7 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
112
115
 
113
116
  for result in iter {
114
117
  let record = result?;
115
- let _: Value = ruby.yield_value(record)?;
118
+ let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
116
119
  }
117
120
  }
118
121
  ParserResultType::Array => {
@@ -121,7 +124,7 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
121
124
  let column_count = row.get_column_iter().count();
122
125
  let mut vec = Vec::with_capacity(column_count);
123
126
  row.get_column_iter()
124
- .for_each(|(_, v)| vec.push(ParquetField(v.clone())));
127
+ .for_each(|(_, v)| vec.push(ParquetField(v.clone(), strict)));
125
128
  Ok(vec)
126
129
  })
127
130
  .and_then(|row| Ok(RowRecord::Vec::<RandomState>(row)))
@@ -130,7 +133,7 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
130
133
 
131
134
  for result in iter {
132
135
  let record = result?;
133
- let _: Value = ruby.yield_value(record)?;
136
+ let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
134
137
  }
135
138
  }
136
139
  }
@@ -1,4 +1,7 @@
1
- use crate::{impl_date_conversion, impl_timestamp_array_conversion, impl_timestamp_conversion};
1
+ use crate::{
2
+ impl_date_conversion, impl_timestamp_array_conversion, impl_timestamp_conversion,
3
+ reader::ReaderError,
4
+ };
2
5
 
3
6
  use super::*;
4
7
 
@@ -103,23 +106,23 @@ impl std::hash::Hash for ParquetValue {
103
106
  }
104
107
  }
105
108
 
106
- impl IntoValue for ParquetValue {
107
- fn into_value_with(self, handle: &Ruby) -> Value {
109
+ impl TryIntoValue for ParquetValue {
110
+ fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ReaderError> {
108
111
  match self {
109
- ParquetValue::Int8(i) => i.into_value_with(handle),
110
- ParquetValue::Int16(i) => i.into_value_with(handle),
111
- ParquetValue::Int32(i) => i.into_value_with(handle),
112
- ParquetValue::Int64(i) => i.into_value_with(handle),
113
- ParquetValue::UInt8(i) => i.into_value_with(handle),
114
- ParquetValue::UInt16(i) => i.into_value_with(handle),
115
- ParquetValue::UInt32(i) => i.into_value_with(handle),
116
- ParquetValue::UInt64(i) => i.into_value_with(handle),
117
- ParquetValue::Float16(f) => f.into_value_with(handle),
118
- ParquetValue::Float32(f) => f.into_value_with(handle),
119
- ParquetValue::Float64(f) => f.into_value_with(handle),
120
- ParquetValue::Boolean(b) => b.into_value_with(handle),
121
- ParquetValue::String(s) => s.into_value_with(handle),
122
- ParquetValue::Bytes(b) => handle.str_from_slice(&b).as_value(),
112
+ ParquetValue::Int8(i) => Ok(i.into_value_with(handle)),
113
+ ParquetValue::Int16(i) => Ok(i.into_value_with(handle)),
114
+ ParquetValue::Int32(i) => Ok(i.into_value_with(handle)),
115
+ ParquetValue::Int64(i) => Ok(i.into_value_with(handle)),
116
+ ParquetValue::UInt8(i) => Ok(i.into_value_with(handle)),
117
+ ParquetValue::UInt16(i) => Ok(i.into_value_with(handle)),
118
+ ParquetValue::UInt32(i) => Ok(i.into_value_with(handle)),
119
+ ParquetValue::UInt64(i) => Ok(i.into_value_with(handle)),
120
+ ParquetValue::Float16(f) => Ok(f.into_value_with(handle)),
121
+ ParquetValue::Float32(f) => Ok(f.into_value_with(handle)),
122
+ ParquetValue::Float64(f) => Ok(f.into_value_with(handle)),
123
+ ParquetValue::Boolean(b) => Ok(b.into_value_with(handle)),
124
+ ParquetValue::String(s) => Ok(s.into_value_with(handle)),
125
+ ParquetValue::Bytes(b) => Ok(handle.str_from_slice(&b).as_value()),
123
126
  ParquetValue::Date32(d) => impl_date_conversion!(d, handle),
124
127
  ParquetValue::Date64(d) => impl_date_conversion!(d, handle),
125
128
  timestamp @ ParquetValue::TimestampSecond(_, _) => {
@@ -136,21 +139,23 @@ impl IntoValue for ParquetValue {
136
139
  }
137
140
  ParquetValue::List(l) => {
138
141
  let ary = handle.ary_new_capa(l.len());
139
- l.into_iter()
140
- .try_for_each(|v| ary.push(v.into_value_with(handle)))
141
- .unwrap();
142
- ary.into_value_with(handle)
142
+ l.into_iter().try_for_each(|v| {
143
+ ary.push(v.try_into_value_with(handle)?)?;
144
+ Ok::<_, ReaderError>(())
145
+ })?;
146
+ Ok(ary.into_value_with(handle))
143
147
  }
144
148
  ParquetValue::Map(m) => {
145
149
  let hash = handle.hash_new_capa(m.len());
146
- m.into_iter()
147
- .try_for_each(|(k, v)| {
148
- hash.aset(k.into_value_with(handle), v.into_value_with(handle))
149
- })
150
- .unwrap();
151
- hash.into_value_with(handle)
152
- }
153
- ParquetValue::Null => handle.qnil().as_value(),
150
+ m.into_iter().try_for_each(|(k, v)| {
151
+ hash.aset(
152
+ k.try_into_value_with(handle)?,
153
+ v.try_into_value_with(handle)?,
154
+ )
155
+ })?;
156
+ Ok(hash.into_value_with(handle))
157
+ }
158
+ ParquetValue::Null => Ok(handle.qnil().as_value()),
154
159
  }
155
160
  }
156
161
  }
@@ -260,18 +265,10 @@ impl std::cmp::PartialEq for ParquetValueVec {
260
265
 
261
266
  impl std::cmp::Eq for ParquetValueVec {}
262
267
 
263
- impl TryFrom<Arc<dyn Array>> for ParquetValueVec {
264
- type Error = String;
265
-
266
- fn try_from(column: Arc<dyn Array>) -> Result<Self, Self::Error> {
267
- ParquetValueVec::try_from(&*column)
268
- }
269
- }
270
-
271
268
  macro_rules! impl_numeric_array_conversion {
272
269
  ($column:expr, $array_type:ty, $variant:ident) => {{
273
270
  let array = downcast_array::<$array_type>($column);
274
- if array.is_nullable() {
271
+ Ok(ParquetValueVec(if array.is_nullable() {
275
272
  array
276
273
  .values()
277
274
  .iter()
@@ -290,13 +287,13 @@ macro_rules! impl_numeric_array_conversion {
290
287
  .iter()
291
288
  .map(|x| ParquetValue::$variant(*x))
292
289
  .collect()
293
- }
290
+ }))
294
291
  }};
295
292
  }
296
293
  macro_rules! impl_boolean_array_conversion {
297
294
  ($column:expr, $array_type:ty, $variant:ident) => {{
298
295
  let array = downcast_array::<$array_type>($column);
299
- if array.is_nullable() {
296
+ Ok(ParquetValueVec(if array.is_nullable() {
300
297
  array
301
298
  .values()
302
299
  .iter()
@@ -315,34 +312,50 @@ macro_rules! impl_boolean_array_conversion {
315
312
  .iter()
316
313
  .map(|x| ParquetValue::$variant(x))
317
314
  .collect()
318
- }
315
+ }))
319
316
  }};
320
317
  }
321
318
 
322
- impl TryFrom<&dyn Array> for ParquetValueVec {
323
- type Error = String;
319
+ pub struct ArrayWrapper<'a> {
320
+ pub array: &'a dyn Array,
321
+ pub strict: bool,
322
+ }
323
+
324
+ impl<'a> TryFrom<ArrayWrapper<'a>> for ParquetValueVec {
325
+ type Error = ReaderError;
324
326
 
325
- fn try_from(column: &dyn Array) -> Result<Self, Self::Error> {
326
- let tmp_vec = match column.data_type() {
327
- DataType::Boolean => impl_boolean_array_conversion!(column, BooleanArray, Boolean),
328
- DataType::Int8 => impl_numeric_array_conversion!(column, Int8Array, Int8),
329
- DataType::Int16 => impl_numeric_array_conversion!(column, Int16Array, Int16),
330
- DataType::Int32 => impl_numeric_array_conversion!(column, Int32Array, Int32),
331
- DataType::Int64 => impl_numeric_array_conversion!(column, Int64Array, Int64),
332
- DataType::UInt8 => impl_numeric_array_conversion!(column, UInt8Array, UInt8),
333
- DataType::UInt16 => impl_numeric_array_conversion!(column, UInt16Array, UInt16),
334
- DataType::UInt32 => impl_numeric_array_conversion!(column, UInt32Array, UInt32),
335
- DataType::UInt64 => impl_numeric_array_conversion!(column, UInt64Array, UInt64),
336
- DataType::Float32 => impl_numeric_array_conversion!(column, Float32Array, Float32),
337
- DataType::Float64 => impl_numeric_array_conversion!(column, Float64Array, Float64),
338
- DataType::Date32 => impl_numeric_array_conversion!(column, Date32Array, Date32),
339
- DataType::Date64 => impl_numeric_array_conversion!(column, Date64Array, Date64),
327
+ fn try_from(column: ArrayWrapper<'a>) -> Result<Self, Self::Error> {
328
+ match column.array.data_type() {
329
+ DataType::Boolean => {
330
+ impl_boolean_array_conversion!(column.array, BooleanArray, Boolean)
331
+ }
332
+ DataType::Int8 => impl_numeric_array_conversion!(column.array, Int8Array, Int8),
333
+ DataType::Int16 => impl_numeric_array_conversion!(column.array, Int16Array, Int16),
334
+ DataType::Int32 => impl_numeric_array_conversion!(column.array, Int32Array, Int32),
335
+ DataType::Int64 => impl_numeric_array_conversion!(column.array, Int64Array, Int64),
336
+ DataType::UInt8 => impl_numeric_array_conversion!(column.array, UInt8Array, UInt8),
337
+ DataType::UInt16 => impl_numeric_array_conversion!(column.array, UInt16Array, UInt16),
338
+ DataType::UInt32 => impl_numeric_array_conversion!(column.array, UInt32Array, UInt32),
339
+ DataType::UInt64 => impl_numeric_array_conversion!(column.array, UInt64Array, UInt64),
340
+ DataType::Float32 => {
341
+ impl_numeric_array_conversion!(column.array, Float32Array, Float32)
342
+ }
343
+ DataType::Float64 => {
344
+ impl_numeric_array_conversion!(column.array, Float64Array, Float64)
345
+ }
346
+ DataType::Date32 => impl_numeric_array_conversion!(column.array, Date32Array, Date32),
347
+ DataType::Date64 => impl_numeric_array_conversion!(column.array, Date64Array, Date64),
340
348
  DataType::Timestamp(TimeUnit::Second, tz) => {
341
- impl_timestamp_array_conversion!(column, TimestampSecondArray, TimestampSecond, tz)
349
+ impl_timestamp_array_conversion!(
350
+ column.array,
351
+ TimestampSecondArray,
352
+ TimestampSecond,
353
+ tz
354
+ )
342
355
  }
343
356
  DataType::Timestamp(TimeUnit::Millisecond, tz) => {
344
357
  impl_timestamp_array_conversion!(
345
- column,
358
+ column.array,
346
359
  TimestampMillisecondArray,
347
360
  TimestampMillis,
348
361
  tz
@@ -350,7 +363,7 @@ impl TryFrom<&dyn Array> for ParquetValueVec {
350
363
  }
351
364
  DataType::Timestamp(TimeUnit::Microsecond, tz) => {
352
365
  impl_timestamp_array_conversion!(
353
- column,
366
+ column.array,
354
367
  TimestampMicrosecondArray,
355
368
  TimestampMicros,
356
369
  tz
@@ -358,72 +371,93 @@ impl TryFrom<&dyn Array> for ParquetValueVec {
358
371
  }
359
372
  DataType::Timestamp(TimeUnit::Nanosecond, tz) => {
360
373
  impl_timestamp_array_conversion!(
361
- column,
374
+ column.array,
362
375
  TimestampNanosecondArray,
363
376
  TimestampNanos,
364
377
  tz
365
378
  )
366
379
  }
367
380
  DataType::Float16 => {
368
- let array = downcast_array::<Float16Array>(column);
381
+ let array = downcast_array::<Float16Array>(column.array);
369
382
  if array.is_nullable() {
370
- array
371
- .values()
372
- .iter()
373
- .enumerate()
374
- .map(|(i, x)| {
375
- if array.is_null(i) {
376
- ParquetValue::Null
377
- } else {
378
- ParquetValue::Float16(f32::from(*x))
379
- }
380
- })
381
- .collect()
383
+ Ok(ParquetValueVec(
384
+ array
385
+ .values()
386
+ .iter()
387
+ .enumerate()
388
+ .map(|(i, x)| {
389
+ if array.is_null(i) {
390
+ ParquetValue::Null
391
+ } else {
392
+ ParquetValue::Float16(f32::from(*x))
393
+ }
394
+ })
395
+ .collect(),
396
+ ))
382
397
  } else {
383
- array
384
- .values()
385
- .iter()
386
- .map(|x| ParquetValue::Float16(f32::from(*x)))
387
- .collect()
398
+ Ok(ParquetValueVec(
399
+ array
400
+ .values()
401
+ .iter()
402
+ .map(|x| ParquetValue::Float16(f32::from(*x)))
403
+ .collect(),
404
+ ))
388
405
  }
389
406
  }
390
407
  DataType::Utf8 => {
391
- let array = downcast_array::<StringArray>(column);
392
- array
393
- .iter()
394
- .map(|opt_x| match opt_x {
395
- Some(x) => ParquetValue::String(x.to_string()),
396
- None => ParquetValue::Null,
397
- })
398
- .collect()
408
+ let array = downcast_array::<StringArray>(column.array);
409
+ let mut tmp_vec = Vec::with_capacity(array.len());
410
+ let iter = array.iter().map(|opt_x| match opt_x {
411
+ Some(x) => {
412
+ if column.strict {
413
+ Ok::<_, ReaderError>(ParquetValue::String(
414
+ simdutf8::basic::from_utf8(x.as_bytes())?.to_string(),
415
+ ))
416
+ } else {
417
+ Ok::<_, ReaderError>(ParquetValue::String(x.to_string()))
418
+ }
419
+ }
420
+ None => Ok(ParquetValue::Null),
421
+ });
422
+ for x in iter {
423
+ tmp_vec.push(x?);
424
+ }
425
+ Ok(ParquetValueVec(tmp_vec))
399
426
  }
400
427
  DataType::Binary => {
401
- let array = downcast_array::<BinaryArray>(column);
402
- array
403
- .iter()
404
- .map(|opt_x| match opt_x {
405
- Some(x) => ParquetValue::Bytes(x.to_vec()),
406
- None => ParquetValue::Null,
407
- })
408
- .collect()
428
+ let array = downcast_array::<BinaryArray>(column.array);
429
+ Ok(ParquetValueVec(
430
+ array
431
+ .iter()
432
+ .map(|opt_x| match opt_x {
433
+ Some(x) => ParquetValue::Bytes(x.to_vec()),
434
+ None => ParquetValue::Null,
435
+ })
436
+ .collect(),
437
+ ))
409
438
  }
410
439
  DataType::List(_field) => {
411
- let list_array = downcast_array::<ListArray>(column);
412
- list_array
413
- .iter()
414
- .map(|x| match x {
415
- Some(values) => match ParquetValueVec::try_from(values) {
416
- Ok(vec) => ParquetValue::List(vec.into_inner()),
417
- Err(e) => {
418
- panic!("Error converting list array to ParquetValueVec: {}", e)
419
- }
420
- },
421
- None => ParquetValue::Null,
422
- })
423
- .collect()
440
+ let list_array = downcast_array::<ListArray>(column.array);
441
+ Ok(ParquetValueVec(
442
+ list_array
443
+ .iter()
444
+ .map(|x| match x {
445
+ Some(values) => match ParquetValueVec::try_from(ArrayWrapper {
446
+ array: &*values,
447
+ strict: column.strict,
448
+ }) {
449
+ Ok(vec) => ParquetValue::List(vec.into_inner()),
450
+ Err(e) => {
451
+ panic!("Error converting list array to ParquetValueVec: {}", e)
452
+ }
453
+ },
454
+ None => ParquetValue::Null,
455
+ })
456
+ .collect(),
457
+ ))
424
458
  }
425
459
  DataType::Struct(_) => {
426
- let struct_array = downcast_array::<StructArray>(column);
460
+ let struct_array = downcast_array::<StructArray>(column.array);
427
461
  let mut values = Vec::with_capacity(struct_array.len());
428
462
  for i in 0..struct_array.len() {
429
463
  if struct_array.is_null(i) {
@@ -433,8 +467,11 @@ impl TryFrom<&dyn Array> for ParquetValueVec {
433
467
 
434
468
  let mut map = std::collections::HashMap::new();
435
469
  for (field_idx, field) in struct_array.fields().iter().enumerate() {
436
- let column = struct_array.column(field_idx);
437
- let field_values = match ParquetValueVec::try_from(column.slice(i, 1)) {
470
+ let c = struct_array.column(field_idx);
471
+ let field_values = match ParquetValueVec::try_from(ArrayWrapper {
472
+ array: &*c.slice(i, 1),
473
+ strict: column.strict,
474
+ }) {
438
475
  Ok(vec) => vec.into_inner(),
439
476
  Err(e) => {
440
477
  panic!("Error converting struct field to ParquetValueVec: {}", e)
@@ -447,16 +484,18 @@ impl TryFrom<&dyn Array> for ParquetValueVec {
447
484
  }
448
485
  values.push(ParquetValue::Map(map));
449
486
  }
450
- values
487
+ Ok(ParquetValueVec(values))
451
488
  }
452
489
  DataType::Null => {
453
- let x = downcast_array::<NullArray>(column);
454
- vec![ParquetValue::Null; x.len()]
490
+ let x = downcast_array::<NullArray>(column.array);
491
+ Ok(ParquetValueVec(vec![ParquetValue::Null; x.len()]))
455
492
  }
456
493
  _ => {
457
- return Err(format!("Unsupported data type: {:?}", column.data_type()));
494
+ return Err(ReaderError::Ruby(format!(
495
+ "Unsupported data type: {:?}",
496
+ column.array.data_type()
497
+ )));
458
498
  }
459
- };
460
- Ok(ParquetValueVec(tmp_vec))
499
+ }
461
500
  }
462
501
  }
@@ -1,5 +1,7 @@
1
1
  use itertools::Itertools;
2
2
 
3
+ use crate::reader::ReaderError;
4
+
3
5
  use super::*;
4
6
 
5
7
  #[derive(Debug)]
@@ -15,15 +17,16 @@ pub enum ColumnRecord<S: BuildHasher + Default> {
15
17
  }
16
18
 
17
19
  #[derive(Debug)]
18
- pub struct ParquetField(pub Field);
20
+ pub struct ParquetField(pub Field, pub bool);
19
21
 
20
- impl<S: BuildHasher + Default> IntoValue for RowRecord<S> {
21
- fn into_value_with(self, handle: &Ruby) -> Value {
22
+ impl<S: BuildHasher + Default> TryIntoValue for RowRecord<S> {
23
+ fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ReaderError> {
22
24
  match self {
23
25
  RowRecord::Vec(vec) => {
24
26
  let ary = handle.ary_new_capa(vec.len());
25
- vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
26
- handle.into_value(ary)
27
+ vec.into_iter()
28
+ .try_for_each(|v| ary.push(v.try_into_value_with(handle)?))?;
29
+ Ok(handle.into_value(ary))
27
30
  }
28
31
  RowRecord::Map(map) => {
29
32
  let hash = handle.hash_new_capa(map.len());
@@ -36,41 +39,41 @@ impl<S: BuildHasher + Default> IntoValue for RowRecord<S> {
36
39
  for (k, v) in chunk {
37
40
  if i + 1 >= values.len() {
38
41
  // Bulk insert current batch if array is full
39
- hash.bulk_insert(&values[..i]).unwrap();
42
+ hash.bulk_insert(&values[..i])?;
40
43
  values[..i].fill(handle.qnil().as_value());
41
44
  i = 0;
42
45
  }
43
46
  values[i] = handle.into_value(k);
44
- values[i + 1] = handle.into_value(v);
47
+ values[i + 1] = v.try_into_value_with(handle)?;
45
48
  i += 2;
46
49
  }
47
50
  // Insert any remaining pairs
48
51
  if i > 0 {
49
- hash.bulk_insert(&values[..i]).unwrap();
52
+ hash.bulk_insert(&values[..i])?;
50
53
  values[..i].fill(handle.qnil().as_value());
51
54
  i = 0;
52
55
  }
53
56
  }
54
57
 
55
- hash.into_value_with(handle)
58
+ Ok(hash.into_value_with(handle))
56
59
  }
57
60
  }
58
61
  }
59
62
  }
60
63
 
61
- impl<S: BuildHasher + Default> IntoValue for ColumnRecord<S> {
62
- fn into_value_with(self, handle: &Ruby) -> Value {
64
+ impl<S: BuildHasher + Default> TryIntoValue for ColumnRecord<S> {
65
+ fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ReaderError> {
63
66
  match self {
64
67
  ColumnRecord::Vec(vec) => {
65
68
  let ary = handle.ary_new_capa(vec.len());
66
- vec.into_iter()
67
- .try_for_each(|v| {
68
- let nested_ary = handle.ary_new_capa(v.len());
69
- v.into_iter().try_for_each(|v| nested_ary.push(v)).unwrap();
70
- ary.push(nested_ary.into_value_with(handle))
71
- })
72
- .unwrap();
73
- ary.into_value_with(handle)
69
+ vec.into_iter().try_for_each(|v| {
70
+ let nested_ary = handle.ary_new_capa(v.len());
71
+ v.into_iter()
72
+ .try_for_each(|v| nested_ary.push(v.try_into_value_with(handle)?))?;
73
+ ary.push(nested_ary.into_value_with(handle))?;
74
+ Ok::<_, ReaderError>(())
75
+ })?;
76
+ Ok(ary.into_value_with(handle))
74
77
  }
75
78
  ColumnRecord::Map(map) => {
76
79
  let hash = handle.hash_new_capa(map.len());
@@ -83,91 +86,98 @@ impl<S: BuildHasher + Default> IntoValue for ColumnRecord<S> {
83
86
  for (k, v) in chunk {
84
87
  if i + 1 >= values.len() {
85
88
  // Bulk insert current batch if array is full
86
- hash.bulk_insert(&values[..i]).unwrap();
89
+ hash.bulk_insert(&values[..i])?;
87
90
  values[..i].fill(handle.qnil().as_value());
88
91
  i = 0;
89
92
  }
90
93
  values[i] = handle.into_value(k);
91
94
  let ary = handle.ary_new_capa(v.len());
92
- v.into_iter().try_for_each(|v| ary.push(v)).unwrap();
95
+ v.into_iter()
96
+ .try_for_each(|v| ary.push(v.try_into_value_with(handle)?))?;
93
97
  values[i + 1] = handle.into_value(ary);
94
98
  i += 2;
95
99
  }
96
100
  // Insert any remaining pairs
97
101
  if i > 0 {
98
- hash.bulk_insert(&values[..i]).unwrap();
102
+ hash.bulk_insert(&values[..i])?;
99
103
  values[..i].fill(handle.qnil().as_value());
100
104
  i = 0;
101
105
  }
102
106
  }
103
107
 
104
- hash.into_value_with(handle)
108
+ Ok(hash.into_value_with(handle))
105
109
  }
106
110
  }
107
111
  }
108
112
  }
109
113
 
110
- impl IntoValue for ParquetField {
111
- fn into_value_with(self, handle: &Ruby) -> Value {
114
+ pub trait TryIntoValue {
115
+ fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ReaderError>;
116
+ }
117
+
118
+ impl TryIntoValue for ParquetField {
119
+ fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ReaderError> {
112
120
  match self.0 {
113
- Field::Null => handle.qnil().as_value(),
114
- Field::Bool(b) => b.into_value_with(handle),
115
- Field::Short(s) => s.into_value_with(handle),
116
- Field::Int(i) => i.into_value_with(handle),
117
- Field::Long(l) => l.into_value_with(handle),
118
- Field::UByte(ub) => ub.into_value_with(handle),
119
- Field::UShort(us) => us.into_value_with(handle),
120
- Field::UInt(ui) => ui.into_value_with(handle),
121
- Field::ULong(ul) => ul.into_value_with(handle),
122
- Field::Float16(f) => f32::from(f).into_value_with(handle),
123
- Field::Float(f) => f.into_value_with(handle),
124
- Field::Double(d) => d.into_value_with(handle),
125
- Field::Str(s) => s.into_value_with(handle),
126
- Field::Byte(b) => b.into_value_with(handle),
127
- Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
121
+ Field::Null => Ok(handle.qnil().as_value()),
122
+ Field::Bool(b) => Ok(b.into_value_with(handle)),
123
+ Field::Short(s) => Ok(s.into_value_with(handle)),
124
+ Field::Int(i) => Ok(i.into_value_with(handle)),
125
+ Field::Long(l) => Ok(l.into_value_with(handle)),
126
+ Field::UByte(ub) => Ok(ub.into_value_with(handle)),
127
+ Field::UShort(us) => Ok(us.into_value_with(handle)),
128
+ Field::UInt(ui) => Ok(ui.into_value_with(handle)),
129
+ Field::ULong(ul) => Ok(ul.into_value_with(handle)),
130
+ Field::Float16(f) => Ok(f32::from(f).into_value_with(handle)),
131
+ Field::Float(f) => Ok(f.into_value_with(handle)),
132
+ Field::Double(d) => Ok(d.into_value_with(handle)),
133
+ Field::Str(s) => {
134
+ if self.1 {
135
+ Ok(simdutf8::basic::from_utf8(s.as_bytes())
136
+ .map_err(|e| ReaderError::Utf8Error(e))
137
+ .and_then(|s| Ok(s.into_value_with(handle)))?)
138
+ } else {
139
+ Ok(handle.str_from_slice(s.as_bytes()).as_value())
140
+ }
141
+ }
142
+ Field::Byte(b) => Ok(b.into_value_with(handle)),
143
+ Field::Bytes(b) => Ok(handle.str_from_slice(b.data()).as_value()),
128
144
  Field::Date(d) => {
129
- let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
145
+ let ts = jiff::Timestamp::from_second((d as i64) * 86400)?;
130
146
  let formatted = ts.strftime("%Y-%m-%d").to_string();
131
- formatted.into_value_with(handle)
147
+ Ok(formatted.into_value_with(handle))
132
148
  }
133
149
  Field::TimestampMillis(ts) => {
134
- let ts = jiff::Timestamp::from_millisecond(ts).unwrap();
150
+ let ts = jiff::Timestamp::from_millisecond(ts)?;
135
151
  let time_class = handle.class_time();
136
- time_class
137
- .funcall::<_, _, Value>("parse", (ts.to_string(),))
138
- .unwrap()
139
- .into_value_with(handle)
152
+ Ok(time_class
153
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))?
154
+ .into_value_with(handle))
140
155
  }
141
156
  Field::TimestampMicros(ts) => {
142
- let ts = jiff::Timestamp::from_microsecond(ts).unwrap();
157
+ let ts = jiff::Timestamp::from_microsecond(ts)?;
143
158
  let time_class = handle.class_time();
144
- time_class
145
- .funcall::<_, _, Value>("parse", (ts.to_string(),))
146
- .unwrap()
147
- .into_value_with(handle)
159
+ Ok(time_class
160
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))?
161
+ .into_value_with(handle))
148
162
  }
149
163
  Field::ListInternal(list) => {
150
164
  let elements = list.elements();
151
165
  let ary = handle.ary_new_capa(elements.len());
152
- elements
153
- .iter()
154
- .try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
155
- .unwrap();
156
- ary.into_value_with(handle)
166
+ elements.iter().try_for_each(|e| {
167
+ ary.push(ParquetField(e.clone(), self.1).try_into_value_with(handle)?)
168
+ })?;
169
+ Ok(ary.into_value_with(handle))
157
170
  }
158
171
  Field::MapInternal(map) => {
159
172
  let entries = map.entries();
160
173
  let hash = handle.hash_new_capa(entries.len());
161
- entries
162
- .iter()
163
- .try_for_each(|(k, v)| {
164
- hash.aset(
165
- ParquetField(k.clone()).into_value_with(handle),
166
- ParquetField(v.clone()).into_value_with(handle),
167
- )
168
- })
169
- .unwrap();
170
- hash.into_value_with(handle)
174
+ entries.iter().try_for_each(|(k, v)| {
175
+ hash.aset(
176
+ ParquetField(k.clone(), self.1).try_into_value_with(handle)?,
177
+ ParquetField(v.clone(), self.1).try_into_value_with(handle)?,
178
+ )
179
+ })?;
180
+ Ok(hash.into_value_with(handle))
171
181
  }
172
182
  Field::Decimal(d) => {
173
183
  let value = match d {
@@ -185,20 +195,24 @@ impl IntoValue for ParquetField {
185
195
  format!("{}e-{}", unscaled, scale)
186
196
  }
187
197
  };
188
- handle.eval(&format!("BigDecimal(\"{value}\")")).unwrap()
198
+ Ok(handle.eval(&format!("BigDecimal(\"{value}\")"))?)
189
199
  }
190
200
  Field::Group(row) => {
191
201
  let hash = handle.hash_new();
192
- row.get_column_iter()
193
- .try_for_each(|(k, v)| {
194
- hash.aset(
195
- k.clone().into_value_with(handle),
196
- ParquetField(v.clone()).into_value_with(handle),
197
- )
198
- })
199
- .unwrap();
200
- hash.into_value_with(handle)
202
+ row.get_column_iter().try_for_each(|(k, v)| {
203
+ hash.aset(
204
+ k.clone().into_value_with(handle),
205
+ ParquetField(v.clone(), self.1).try_into_value_with(handle)?,
206
+ )
207
+ })?;
208
+ Ok(hash.into_value_with(handle))
201
209
  }
202
210
  }
203
211
  }
204
212
  }
213
+
214
+ // impl IntoValue for ParquetField {
215
+ // fn into_value_with(self, handle: &Ruby) -> Value {
216
+ // self.try_into_value_with(handle).unwrap()
217
+ // }
218
+ // }
@@ -64,10 +64,9 @@ macro_rules! impl_timestamp_conversion {
64
64
  ParquetValue::$unit(ts, tz) => {
65
65
  let ts = parse_zoned_timestamp(&ParquetValue::$unit(ts, tz));
66
66
  let time_class = $handle.class_time();
67
- time_class
68
- .funcall::<_, _, Value>("parse", (ts.to_string(),))
69
- .unwrap()
70
- .into_value_with($handle)
67
+ Ok(time_class
68
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))?
69
+ .into_value_with($handle))
71
70
  }
72
71
  _ => panic!("Invalid timestamp type"),
73
72
  }
@@ -80,6 +79,6 @@ macro_rules! impl_date_conversion {
80
79
  ($value:expr, $handle:expr) => {{
81
80
  let ts = jiff::Timestamp::from_second(($value as i64) * 86400).unwrap();
82
81
  let formatted = ts.strftime("%Y-%m-%d").to_string();
83
- formatted.into_value_with($handle)
82
+ Ok(formatted.into_value_with($handle))
84
83
  }};
85
84
  }
@@ -419,7 +419,7 @@ macro_rules! impl_timestamp_to_arrow_conversion {
419
419
  macro_rules! impl_timestamp_array_conversion {
420
420
  ($column:expr, $array_type:ty, $variant:ident, $tz:expr) => {{
421
421
  let array = downcast_array::<$array_type>($column);
422
- if array.is_nullable() {
422
+ Ok(ParquetValueVec(if array.is_nullable() {
423
423
  array
424
424
  .values()
425
425
  .iter()
@@ -438,7 +438,7 @@ macro_rules! impl_timestamp_array_conversion {
438
438
  .iter()
439
439
  .map(|x| ParquetValue::$variant(*x, $tz.clone().map(|s| s.into())))
440
440
  .collect()
441
- }
441
+ }))
442
442
  }};
443
443
  }
444
444
 
@@ -32,6 +32,7 @@ pub struct ParquetRowsArgs {
32
32
  pub to_read: Value,
33
33
  pub result_type: ParserResultType,
34
34
  pub columns: Option<Vec<String>>,
35
+ pub strict: bool,
35
36
  }
36
37
 
37
38
  /// Parse common arguments for CSV parsing
@@ -39,10 +40,19 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
39
40
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
40
41
  let (to_read,) = parsed_args.required;
41
42
 
42
- let kwargs = get_kwargs::<_, (), (Option<Option<Value>>, Option<Option<Vec<String>>>), ()>(
43
+ let kwargs = get_kwargs::<
44
+ _,
45
+ (),
46
+ (
47
+ Option<Option<Value>>,
48
+ Option<Option<Vec<String>>>,
49
+ Option<Option<bool>>,
50
+ ),
51
+ (),
52
+ >(
43
53
  parsed_args.keywords,
44
54
  &[],
45
- &["result_type", "columns"],
55
+ &["result_type", "columns", "strict"],
46
56
  )?;
47
57
 
48
58
  let result_type: ParserResultType = match kwargs
@@ -73,10 +83,13 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
73
83
  None => ParserResultType::Hash,
74
84
  };
75
85
 
86
+ let strict = kwargs.optional.2.flatten().unwrap_or(false);
87
+
76
88
  Ok(ParquetRowsArgs {
77
89
  to_read,
78
90
  result_type,
79
91
  columns: kwargs.optional.1.flatten(),
92
+ strict,
80
93
  })
81
94
  }
82
95
 
@@ -86,6 +99,7 @@ pub struct ParquetColumnsArgs {
86
99
  pub result_type: ParserResultType,
87
100
  pub columns: Option<Vec<String>>,
88
101
  pub batch_size: Option<usize>,
102
+ pub strict: bool,
89
103
  }
90
104
 
91
105
  /// Parse common arguments for CSV parsing
@@ -103,12 +117,13 @@ pub fn parse_parquet_columns_args(
103
117
  Option<Option<Value>>,
104
118
  Option<Option<Vec<String>>>,
105
119
  Option<Option<usize>>,
120
+ Option<Option<bool>>,
106
121
  ),
107
122
  (),
108
123
  >(
109
124
  parsed_args.keywords,
110
125
  &[],
111
- &["result_type", "columns", "batch_size"],
126
+ &["result_type", "columns", "batch_size", "strict"],
112
127
  )?;
113
128
 
114
129
  let result_type: ParserResultType = match kwargs
@@ -144,5 +159,6 @@ pub fn parse_parquet_columns_args(
144
159
  result_type,
145
160
  columns: kwargs.optional.1.flatten(),
146
161
  batch_size: kwargs.optional.2.flatten(),
162
+ strict: kwargs.optional.3.flatten().unwrap_or(false),
147
163
  })
148
164
  }
Binary file
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.2.12"
2
+ VERSION = "0.2.13"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.12
4
+ version: 0.2.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
@@ -76,6 +76,7 @@ files:
76
76
  - ext/parquet/src/writer/mod.rs
77
77
  - lib/parquet.rb
78
78
  - lib/parquet.rbi
79
+ - lib/parquet/parquet.so
79
80
  - lib/parquet/version.rb
80
81
  homepage: https://github.com/njaremko/parquet-ruby
81
82
  licenses: