parquet 0.2.10 → 0.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
- use crate::header_cache::{HeaderCacheCleanupIter, StringCache};
1
+ use crate::header_cache::StringCache;
2
+ use crate::types::TryIntoValue;
2
3
  use crate::{
3
4
  create_row_enumerator, utils::*, ForgottenFileHandle, ParquetField, ParserResultType,
4
5
  ReaderError, RowEnumeratorArgs, RowRecord, SeekableRubyValue,
@@ -6,7 +7,8 @@ use crate::{
6
7
  use ahash::RandomState;
7
8
  use magnus::rb_sys::AsRawValue;
8
9
  use magnus::value::{Opaque, ReprValue};
9
- use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
10
+ use magnus::IntoValue;
11
+ use magnus::{Error as MagnusError, Ruby, Value};
10
12
  use parquet::file::reader::{FileReader, SerializedFileReader};
11
13
  use parquet::record::reader::RowIter as ParquetRowIter;
12
14
  use parquet::schema::types::{Type as SchemaType, TypePtr};
@@ -17,16 +19,14 @@ use std::os::fd::FromRawFd;
17
19
  use std::sync::OnceLock;
18
20
 
19
21
  #[inline]
20
- pub fn parse_parquet_rows<'a>(
21
- rb_self: Value,
22
- args: &[Value],
23
- ) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
22
+ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
24
23
  let ruby = unsafe { Ruby::get_unchecked() };
25
24
 
26
25
  let ParquetRowsArgs {
27
26
  to_read,
28
27
  result_type,
29
28
  columns,
29
+ strict,
30
30
  } = parse_parquet_rows_args(&ruby, args)?;
31
31
 
32
32
  if !ruby.block_given() {
@@ -35,7 +35,9 @@ pub fn parse_parquet_rows<'a>(
35
35
  to_read,
36
36
  result_type,
37
37
  columns,
38
- });
38
+ strict,
39
+ })
40
+ .map(|yield_enum| yield_enum.into_value_with(&ruby));
39
41
  }
40
42
 
41
43
  let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
@@ -81,56 +83,62 @@ pub fn parse_parquet_rows<'a>(
81
83
  })?;
82
84
  }
83
85
 
84
- let iter: Box<dyn Iterator<Item = RowRecord<RandomState>>> = match result_type {
86
+ match result_type {
85
87
  ParserResultType::Hash => {
86
88
  let headers = OnceLock::new();
87
89
  let headers_clone = headers.clone();
88
- let iter = iter
89
- .filter_map(move |row| {
90
- row.ok().map(|row| {
91
- let headers = headers_clone.get_or_init(|| {
92
- let column_count = row.get_column_iter().count();
93
-
94
- let mut header_string = Vec::with_capacity(column_count);
95
- for (k, _) in row.get_column_iter() {
96
- header_string.push(k.to_owned());
97
- }
98
-
99
- let headers = StringCache::intern_many(&header_string).unwrap();
100
-
101
- headers
102
- });
103
-
104
- let mut map =
105
- HashMap::with_capacity_and_hasher(headers.len(), Default::default());
106
- row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
107
- map.insert(headers[i], ParquetField(v.clone()));
108
- });
109
- map
110
- })
90
+ let iter = iter.map(move |row| {
91
+ row.and_then(|row| {
92
+ let headers = headers_clone.get_or_init(|| {
93
+ let column_count = row.get_column_iter().count();
94
+
95
+ let mut header_string = Vec::with_capacity(column_count);
96
+ for (k, _) in row.get_column_iter() {
97
+ header_string.push(k.to_owned());
98
+ }
99
+
100
+ let headers = StringCache::intern_many(&header_string).unwrap();
101
+
102
+ headers
103
+ });
104
+
105
+ let mut map =
106
+ HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
107
+ row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
108
+ map.insert(headers[i], ParquetField(v.clone(), strict));
109
+ });
110
+ Ok(map)
111
111
  })
112
- .map(RowRecord::Map);
113
-
114
- Box::new(HeaderCacheCleanupIter {
115
- inner: iter,
116
- headers,
117
- })
112
+ .and_then(|row| Ok(RowRecord::Map::<RandomState>(row)))
113
+ .map_err(|e| ReaderError::Parquet(e))
114
+ });
115
+
116
+ for result in iter {
117
+ let record = result?;
118
+ let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
119
+ }
118
120
  }
119
- ParserResultType::Array => Box::new(
120
- iter.filter_map(|row| {
121
- row.ok().map(|row| {
121
+ ParserResultType::Array => {
122
+ let iter = iter.map(|row| {
123
+ row.and_then(|row| {
122
124
  let column_count = row.get_column_iter().count();
123
125
  let mut vec = Vec::with_capacity(column_count);
124
126
  row.get_column_iter()
125
- .for_each(|(_, v)| vec.push(ParquetField(v.clone())));
126
- vec
127
+ .for_each(|(_, v)| vec.push(ParquetField(v.clone(), strict)));
128
+ Ok(vec)
127
129
  })
128
- })
129
- .map(RowRecord::Vec),
130
- ),
131
- };
130
+ .and_then(|row| Ok(RowRecord::Vec::<RandomState>(row)))
131
+ .map_err(|e| ReaderError::Parquet(e))
132
+ });
133
+
134
+ for result in iter {
135
+ let record = result?;
136
+ let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
137
+ }
138
+ }
139
+ }
132
140
 
133
- Ok(Yield::Iter(iter))
141
+ Ok(ruby.qnil().into_value_with(&ruby))
134
142
  }
135
143
 
136
144
  fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
@@ -1,4 +1,7 @@
1
- use crate::{impl_date_conversion, impl_timestamp_array_conversion, impl_timestamp_conversion};
1
+ use crate::{
2
+ impl_date_conversion, impl_timestamp_array_conversion, impl_timestamp_conversion,
3
+ reader::ReaderError,
4
+ };
2
5
 
3
6
  use super::*;
4
7
 
@@ -103,23 +106,23 @@ impl std::hash::Hash for ParquetValue {
103
106
  }
104
107
  }
105
108
 
106
- impl IntoValue for ParquetValue {
107
- fn into_value_with(self, handle: &Ruby) -> Value {
109
+ impl TryIntoValue for ParquetValue {
110
+ fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ReaderError> {
108
111
  match self {
109
- ParquetValue::Int8(i) => i.into_value_with(handle),
110
- ParquetValue::Int16(i) => i.into_value_with(handle),
111
- ParquetValue::Int32(i) => i.into_value_with(handle),
112
- ParquetValue::Int64(i) => i.into_value_with(handle),
113
- ParquetValue::UInt8(i) => i.into_value_with(handle),
114
- ParquetValue::UInt16(i) => i.into_value_with(handle),
115
- ParquetValue::UInt32(i) => i.into_value_with(handle),
116
- ParquetValue::UInt64(i) => i.into_value_with(handle),
117
- ParquetValue::Float16(f) => f.into_value_with(handle),
118
- ParquetValue::Float32(f) => f.into_value_with(handle),
119
- ParquetValue::Float64(f) => f.into_value_with(handle),
120
- ParquetValue::Boolean(b) => b.into_value_with(handle),
121
- ParquetValue::String(s) => s.into_value_with(handle),
122
- ParquetValue::Bytes(b) => handle.str_from_slice(&b).as_value(),
112
+ ParquetValue::Int8(i) => Ok(i.into_value_with(handle)),
113
+ ParquetValue::Int16(i) => Ok(i.into_value_with(handle)),
114
+ ParquetValue::Int32(i) => Ok(i.into_value_with(handle)),
115
+ ParquetValue::Int64(i) => Ok(i.into_value_with(handle)),
116
+ ParquetValue::UInt8(i) => Ok(i.into_value_with(handle)),
117
+ ParquetValue::UInt16(i) => Ok(i.into_value_with(handle)),
118
+ ParquetValue::UInt32(i) => Ok(i.into_value_with(handle)),
119
+ ParquetValue::UInt64(i) => Ok(i.into_value_with(handle)),
120
+ ParquetValue::Float16(f) => Ok(f.into_value_with(handle)),
121
+ ParquetValue::Float32(f) => Ok(f.into_value_with(handle)),
122
+ ParquetValue::Float64(f) => Ok(f.into_value_with(handle)),
123
+ ParquetValue::Boolean(b) => Ok(b.into_value_with(handle)),
124
+ ParquetValue::String(s) => Ok(s.into_value_with(handle)),
125
+ ParquetValue::Bytes(b) => Ok(handle.str_from_slice(&b).as_value()),
123
126
  ParquetValue::Date32(d) => impl_date_conversion!(d, handle),
124
127
  ParquetValue::Date64(d) => impl_date_conversion!(d, handle),
125
128
  timestamp @ ParquetValue::TimestampSecond(_, _) => {
@@ -136,21 +139,23 @@ impl IntoValue for ParquetValue {
136
139
  }
137
140
  ParquetValue::List(l) => {
138
141
  let ary = handle.ary_new_capa(l.len());
139
- l.into_iter()
140
- .try_for_each(|v| ary.push(v.into_value_with(handle)))
141
- .unwrap();
142
- ary.into_value_with(handle)
142
+ l.into_iter().try_for_each(|v| {
143
+ ary.push(v.try_into_value_with(handle)?)?;
144
+ Ok::<_, ReaderError>(())
145
+ })?;
146
+ Ok(ary.into_value_with(handle))
143
147
  }
144
148
  ParquetValue::Map(m) => {
145
149
  let hash = handle.hash_new_capa(m.len());
146
- m.into_iter()
147
- .try_for_each(|(k, v)| {
148
- hash.aset(k.into_value_with(handle), v.into_value_with(handle))
149
- })
150
- .unwrap();
151
- hash.into_value_with(handle)
152
- }
153
- ParquetValue::Null => handle.qnil().as_value(),
150
+ m.into_iter().try_for_each(|(k, v)| {
151
+ hash.aset(
152
+ k.try_into_value_with(handle)?,
153
+ v.try_into_value_with(handle)?,
154
+ )
155
+ })?;
156
+ Ok(hash.into_value_with(handle))
157
+ }
158
+ ParquetValue::Null => Ok(handle.qnil().as_value()),
154
159
  }
155
160
  }
156
161
  }
@@ -260,18 +265,10 @@ impl std::cmp::PartialEq for ParquetValueVec {
260
265
 
261
266
  impl std::cmp::Eq for ParquetValueVec {}
262
267
 
263
- impl TryFrom<Arc<dyn Array>> for ParquetValueVec {
264
- type Error = String;
265
-
266
- fn try_from(column: Arc<dyn Array>) -> Result<Self, Self::Error> {
267
- ParquetValueVec::try_from(&*column)
268
- }
269
- }
270
-
271
268
  macro_rules! impl_numeric_array_conversion {
272
269
  ($column:expr, $array_type:ty, $variant:ident) => {{
273
270
  let array = downcast_array::<$array_type>($column);
274
- if array.is_nullable() {
271
+ Ok(ParquetValueVec(if array.is_nullable() {
275
272
  array
276
273
  .values()
277
274
  .iter()
@@ -290,13 +287,13 @@ macro_rules! impl_numeric_array_conversion {
290
287
  .iter()
291
288
  .map(|x| ParquetValue::$variant(*x))
292
289
  .collect()
293
- }
290
+ }))
294
291
  }};
295
292
  }
296
293
  macro_rules! impl_boolean_array_conversion {
297
294
  ($column:expr, $array_type:ty, $variant:ident) => {{
298
295
  let array = downcast_array::<$array_type>($column);
299
- if array.is_nullable() {
296
+ Ok(ParquetValueVec(if array.is_nullable() {
300
297
  array
301
298
  .values()
302
299
  .iter()
@@ -315,34 +312,50 @@ macro_rules! impl_boolean_array_conversion {
315
312
  .iter()
316
313
  .map(|x| ParquetValue::$variant(x))
317
314
  .collect()
318
- }
315
+ }))
319
316
  }};
320
317
  }
321
318
 
322
- impl TryFrom<&dyn Array> for ParquetValueVec {
323
- type Error = String;
319
+ pub struct ArrayWrapper<'a> {
320
+ pub array: &'a dyn Array,
321
+ pub strict: bool,
322
+ }
323
+
324
+ impl<'a> TryFrom<ArrayWrapper<'a>> for ParquetValueVec {
325
+ type Error = ReaderError;
324
326
 
325
- fn try_from(column: &dyn Array) -> Result<Self, Self::Error> {
326
- let tmp_vec = match column.data_type() {
327
- DataType::Boolean => impl_boolean_array_conversion!(column, BooleanArray, Boolean),
328
- DataType::Int8 => impl_numeric_array_conversion!(column, Int8Array, Int8),
329
- DataType::Int16 => impl_numeric_array_conversion!(column, Int16Array, Int16),
330
- DataType::Int32 => impl_numeric_array_conversion!(column, Int32Array, Int32),
331
- DataType::Int64 => impl_numeric_array_conversion!(column, Int64Array, Int64),
332
- DataType::UInt8 => impl_numeric_array_conversion!(column, UInt8Array, UInt8),
333
- DataType::UInt16 => impl_numeric_array_conversion!(column, UInt16Array, UInt16),
334
- DataType::UInt32 => impl_numeric_array_conversion!(column, UInt32Array, UInt32),
335
- DataType::UInt64 => impl_numeric_array_conversion!(column, UInt64Array, UInt64),
336
- DataType::Float32 => impl_numeric_array_conversion!(column, Float32Array, Float32),
337
- DataType::Float64 => impl_numeric_array_conversion!(column, Float64Array, Float64),
338
- DataType::Date32 => impl_numeric_array_conversion!(column, Date32Array, Date32),
339
- DataType::Date64 => impl_numeric_array_conversion!(column, Date64Array, Date64),
327
+ fn try_from(column: ArrayWrapper<'a>) -> Result<Self, Self::Error> {
328
+ match column.array.data_type() {
329
+ DataType::Boolean => {
330
+ impl_boolean_array_conversion!(column.array, BooleanArray, Boolean)
331
+ }
332
+ DataType::Int8 => impl_numeric_array_conversion!(column.array, Int8Array, Int8),
333
+ DataType::Int16 => impl_numeric_array_conversion!(column.array, Int16Array, Int16),
334
+ DataType::Int32 => impl_numeric_array_conversion!(column.array, Int32Array, Int32),
335
+ DataType::Int64 => impl_numeric_array_conversion!(column.array, Int64Array, Int64),
336
+ DataType::UInt8 => impl_numeric_array_conversion!(column.array, UInt8Array, UInt8),
337
+ DataType::UInt16 => impl_numeric_array_conversion!(column.array, UInt16Array, UInt16),
338
+ DataType::UInt32 => impl_numeric_array_conversion!(column.array, UInt32Array, UInt32),
339
+ DataType::UInt64 => impl_numeric_array_conversion!(column.array, UInt64Array, UInt64),
340
+ DataType::Float32 => {
341
+ impl_numeric_array_conversion!(column.array, Float32Array, Float32)
342
+ }
343
+ DataType::Float64 => {
344
+ impl_numeric_array_conversion!(column.array, Float64Array, Float64)
345
+ }
346
+ DataType::Date32 => impl_numeric_array_conversion!(column.array, Date32Array, Date32),
347
+ DataType::Date64 => impl_numeric_array_conversion!(column.array, Date64Array, Date64),
340
348
  DataType::Timestamp(TimeUnit::Second, tz) => {
341
- impl_timestamp_array_conversion!(column, TimestampSecondArray, TimestampSecond, tz)
349
+ impl_timestamp_array_conversion!(
350
+ column.array,
351
+ TimestampSecondArray,
352
+ TimestampSecond,
353
+ tz
354
+ )
342
355
  }
343
356
  DataType::Timestamp(TimeUnit::Millisecond, tz) => {
344
357
  impl_timestamp_array_conversion!(
345
- column,
358
+ column.array,
346
359
  TimestampMillisecondArray,
347
360
  TimestampMillis,
348
361
  tz
@@ -350,7 +363,7 @@ impl TryFrom<&dyn Array> for ParquetValueVec {
350
363
  }
351
364
  DataType::Timestamp(TimeUnit::Microsecond, tz) => {
352
365
  impl_timestamp_array_conversion!(
353
- column,
366
+ column.array,
354
367
  TimestampMicrosecondArray,
355
368
  TimestampMicros,
356
369
  tz
@@ -358,72 +371,93 @@ impl TryFrom<&dyn Array> for ParquetValueVec {
358
371
  }
359
372
  DataType::Timestamp(TimeUnit::Nanosecond, tz) => {
360
373
  impl_timestamp_array_conversion!(
361
- column,
374
+ column.array,
362
375
  TimestampNanosecondArray,
363
376
  TimestampNanos,
364
377
  tz
365
378
  )
366
379
  }
367
380
  DataType::Float16 => {
368
- let array = downcast_array::<Float16Array>(column);
381
+ let array = downcast_array::<Float16Array>(column.array);
369
382
  if array.is_nullable() {
370
- array
371
- .values()
372
- .iter()
373
- .enumerate()
374
- .map(|(i, x)| {
375
- if array.is_null(i) {
376
- ParquetValue::Null
377
- } else {
378
- ParquetValue::Float16(f32::from(*x))
379
- }
380
- })
381
- .collect()
383
+ Ok(ParquetValueVec(
384
+ array
385
+ .values()
386
+ .iter()
387
+ .enumerate()
388
+ .map(|(i, x)| {
389
+ if array.is_null(i) {
390
+ ParquetValue::Null
391
+ } else {
392
+ ParquetValue::Float16(f32::from(*x))
393
+ }
394
+ })
395
+ .collect(),
396
+ ))
382
397
  } else {
383
- array
384
- .values()
385
- .iter()
386
- .map(|x| ParquetValue::Float16(f32::from(*x)))
387
- .collect()
398
+ Ok(ParquetValueVec(
399
+ array
400
+ .values()
401
+ .iter()
402
+ .map(|x| ParquetValue::Float16(f32::from(*x)))
403
+ .collect(),
404
+ ))
388
405
  }
389
406
  }
390
407
  DataType::Utf8 => {
391
- let array = downcast_array::<StringArray>(column);
392
- array
393
- .iter()
394
- .map(|opt_x| match opt_x {
395
- Some(x) => ParquetValue::String(x.to_string()),
396
- None => ParquetValue::Null,
397
- })
398
- .collect()
408
+ let array = downcast_array::<StringArray>(column.array);
409
+ let mut tmp_vec = Vec::with_capacity(array.len());
410
+ let iter = array.iter().map(|opt_x| match opt_x {
411
+ Some(x) => {
412
+ if column.strict {
413
+ Ok::<_, ReaderError>(ParquetValue::String(
414
+ simdutf8::basic::from_utf8(x.as_bytes())?.to_string(),
415
+ ))
416
+ } else {
417
+ Ok::<_, ReaderError>(ParquetValue::String(x.to_string()))
418
+ }
419
+ }
420
+ None => Ok(ParquetValue::Null),
421
+ });
422
+ for x in iter {
423
+ tmp_vec.push(x?);
424
+ }
425
+ Ok(ParquetValueVec(tmp_vec))
399
426
  }
400
427
  DataType::Binary => {
401
- let array = downcast_array::<BinaryArray>(column);
402
- array
403
- .iter()
404
- .map(|opt_x| match opt_x {
405
- Some(x) => ParquetValue::Bytes(x.to_vec()),
406
- None => ParquetValue::Null,
407
- })
408
- .collect()
428
+ let array = downcast_array::<BinaryArray>(column.array);
429
+ Ok(ParquetValueVec(
430
+ array
431
+ .iter()
432
+ .map(|opt_x| match opt_x {
433
+ Some(x) => ParquetValue::Bytes(x.to_vec()),
434
+ None => ParquetValue::Null,
435
+ })
436
+ .collect(),
437
+ ))
409
438
  }
410
439
  DataType::List(_field) => {
411
- let list_array = downcast_array::<ListArray>(column);
412
- list_array
413
- .iter()
414
- .map(|x| match x {
415
- Some(values) => match ParquetValueVec::try_from(values) {
416
- Ok(vec) => ParquetValue::List(vec.into_inner()),
417
- Err(e) => {
418
- panic!("Error converting list array to ParquetValueVec: {}", e)
419
- }
420
- },
421
- None => ParquetValue::Null,
422
- })
423
- .collect()
440
+ let list_array = downcast_array::<ListArray>(column.array);
441
+ Ok(ParquetValueVec(
442
+ list_array
443
+ .iter()
444
+ .map(|x| match x {
445
+ Some(values) => match ParquetValueVec::try_from(ArrayWrapper {
446
+ array: &*values,
447
+ strict: column.strict,
448
+ }) {
449
+ Ok(vec) => ParquetValue::List(vec.into_inner()),
450
+ Err(e) => {
451
+ panic!("Error converting list array to ParquetValueVec: {}", e)
452
+ }
453
+ },
454
+ None => ParquetValue::Null,
455
+ })
456
+ .collect(),
457
+ ))
424
458
  }
425
459
  DataType::Struct(_) => {
426
- let struct_array = downcast_array::<StructArray>(column);
460
+ let struct_array = downcast_array::<StructArray>(column.array);
427
461
  let mut values = Vec::with_capacity(struct_array.len());
428
462
  for i in 0..struct_array.len() {
429
463
  if struct_array.is_null(i) {
@@ -433,8 +467,11 @@ impl TryFrom<&dyn Array> for ParquetValueVec {
433
467
 
434
468
  let mut map = std::collections::HashMap::new();
435
469
  for (field_idx, field) in struct_array.fields().iter().enumerate() {
436
- let column = struct_array.column(field_idx);
437
- let field_values = match ParquetValueVec::try_from(column.slice(i, 1)) {
470
+ let c = struct_array.column(field_idx);
471
+ let field_values = match ParquetValueVec::try_from(ArrayWrapper {
472
+ array: &*c.slice(i, 1),
473
+ strict: column.strict,
474
+ }) {
438
475
  Ok(vec) => vec.into_inner(),
439
476
  Err(e) => {
440
477
  panic!("Error converting struct field to ParquetValueVec: {}", e)
@@ -447,16 +484,18 @@ impl TryFrom<&dyn Array> for ParquetValueVec {
447
484
  }
448
485
  values.push(ParquetValue::Map(map));
449
486
  }
450
- values
487
+ Ok(ParquetValueVec(values))
451
488
  }
452
489
  DataType::Null => {
453
- let x = downcast_array::<NullArray>(column);
454
- vec![ParquetValue::Null; x.len()]
490
+ let x = downcast_array::<NullArray>(column.array);
491
+ Ok(ParquetValueVec(vec![ParquetValue::Null; x.len()]))
455
492
  }
456
493
  _ => {
457
- return Err(format!("Unsupported data type: {:?}", column.data_type()));
494
+ return Err(ReaderError::Ruby(format!(
495
+ "Unsupported data type: {:?}",
496
+ column.array.data_type()
497
+ )));
458
498
  }
459
- };
460
- Ok(ParquetValueVec(tmp_vec))
499
+ }
461
500
  }
462
501
  }