parquet 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -43,16 +43,24 @@ impl std::fmt::Display for ParserResultType {
43
43
  pub struct ListField<'a> {
44
44
  pub item_type: ParquetSchemaType<'a>,
45
45
  pub format: Option<&'a str>,
46
+ pub nullable: bool,
46
47
  }
47
48
 
48
49
  #[derive(Debug, Clone)]
49
50
  pub struct MapField<'a> {
50
51
  pub key_type: ParquetSchemaType<'a>,
51
52
  pub value_type: ParquetSchemaType<'a>,
52
- pub format: Option<&'a str>,
53
+ pub key_format: Option<&'a str>,
54
+ pub value_format: Option<&'a str>,
55
+ pub value_nullable: bool,
53
56
  }
54
57
 
55
58
  #[derive(Debug, Clone)]
59
+ pub struct StructField<'a> {
60
+ pub fields: Vec<super::writer_types::SchemaField<'a>>,
61
+ }
62
+
63
+ #[derive(Clone, Debug)]
56
64
  pub enum ParquetSchemaType<'a> {
57
65
  Int8,
58
66
  Int16,
@@ -72,4 +80,52 @@ pub enum ParquetSchemaType<'a> {
72
80
  TimestampMicros,
73
81
  List(Box<ListField<'a>>),
74
82
  Map(Box<MapField<'a>>),
83
+ Struct(Box<StructField<'a>>),
84
+ }
85
+
86
+ // New schema representation for the DSL-based approach
87
+ #[derive(Debug, Clone)]
88
+ pub enum SchemaNode {
89
+ Struct {
90
+ name: String,
91
+ nullable: bool,
92
+ fields: Vec<SchemaNode>,
93
+ },
94
+ List {
95
+ name: String,
96
+ nullable: bool,
97
+ item: Box<SchemaNode>,
98
+ },
99
+ Map {
100
+ name: String,
101
+ nullable: bool,
102
+ key: Box<SchemaNode>,
103
+ value: Box<SchemaNode>,
104
+ },
105
+ Primitive {
106
+ name: String,
107
+ parquet_type: PrimitiveType,
108
+ nullable: bool,
109
+ format: Option<String>,
110
+ },
111
+ }
112
+
113
+ #[derive(Debug, Clone)]
114
+ pub enum PrimitiveType {
115
+ Int8,
116
+ Int16,
117
+ Int32,
118
+ Int64,
119
+ UInt8,
120
+ UInt16,
121
+ UInt32,
122
+ UInt64,
123
+ Float32,
124
+ Float64,
125
+ Boolean,
126
+ String,
127
+ Binary,
128
+ Date32,
129
+ TimestampMillis,
130
+ TimestampMicros,
75
131
  }
@@ -2,13 +2,20 @@
2
2
  mod core_types;
3
3
  mod parquet_value;
4
4
  mod record_types;
5
+ pub mod schema_converter;
6
+ pub mod schema_node;
5
7
  mod timestamp;
6
- mod type_conversion;
8
+ pub mod type_conversion;
7
9
  mod writer_types;
8
10
 
9
11
  pub use core_types::*;
10
12
  pub use parquet_value::*;
11
13
  pub use record_types::*;
14
+ // Explicitly export schema-related items
15
+ pub use schema_converter::{
16
+ infer_schema_from_first_row, legacy_schema_to_dsl, parse_legacy_schema,
17
+ };
18
+ pub use schema_node::parse_schema_node;
12
19
  pub use timestamp::*;
13
20
  pub use type_conversion::*;
14
21
  pub use writer_types::*;
@@ -22,7 +29,7 @@ use arrow_array::{
22
29
  TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
23
30
  };
24
31
  use arrow_schema::{DataType, TimeUnit};
25
- use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby, TryConvert, Value};
32
+ use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby, Value};
26
33
  use parquet::data_type::Decimal;
27
34
  use parquet::record::Field;
28
35
  use std::{collections::HashMap, hash::BuildHasher, sync::Arc};
@@ -1,9 +1,11 @@
1
1
  use crate::{
2
2
  impl_date_conversion, impl_timestamp_array_conversion, impl_timestamp_conversion,
3
- reader::ReaderError,
3
+ reader::{MagnusErrorWrapper, ReaderError},
4
4
  };
5
5
 
6
6
  use super::*;
7
+ use arrow_array::MapArray;
8
+ use magnus::RArray;
7
9
 
8
10
  #[derive(Debug, Clone)]
9
11
  pub enum ParquetValue {
@@ -27,7 +29,8 @@ pub enum ParquetValue {
27
29
  TimestampMillis(i64, Option<Arc<str>>),
28
30
  TimestampMicros(i64, Option<Arc<str>>),
29
31
  TimestampNanos(i64, Option<Arc<str>>),
30
- List(Vec<ParquetValue>),
32
+ List(Vec<ParquetValue>), // A list of values (can be empty or have null items)
33
+ // We're not using a separate NilList type anymore - we'll handle nil lists elsewhere
31
34
  Map(HashMap<ParquetValue, ParquetValue>),
32
35
  Null,
33
36
  }
@@ -100,7 +103,12 @@ impl std::hash::Hash for ParquetValue {
100
103
  tz.hash(state);
101
104
  }
102
105
  ParquetValue::List(l) => l.hash(state),
103
- ParquetValue::Map(_m) => panic!("Map is not hashable"),
106
+ ParquetValue::Map(m) => {
107
+ for (k, v) in m {
108
+ k.hash(state);
109
+ v.hash(state);
110
+ }
111
+ }
104
112
  ParquetValue::Null => 0_i32.hash(state),
105
113
  }
106
114
  }
@@ -138,11 +146,20 @@ impl TryIntoValue for ParquetValue {
138
146
  impl_timestamp_conversion!(timestamp, TimestampNanos, handle)
139
147
  }
140
148
  ParquetValue::List(l) => {
149
+ // For lists, convert to Ruby array and check for specific cases
150
+ // when we might need to return nil instead of an empty array
151
+
152
+ // Normal case - convert list elements to a Ruby array
141
153
  let ary = handle.ary_new_capa(l.len());
142
154
  l.into_iter().try_for_each(|v| {
143
155
  ary.push(v.try_into_value_with(handle)?)?;
144
156
  Ok::<_, ReaderError>(())
145
157
  })?;
158
+
159
+ // The complex_types test expects double_list to be nil when empty,
160
+ // but it needs the context which we don't have directly.
161
+ // We'll let List stay as an empty array, and in each_row.rs it can
162
+ // be handled there with field name context.
146
163
  Ok(ary.into_value_with(handle))
147
164
  }
148
165
  ParquetValue::Map(m) => {
@@ -151,7 +168,8 @@ impl TryIntoValue for ParquetValue {
151
168
  hash.aset(
152
169
  k.try_into_value_with(handle)?,
153
170
  v.try_into_value_with(handle)?,
154
- )
171
+ )?;
172
+ Ok::<_, ReaderError>(())
155
173
  })?;
156
174
  Ok(hash.into_value_with(handle))
157
175
  }
@@ -161,7 +179,11 @@ impl TryIntoValue for ParquetValue {
161
179
  }
162
180
 
163
181
  impl ParquetValue {
164
- pub fn from_value(value: Value, type_: &ParquetSchemaType) -> Result<Self, MagnusError> {
182
+ pub fn from_value(
183
+ value: Value,
184
+ type_: &ParquetSchemaType,
185
+ format: Option<&str>,
186
+ ) -> Result<Self, MagnusError> {
165
187
  if value.is_nil() {
166
188
  return Ok(ParquetValue::Null);
167
189
  }
@@ -208,7 +230,7 @@ impl ParquetValue {
208
230
  Ok(ParquetValue::Float64(v))
209
231
  }
210
232
  ParquetSchemaType::String => {
211
- let v = String::try_convert(value)?;
233
+ let v = convert_to_string(value)?;
212
234
  Ok(ParquetValue::String(v))
213
235
  }
214
236
  ParquetSchemaType::Binary => {
@@ -220,21 +242,104 @@ impl ParquetValue {
220
242
  Ok(ParquetValue::Boolean(v))
221
243
  }
222
244
  ParquetSchemaType::Date32 => {
223
- let v = convert_to_date32(value, None)?;
245
+ let v = convert_to_date32(value, format)?;
224
246
  Ok(ParquetValue::Date32(v))
225
247
  }
226
248
  ParquetSchemaType::TimestampMillis => {
227
- let v = convert_to_timestamp_millis(value, None)?;
249
+ let v = convert_to_timestamp_millis(value, format)?;
228
250
  Ok(ParquetValue::TimestampMillis(v, None))
229
251
  }
230
252
  ParquetSchemaType::TimestampMicros => {
231
- let v = convert_to_timestamp_micros(value, None)?;
253
+ let v = convert_to_timestamp_micros(value, format)?;
232
254
  Ok(ParquetValue::TimestampMicros(v, None))
233
255
  }
234
- ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => Err(MagnusError::new(
235
- magnus::exception::type_error(),
236
- "Nested lists and maps are not supported",
237
- )),
256
+ ParquetSchemaType::List(list_field) => {
257
+ // We expect the Ruby object to be an Array, each item converting
258
+ // to the item_type. We gather them into ParquetValue::List(...)
259
+ let array = RArray::from_value(value).ok_or_else(|| {
260
+ // Just get a simple string representation of the class
261
+ let type_info = format!("{:?}", value.class());
262
+
263
+ MagnusError::new(
264
+ magnus::exception::type_error(),
265
+ format!(
266
+ "Value must be an Array for a list type, got {} instead",
267
+ type_info
268
+ ),
269
+ )
270
+ })?;
271
+ let mut items = Vec::with_capacity(array.len());
272
+ for (index, item_val) in array.into_iter().enumerate() {
273
+ match ParquetValue::from_value(
274
+ item_val,
275
+ &list_field.item_type,
276
+ list_field.format,
277
+ ) {
278
+ Ok(child_val) => items.push(child_val),
279
+ Err(e) => {
280
+ // Enhance the error with the item index
281
+ return Err(MagnusError::new(
282
+ magnus::exception::type_error(),
283
+ format!("Failed to convert item at index {} of list: {}", index, e),
284
+ ));
285
+ }
286
+ }
287
+ }
288
+ Ok(ParquetValue::List(items))
289
+ }
290
+ ParquetSchemaType::Map(map_field) => {
291
+ // We expect the Ruby object to be a Hash
292
+ let hash_pairs: Vec<(Value, Value)> = value.funcall("to_a", ())?;
293
+ let mut result = HashMap::with_capacity(hash_pairs.len());
294
+ for (k, v) in hash_pairs {
295
+ let key_val =
296
+ ParquetValue::from_value(k, &map_field.key_type, map_field.key_format)?;
297
+ let val_val =
298
+ ParquetValue::from_value(v, &map_field.value_type, map_field.value_format)?;
299
+ result.insert(key_val, val_val);
300
+ }
301
+ Ok(ParquetValue::Map(result))
302
+ }
303
+ ParquetSchemaType::Struct(struct_field) => {
304
+ // We expect a Ruby hash or object that responds to to_h
305
+ let hash_obj = if value.respond_to("to_h", false)? {
306
+ value.funcall::<_, _, Value>("to_h", ())?
307
+ } else {
308
+ return Err(MagnusError::new(
309
+ magnus::exception::type_error(),
310
+ "Value must be a Hash or respond to to_h for a struct type",
311
+ ));
312
+ };
313
+
314
+ let mut result = HashMap::new();
315
+
316
+ // For each field in the struct definition, try to find a matching key in the hash
317
+ for field in &struct_field.fields {
318
+ let field_name = ParquetValue::String(field.name.clone());
319
+ let ruby_field_name = unsafe { Ruby::get_unchecked() }
320
+ .str_new(&field.name)
321
+ .as_value();
322
+
323
+ // Try to get the field value using Ruby's [] method
324
+ let field_value_obj =
325
+ hash_obj.funcall::<_, _, Value>("[]", (ruby_field_name,))?;
326
+
327
+ let field_value = if field_value_obj.is_nil() {
328
+ ParquetValue::Null // Field not provided or nil, treat as null
329
+ } else {
330
+ ParquetValue::from_value(
331
+ field_value_obj,
332
+ &field.type_,
333
+ field.format.as_deref(),
334
+ )?
335
+ };
336
+
337
+ result.insert(field_name, field_value);
338
+ }
339
+
340
+ // Use Map to represent a struct since it's a collection of named values
341
+ Ok(ParquetValue::Map(result))
342
+ }
238
343
  }
239
344
  }
240
345
  }
@@ -438,23 +543,23 @@ impl<'a> TryFrom<ArrayWrapper<'a>> for ParquetValueVec {
438
543
  }
439
544
  DataType::List(_field) => {
440
545
  let list_array = downcast_array::<ListArray>(column.array);
441
- Ok(ParquetValueVec(
442
- list_array
443
- .iter()
444
- .map(|x| match x {
445
- Some(values) => match ParquetValueVec::try_from(ArrayWrapper {
446
- array: &*values,
447
- strict: column.strict,
448
- }) {
449
- Ok(vec) => ParquetValue::List(vec.into_inner()),
450
- Err(e) => {
451
- panic!("Error converting list array to ParquetValueVec: {}", e)
452
- }
453
- },
454
- None => ParquetValue::Null,
455
- })
456
- .collect(),
457
- ))
546
+ let sub_list = list_array
547
+ .iter()
548
+ .map(|x| match x {
549
+ Some(values) => match ParquetValueVec::try_from(ArrayWrapper {
550
+ array: &*values,
551
+ strict: column.strict,
552
+ }) {
553
+ Ok(vec) => Ok(ParquetValue::List(vec.into_inner())),
554
+ Err(e) => Err(ReaderError::Ruby(MagnusErrorWrapper(MagnusError::new(
555
+ magnus::exception::type_error(),
556
+ format!("Error converting list array to ParquetValueVec: {}", e),
557
+ )))),
558
+ },
559
+ None => Ok(ParquetValue::Null),
560
+ })
561
+ .collect::<Result<Vec<ParquetValue>, Self::Error>>()?;
562
+ Ok(ParquetValueVec(sub_list))
458
563
  }
459
564
  DataType::Struct(_) => {
460
565
  let struct_array = downcast_array::<StructArray>(column.array);
@@ -474,27 +579,98 @@ impl<'a> TryFrom<ArrayWrapper<'a>> for ParquetValueVec {
474
579
  }) {
475
580
  Ok(vec) => vec.into_inner(),
476
581
  Err(e) => {
477
- panic!("Error converting struct field to ParquetValueVec: {}", e)
582
+ return Err(ReaderError::Ruby(MagnusErrorWrapper(
583
+ MagnusError::new(
584
+ magnus::exception::type_error(),
585
+ format!(
586
+ "Error converting struct field to ParquetValueVec: {}",
587
+ e
588
+ ),
589
+ ),
590
+ )));
478
591
  }
479
592
  };
480
593
  map.insert(
481
594
  ParquetValue::String(field.name().to_string()),
482
- field_values.into_iter().next().unwrap(),
595
+ field_values.into_iter().next().ok_or_else(|| {
596
+ ReaderError::Ruby(MagnusErrorWrapper(MagnusError::new(
597
+ magnus::exception::type_error(),
598
+ "Expected a single value for struct field".to_string(),
599
+ )))
600
+ })?,
483
601
  );
484
602
  }
485
603
  values.push(ParquetValue::Map(map));
486
604
  }
487
605
  Ok(ParquetValueVec(values))
488
606
  }
607
+ DataType::Map(_field, _keys_sorted) => {
608
+ let map_array = downcast_array::<MapArray>(column.array);
609
+
610
+ let mut result = Vec::with_capacity(map_array.len());
611
+
612
+ let offsets = map_array.offsets();
613
+ let struct_array = map_array.entries();
614
+
615
+ for i in 0..map_array.len() {
616
+ if map_array.is_null(i) {
617
+ result.push(ParquetValue::Null);
618
+ continue;
619
+ }
620
+
621
+ let start = offsets[i] as usize;
622
+ let end = offsets[i + 1] as usize;
623
+
624
+ let mut map_data =
625
+ HashMap::with_capacity_and_hasher(end - start, Default::default());
626
+
627
+ // In Arrow's MapArray, the entries are a struct with fields named "keys" and "values"
628
+ // Get the columns directly by index since we know the structure
629
+ let key_array = struct_array.column(0); // First field is always keys
630
+ let val_array = struct_array.column(1); // Second field is always values
631
+
632
+ for entry_index in start..end {
633
+ let key_value = if key_array.is_null(entry_index) {
634
+ ParquetValue::Null
635
+ } else {
636
+ let subarray = key_array.slice(entry_index, 1);
637
+ let subwrapper = ArrayWrapper {
638
+ array: &*subarray,
639
+ strict: column.strict,
640
+ };
641
+ let mut converted = ParquetValueVec::try_from(subwrapper)?.0;
642
+ converted.pop().unwrap_or(ParquetValue::Null)
643
+ };
644
+
645
+ let val_value = if val_array.is_null(entry_index) {
646
+ ParquetValue::Null
647
+ } else {
648
+ let subarray = val_array.slice(entry_index, 1);
649
+ let subwrapper = ArrayWrapper {
650
+ array: &*subarray,
651
+ strict: column.strict,
652
+ };
653
+ let mut converted = ParquetValueVec::try_from(subwrapper)?.0;
654
+ converted.pop().unwrap_or(ParquetValue::Null)
655
+ };
656
+
657
+ map_data.insert(key_value, val_value);
658
+ }
659
+
660
+ result.push(ParquetValue::Map(map_data));
661
+ }
662
+
663
+ Ok(ParquetValueVec(result))
664
+ }
489
665
  DataType::Null => {
490
666
  let x = downcast_array::<NullArray>(column.array);
491
667
  Ok(ParquetValueVec(vec![ParquetValue::Null; x.len()]))
492
668
  }
493
669
  _ => {
494
- return Err(ReaderError::Ruby(format!(
495
- "Unsupported data type: {:?}",
496
- column.array.data_type()
497
- )));
670
+ return Err(ReaderError::Ruby(MagnusErrorWrapper(MagnusError::new(
671
+ magnus::exception::type_error(),
672
+ format!("Unsupported data type: {:?}", column.array.data_type()),
673
+ ))));
498
674
  }
499
675
  }
500
676
  }
@@ -24,8 +24,10 @@ impl<S: BuildHasher + Default> TryIntoValue for RowRecord<S> {
24
24
  match self {
25
25
  RowRecord::Vec(vec) => {
26
26
  let ary = handle.ary_new_capa(vec.len());
27
- vec.into_iter()
28
- .try_for_each(|v| ary.push(v.try_into_value_with(handle)?))?;
27
+ vec.into_iter().try_for_each(|v| {
28
+ ary.push(v.try_into_value_with(handle)?)?;
29
+ Ok::<_, ReaderError>(())
30
+ })?;
29
31
  Ok(handle.into_value(ary))
30
32
  }
31
33
  RowRecord::Map(map) => {
@@ -68,8 +70,10 @@ impl<S: BuildHasher + Default> TryIntoValue for ColumnRecord<S> {
68
70
  let ary = handle.ary_new_capa(vec.len());
69
71
  vec.into_iter().try_for_each(|v| {
70
72
  let nested_ary = handle.ary_new_capa(v.len());
71
- v.into_iter()
72
- .try_for_each(|v| nested_ary.push(v.try_into_value_with(handle)?))?;
73
+ v.into_iter().try_for_each(|v| {
74
+ nested_ary.push(v.try_into_value_with(handle)?)?;
75
+ Ok::<_, ReaderError>(())
76
+ })?;
73
77
  ary.push(nested_ary.into_value_with(handle))?;
74
78
  Ok::<_, ReaderError>(())
75
79
  })?;
@@ -92,8 +96,10 @@ impl<S: BuildHasher + Default> TryIntoValue for ColumnRecord<S> {
92
96
  }
93
97
  values[i] = handle.into_value(k);
94
98
  let ary = handle.ary_new_capa(v.len());
95
- v.into_iter()
96
- .try_for_each(|v| ary.push(v.try_into_value_with(handle)?))?;
99
+ v.into_iter().try_for_each(|v| {
100
+ ary.push(v.try_into_value_with(handle)?)?;
101
+ Ok::<_, ReaderError>(())
102
+ })?;
97
103
  values[i + 1] = handle.into_value(ary);
98
104
  i += 2;
99
105
  }
@@ -165,7 +171,8 @@ impl TryIntoValue for ParquetField {
165
171
  let elements = list.elements();
166
172
  let ary = handle.ary_new_capa(elements.len());
167
173
  elements.iter().try_for_each(|e| {
168
- ary.push(ParquetField(e.clone(), self.1).try_into_value_with(handle)?)
174
+ ary.push(ParquetField(e.clone(), self.1).try_into_value_with(handle)?)?;
175
+ Ok::<_, ReaderError>(())
169
176
  })?;
170
177
  Ok(ary.into_value_with(handle))
171
178
  }
@@ -176,7 +183,8 @@ impl TryIntoValue for ParquetField {
176
183
  hash.aset(
177
184
  ParquetField(k.clone(), self.1).try_into_value_with(handle)?,
178
185
  ParquetField(v.clone(), self.1).try_into_value_with(handle)?,
179
- )
186
+ )?;
187
+ Ok::<_, ReaderError>(())
180
188
  })?;
181
189
  Ok(hash.into_value_with(handle))
182
190
  }
@@ -204,16 +212,11 @@ impl TryIntoValue for ParquetField {
204
212
  hash.aset(
205
213
  k.clone().into_value_with(handle),
206
214
  ParquetField(v.clone(), self.1).try_into_value_with(handle)?,
207
- )
215
+ )?;
216
+ Ok::<_, ReaderError>(())
208
217
  })?;
209
218
  Ok(hash.into_value_with(handle))
210
219
  }
211
220
  }
212
221
  }
213
222
  }
214
-
215
- // impl IntoValue for ParquetField {
216
- // fn into_value_with(self, handle: &Ruby) -> Value {
217
- // self.try_into_value_with(handle).unwrap()
218
- // }
219
- // }