parquet 0.7.2 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,11 +12,14 @@ use arrow_schema::{DataType, Field};
12
12
  use bytes::Bytes;
13
13
  use indexmap::IndexMap;
14
14
  use ordered_float::OrderedFloat;
15
+ use parquet::basic::LogicalType;
16
+ use parquet::schema::types::Type;
15
17
  use std::sync::Arc;
16
18
 
17
19
  /// Convert a single value from an Arrow array at the given index to a ParquetValue
18
20
  pub fn arrow_to_parquet_value(
19
- field: &Field,
21
+ arrow_field: &Field,
22
+ parquet_field: &Type,
20
23
  array: &dyn Array,
21
24
  index: usize,
22
25
  ) -> Result<ParquetValue> {
@@ -91,13 +94,20 @@ pub fn arrow_to_parquet_value(
91
94
  DataType::FixedSizeBinary(_) => {
92
95
  let array = downcast_array::<FixedSizeBinaryArray>(array)?;
93
96
  let value = array.value(index);
94
- match field.try_extension_type::<ArrowUuid>() {
95
- Ok(_) => {
96
- let uuid = uuid::Uuid::from_slice(value)
97
- .map_err(|e| ParquetError::Conversion(format!("Invalid UUID: {}", e)))?;
98
- Ok(ParquetValue::Uuid(uuid))
97
+ if let Some(LogicalType::Uuid) = parquet_field.get_basic_info().logical_type() {
98
+ let uuid = uuid::Uuid::from_slice(value)
99
+ .map_err(|e| ParquetError::Conversion(format!("Invalid UUID: {}", e)))?;
100
+ Ok(ParquetValue::Uuid(uuid))
101
+ } else {
102
+ match arrow_field.try_extension_type::<ArrowUuid>() {
103
+ Ok(_) => {
104
+ let uuid = uuid::Uuid::from_slice(value).map_err(|e| {
105
+ ParquetError::Conversion(format!("Invalid UUID: {}", e))
106
+ })?;
107
+ Ok(ParquetValue::Uuid(uuid))
108
+ }
109
+ Err(_) => Ok(ParquetValue::Bytes(Bytes::copy_from_slice(value))),
99
110
  }
100
- Err(_) => Ok(ParquetValue::Bytes(Bytes::copy_from_slice(value))),
101
111
  }
102
112
  }
103
113
 
@@ -192,8 +202,43 @@ pub fn arrow_to_parquet_value(
192
202
  let list_values = array.value(index);
193
203
 
194
204
  let mut values = Vec::with_capacity(list_values.len());
205
+
206
+ // Get the list's element type from parquet schema
207
+ let element_type = match parquet_field {
208
+ parquet::schema::types::Type::GroupType { fields, .. } => {
209
+ // List has a repeated group containing the element
210
+ // The structure is: LIST -> repeated group -> element
211
+ if let Some(repeated_group) = fields.first() {
212
+ match repeated_group.as_ref() {
213
+ parquet::schema::types::Type::GroupType {
214
+ fields: inner_fields,
215
+ ..
216
+ } => {
217
+ // This is the repeated group, get the actual element
218
+ inner_fields.first().ok_or_else(|| {
219
+ ParquetError::Conversion(
220
+ "List repeated group missing element field".to_string(),
221
+ )
222
+ })?
223
+ }
224
+ _ => repeated_group, // If it's not a group, use it directly
225
+ }
226
+ } else {
227
+ return Err(ParquetError::Conversion(
228
+ "List type missing fields".to_string(),
229
+ ));
230
+ }
231
+ }
232
+ _ => parquet_field, // Fallback for cases where it's not a proper list structure
233
+ };
234
+
195
235
  for i in 0..list_values.len() {
196
- values.push(arrow_to_parquet_value(item_field, &list_values, i)?);
236
+ values.push(arrow_to_parquet_value(
237
+ item_field,
238
+ element_type,
239
+ &list_values,
240
+ i,
241
+ )?);
197
242
  }
198
243
 
199
244
  Ok(ParquetValue::List(values))
@@ -210,7 +255,7 @@ pub fn arrow_to_parquet_value(
210
255
  .fields()
211
256
  .iter()
212
257
  .find(|f| f.name() == "key")
213
- .ok_or_else(|| ParquetError::Conversion("No value field found".to_string()))?;
258
+ .ok_or_else(|| ParquetError::Conversion("No key field found".to_string()))?;
214
259
 
215
260
  let value_field = map_value
216
261
  .fields()
@@ -219,9 +264,59 @@ pub fn arrow_to_parquet_value(
219
264
  .ok_or_else(|| ParquetError::Conversion("No value field found".to_string()))?;
220
265
 
221
266
  let mut map_vec = Vec::with_capacity(keys.len());
267
+
268
+ // Get key and value types from parquet schema
269
+ // Map structure is: MAP -> key_value (repeated group) -> key, value
270
+ let (key_type, value_type) = match parquet_field {
271
+ parquet::schema::types::Type::GroupType { fields, .. } => {
272
+ // Get the key_value repeated group
273
+ match fields.first() {
274
+ Some(key_value_group) => match key_value_group.as_ref() {
275
+ parquet::schema::types::Type::GroupType {
276
+ fields: kv_fields, ..
277
+ } => {
278
+ // Find key and value fields by name
279
+ let key_field = kv_fields
280
+ .iter()
281
+ .find(|f| f.name() == "key")
282
+ .ok_or_else(|| {
283
+ ParquetError::Conversion(
284
+ "Map missing key field".to_string(),
285
+ )
286
+ })?;
287
+ let value_field = kv_fields
288
+ .iter()
289
+ .find(|f| f.name() == "value")
290
+ .ok_or_else(|| {
291
+ ParquetError::Conversion(
292
+ "Map missing value field".to_string(),
293
+ )
294
+ })?;
295
+ (key_field.as_ref(), value_field.as_ref())
296
+ }
297
+ _ => {
298
+ return Err(ParquetError::Conversion(
299
+ "Map key_value should be a group".to_string(),
300
+ ))
301
+ }
302
+ },
303
+ None => {
304
+ return Err(ParquetError::Conversion(
305
+ "Map type missing key_value field".to_string(),
306
+ ))
307
+ }
308
+ }
309
+ }
310
+ _ => {
311
+ return Err(ParquetError::Conversion(
312
+ "Map type must be a group".to_string(),
313
+ ))
314
+ }
315
+ };
316
+
222
317
  for i in 0..keys.len() {
223
- let key = arrow_to_parquet_value(key_field, keys, i)?;
224
- let value = arrow_to_parquet_value(value_field, values, i)?;
318
+ let key = arrow_to_parquet_value(key_field, key_type, keys, i)?;
319
+ let value = arrow_to_parquet_value(value_field, value_type, values, i)?;
225
320
  map_vec.push((key, value));
226
321
  }
227
322
 
@@ -231,10 +326,34 @@ pub fn arrow_to_parquet_value(
231
326
  let array = downcast_array::<StructArray>(array)?;
232
327
 
233
328
  let mut map = IndexMap::new();
234
- for (col_idx, field) in array.fields().iter().enumerate() {
329
+
330
+ // Get struct fields from parquet schema
331
+ let parquet_fields = match parquet_field {
332
+ parquet::schema::types::Type::GroupType { fields, .. } => fields,
333
+ _ => {
334
+ return Err(ParquetError::Conversion(
335
+ "Struct type must be a group".to_string(),
336
+ ))
337
+ }
338
+ };
339
+
340
+ for (col_idx, arrow_field) in array.fields().iter().enumerate() {
235
341
  let column = array.column(col_idx);
236
- let value = arrow_to_parquet_value(field, column, index)?;
237
- map.insert(Arc::from(field.name().as_str()), value);
342
+
343
+ // Find matching parquet field by name
344
+ let nested_parquet_field = parquet_fields
345
+ .iter()
346
+ .find(|f| f.name() == arrow_field.name())
347
+ .ok_or_else(|| {
348
+ ParquetError::Conversion(format!(
349
+ "No matching parquet field for struct field '{}'",
350
+ arrow_field.name()
351
+ ))
352
+ })?;
353
+
354
+ let value =
355
+ arrow_to_parquet_value(arrow_field, nested_parquet_field, column, index)?;
356
+ map.insert(Arc::from(arrow_field.name().as_str()), value);
238
357
  }
239
358
 
240
359
  Ok(ParquetValue::Record(map))
@@ -1121,6 +1240,7 @@ pub fn append_parquet_value_to_builder(
1121
1240
  mod tests {
1122
1241
  use super::*;
1123
1242
  use arrow_array::*;
1243
+ use parquet::basic::Type as PhysicalType;
1124
1244
 
1125
1245
  #[test]
1126
1246
  fn test_primitive_conversion_roundtrip() {
@@ -1132,9 +1252,12 @@ mod tests {
1132
1252
  ];
1133
1253
  let field = Field::new("test", DataType::Boolean, true);
1134
1254
  let array = parquet_values_to_arrow_array(values.clone(), &field).unwrap();
1255
+ let type_ = Type::primitive_type_builder("test", PhysicalType::BOOLEAN)
1256
+ .build()
1257
+ .unwrap();
1135
1258
 
1136
1259
  for (i, expected) in values.iter().enumerate() {
1137
- let actual = arrow_to_parquet_value(&field, array.as_ref(), i).unwrap();
1260
+ let actual = arrow_to_parquet_value(&field, &type_, array.as_ref(), i).unwrap();
1138
1261
  assert_eq!(&actual, expected);
1139
1262
  }
1140
1263
  }
@@ -1,10 +1,10 @@
1
1
  //! Core Parquet reading functionality
2
2
 
3
- use crate::{arrow_conversion::arrow_to_parquet_value, ParquetValue, Result};
3
+ use crate::{arrow_conversion::arrow_to_parquet_value, ParquetError, ParquetValue, Result};
4
4
  use arrow::record_batch::RecordBatch;
5
5
  use arrow_array::Array;
6
6
  use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder};
7
- use parquet::file::metadata::FileMetaData;
7
+ use parquet::file::metadata::{FileMetaData, ParquetMetaData};
8
8
  use std::sync::Arc;
9
9
 
10
10
  /// Core Parquet reader that works with any source implementing Read + Seek
@@ -33,10 +33,12 @@ where
33
33
  /// Returns an iterator over rows where each row is a vector of ParquetValues
34
34
  pub fn read_rows(self) -> Result<RowIterator<R>> {
35
35
  let builder = ParquetRecordBatchReaderBuilder::try_new(self.inner)?;
36
+ let metadata = builder.metadata().clone();
36
37
  let reader = builder.build()?;
37
38
 
38
39
  Ok(RowIterator {
39
40
  batch_reader: reader,
41
+ metadata,
40
42
  current_batch: None,
41
43
  current_row: 0,
42
44
  _phantom: std::marker::PhantomData,
@@ -64,10 +66,12 @@ where
64
66
 
65
67
  let mask = parquet::arrow::ProjectionMask::roots(builder.parquet_schema(), column_indices);
66
68
  builder = builder.with_projection(mask);
69
+ let metadata = builder.metadata().clone();
67
70
  let reader = builder.build()?;
68
71
 
69
72
  Ok(RowIterator {
70
73
  batch_reader: reader,
74
+ metadata,
71
75
  current_batch: None,
72
76
  current_row: 0,
73
77
  _phantom: std::marker::PhantomData,
@@ -88,10 +92,12 @@ where
88
92
  }
89
93
 
90
94
  let schema = builder.schema().clone();
95
+ let metadata = builder.metadata().clone();
91
96
  let reader = builder.build()?;
92
97
 
93
98
  Ok(ColumnIterator {
94
99
  batch_reader: reader,
100
+ metadata,
95
101
  schema,
96
102
  returned_empty_batch: false,
97
103
  is_empty_file: is_empty,
@@ -129,10 +135,12 @@ where
129
135
  }
130
136
 
131
137
  let schema = builder.schema().clone();
138
+ let metadata = builder.metadata().clone();
132
139
  let reader = builder.build()?;
133
140
 
134
141
  Ok(ColumnIterator {
135
142
  batch_reader: reader,
143
+ metadata,
136
144
  schema,
137
145
  returned_empty_batch: false,
138
146
  is_empty_file: is_empty,
@@ -144,6 +152,7 @@ where
144
152
  /// Iterator over rows in a Parquet file
145
153
  pub struct RowIterator<R> {
146
154
  batch_reader: ParquetRecordBatchReader,
155
+ metadata: Arc<ParquetMetaData>,
147
156
  current_batch: Option<RecordBatch>,
148
157
  current_row: usize,
149
158
  _phantom: std::marker::PhantomData<R>,
@@ -156,6 +165,7 @@ where
156
165
  type Item = Result<Vec<ParquetValue>>;
157
166
 
158
167
  fn next(&mut self) -> Option<Self::Item> {
168
+ let schema_descriptor = self.metadata.file_metadata().schema_descr_ptr();
159
169
  loop {
160
170
  // If we have a current batch and haven't exhausted it
161
171
  if let Some(ref batch) = self.current_batch {
@@ -164,9 +174,31 @@ where
164
174
  let mut row_values = Vec::with_capacity(batch.num_columns());
165
175
 
166
176
  let schema = batch.schema();
177
+
178
+ let root_schema = schema_descriptor.root_schema();
179
+ let parquet_fields = match root_schema {
180
+ parquet::schema::types::Type::GroupType { fields, .. } => fields,
181
+ _ => {
182
+ return Some(Err(ParquetError::Conversion(
183
+ "Root schema must be a group type".to_string(),
184
+ )))
185
+ }
186
+ };
187
+
167
188
  for (i, column) in batch.columns().iter().enumerate() {
168
189
  let field = schema.field(i);
169
- let value = match arrow_to_parquet_value(field, column, self.current_row) {
190
+ let parquet_field = if i < parquet_fields.len() {
191
+ parquet_fields[i].clone()
192
+ } else {
193
+ // Fallback to leaf column if index out of bounds
194
+ schema_descriptor.column(i).self_type_ptr()
195
+ };
196
+ let value = match arrow_to_parquet_value(
197
+ field,
198
+ &parquet_field,
199
+ column,
200
+ self.current_row,
201
+ ) {
170
202
  Ok(v) => v,
171
203
  Err(e) => return Some(Err(e)),
172
204
  };
@@ -194,6 +226,7 @@ where
194
226
  /// Iterator over column batches in a Parquet file
195
227
  pub struct ColumnIterator<R> {
196
228
  batch_reader: ParquetRecordBatchReader,
229
+ metadata: Arc<ParquetMetaData>,
197
230
  schema: Arc<arrow_schema::Schema>,
198
231
  returned_empty_batch: bool,
199
232
  is_empty_file: bool,
@@ -228,6 +261,17 @@ where
228
261
  match self.batch_reader.next() {
229
262
  Some(Ok(batch)) => {
230
263
  let mut columns = Vec::with_capacity(batch.num_columns());
264
+ let schema_descriptor = self.metadata.file_metadata().schema_descr_ptr();
265
+
266
+ let root_schema = schema_descriptor.root_schema();
267
+ let parquet_fields = match root_schema {
268
+ parquet::schema::types::Type::GroupType { fields, .. } => fields,
269
+ _ => {
270
+ return Some(Err(ParquetError::Conversion(
271
+ "Root schema must be a group type".to_string(),
272
+ )))
273
+ }
274
+ };
231
275
 
232
276
  for (idx, column) in batch.columns().iter().enumerate() {
233
277
  let field = self.schema.field(idx);
@@ -236,7 +280,13 @@ where
236
280
  // Convert entire column to ParquetValues
237
281
  let mut values = Vec::with_capacity(column.len());
238
282
  for row_idx in 0..column.len() {
239
- match arrow_to_parquet_value(field, column, row_idx) {
283
+ let parquet_field = if idx < parquet_fields.len() {
284
+ parquet_fields[idx].clone()
285
+ } else {
286
+ // Fallback to leaf column if index out of bounds
287
+ schema_descriptor.column(idx).self_type_ptr()
288
+ };
289
+ match arrow_to_parquet_value(field, &parquet_field, column, row_idx) {
240
290
  Ok(value) => values.push(value),
241
291
  Err(e) => return Some(Err(e)),
242
292
  }
@@ -3,6 +3,7 @@ use arrow_schema::{DataType, Field, TimeUnit};
3
3
  use bytes::Bytes;
4
4
  use num::BigInt;
5
5
  use ordered_float::OrderedFloat;
6
+ use parquet::schema::types::Type;
6
7
  use parquet_core::arrow_conversion::{arrow_to_parquet_value, parquet_values_to_arrow_array};
7
8
  use parquet_core::*;
8
9
  use std::sync::Arc;
@@ -99,7 +100,19 @@ fn test_decimal256_large_values() {
99
100
 
100
101
  // Verify roundtrip
101
102
  for i in 0..4 {
102
- let value = arrow_to_parquet_value(&field, array.as_ref(), i).unwrap();
103
+ // Create a dummy parquet type for testing
104
+ let parquet_type =
105
+ Type::primitive_type_builder("test", parquet::basic::Type::FIXED_LEN_BYTE_ARRAY)
106
+ .with_length(32)
107
+ .with_precision(76)
108
+ .with_scale(0)
109
+ .with_logical_type(Some(parquet::basic::LogicalType::Decimal {
110
+ scale: 0,
111
+ precision: 76,
112
+ }))
113
+ .build()
114
+ .unwrap();
115
+ let value = arrow_to_parquet_value(&field, &parquet_type, array.as_ref(), i).unwrap();
103
116
  match (i, value) {
104
117
  (0, ParquetValue::Decimal256(v, _)) => assert_eq!(v, large_positive.clone()),
105
118
  (1, ParquetValue::Decimal256(v, _)) => assert_eq!(v, large_negative.clone()),
@@ -173,7 +186,15 @@ fn test_timestamp_with_timezone() {
173
186
 
174
187
  // Verify roundtrip preserves timezone
175
188
  for i in 0..3 {
176
- let value = arrow_to_parquet_value(&field, array.as_ref(), i).unwrap();
189
+ // Create a dummy parquet type for testing
190
+ let parquet_type = Type::primitive_type_builder("test", parquet::basic::Type::INT64)
191
+ .with_logical_type(Some(parquet::basic::LogicalType::Timestamp {
192
+ is_adjusted_to_u_t_c: true,
193
+ unit: parquet::basic::TimeUnit::MILLIS(Default::default()),
194
+ }))
195
+ .build()
196
+ .unwrap();
197
+ let value = arrow_to_parquet_value(&field, &parquet_type, array.as_ref(), i).unwrap();
177
198
  match value {
178
199
  ParquetValue::TimestampMillis(_, Some(tz)) => {
179
200
  assert_eq!(tz.as_ref(), "America/New_York");
@@ -209,7 +230,19 @@ fn test_nested_list_of_lists() {
209
230
  assert_eq!(array.len(), 1);
210
231
 
211
232
  // Verify roundtrip
212
- let value = arrow_to_parquet_value(&outer_field, array.as_ref(), 0).unwrap();
233
+ // Create a dummy parquet type for testing - a list of list of int32
234
+ let int_type = Type::primitive_type_builder("item", parquet::basic::Type::INT32)
235
+ .build()
236
+ .unwrap();
237
+ let inner_list = Type::group_type_builder("inner_list")
238
+ .with_fields(vec![Arc::new(int_type)])
239
+ .build()
240
+ .unwrap();
241
+ let parquet_type = Type::group_type_builder("outer_list")
242
+ .with_fields(vec![Arc::new(inner_list)])
243
+ .build()
244
+ .unwrap();
245
+ let value = arrow_to_parquet_value(&outer_field, &parquet_type, array.as_ref(), 0).unwrap();
213
246
  match value {
214
247
  ParquetValue::List(items) => assert_eq!(items.len(), 5),
215
248
  _ => panic!("Expected list"),
@@ -357,7 +390,16 @@ fn test_unsupported_arrow_types() {
357
390
  )
358
391
  .unwrap();
359
392
 
360
- let result = arrow_to_parquet_value(&Field::new("int", DataType::Int32, false), &array, 0);
393
+ // Create a dummy parquet type for testing
394
+ let parquet_type = Type::primitive_type_builder("int", parquet::basic::Type::INT32)
395
+ .build()
396
+ .unwrap();
397
+ let result = arrow_to_parquet_value(
398
+ &Field::new("int", DataType::Int32, false),
399
+ &parquet_type,
400
+ &array,
401
+ 0,
402
+ );
361
403
  assert!(result.is_err());
362
404
  assert!(result
363
405
  .unwrap_err()
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.7.2"
2
+ VERSION = "0.7.3"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.2
4
+ version: 0.7.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko