parquet 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,6 @@
1
1
  use std::str::FromStr;
2
2
  use std::sync::Arc;
3
3
 
4
- use crate::reader::ReaderError;
5
-
6
4
  use super::*;
7
5
  use arrow_array::builder::MapFieldNames;
8
6
  use arrow_array::builder::*;
@@ -19,8 +17,7 @@ where
19
17
  T: TryConvert + FromStr,
20
18
  <T as FromStr>::Err: std::fmt::Display,
21
19
  {
22
- pub fn convert_with_string_fallback(value: Value) -> Result<T, MagnusError> {
23
- let ruby = unsafe { Ruby::get_unchecked() };
20
+ pub fn convert_with_string_fallback(ruby: &Ruby, value: Value) -> Result<T, MagnusError> {
24
21
  if value.is_kind_of(ruby.class_string()) {
25
22
  let s = String::try_convert(value)?;
26
23
  s.trim().parse::<T>().map_err(|e| {
@@ -35,8 +32,11 @@ where
35
32
  }
36
33
  }
37
34
 
38
- pub fn convert_to_date32(value: Value, format: Option<&str>) -> Result<i32, MagnusError> {
39
- let ruby = unsafe { Ruby::get_unchecked() };
35
+ pub fn convert_to_date32(
36
+ ruby: &Ruby,
37
+ value: Value,
38
+ format: Option<&str>,
39
+ ) -> Result<i32, MagnusError> {
40
40
  if value.is_kind_of(ruby.class_string()) {
41
41
  let s = String::try_convert(value)?;
42
42
  // Parse string into Date using jiff
@@ -91,8 +91,11 @@ pub fn convert_to_date32(value: Value, format: Option<&str>) -> Result<i32, Magn
91
91
  }
92
92
  }
93
93
 
94
- pub fn convert_to_timestamp_millis(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
95
- let ruby = unsafe { Ruby::get_unchecked() };
94
+ pub fn convert_to_timestamp_millis(
95
+ ruby: &Ruby,
96
+ value: Value,
97
+ format: Option<&str>,
98
+ ) -> Result<i64, MagnusError> {
96
99
  if value.is_kind_of(ruby.class_string()) {
97
100
  let s = String::try_convert(value)?;
98
101
  // Parse string into Timestamp using jiff
@@ -138,8 +141,11 @@ pub fn convert_to_timestamp_millis(value: Value, format: Option<&str>) -> Result
138
141
  }
139
142
  }
140
143
 
141
- pub fn convert_to_timestamp_micros(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
142
- let ruby = unsafe { Ruby::get_unchecked() };
144
+ pub fn convert_to_timestamp_micros(
145
+ ruby: &Ruby,
146
+ value: Value,
147
+ format: Option<&str>,
148
+ ) -> Result<i64, MagnusError> {
143
149
  if value.is_kind_of(ruby.class_string()) {
144
150
  let s = String::try_convert(value)?;
145
151
  // Parse string into Timestamp using jiff
@@ -189,8 +195,7 @@ pub fn convert_to_binary(value: Value) -> Result<Vec<u8>, MagnusError> {
189
195
  Ok(unsafe { value.to_r_string()?.as_slice() }.to_vec())
190
196
  }
191
197
 
192
- pub fn convert_to_boolean(value: Value) -> Result<bool, MagnusError> {
193
- let ruby = unsafe { Ruby::get_unchecked() };
198
+ pub fn convert_to_boolean(ruby: &Ruby, value: Value) -> Result<bool, MagnusError> {
194
199
  if value.is_kind_of(ruby.class_string()) {
195
200
  let s = String::try_convert(value)?;
196
201
  s.trim().parse::<bool>().map_err(|e| {
@@ -226,23 +231,24 @@ pub fn parquet_schema_type_to_arrow_data_type(
226
231
  schema_type: &ParquetSchemaType,
227
232
  ) -> Result<DataType, MagnusError> {
228
233
  Ok(match schema_type {
229
- ParquetSchemaType::Int8 => DataType::Int8,
230
- ParquetSchemaType::Int16 => DataType::Int16,
231
- ParquetSchemaType::Int32 => DataType::Int32,
232
- ParquetSchemaType::Int64 => DataType::Int64,
233
- ParquetSchemaType::UInt8 => DataType::UInt8,
234
- ParquetSchemaType::UInt16 => DataType::UInt16,
235
- ParquetSchemaType::UInt32 => DataType::UInt32,
236
- ParquetSchemaType::UInt64 => DataType::UInt64,
237
- ParquetSchemaType::Float => DataType::Float32,
238
- ParquetSchemaType::Double => DataType::Float64,
239
- ParquetSchemaType::String => DataType::Utf8,
240
- ParquetSchemaType::Binary => DataType::Binary,
241
- ParquetSchemaType::Boolean => DataType::Boolean,
242
- ParquetSchemaType::Date32 => DataType::Date32,
243
- ParquetSchemaType::TimestampMillis => DataType::Timestamp(TimeUnit::Millisecond, None),
244
- ParquetSchemaType::TimestampMicros => DataType::Timestamp(TimeUnit::Microsecond, None),
245
-
234
+ ParquetSchemaType::Primitive(primative) => match primative {
235
+ PrimitiveType::Int8 => DataType::Int8,
236
+ PrimitiveType::Int16 => DataType::Int16,
237
+ PrimitiveType::Int32 => DataType::Int32,
238
+ PrimitiveType::Int64 => DataType::Int64,
239
+ PrimitiveType::UInt8 => DataType::UInt8,
240
+ PrimitiveType::UInt16 => DataType::UInt16,
241
+ PrimitiveType::UInt32 => DataType::UInt32,
242
+ PrimitiveType::UInt64 => DataType::UInt64,
243
+ PrimitiveType::Float32 => DataType::Float32,
244
+ PrimitiveType::Float64 => DataType::Float64,
245
+ PrimitiveType::String => DataType::Utf8,
246
+ PrimitiveType::Binary => DataType::Binary,
247
+ PrimitiveType::Boolean => DataType::Boolean,
248
+ PrimitiveType::Date32 => DataType::Date32,
249
+ PrimitiveType::TimestampMillis => DataType::Timestamp(TimeUnit::Millisecond, None),
250
+ PrimitiveType::TimestampMicros => DataType::Timestamp(TimeUnit::Microsecond, None),
251
+ },
246
252
  // For a List<T>, create a standard List in Arrow with nullable items
247
253
  ParquetSchemaType::List(list_field) => {
248
254
  let child_type = parquet_schema_type_to_arrow_data_type(&list_field.item_type)?;
@@ -325,27 +331,55 @@ macro_rules! impl_timestamp_array_conversion {
325
331
  fn create_arrow_builder_for_type(
326
332
  type_: &ParquetSchemaType,
327
333
  capacity: Option<usize>,
328
- ) -> Result<Box<dyn ArrayBuilder>, ReaderError> {
334
+ ) -> Result<Box<dyn ArrayBuilder>, ParquetGemError> {
329
335
  let cap = capacity.unwrap_or(1); // Default to at least capacity 1 to avoid empty builders
330
336
  match type_ {
331
- ParquetSchemaType::Int8 => Ok(Box::new(Int8Builder::with_capacity(cap))),
332
- ParquetSchemaType::Int16 => Ok(Box::new(Int16Builder::with_capacity(cap))),
333
- ParquetSchemaType::Int32 => Ok(Box::new(Int32Builder::with_capacity(cap))),
334
- ParquetSchemaType::Int64 => Ok(Box::new(Int64Builder::with_capacity(cap))),
335
- ParquetSchemaType::UInt8 => Ok(Box::new(UInt8Builder::with_capacity(cap))),
336
- ParquetSchemaType::UInt16 => Ok(Box::new(UInt16Builder::with_capacity(cap))),
337
- ParquetSchemaType::UInt32 => Ok(Box::new(UInt32Builder::with_capacity(cap))),
338
- ParquetSchemaType::UInt64 => Ok(Box::new(UInt64Builder::with_capacity(cap))),
339
- ParquetSchemaType::Float => Ok(Box::new(Float32Builder::with_capacity(cap))),
340
- ParquetSchemaType::Double => Ok(Box::new(Float64Builder::with_capacity(cap))),
341
- ParquetSchemaType::String => Ok(Box::new(StringBuilder::with_capacity(cap, cap * 32))),
342
- ParquetSchemaType::Binary => Ok(Box::new(BinaryBuilder::with_capacity(cap, cap * 32))),
343
- ParquetSchemaType::Boolean => Ok(Box::new(BooleanBuilder::with_capacity(cap))),
344
- ParquetSchemaType::Date32 => Ok(Box::new(Date32Builder::with_capacity(cap))),
345
- ParquetSchemaType::TimestampMillis => {
337
+ ParquetSchemaType::Primitive(PrimitiveType::Int8) => {
338
+ Ok(Box::new(Int8Builder::with_capacity(cap)))
339
+ }
340
+ ParquetSchemaType::Primitive(PrimitiveType::Int16) => {
341
+ Ok(Box::new(Int16Builder::with_capacity(cap)))
342
+ }
343
+ ParquetSchemaType::Primitive(PrimitiveType::Int32) => {
344
+ Ok(Box::new(Int32Builder::with_capacity(cap)))
345
+ }
346
+ ParquetSchemaType::Primitive(PrimitiveType::Int64) => {
347
+ Ok(Box::new(Int64Builder::with_capacity(cap)))
348
+ }
349
+ ParquetSchemaType::Primitive(PrimitiveType::UInt8) => {
350
+ Ok(Box::new(UInt8Builder::with_capacity(cap)))
351
+ }
352
+ ParquetSchemaType::Primitive(PrimitiveType::UInt16) => {
353
+ Ok(Box::new(UInt16Builder::with_capacity(cap)))
354
+ }
355
+ ParquetSchemaType::Primitive(PrimitiveType::UInt32) => {
356
+ Ok(Box::new(UInt32Builder::with_capacity(cap)))
357
+ }
358
+ ParquetSchemaType::Primitive(PrimitiveType::UInt64) => {
359
+ Ok(Box::new(UInt64Builder::with_capacity(cap)))
360
+ }
361
+ ParquetSchemaType::Primitive(PrimitiveType::Float32) => {
362
+ Ok(Box::new(Float32Builder::with_capacity(cap)))
363
+ }
364
+ ParquetSchemaType::Primitive(PrimitiveType::Float64) => {
365
+ Ok(Box::new(Float64Builder::with_capacity(cap)))
366
+ }
367
+ ParquetSchemaType::Primitive(PrimitiveType::String) => {
368
+ Ok(Box::new(StringBuilder::with_capacity(cap, cap * 32)))
369
+ }
370
+ ParquetSchemaType::Primitive(PrimitiveType::Binary) => {
371
+ Ok(Box::new(BinaryBuilder::with_capacity(cap, cap * 32)))
372
+ }
373
+ ParquetSchemaType::Primitive(PrimitiveType::Boolean) => {
374
+ Ok(Box::new(BooleanBuilder::with_capacity(cap)))
375
+ }
376
+ ParquetSchemaType::Primitive(PrimitiveType::Date32) => {
377
+ Ok(Box::new(Date32Builder::with_capacity(cap)))
378
+ }
379
+ ParquetSchemaType::Primitive(PrimitiveType::TimestampMillis) => {
346
380
  Ok(Box::new(TimestampMillisecondBuilder::with_capacity(cap)))
347
381
  }
348
- ParquetSchemaType::TimestampMicros => {
382
+ ParquetSchemaType::Primitive(PrimitiveType::TimestampMicros) => {
349
383
  Ok(Box::new(TimestampMicrosecondBuilder::with_capacity(cap)))
350
384
  }
351
385
  ParquetSchemaType::List(list_field) => {
@@ -592,9 +626,9 @@ fn fill_builder(
592
626
  // ------------------
593
627
  // PRIMITIVE SCALARS - delegated to specialized helpers
594
628
  // ------------------
595
- ParquetSchemaType::Int8 => fill_int8_builder(builder, values),
596
- ParquetSchemaType::Int16 => fill_int16_builder(builder, values),
597
- ParquetSchemaType::Int32 => {
629
+ ParquetSchemaType::Primitive(PrimitiveType::Int8) => fill_int8_builder(builder, values),
630
+ ParquetSchemaType::Primitive(PrimitiveType::Int16) => fill_int16_builder(builder, values),
631
+ ParquetSchemaType::Primitive(PrimitiveType::Int32) => {
598
632
  let typed_builder = builder
599
633
  .as_any_mut()
600
634
  .downcast_mut::<Int32Builder>()
@@ -624,7 +658,7 @@ fn fill_builder(
624
658
  }
625
659
  Ok(())
626
660
  }
627
- ParquetSchemaType::Int64 => {
661
+ ParquetSchemaType::Primitive(PrimitiveType::Int64) => {
628
662
  let typed_builder = builder
629
663
  .as_any_mut()
630
664
  .downcast_mut::<Int64Builder>()
@@ -643,7 +677,7 @@ fn fill_builder(
643
677
  }
644
678
  Ok(())
645
679
  }
646
- ParquetSchemaType::UInt8 => {
680
+ ParquetSchemaType::Primitive(PrimitiveType::UInt8) => {
647
681
  let typed_builder = builder
648
682
  .as_any_mut()
649
683
  .downcast_mut::<UInt8Builder>()
@@ -672,7 +706,7 @@ fn fill_builder(
672
706
  }
673
707
  Ok(())
674
708
  }
675
- ParquetSchemaType::UInt16 => {
709
+ ParquetSchemaType::Primitive(PrimitiveType::UInt16) => {
676
710
  let typed_builder = builder
677
711
  .as_any_mut()
678
712
  .downcast_mut::<UInt16Builder>()
@@ -701,7 +735,7 @@ fn fill_builder(
701
735
  }
702
736
  Ok(())
703
737
  }
704
- ParquetSchemaType::UInt32 => {
738
+ ParquetSchemaType::Primitive(PrimitiveType::UInt32) => {
705
739
  let typed_builder = builder
706
740
  .as_any_mut()
707
741
  .downcast_mut::<UInt32Builder>()
@@ -730,7 +764,7 @@ fn fill_builder(
730
764
  }
731
765
  Ok(())
732
766
  }
733
- ParquetSchemaType::UInt64 => {
767
+ ParquetSchemaType::Primitive(PrimitiveType::UInt64) => {
734
768
  let typed_builder = builder
735
769
  .as_any_mut()
736
770
  .downcast_mut::<UInt64Builder>()
@@ -759,7 +793,7 @@ fn fill_builder(
759
793
  }
760
794
  Ok(())
761
795
  }
762
- ParquetSchemaType::Float => {
796
+ ParquetSchemaType::Primitive(PrimitiveType::Float32) => {
763
797
  let typed_builder = builder
764
798
  .as_any_mut()
765
799
  .downcast_mut::<Float32Builder>()
@@ -779,7 +813,7 @@ fn fill_builder(
779
813
  }
780
814
  Ok(())
781
815
  }
782
- ParquetSchemaType::Double => {
816
+ ParquetSchemaType::Primitive(PrimitiveType::Float64) => {
783
817
  let typed_builder = builder
784
818
  .as_any_mut()
785
819
  .downcast_mut::<Float64Builder>()
@@ -800,7 +834,7 @@ fn fill_builder(
800
834
  }
801
835
  Ok(())
802
836
  }
803
- ParquetSchemaType::Boolean => {
837
+ ParquetSchemaType::Primitive(PrimitiveType::Boolean) => {
804
838
  let typed_builder = builder
805
839
  .as_any_mut()
806
840
  .downcast_mut::<BooleanBuilder>()
@@ -819,7 +853,7 @@ fn fill_builder(
819
853
  }
820
854
  Ok(())
821
855
  }
822
- ParquetSchemaType::Date32 => {
856
+ ParquetSchemaType::Primitive(PrimitiveType::Date32) => {
823
857
  let typed_builder = builder
824
858
  .as_any_mut()
825
859
  .downcast_mut::<Date32Builder>()
@@ -838,7 +872,7 @@ fn fill_builder(
838
872
  }
839
873
  Ok(())
840
874
  }
841
- ParquetSchemaType::TimestampMillis => {
875
+ ParquetSchemaType::Primitive(PrimitiveType::TimestampMillis) => {
842
876
  let typed_builder = builder
843
877
  .as_any_mut()
844
878
  .downcast_mut::<TimestampMillisecondBuilder>()
@@ -857,7 +891,7 @@ fn fill_builder(
857
891
  }
858
892
  Ok(())
859
893
  }
860
- ParquetSchemaType::TimestampMicros => {
894
+ ParquetSchemaType::Primitive(PrimitiveType::TimestampMicros) => {
861
895
  let typed_builder = builder
862
896
  .as_any_mut()
863
897
  .downcast_mut::<TimestampMicrosecondBuilder>()
@@ -894,7 +928,7 @@ fn fill_builder(
894
928
  // ------------------
895
929
  // OTHER TYPES - keep as is for now
896
930
  // ------------------
897
- ParquetSchemaType::String => {
931
+ ParquetSchemaType::Primitive(PrimitiveType::String) => {
898
932
  let typed_builder = builder
899
933
  .as_any_mut()
900
934
  .downcast_mut::<StringBuilder>()
@@ -913,7 +947,7 @@ fn fill_builder(
913
947
  }
914
948
  Ok(())
915
949
  }
916
- ParquetSchemaType::Binary => {
950
+ ParquetSchemaType::Primitive(PrimitiveType::Binary) => {
917
951
  let typed_builder = builder
918
952
  .as_any_mut()
919
953
  .downcast_mut::<BinaryBuilder>()
@@ -1178,7 +1212,7 @@ fn fill_builder(
1178
1212
  }
1179
1213
  }
1180
1214
  ParquetValue::Null => match struct_field.fields[i].type_ {
1181
- ParquetSchemaType::Int8 => typed_builder
1215
+ ParquetSchemaType::Primitive(PrimitiveType::Int8) => typed_builder
1182
1216
  .field_builder::<Int8Builder>(i)
1183
1217
  .ok_or_else(|| {
1184
1218
  MagnusError::new(
@@ -1187,7 +1221,7 @@ fn fill_builder(
1187
1221
  )
1188
1222
  })?
1189
1223
  .append_null(),
1190
- ParquetSchemaType::Int16 => typed_builder
1224
+ ParquetSchemaType::Primitive(PrimitiveType::Int16) => typed_builder
1191
1225
  .field_builder::<Int16Builder>(i)
1192
1226
  .ok_or_else(|| {
1193
1227
  MagnusError::new(
@@ -1196,7 +1230,7 @@ fn fill_builder(
1196
1230
  )
1197
1231
  })?
1198
1232
  .append_null(),
1199
- ParquetSchemaType::Int32 => typed_builder
1233
+ ParquetSchemaType::Primitive(PrimitiveType::Int32) => typed_builder
1200
1234
  .field_builder::<Int32Builder>(i)
1201
1235
  .ok_or_else(|| {
1202
1236
  MagnusError::new(
@@ -1205,7 +1239,7 @@ fn fill_builder(
1205
1239
  )
1206
1240
  })?
1207
1241
  .append_null(),
1208
- ParquetSchemaType::Int64 => typed_builder
1242
+ ParquetSchemaType::Primitive(PrimitiveType::Int64) => typed_builder
1209
1243
  .field_builder::<Int64Builder>(i)
1210
1244
  .ok_or_else(|| {
1211
1245
  MagnusError::new(
@@ -1214,7 +1248,7 @@ fn fill_builder(
1214
1248
  )
1215
1249
  })?
1216
1250
  .append_null(),
1217
- ParquetSchemaType::UInt8 => typed_builder
1251
+ ParquetSchemaType::Primitive(PrimitiveType::UInt8) => typed_builder
1218
1252
  .field_builder::<UInt8Builder>(i)
1219
1253
  .ok_or_else(|| {
1220
1254
  MagnusError::new(
@@ -1223,7 +1257,7 @@ fn fill_builder(
1223
1257
  )
1224
1258
  })?
1225
1259
  .append_null(),
1226
- ParquetSchemaType::UInt16 => typed_builder
1260
+ ParquetSchemaType::Primitive(PrimitiveType::UInt16) => typed_builder
1227
1261
  .field_builder::<UInt16Builder>(i)
1228
1262
  .ok_or_else(|| {
1229
1263
  MagnusError::new(
@@ -1232,7 +1266,7 @@ fn fill_builder(
1232
1266
  )
1233
1267
  })?
1234
1268
  .append_null(),
1235
- ParquetSchemaType::UInt32 => typed_builder
1269
+ ParquetSchemaType::Primitive(PrimitiveType::UInt32) => typed_builder
1236
1270
  .field_builder::<UInt32Builder>(i)
1237
1271
  .ok_or_else(|| {
1238
1272
  MagnusError::new(
@@ -1241,7 +1275,7 @@ fn fill_builder(
1241
1275
  )
1242
1276
  })?
1243
1277
  .append_null(),
1244
- ParquetSchemaType::UInt64 => typed_builder
1278
+ ParquetSchemaType::Primitive(PrimitiveType::UInt64) => typed_builder
1245
1279
  .field_builder::<UInt64Builder>(i)
1246
1280
  .ok_or_else(|| {
1247
1281
  MagnusError::new(
@@ -1250,7 +1284,7 @@ fn fill_builder(
1250
1284
  )
1251
1285
  })?
1252
1286
  .append_null(),
1253
- ParquetSchemaType::Float => typed_builder
1287
+ ParquetSchemaType::Primitive(PrimitiveType::Float32) => typed_builder
1254
1288
  .field_builder::<Float32Builder>(i)
1255
1289
  .ok_or_else(|| {
1256
1290
  MagnusError::new(
@@ -1259,7 +1293,7 @@ fn fill_builder(
1259
1293
  )
1260
1294
  })?
1261
1295
  .append_null(),
1262
- ParquetSchemaType::Double => typed_builder
1296
+ ParquetSchemaType::Primitive(PrimitiveType::Float64) => typed_builder
1263
1297
  .field_builder::<Float64Builder>(i)
1264
1298
  .ok_or_else(|| {
1265
1299
  MagnusError::new(
@@ -1268,7 +1302,7 @@ fn fill_builder(
1268
1302
  )
1269
1303
  })?
1270
1304
  .append_null(),
1271
- ParquetSchemaType::String => typed_builder
1305
+ ParquetSchemaType::Primitive(PrimitiveType::String) => typed_builder
1272
1306
  .field_builder::<StringBuilder>(i)
1273
1307
  .ok_or_else(|| {
1274
1308
  MagnusError::new(
@@ -1277,7 +1311,7 @@ fn fill_builder(
1277
1311
  )
1278
1312
  })?
1279
1313
  .append_null(),
1280
- ParquetSchemaType::Binary => typed_builder
1314
+ ParquetSchemaType::Primitive(PrimitiveType::Binary) => typed_builder
1281
1315
  .field_builder::<BinaryBuilder>(i)
1282
1316
  .ok_or_else(|| {
1283
1317
  MagnusError::new(
@@ -1286,7 +1320,7 @@ fn fill_builder(
1286
1320
  )
1287
1321
  })?
1288
1322
  .append_null(),
1289
- ParquetSchemaType::Boolean => typed_builder
1323
+ ParquetSchemaType::Primitive(PrimitiveType::Boolean) => typed_builder
1290
1324
  .field_builder::<BooleanBuilder>(i)
1291
1325
  .ok_or_else(|| {
1292
1326
  MagnusError::new(
@@ -1295,7 +1329,7 @@ fn fill_builder(
1295
1329
  )
1296
1330
  })?
1297
1331
  .append_null(),
1298
- ParquetSchemaType::Date32 => typed_builder
1332
+ ParquetSchemaType::Primitive(PrimitiveType::Date32) => typed_builder
1299
1333
  .field_builder::<Date32Builder>(i)
1300
1334
  .ok_or_else(|| {
1301
1335
  MagnusError::new(
@@ -1304,7 +1338,7 @@ fn fill_builder(
1304
1338
  )
1305
1339
  })?
1306
1340
  .append_null(),
1307
- ParquetSchemaType::TimestampMillis => typed_builder
1341
+ ParquetSchemaType::Primitive(PrimitiveType::TimestampMillis) => typed_builder
1308
1342
  .field_builder::<TimestampMillisecondBuilder>(i)
1309
1343
  .ok_or_else(|| {
1310
1344
  MagnusError::new(
@@ -1313,7 +1347,7 @@ fn fill_builder(
1313
1347
  )
1314
1348
  })?
1315
1349
  .append_null(),
1316
- ParquetSchemaType::TimestampMicros => typed_builder
1350
+ ParquetSchemaType::Primitive(PrimitiveType::TimestampMicros) => typed_builder
1317
1351
  .field_builder::<TimestampMicrosecondBuilder>(i)
1318
1352
  .ok_or_else(|| {
1319
1353
  MagnusError::new(
@@ -1389,7 +1423,7 @@ fn fill_builder(
1389
1423
  pub fn convert_parquet_values_to_arrow(
1390
1424
  values: Vec<ParquetValue>,
1391
1425
  type_: &ParquetSchemaType,
1392
- ) -> Result<Arc<dyn Array>, ReaderError> {
1426
+ ) -> Result<Arc<dyn Array>, ParquetGemError> {
1393
1427
  // Make sure we always have at least capacity 1 to avoid empty builders
1394
1428
  let capacity = if values.is_empty() { 1 } else { values.len() };
1395
1429
  let mut builder = create_arrow_builder_for_type(type_, Some(capacity))?;
@@ -1403,16 +1437,17 @@ pub fn convert_parquet_values_to_arrow(
1403
1437
  }
1404
1438
 
1405
1439
  pub fn convert_ruby_array_to_arrow(
1440
+ ruby: &Ruby,
1406
1441
  values: RArray,
1407
1442
  type_: &ParquetSchemaType,
1408
- ) -> Result<Arc<dyn Array>, ReaderError> {
1443
+ ) -> Result<Arc<dyn Array>, ParquetGemError> {
1409
1444
  let mut parquet_values = Vec::with_capacity(values.len());
1410
1445
  for value in values {
1411
1446
  if value.is_nil() {
1412
1447
  parquet_values.push(ParquetValue::Null);
1413
1448
  continue;
1414
1449
  }
1415
- let parquet_value = ParquetValue::from_value(value, type_, None)?;
1450
+ let parquet_value = ParquetValue::from_value(ruby, value, type_, None)?;
1416
1451
  parquet_values.push(parquet_value);
1417
1452
  }
1418
1453
  convert_parquet_values_to_arrow(parquet_values, type_)
@@ -1,10 +1,10 @@
1
- use super::core_types::SchemaNode;
1
+ use super::{core_types::SchemaNode, ParquetGemError, PrimitiveType};
2
2
  use crate::{
3
- reader::ReaderError,
4
3
  types::{ListField, MapField, ParquetSchemaType},
4
+ utils::parse_string_or_symbol,
5
5
  };
6
6
  use arrow_array::{Array, RecordBatch};
7
- use magnus::{value::ReprValue, Error as MagnusError, RString, Ruby, Symbol, TryConvert, Value};
7
+ use magnus::{value::ReprValue, Error as MagnusError, RString, Ruby, TryConvert, Value};
8
8
  use parquet::{arrow::ArrowWriter, errors::ParquetError};
9
9
  use std::{
10
10
  io::{self, Write},
@@ -100,34 +100,27 @@ impl<'a> FromStr for ParquetSchemaType<'a> {
100
100
 
101
101
  // Handle primitive types
102
102
  match s {
103
- "int8" => Ok(ParquetSchemaType::Int8),
104
- "int16" => Ok(ParquetSchemaType::Int16),
105
- "int32" => Ok(ParquetSchemaType::Int32),
106
- "int64" => Ok(ParquetSchemaType::Int64),
107
- "uint8" => Ok(ParquetSchemaType::UInt8),
108
- "uint16" => Ok(ParquetSchemaType::UInt16),
109
- "uint32" => Ok(ParquetSchemaType::UInt32),
110
- "uint64" => Ok(ParquetSchemaType::UInt64),
111
- "float" | "float32" => Ok(ParquetSchemaType::Float),
112
- "double" | "float64" => Ok(ParquetSchemaType::Double),
113
- "string" | "utf8" => Ok(ParquetSchemaType::String),
114
- "binary" => Ok(ParquetSchemaType::Binary),
115
- "boolean" | "bool" => Ok(ParquetSchemaType::Boolean),
116
- "date32" => Ok(ParquetSchemaType::Date32),
117
- "timestamp_millis" => Ok(ParquetSchemaType::TimestampMillis),
118
- "timestamp_micros" => Ok(ParquetSchemaType::TimestampMicros),
103
+ "int8" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Int8)),
104
+ "int16" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Int16)),
105
+ "int32" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Int32)),
106
+ "int64" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Int64)),
107
+ "uint8" => Ok(ParquetSchemaType::Primitive(PrimitiveType::UInt8)),
108
+ "uint16" => Ok(ParquetSchemaType::Primitive(PrimitiveType::UInt16)),
109
+ "uint32" => Ok(ParquetSchemaType::Primitive(PrimitiveType::UInt32)),
110
+ "uint64" => Ok(ParquetSchemaType::Primitive(PrimitiveType::UInt64)),
111
+ "float" | "float32" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Float32)),
112
+ "double" | "float64" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Float64)),
113
+ "string" | "utf8" => Ok(ParquetSchemaType::Primitive(PrimitiveType::String)),
114
+ "binary" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Binary)),
115
+ "boolean" | "bool" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Boolean)),
116
+ "date32" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Date32)),
117
+ "timestamp_millis" => Ok(ParquetSchemaType::Primitive(PrimitiveType::TimestampMillis)),
118
+ "timestamp_micros" => Ok(ParquetSchemaType::Primitive(PrimitiveType::TimestampMicros)),
119
119
  "list" => Ok(ParquetSchemaType::List(Box::new(ListField {
120
- item_type: ParquetSchemaType::String,
120
+ item_type: ParquetSchemaType::Primitive(PrimitiveType::String),
121
121
  format: None,
122
122
  nullable: true,
123
123
  }))),
124
- "map" => Ok(ParquetSchemaType::Map(Box::new(MapField {
125
- key_type: ParquetSchemaType::String,
126
- value_type: ParquetSchemaType::String,
127
- key_format: None,
128
- value_format: None,
129
- value_nullable: true,
130
- }))),
131
124
  _ => Err(MagnusError::new(
132
125
  magnus::exception::runtime_error(),
133
126
  format!("Invalid schema type: {}", s),
@@ -153,31 +146,6 @@ impl<'a> TryConvert for ParquetSchemaType<'a> {
153
146
  // with simple primitive types and strings
154
147
  unsafe impl<'a> Send for ParquetSchemaType<'a> {}
155
148
 
156
- fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, MagnusError> {
157
- if value.is_nil() {
158
- Ok(None)
159
- } else if value.is_kind_of(ruby.class_string()) {
160
- RString::from_value(value)
161
- .ok_or_else(|| {
162
- MagnusError::new(magnus::exception::type_error(), "Invalid string value")
163
- })?
164
- .to_string()
165
- .map(|s| Some(s))
166
- } else if value.is_kind_of(ruby.class_symbol()) {
167
- Symbol::from_value(value)
168
- .ok_or_else(|| {
169
- MagnusError::new(magnus::exception::type_error(), "Invalid symbol value")
170
- })?
171
- .funcall("to_s", ())
172
- .map(|s| Some(s))
173
- } else {
174
- Err(MagnusError::new(
175
- magnus::exception::type_error(),
176
- "Value must be a String or Symbol",
177
- ))
178
- }
179
- }
180
-
181
149
  pub enum WriterOutput {
182
150
  File(ArrowWriter<Box<dyn SendableWrite>>),
183
151
  TempFile(ArrowWriter<Box<dyn SendableWrite>>, NamedTempFile),
@@ -205,6 +173,7 @@ impl WriterOutput {
205
173
  }
206
174
 
207
175
  pub struct ColumnCollector<'a> {
176
+ pub ruby: &'a Ruby,
208
177
  pub name: String,
209
178
  pub type_: ParquetSchemaType<'a>,
210
179
  pub format: Option<String>,
@@ -214,12 +183,14 @@ pub struct ColumnCollector<'a> {
214
183
 
215
184
  impl<'a> ColumnCollector<'a> {
216
185
  pub fn new(
186
+ ruby: &'a Ruby,
217
187
  name: String,
218
188
  type_: ParquetSchemaType<'a>,
219
189
  format: Option<String>,
220
190
  nullable: bool,
221
191
  ) -> Self {
222
192
  Self {
193
+ ruby,
223
194
  name,
224
195
  type_,
225
196
  format,
@@ -242,12 +213,13 @@ impl<'a> ColumnCollector<'a> {
242
213
  }
243
214
 
244
215
  // For all other types, proceed as normal
245
- let parquet_value = ParquetValue::from_value(value, &self.type_, self.format.as_deref())?;
216
+ let parquet_value =
217
+ ParquetValue::from_value(self.ruby, value, &self.type_, self.format.as_deref())?;
246
218
  self.values.push(parquet_value);
247
219
  Ok(())
248
220
  }
249
221
 
250
- pub fn take_array(&mut self) -> Result<Arc<dyn Array>, ReaderError> {
222
+ pub fn take_array(&mut self) -> Result<Arc<dyn Array>, ParquetGemError> {
251
223
  let values = std::mem::take(&mut self.values);
252
224
  crate::convert_parquet_values_to_arrow(values, &self.type_)
253
225
  }