parquet 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/ext/parquet/src/header_cache.rs +4 -9
- data/ext/parquet/src/logger.rs +2 -2
- data/ext/parquet/src/reader/common.rs +12 -15
- data/ext/parquet/src/reader/mod.rs +0 -56
- data/ext/parquet/src/reader/parquet_column_reader.rs +20 -16
- data/ext/parquet/src/reader/parquet_row_reader.rs +21 -14
- data/ext/parquet/src/ruby_reader.rs +37 -25
- data/ext/parquet/src/types/core_types.rs +2 -17
- data/ext/parquet/src/types/mod.rs +56 -0
- data/ext/parquet/src/types/parquet_value.rs +101 -95
- data/ext/parquet/src/types/record_types.rs +12 -14
- data/ext/parquet/src/types/schema_converter.rs +4 -109
- data/ext/parquet/src/types/timestamp.rs +3 -5
- data/ext/parquet/src/types/type_conversion.rs +116 -81
- data/ext/parquet/src/types/writer_types.rs +26 -54
- data/ext/parquet/src/writer/mod.rs +176 -839
- data/ext/parquet/src/writer/write_columns.rs +226 -0
- data/ext/parquet/src/writer/write_rows.rs +484 -0
- data/lib/parquet/version.rb +1 -1
- metadata +3 -1
@@ -1,8 +1,6 @@
|
|
1
1
|
use std::str::FromStr;
|
2
2
|
use std::sync::Arc;
|
3
3
|
|
4
|
-
use crate::reader::ReaderError;
|
5
|
-
|
6
4
|
use super::*;
|
7
5
|
use arrow_array::builder::MapFieldNames;
|
8
6
|
use arrow_array::builder::*;
|
@@ -19,8 +17,7 @@ where
|
|
19
17
|
T: TryConvert + FromStr,
|
20
18
|
<T as FromStr>::Err: std::fmt::Display,
|
21
19
|
{
|
22
|
-
pub fn convert_with_string_fallback(value: Value) -> Result<T, MagnusError> {
|
23
|
-
let ruby = unsafe { Ruby::get_unchecked() };
|
20
|
+
pub fn convert_with_string_fallback(ruby: &Ruby, value: Value) -> Result<T, MagnusError> {
|
24
21
|
if value.is_kind_of(ruby.class_string()) {
|
25
22
|
let s = String::try_convert(value)?;
|
26
23
|
s.trim().parse::<T>().map_err(|e| {
|
@@ -35,8 +32,11 @@ where
|
|
35
32
|
}
|
36
33
|
}
|
37
34
|
|
38
|
-
pub fn convert_to_date32(
|
39
|
-
|
35
|
+
pub fn convert_to_date32(
|
36
|
+
ruby: &Ruby,
|
37
|
+
value: Value,
|
38
|
+
format: Option<&str>,
|
39
|
+
) -> Result<i32, MagnusError> {
|
40
40
|
if value.is_kind_of(ruby.class_string()) {
|
41
41
|
let s = String::try_convert(value)?;
|
42
42
|
// Parse string into Date using jiff
|
@@ -91,8 +91,11 @@ pub fn convert_to_date32(value: Value, format: Option<&str>) -> Result<i32, Magn
|
|
91
91
|
}
|
92
92
|
}
|
93
93
|
|
94
|
-
pub fn convert_to_timestamp_millis(
|
95
|
-
|
94
|
+
pub fn convert_to_timestamp_millis(
|
95
|
+
ruby: &Ruby,
|
96
|
+
value: Value,
|
97
|
+
format: Option<&str>,
|
98
|
+
) -> Result<i64, MagnusError> {
|
96
99
|
if value.is_kind_of(ruby.class_string()) {
|
97
100
|
let s = String::try_convert(value)?;
|
98
101
|
// Parse string into Timestamp using jiff
|
@@ -138,8 +141,11 @@ pub fn convert_to_timestamp_millis(value: Value, format: Option<&str>) -> Result
|
|
138
141
|
}
|
139
142
|
}
|
140
143
|
|
141
|
-
pub fn convert_to_timestamp_micros(
|
142
|
-
|
144
|
+
pub fn convert_to_timestamp_micros(
|
145
|
+
ruby: &Ruby,
|
146
|
+
value: Value,
|
147
|
+
format: Option<&str>,
|
148
|
+
) -> Result<i64, MagnusError> {
|
143
149
|
if value.is_kind_of(ruby.class_string()) {
|
144
150
|
let s = String::try_convert(value)?;
|
145
151
|
// Parse string into Timestamp using jiff
|
@@ -189,8 +195,7 @@ pub fn convert_to_binary(value: Value) -> Result<Vec<u8>, MagnusError> {
|
|
189
195
|
Ok(unsafe { value.to_r_string()?.as_slice() }.to_vec())
|
190
196
|
}
|
191
197
|
|
192
|
-
pub fn convert_to_boolean(value: Value) -> Result<bool, MagnusError> {
|
193
|
-
let ruby = unsafe { Ruby::get_unchecked() };
|
198
|
+
pub fn convert_to_boolean(ruby: &Ruby, value: Value) -> Result<bool, MagnusError> {
|
194
199
|
if value.is_kind_of(ruby.class_string()) {
|
195
200
|
let s = String::try_convert(value)?;
|
196
201
|
s.trim().parse::<bool>().map_err(|e| {
|
@@ -226,23 +231,24 @@ pub fn parquet_schema_type_to_arrow_data_type(
|
|
226
231
|
schema_type: &ParquetSchemaType,
|
227
232
|
) -> Result<DataType, MagnusError> {
|
228
233
|
Ok(match schema_type {
|
229
|
-
ParquetSchemaType::
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
234
|
+
ParquetSchemaType::Primitive(primative) => match primative {
|
235
|
+
PrimitiveType::Int8 => DataType::Int8,
|
236
|
+
PrimitiveType::Int16 => DataType::Int16,
|
237
|
+
PrimitiveType::Int32 => DataType::Int32,
|
238
|
+
PrimitiveType::Int64 => DataType::Int64,
|
239
|
+
PrimitiveType::UInt8 => DataType::UInt8,
|
240
|
+
PrimitiveType::UInt16 => DataType::UInt16,
|
241
|
+
PrimitiveType::UInt32 => DataType::UInt32,
|
242
|
+
PrimitiveType::UInt64 => DataType::UInt64,
|
243
|
+
PrimitiveType::Float32 => DataType::Float32,
|
244
|
+
PrimitiveType::Float64 => DataType::Float64,
|
245
|
+
PrimitiveType::String => DataType::Utf8,
|
246
|
+
PrimitiveType::Binary => DataType::Binary,
|
247
|
+
PrimitiveType::Boolean => DataType::Boolean,
|
248
|
+
PrimitiveType::Date32 => DataType::Date32,
|
249
|
+
PrimitiveType::TimestampMillis => DataType::Timestamp(TimeUnit::Millisecond, None),
|
250
|
+
PrimitiveType::TimestampMicros => DataType::Timestamp(TimeUnit::Microsecond, None),
|
251
|
+
},
|
246
252
|
// For a List<T>, create a standard List in Arrow with nullable items
|
247
253
|
ParquetSchemaType::List(list_field) => {
|
248
254
|
let child_type = parquet_schema_type_to_arrow_data_type(&list_field.item_type)?;
|
@@ -325,27 +331,55 @@ macro_rules! impl_timestamp_array_conversion {
|
|
325
331
|
fn create_arrow_builder_for_type(
|
326
332
|
type_: &ParquetSchemaType,
|
327
333
|
capacity: Option<usize>,
|
328
|
-
) -> Result<Box<dyn ArrayBuilder>,
|
334
|
+
) -> Result<Box<dyn ArrayBuilder>, ParquetGemError> {
|
329
335
|
let cap = capacity.unwrap_or(1); // Default to at least capacity 1 to avoid empty builders
|
330
336
|
match type_ {
|
331
|
-
ParquetSchemaType::Int8 =>
|
332
|
-
|
333
|
-
|
334
|
-
ParquetSchemaType::
|
335
|
-
|
336
|
-
|
337
|
-
ParquetSchemaType::
|
338
|
-
|
339
|
-
|
340
|
-
ParquetSchemaType::
|
341
|
-
|
342
|
-
|
343
|
-
ParquetSchemaType::
|
344
|
-
|
345
|
-
|
337
|
+
ParquetSchemaType::Primitive(PrimitiveType::Int8) => {
|
338
|
+
Ok(Box::new(Int8Builder::with_capacity(cap)))
|
339
|
+
}
|
340
|
+
ParquetSchemaType::Primitive(PrimitiveType::Int16) => {
|
341
|
+
Ok(Box::new(Int16Builder::with_capacity(cap)))
|
342
|
+
}
|
343
|
+
ParquetSchemaType::Primitive(PrimitiveType::Int32) => {
|
344
|
+
Ok(Box::new(Int32Builder::with_capacity(cap)))
|
345
|
+
}
|
346
|
+
ParquetSchemaType::Primitive(PrimitiveType::Int64) => {
|
347
|
+
Ok(Box::new(Int64Builder::with_capacity(cap)))
|
348
|
+
}
|
349
|
+
ParquetSchemaType::Primitive(PrimitiveType::UInt8) => {
|
350
|
+
Ok(Box::new(UInt8Builder::with_capacity(cap)))
|
351
|
+
}
|
352
|
+
ParquetSchemaType::Primitive(PrimitiveType::UInt16) => {
|
353
|
+
Ok(Box::new(UInt16Builder::with_capacity(cap)))
|
354
|
+
}
|
355
|
+
ParquetSchemaType::Primitive(PrimitiveType::UInt32) => {
|
356
|
+
Ok(Box::new(UInt32Builder::with_capacity(cap)))
|
357
|
+
}
|
358
|
+
ParquetSchemaType::Primitive(PrimitiveType::UInt64) => {
|
359
|
+
Ok(Box::new(UInt64Builder::with_capacity(cap)))
|
360
|
+
}
|
361
|
+
ParquetSchemaType::Primitive(PrimitiveType::Float32) => {
|
362
|
+
Ok(Box::new(Float32Builder::with_capacity(cap)))
|
363
|
+
}
|
364
|
+
ParquetSchemaType::Primitive(PrimitiveType::Float64) => {
|
365
|
+
Ok(Box::new(Float64Builder::with_capacity(cap)))
|
366
|
+
}
|
367
|
+
ParquetSchemaType::Primitive(PrimitiveType::String) => {
|
368
|
+
Ok(Box::new(StringBuilder::with_capacity(cap, cap * 32)))
|
369
|
+
}
|
370
|
+
ParquetSchemaType::Primitive(PrimitiveType::Binary) => {
|
371
|
+
Ok(Box::new(BinaryBuilder::with_capacity(cap, cap * 32)))
|
372
|
+
}
|
373
|
+
ParquetSchemaType::Primitive(PrimitiveType::Boolean) => {
|
374
|
+
Ok(Box::new(BooleanBuilder::with_capacity(cap)))
|
375
|
+
}
|
376
|
+
ParquetSchemaType::Primitive(PrimitiveType::Date32) => {
|
377
|
+
Ok(Box::new(Date32Builder::with_capacity(cap)))
|
378
|
+
}
|
379
|
+
ParquetSchemaType::Primitive(PrimitiveType::TimestampMillis) => {
|
346
380
|
Ok(Box::new(TimestampMillisecondBuilder::with_capacity(cap)))
|
347
381
|
}
|
348
|
-
ParquetSchemaType::TimestampMicros => {
|
382
|
+
ParquetSchemaType::Primitive(PrimitiveType::TimestampMicros) => {
|
349
383
|
Ok(Box::new(TimestampMicrosecondBuilder::with_capacity(cap)))
|
350
384
|
}
|
351
385
|
ParquetSchemaType::List(list_field) => {
|
@@ -592,9 +626,9 @@ fn fill_builder(
|
|
592
626
|
// ------------------
|
593
627
|
// PRIMITIVE SCALARS - delegated to specialized helpers
|
594
628
|
// ------------------
|
595
|
-
ParquetSchemaType::Int8 => fill_int8_builder(builder, values),
|
596
|
-
ParquetSchemaType::Int16 => fill_int16_builder(builder, values),
|
597
|
-
ParquetSchemaType::Int32 => {
|
629
|
+
ParquetSchemaType::Primitive(PrimitiveType::Int8) => fill_int8_builder(builder, values),
|
630
|
+
ParquetSchemaType::Primitive(PrimitiveType::Int16) => fill_int16_builder(builder, values),
|
631
|
+
ParquetSchemaType::Primitive(PrimitiveType::Int32) => {
|
598
632
|
let typed_builder = builder
|
599
633
|
.as_any_mut()
|
600
634
|
.downcast_mut::<Int32Builder>()
|
@@ -624,7 +658,7 @@ fn fill_builder(
|
|
624
658
|
}
|
625
659
|
Ok(())
|
626
660
|
}
|
627
|
-
ParquetSchemaType::Int64 => {
|
661
|
+
ParquetSchemaType::Primitive(PrimitiveType::Int64) => {
|
628
662
|
let typed_builder = builder
|
629
663
|
.as_any_mut()
|
630
664
|
.downcast_mut::<Int64Builder>()
|
@@ -643,7 +677,7 @@ fn fill_builder(
|
|
643
677
|
}
|
644
678
|
Ok(())
|
645
679
|
}
|
646
|
-
ParquetSchemaType::UInt8 => {
|
680
|
+
ParquetSchemaType::Primitive(PrimitiveType::UInt8) => {
|
647
681
|
let typed_builder = builder
|
648
682
|
.as_any_mut()
|
649
683
|
.downcast_mut::<UInt8Builder>()
|
@@ -672,7 +706,7 @@ fn fill_builder(
|
|
672
706
|
}
|
673
707
|
Ok(())
|
674
708
|
}
|
675
|
-
ParquetSchemaType::UInt16 => {
|
709
|
+
ParquetSchemaType::Primitive(PrimitiveType::UInt16) => {
|
676
710
|
let typed_builder = builder
|
677
711
|
.as_any_mut()
|
678
712
|
.downcast_mut::<UInt16Builder>()
|
@@ -701,7 +735,7 @@ fn fill_builder(
|
|
701
735
|
}
|
702
736
|
Ok(())
|
703
737
|
}
|
704
|
-
ParquetSchemaType::UInt32 => {
|
738
|
+
ParquetSchemaType::Primitive(PrimitiveType::UInt32) => {
|
705
739
|
let typed_builder = builder
|
706
740
|
.as_any_mut()
|
707
741
|
.downcast_mut::<UInt32Builder>()
|
@@ -730,7 +764,7 @@ fn fill_builder(
|
|
730
764
|
}
|
731
765
|
Ok(())
|
732
766
|
}
|
733
|
-
ParquetSchemaType::UInt64 => {
|
767
|
+
ParquetSchemaType::Primitive(PrimitiveType::UInt64) => {
|
734
768
|
let typed_builder = builder
|
735
769
|
.as_any_mut()
|
736
770
|
.downcast_mut::<UInt64Builder>()
|
@@ -759,7 +793,7 @@ fn fill_builder(
|
|
759
793
|
}
|
760
794
|
Ok(())
|
761
795
|
}
|
762
|
-
ParquetSchemaType::
|
796
|
+
ParquetSchemaType::Primitive(PrimitiveType::Float32) => {
|
763
797
|
let typed_builder = builder
|
764
798
|
.as_any_mut()
|
765
799
|
.downcast_mut::<Float32Builder>()
|
@@ -779,7 +813,7 @@ fn fill_builder(
|
|
779
813
|
}
|
780
814
|
Ok(())
|
781
815
|
}
|
782
|
-
ParquetSchemaType::
|
816
|
+
ParquetSchemaType::Primitive(PrimitiveType::Float64) => {
|
783
817
|
let typed_builder = builder
|
784
818
|
.as_any_mut()
|
785
819
|
.downcast_mut::<Float64Builder>()
|
@@ -800,7 +834,7 @@ fn fill_builder(
|
|
800
834
|
}
|
801
835
|
Ok(())
|
802
836
|
}
|
803
|
-
ParquetSchemaType::Boolean => {
|
837
|
+
ParquetSchemaType::Primitive(PrimitiveType::Boolean) => {
|
804
838
|
let typed_builder = builder
|
805
839
|
.as_any_mut()
|
806
840
|
.downcast_mut::<BooleanBuilder>()
|
@@ -819,7 +853,7 @@ fn fill_builder(
|
|
819
853
|
}
|
820
854
|
Ok(())
|
821
855
|
}
|
822
|
-
ParquetSchemaType::Date32 => {
|
856
|
+
ParquetSchemaType::Primitive(PrimitiveType::Date32) => {
|
823
857
|
let typed_builder = builder
|
824
858
|
.as_any_mut()
|
825
859
|
.downcast_mut::<Date32Builder>()
|
@@ -838,7 +872,7 @@ fn fill_builder(
|
|
838
872
|
}
|
839
873
|
Ok(())
|
840
874
|
}
|
841
|
-
ParquetSchemaType::TimestampMillis => {
|
875
|
+
ParquetSchemaType::Primitive(PrimitiveType::TimestampMillis) => {
|
842
876
|
let typed_builder = builder
|
843
877
|
.as_any_mut()
|
844
878
|
.downcast_mut::<TimestampMillisecondBuilder>()
|
@@ -857,7 +891,7 @@ fn fill_builder(
|
|
857
891
|
}
|
858
892
|
Ok(())
|
859
893
|
}
|
860
|
-
ParquetSchemaType::TimestampMicros => {
|
894
|
+
ParquetSchemaType::Primitive(PrimitiveType::TimestampMicros) => {
|
861
895
|
let typed_builder = builder
|
862
896
|
.as_any_mut()
|
863
897
|
.downcast_mut::<TimestampMicrosecondBuilder>()
|
@@ -894,7 +928,7 @@ fn fill_builder(
|
|
894
928
|
// ------------------
|
895
929
|
// OTHER TYPES - keep as is for now
|
896
930
|
// ------------------
|
897
|
-
ParquetSchemaType::String => {
|
931
|
+
ParquetSchemaType::Primitive(PrimitiveType::String) => {
|
898
932
|
let typed_builder = builder
|
899
933
|
.as_any_mut()
|
900
934
|
.downcast_mut::<StringBuilder>()
|
@@ -913,7 +947,7 @@ fn fill_builder(
|
|
913
947
|
}
|
914
948
|
Ok(())
|
915
949
|
}
|
916
|
-
ParquetSchemaType::Binary => {
|
950
|
+
ParquetSchemaType::Primitive(PrimitiveType::Binary) => {
|
917
951
|
let typed_builder = builder
|
918
952
|
.as_any_mut()
|
919
953
|
.downcast_mut::<BinaryBuilder>()
|
@@ -1178,7 +1212,7 @@ fn fill_builder(
|
|
1178
1212
|
}
|
1179
1213
|
}
|
1180
1214
|
ParquetValue::Null => match struct_field.fields[i].type_ {
|
1181
|
-
ParquetSchemaType::Int8 => typed_builder
|
1215
|
+
ParquetSchemaType::Primitive(PrimitiveType::Int8) => typed_builder
|
1182
1216
|
.field_builder::<Int8Builder>(i)
|
1183
1217
|
.ok_or_else(|| {
|
1184
1218
|
MagnusError::new(
|
@@ -1187,7 +1221,7 @@ fn fill_builder(
|
|
1187
1221
|
)
|
1188
1222
|
})?
|
1189
1223
|
.append_null(),
|
1190
|
-
ParquetSchemaType::Int16 => typed_builder
|
1224
|
+
ParquetSchemaType::Primitive(PrimitiveType::Int16) => typed_builder
|
1191
1225
|
.field_builder::<Int16Builder>(i)
|
1192
1226
|
.ok_or_else(|| {
|
1193
1227
|
MagnusError::new(
|
@@ -1196,7 +1230,7 @@ fn fill_builder(
|
|
1196
1230
|
)
|
1197
1231
|
})?
|
1198
1232
|
.append_null(),
|
1199
|
-
ParquetSchemaType::Int32 => typed_builder
|
1233
|
+
ParquetSchemaType::Primitive(PrimitiveType::Int32) => typed_builder
|
1200
1234
|
.field_builder::<Int32Builder>(i)
|
1201
1235
|
.ok_or_else(|| {
|
1202
1236
|
MagnusError::new(
|
@@ -1205,7 +1239,7 @@ fn fill_builder(
|
|
1205
1239
|
)
|
1206
1240
|
})?
|
1207
1241
|
.append_null(),
|
1208
|
-
ParquetSchemaType::Int64 => typed_builder
|
1242
|
+
ParquetSchemaType::Primitive(PrimitiveType::Int64) => typed_builder
|
1209
1243
|
.field_builder::<Int64Builder>(i)
|
1210
1244
|
.ok_or_else(|| {
|
1211
1245
|
MagnusError::new(
|
@@ -1214,7 +1248,7 @@ fn fill_builder(
|
|
1214
1248
|
)
|
1215
1249
|
})?
|
1216
1250
|
.append_null(),
|
1217
|
-
ParquetSchemaType::UInt8 => typed_builder
|
1251
|
+
ParquetSchemaType::Primitive(PrimitiveType::UInt8) => typed_builder
|
1218
1252
|
.field_builder::<UInt8Builder>(i)
|
1219
1253
|
.ok_or_else(|| {
|
1220
1254
|
MagnusError::new(
|
@@ -1223,7 +1257,7 @@ fn fill_builder(
|
|
1223
1257
|
)
|
1224
1258
|
})?
|
1225
1259
|
.append_null(),
|
1226
|
-
ParquetSchemaType::UInt16 => typed_builder
|
1260
|
+
ParquetSchemaType::Primitive(PrimitiveType::UInt16) => typed_builder
|
1227
1261
|
.field_builder::<UInt16Builder>(i)
|
1228
1262
|
.ok_or_else(|| {
|
1229
1263
|
MagnusError::new(
|
@@ -1232,7 +1266,7 @@ fn fill_builder(
|
|
1232
1266
|
)
|
1233
1267
|
})?
|
1234
1268
|
.append_null(),
|
1235
|
-
ParquetSchemaType::UInt32 => typed_builder
|
1269
|
+
ParquetSchemaType::Primitive(PrimitiveType::UInt32) => typed_builder
|
1236
1270
|
.field_builder::<UInt32Builder>(i)
|
1237
1271
|
.ok_or_else(|| {
|
1238
1272
|
MagnusError::new(
|
@@ -1241,7 +1275,7 @@ fn fill_builder(
|
|
1241
1275
|
)
|
1242
1276
|
})?
|
1243
1277
|
.append_null(),
|
1244
|
-
ParquetSchemaType::UInt64 => typed_builder
|
1278
|
+
ParquetSchemaType::Primitive(PrimitiveType::UInt64) => typed_builder
|
1245
1279
|
.field_builder::<UInt64Builder>(i)
|
1246
1280
|
.ok_or_else(|| {
|
1247
1281
|
MagnusError::new(
|
@@ -1250,7 +1284,7 @@ fn fill_builder(
|
|
1250
1284
|
)
|
1251
1285
|
})?
|
1252
1286
|
.append_null(),
|
1253
|
-
ParquetSchemaType::
|
1287
|
+
ParquetSchemaType::Primitive(PrimitiveType::Float32) => typed_builder
|
1254
1288
|
.field_builder::<Float32Builder>(i)
|
1255
1289
|
.ok_or_else(|| {
|
1256
1290
|
MagnusError::new(
|
@@ -1259,7 +1293,7 @@ fn fill_builder(
|
|
1259
1293
|
)
|
1260
1294
|
})?
|
1261
1295
|
.append_null(),
|
1262
|
-
ParquetSchemaType::
|
1296
|
+
ParquetSchemaType::Primitive(PrimitiveType::Float64) => typed_builder
|
1263
1297
|
.field_builder::<Float64Builder>(i)
|
1264
1298
|
.ok_or_else(|| {
|
1265
1299
|
MagnusError::new(
|
@@ -1268,7 +1302,7 @@ fn fill_builder(
|
|
1268
1302
|
)
|
1269
1303
|
})?
|
1270
1304
|
.append_null(),
|
1271
|
-
ParquetSchemaType::String => typed_builder
|
1305
|
+
ParquetSchemaType::Primitive(PrimitiveType::String) => typed_builder
|
1272
1306
|
.field_builder::<StringBuilder>(i)
|
1273
1307
|
.ok_or_else(|| {
|
1274
1308
|
MagnusError::new(
|
@@ -1277,7 +1311,7 @@ fn fill_builder(
|
|
1277
1311
|
)
|
1278
1312
|
})?
|
1279
1313
|
.append_null(),
|
1280
|
-
ParquetSchemaType::Binary => typed_builder
|
1314
|
+
ParquetSchemaType::Primitive(PrimitiveType::Binary) => typed_builder
|
1281
1315
|
.field_builder::<BinaryBuilder>(i)
|
1282
1316
|
.ok_or_else(|| {
|
1283
1317
|
MagnusError::new(
|
@@ -1286,7 +1320,7 @@ fn fill_builder(
|
|
1286
1320
|
)
|
1287
1321
|
})?
|
1288
1322
|
.append_null(),
|
1289
|
-
ParquetSchemaType::Boolean => typed_builder
|
1323
|
+
ParquetSchemaType::Primitive(PrimitiveType::Boolean) => typed_builder
|
1290
1324
|
.field_builder::<BooleanBuilder>(i)
|
1291
1325
|
.ok_or_else(|| {
|
1292
1326
|
MagnusError::new(
|
@@ -1295,7 +1329,7 @@ fn fill_builder(
|
|
1295
1329
|
)
|
1296
1330
|
})?
|
1297
1331
|
.append_null(),
|
1298
|
-
ParquetSchemaType::Date32 => typed_builder
|
1332
|
+
ParquetSchemaType::Primitive(PrimitiveType::Date32) => typed_builder
|
1299
1333
|
.field_builder::<Date32Builder>(i)
|
1300
1334
|
.ok_or_else(|| {
|
1301
1335
|
MagnusError::new(
|
@@ -1304,7 +1338,7 @@ fn fill_builder(
|
|
1304
1338
|
)
|
1305
1339
|
})?
|
1306
1340
|
.append_null(),
|
1307
|
-
ParquetSchemaType::TimestampMillis => typed_builder
|
1341
|
+
ParquetSchemaType::Primitive(PrimitiveType::TimestampMillis) => typed_builder
|
1308
1342
|
.field_builder::<TimestampMillisecondBuilder>(i)
|
1309
1343
|
.ok_or_else(|| {
|
1310
1344
|
MagnusError::new(
|
@@ -1313,7 +1347,7 @@ fn fill_builder(
|
|
1313
1347
|
)
|
1314
1348
|
})?
|
1315
1349
|
.append_null(),
|
1316
|
-
ParquetSchemaType::TimestampMicros => typed_builder
|
1350
|
+
ParquetSchemaType::Primitive(PrimitiveType::TimestampMicros) => typed_builder
|
1317
1351
|
.field_builder::<TimestampMicrosecondBuilder>(i)
|
1318
1352
|
.ok_or_else(|| {
|
1319
1353
|
MagnusError::new(
|
@@ -1389,7 +1423,7 @@ fn fill_builder(
|
|
1389
1423
|
pub fn convert_parquet_values_to_arrow(
|
1390
1424
|
values: Vec<ParquetValue>,
|
1391
1425
|
type_: &ParquetSchemaType,
|
1392
|
-
) -> Result<Arc<dyn Array>,
|
1426
|
+
) -> Result<Arc<dyn Array>, ParquetGemError> {
|
1393
1427
|
// Make sure we always have at least capacity 1 to avoid empty builders
|
1394
1428
|
let capacity = if values.is_empty() { 1 } else { values.len() };
|
1395
1429
|
let mut builder = create_arrow_builder_for_type(type_, Some(capacity))?;
|
@@ -1403,16 +1437,17 @@ pub fn convert_parquet_values_to_arrow(
|
|
1403
1437
|
}
|
1404
1438
|
|
1405
1439
|
pub fn convert_ruby_array_to_arrow(
|
1440
|
+
ruby: &Ruby,
|
1406
1441
|
values: RArray,
|
1407
1442
|
type_: &ParquetSchemaType,
|
1408
|
-
) -> Result<Arc<dyn Array>,
|
1443
|
+
) -> Result<Arc<dyn Array>, ParquetGemError> {
|
1409
1444
|
let mut parquet_values = Vec::with_capacity(values.len());
|
1410
1445
|
for value in values {
|
1411
1446
|
if value.is_nil() {
|
1412
1447
|
parquet_values.push(ParquetValue::Null);
|
1413
1448
|
continue;
|
1414
1449
|
}
|
1415
|
-
let parquet_value = ParquetValue::from_value(value, type_, None)?;
|
1450
|
+
let parquet_value = ParquetValue::from_value(ruby, value, type_, None)?;
|
1416
1451
|
parquet_values.push(parquet_value);
|
1417
1452
|
}
|
1418
1453
|
convert_parquet_values_to_arrow(parquet_values, type_)
|
@@ -1,10 +1,10 @@
|
|
1
|
-
use super::core_types::SchemaNode;
|
1
|
+
use super::{core_types::SchemaNode, ParquetGemError, PrimitiveType};
|
2
2
|
use crate::{
|
3
|
-
reader::ReaderError,
|
4
3
|
types::{ListField, MapField, ParquetSchemaType},
|
4
|
+
utils::parse_string_or_symbol,
|
5
5
|
};
|
6
6
|
use arrow_array::{Array, RecordBatch};
|
7
|
-
use magnus::{value::ReprValue, Error as MagnusError, RString, Ruby,
|
7
|
+
use magnus::{value::ReprValue, Error as MagnusError, RString, Ruby, TryConvert, Value};
|
8
8
|
use parquet::{arrow::ArrowWriter, errors::ParquetError};
|
9
9
|
use std::{
|
10
10
|
io::{self, Write},
|
@@ -100,34 +100,27 @@ impl<'a> FromStr for ParquetSchemaType<'a> {
|
|
100
100
|
|
101
101
|
// Handle primitive types
|
102
102
|
match s {
|
103
|
-
"int8" => Ok(ParquetSchemaType::Int8),
|
104
|
-
"int16" => Ok(ParquetSchemaType::Int16),
|
105
|
-
"int32" => Ok(ParquetSchemaType::Int32),
|
106
|
-
"int64" => Ok(ParquetSchemaType::Int64),
|
107
|
-
"uint8" => Ok(ParquetSchemaType::UInt8),
|
108
|
-
"uint16" => Ok(ParquetSchemaType::UInt16),
|
109
|
-
"uint32" => Ok(ParquetSchemaType::UInt32),
|
110
|
-
"uint64" => Ok(ParquetSchemaType::UInt64),
|
111
|
-
"float" | "float32" => Ok(ParquetSchemaType::
|
112
|
-
"double" | "float64" => Ok(ParquetSchemaType::
|
113
|
-
"string" | "utf8" => Ok(ParquetSchemaType::String),
|
114
|
-
"binary" => Ok(ParquetSchemaType::Binary),
|
115
|
-
"boolean" | "bool" => Ok(ParquetSchemaType::Boolean),
|
116
|
-
"date32" => Ok(ParquetSchemaType::Date32),
|
117
|
-
"timestamp_millis" => Ok(ParquetSchemaType::TimestampMillis),
|
118
|
-
"timestamp_micros" => Ok(ParquetSchemaType::TimestampMicros),
|
103
|
+
"int8" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Int8)),
|
104
|
+
"int16" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Int16)),
|
105
|
+
"int32" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Int32)),
|
106
|
+
"int64" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Int64)),
|
107
|
+
"uint8" => Ok(ParquetSchemaType::Primitive(PrimitiveType::UInt8)),
|
108
|
+
"uint16" => Ok(ParquetSchemaType::Primitive(PrimitiveType::UInt16)),
|
109
|
+
"uint32" => Ok(ParquetSchemaType::Primitive(PrimitiveType::UInt32)),
|
110
|
+
"uint64" => Ok(ParquetSchemaType::Primitive(PrimitiveType::UInt64)),
|
111
|
+
"float" | "float32" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Float32)),
|
112
|
+
"double" | "float64" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Float64)),
|
113
|
+
"string" | "utf8" => Ok(ParquetSchemaType::Primitive(PrimitiveType::String)),
|
114
|
+
"binary" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Binary)),
|
115
|
+
"boolean" | "bool" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Boolean)),
|
116
|
+
"date32" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Date32)),
|
117
|
+
"timestamp_millis" => Ok(ParquetSchemaType::Primitive(PrimitiveType::TimestampMillis)),
|
118
|
+
"timestamp_micros" => Ok(ParquetSchemaType::Primitive(PrimitiveType::TimestampMicros)),
|
119
119
|
"list" => Ok(ParquetSchemaType::List(Box::new(ListField {
|
120
|
-
item_type: ParquetSchemaType::String,
|
120
|
+
item_type: ParquetSchemaType::Primitive(PrimitiveType::String),
|
121
121
|
format: None,
|
122
122
|
nullable: true,
|
123
123
|
}))),
|
124
|
-
"map" => Ok(ParquetSchemaType::Map(Box::new(MapField {
|
125
|
-
key_type: ParquetSchemaType::String,
|
126
|
-
value_type: ParquetSchemaType::String,
|
127
|
-
key_format: None,
|
128
|
-
value_format: None,
|
129
|
-
value_nullable: true,
|
130
|
-
}))),
|
131
124
|
_ => Err(MagnusError::new(
|
132
125
|
magnus::exception::runtime_error(),
|
133
126
|
format!("Invalid schema type: {}", s),
|
@@ -153,31 +146,6 @@ impl<'a> TryConvert for ParquetSchemaType<'a> {
|
|
153
146
|
// with simple primitive types and strings
|
154
147
|
unsafe impl<'a> Send for ParquetSchemaType<'a> {}
|
155
148
|
|
156
|
-
fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, MagnusError> {
|
157
|
-
if value.is_nil() {
|
158
|
-
Ok(None)
|
159
|
-
} else if value.is_kind_of(ruby.class_string()) {
|
160
|
-
RString::from_value(value)
|
161
|
-
.ok_or_else(|| {
|
162
|
-
MagnusError::new(magnus::exception::type_error(), "Invalid string value")
|
163
|
-
})?
|
164
|
-
.to_string()
|
165
|
-
.map(|s| Some(s))
|
166
|
-
} else if value.is_kind_of(ruby.class_symbol()) {
|
167
|
-
Symbol::from_value(value)
|
168
|
-
.ok_or_else(|| {
|
169
|
-
MagnusError::new(magnus::exception::type_error(), "Invalid symbol value")
|
170
|
-
})?
|
171
|
-
.funcall("to_s", ())
|
172
|
-
.map(|s| Some(s))
|
173
|
-
} else {
|
174
|
-
Err(MagnusError::new(
|
175
|
-
magnus::exception::type_error(),
|
176
|
-
"Value must be a String or Symbol",
|
177
|
-
))
|
178
|
-
}
|
179
|
-
}
|
180
|
-
|
181
149
|
pub enum WriterOutput {
|
182
150
|
File(ArrowWriter<Box<dyn SendableWrite>>),
|
183
151
|
TempFile(ArrowWriter<Box<dyn SendableWrite>>, NamedTempFile),
|
@@ -205,6 +173,7 @@ impl WriterOutput {
|
|
205
173
|
}
|
206
174
|
|
207
175
|
pub struct ColumnCollector<'a> {
|
176
|
+
pub ruby: &'a Ruby,
|
208
177
|
pub name: String,
|
209
178
|
pub type_: ParquetSchemaType<'a>,
|
210
179
|
pub format: Option<String>,
|
@@ -214,12 +183,14 @@ pub struct ColumnCollector<'a> {
|
|
214
183
|
|
215
184
|
impl<'a> ColumnCollector<'a> {
|
216
185
|
pub fn new(
|
186
|
+
ruby: &'a Ruby,
|
217
187
|
name: String,
|
218
188
|
type_: ParquetSchemaType<'a>,
|
219
189
|
format: Option<String>,
|
220
190
|
nullable: bool,
|
221
191
|
) -> Self {
|
222
192
|
Self {
|
193
|
+
ruby,
|
223
194
|
name,
|
224
195
|
type_,
|
225
196
|
format,
|
@@ -242,12 +213,13 @@ impl<'a> ColumnCollector<'a> {
|
|
242
213
|
}
|
243
214
|
|
244
215
|
// For all other types, proceed as normal
|
245
|
-
let parquet_value =
|
216
|
+
let parquet_value =
|
217
|
+
ParquetValue::from_value(self.ruby, value, &self.type_, self.format.as_deref())?;
|
246
218
|
self.values.push(parquet_value);
|
247
219
|
Ok(())
|
248
220
|
}
|
249
221
|
|
250
|
-
pub fn take_array(&mut self) -> Result<Arc<dyn Array>,
|
222
|
+
pub fn take_array(&mut self) -> Result<Arc<dyn Array>, ParquetGemError> {
|
251
223
|
let values = std::mem::take(&mut self.values);
|
252
224
|
crate::convert_parquet_values_to_arrow(values, &self.type_)
|
253
225
|
}
|