parquet 0.0.2 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,27 +1,116 @@
1
- use std::{borrow::Cow, collections::HashMap, hash::BuildHasher};
1
+ use std::{borrow::Cow, collections::HashMap, hash::BuildHasher, sync::Arc};
2
2
 
3
+ use arrow_array::cast::downcast_array;
4
+ use arrow_array::{
5
+ Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Float16Array, Float32Array,
6
+ Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, NullArray, StringArray,
7
+ StructArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
8
+ TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
9
+ };
10
+ use arrow_schema::{DataType, TimeUnit};
11
+ use itertools::Itertools;
3
12
  use magnus::{value::ReprValue, IntoValue, Ruby, Value};
13
+ use parquet::data_type::Decimal;
4
14
  use parquet::record::Field;
5
15
 
16
+ use crate::header_cache::StringCacheKey;
17
+
6
18
  #[derive(Debug)]
7
- pub enum Record<S: BuildHasher + Default> {
19
+ pub enum RowRecord<S: BuildHasher + Default> {
8
20
  Vec(Vec<ParquetField>),
9
- Map(HashMap<&'static str, ParquetField, S>),
21
+ Map(HashMap<StringCacheKey, ParquetField, S>),
10
22
  }
11
23
 
12
- impl<S: BuildHasher + Default> IntoValue for Record<S> {
24
+ #[derive(Debug)]
25
+ pub enum ColumnRecord<S: BuildHasher + Default> {
26
+ Vec(Vec<Vec<ParquetValue>>),
27
+ Map(HashMap<StringCacheKey, Vec<ParquetValue>, S>),
28
+ }
29
+
30
+ impl<S: BuildHasher + Default> IntoValue for RowRecord<S> {
13
31
  fn into_value_with(self, handle: &Ruby) -> Value {
14
32
  match self {
15
- Record::Vec(vec) => {
33
+ RowRecord::Vec(vec) => {
16
34
  let ary = handle.ary_new_capa(vec.len());
17
35
  vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
18
- ary.into_value_with(handle)
36
+ handle.into_value(ary)
19
37
  }
20
- Record::Map(map) => {
38
+ RowRecord::Map(map) => {
21
39
  let hash = handle.hash_new_capa(map.len());
22
- map.into_iter()
23
- .try_for_each(|(k, v)| hash.aset(k, v))
40
+
41
+ let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
42
+ let mut i = 0;
43
+
44
+ for chunk in &map.into_iter().chunks(64) {
45
+ // Reduced to 64 to ensure space for pairs
46
+ for (k, v) in chunk {
47
+ if i + 1 >= values.len() {
48
+ // Bulk insert current batch if array is full
49
+ hash.bulk_insert(&values[..i]).unwrap();
50
+ values[..i].fill(handle.qnil().as_value());
51
+ i = 0;
52
+ }
53
+ values[i] = handle.into_value(k);
54
+ values[i + 1] = handle.into_value(v);
55
+ i += 2;
56
+ }
57
+ // Insert any remaining pairs
58
+ if i > 0 {
59
+ hash.bulk_insert(&values[..i]).unwrap();
60
+ values[..i].fill(handle.qnil().as_value());
61
+ i = 0;
62
+ }
63
+ }
64
+
65
+ hash.into_value_with(handle)
66
+ }
67
+ }
68
+ }
69
+ }
70
+
71
+ impl<S: BuildHasher + Default> IntoValue for ColumnRecord<S> {
72
+ fn into_value_with(self, handle: &Ruby) -> Value {
73
+ match self {
74
+ ColumnRecord::Vec(vec) => {
75
+ let ary = handle.ary_new_capa(vec.len());
76
+ vec.into_iter()
77
+ .try_for_each(|v| {
78
+ let nested_ary = handle.ary_new_capa(v.len());
79
+ v.into_iter().try_for_each(|v| nested_ary.push(v)).unwrap();
80
+ ary.push(nested_ary.into_value_with(handle))
81
+ })
24
82
  .unwrap();
83
+ ary.into_value_with(handle)
84
+ }
85
+ ColumnRecord::Map(map) => {
86
+ let hash = handle.hash_new_capa(map.len());
87
+
88
+ let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
89
+ let mut i = 0;
90
+
91
+ for chunk in &map.into_iter().chunks(64) {
92
+ // Reduced to 64 to ensure space for pairs
93
+ for (k, v) in chunk {
94
+ if i + 1 >= values.len() {
95
+ // Bulk insert current batch if array is full
96
+ hash.bulk_insert(&values[..i]).unwrap();
97
+ values[..i].fill(handle.qnil().as_value());
98
+ i = 0;
99
+ }
100
+ values[i] = handle.into_value(k);
101
+ let ary = handle.ary_new_capa(v.len());
102
+ v.into_iter().try_for_each(|v| ary.push(v)).unwrap();
103
+ values[i + 1] = handle.into_value(ary);
104
+ i += 2;
105
+ }
106
+ // Insert any remaining pairs
107
+ if i > 0 {
108
+ hash.bulk_insert(&values[..i]).unwrap();
109
+ values[..i].fill(handle.qnil().as_value());
110
+ i = 0;
111
+ }
112
+ }
113
+
25
114
  hash.into_value_with(handle)
26
115
  }
27
116
  }
@@ -43,7 +132,7 @@ pub struct ParquetField(pub Field);
43
132
  impl IntoValue for ParquetField {
44
133
  fn into_value_with(self, handle: &Ruby) -> Value {
45
134
  match self.0 {
46
- Field::Byte(b) => b.into_value_with(handle),
135
+ Field::Null => handle.qnil().as_value(),
47
136
  Field::Bool(b) => b.into_value_with(handle),
48
137
  Field::Short(s) => s.into_value_with(handle),
49
138
  Field::Int(i) => i.into_value_with(handle),
@@ -56,10 +145,29 @@ impl IntoValue for ParquetField {
56
145
  Field::Float(f) => f.into_value_with(handle),
57
146
  Field::Double(d) => d.into_value_with(handle),
58
147
  Field::Str(s) => s.into_value_with(handle),
148
+ Field::Byte(b) => b.into_value_with(handle),
59
149
  Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
60
- Field::Date(d) => d.into_value_with(handle),
61
- Field::TimestampMillis(ts) => ts.into_value_with(handle),
62
- Field::TimestampMicros(ts) => ts.into_value_with(handle),
150
+ Field::Date(d) => {
151
+ let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
152
+ let formatted = ts.strftime("%Y-%m-%d").to_string();
153
+ formatted.into_value_with(handle)
154
+ }
155
+ Field::TimestampMillis(ts) => {
156
+ let ts = jiff::Timestamp::from_millisecond(ts).unwrap();
157
+ let time_class = handle.class_time();
158
+ time_class
159
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
160
+ .unwrap()
161
+ .into_value_with(handle)
162
+ }
163
+ Field::TimestampMicros(ts) => {
164
+ let ts = jiff::Timestamp::from_microsecond(ts).unwrap();
165
+ let time_class = handle.class_time();
166
+ time_class
167
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
168
+ .unwrap()
169
+ .into_value_with(handle)
170
+ }
63
171
  Field::ListInternal(list) => {
64
172
  let elements = list.elements();
65
173
  let ary = handle.ary_new_capa(elements.len());
@@ -83,8 +191,532 @@ impl IntoValue for ParquetField {
83
191
  .unwrap();
84
192
  hash.into_value_with(handle)
85
193
  }
86
- Field::Null => handle.qnil().as_value(),
87
- _ => panic!("Unsupported field type"),
194
+ Field::Decimal(d) => {
195
+ let value = match d {
196
+ Decimal::Int32 { value, scale, .. } => {
197
+ let unscaled = i32::from_be_bytes(value);
198
+ format!("{}e-{}", unscaled, scale)
199
+ }
200
+ Decimal::Int64 { value, scale, .. } => {
201
+ let unscaled = i64::from_be_bytes(value);
202
+ format!("{}e-{}", unscaled, scale)
203
+ }
204
+ Decimal::Bytes { value, scale, .. } => {
205
+ // Convert bytes to string representation of unscaled value
206
+ let unscaled = String::from_utf8_lossy(value.data());
207
+ format!("{}e-{}", unscaled, scale)
208
+ }
209
+ };
210
+ handle.eval(&format!("BigDecimal(\"{value}\")")).unwrap()
211
+ }
212
+ Field::Group(row) => {
213
+ let hash = handle.hash_new();
214
+ row.get_column_iter()
215
+ .try_for_each(|(k, v)| {
216
+ hash.aset(
217
+ k.clone().into_value_with(handle),
218
+ ParquetField(v.clone()).into_value_with(handle),
219
+ )
220
+ })
221
+ .unwrap();
222
+ hash.into_value_with(handle)
223
+ }
224
+ }
225
+ }
226
+ }
227
+
228
+ #[allow(dead_code)]
229
+ #[derive(Clone, Debug)]
230
+ pub enum ParquetValue {
231
+ Int8(i8),
232
+ Int16(i16),
233
+ Int32(i32),
234
+ Int64(i64),
235
+ UInt8(u8),
236
+ UInt16(u16),
237
+ UInt32(u32),
238
+ UInt64(u64),
239
+ Float16(f32), // f16 converted to f32
240
+ Float32(f32),
241
+ Float64(f64),
242
+ Boolean(bool),
243
+ String(String),
244
+ Bytes(Vec<u8>),
245
+ Date32(i32),
246
+ Date64(i64),
247
+ TimestampSecond(i64, Option<Arc<str>>),
248
+ TimestampMillis(i64, Option<Arc<str>>),
249
+ TimestampMicros(i64, Option<Arc<str>>),
250
+ TimestampNanos(i64, Option<Arc<str>>),
251
+ List(Vec<ParquetValue>),
252
+ Map(HashMap<ParquetValue, ParquetValue>),
253
+ Null,
254
+ }
255
+
256
+ impl PartialEq for ParquetValue {
257
+ fn eq(&self, other: &Self) -> bool {
258
+ match (self, other) {
259
+ (ParquetValue::Int8(a), ParquetValue::Int8(b)) => a == b,
260
+ (ParquetValue::Int16(a), ParquetValue::Int16(b)) => a == b,
261
+ (ParquetValue::Int32(a), ParquetValue::Int32(b)) => a == b,
262
+ (ParquetValue::Int64(a), ParquetValue::Int64(b)) => a == b,
263
+ (ParquetValue::UInt8(a), ParquetValue::UInt8(b)) => a == b,
264
+ (ParquetValue::UInt16(a), ParquetValue::UInt16(b)) => a == b,
265
+ (ParquetValue::UInt32(a), ParquetValue::UInt32(b)) => a == b,
266
+ (ParquetValue::UInt64(a), ParquetValue::UInt64(b)) => a == b,
267
+ (ParquetValue::Float16(a), ParquetValue::Float16(b)) => a == b,
268
+ (ParquetValue::Float32(a), ParquetValue::Float32(b)) => a == b,
269
+ (ParquetValue::Float64(a), ParquetValue::Float64(b)) => a == b,
270
+ (ParquetValue::Boolean(a), ParquetValue::Boolean(b)) => a == b,
271
+ (ParquetValue::String(a), ParquetValue::String(b)) => a == b,
272
+ (ParquetValue::Bytes(a), ParquetValue::Bytes(b)) => a == b,
273
+ (ParquetValue::Date32(a), ParquetValue::Date32(b)) => a == b,
274
+ (ParquetValue::Date64(a), ParquetValue::Date64(b)) => a == b,
275
+ (ParquetValue::TimestampSecond(a, _), ParquetValue::TimestampSecond(b, _)) => a == b,
276
+ (ParquetValue::TimestampMillis(a, _), ParquetValue::TimestampMillis(b, _)) => a == b,
277
+ (ParquetValue::TimestampMicros(a, _), ParquetValue::TimestampMicros(b, _)) => a == b,
278
+ (ParquetValue::TimestampNanos(a, _), ParquetValue::TimestampNanos(b, _)) => a == b,
279
+ (ParquetValue::List(a), ParquetValue::List(b)) => a == b,
280
+ (ParquetValue::Null, ParquetValue::Null) => true,
281
+ _ => false,
282
+ }
283
+ }
284
+ }
285
+
286
+ impl Eq for ParquetValue {}
287
+
288
+ #[derive(Debug)]
289
+ pub struct ParquetValueVec(Vec<ParquetValue>);
290
+
291
+ impl ParquetValueVec {
292
+ pub fn into_inner(self) -> Vec<ParquetValue> {
293
+ self.0
294
+ }
295
+ }
296
+
297
+ impl IntoIterator for ParquetValueVec {
298
+ type Item = ParquetValue;
299
+ type IntoIter = std::vec::IntoIter<ParquetValue>;
300
+
301
+ fn into_iter(self) -> Self::IntoIter {
302
+ self.0.into_iter()
303
+ }
304
+ }
305
+
306
+ impl std::cmp::PartialEq for ParquetValueVec {
307
+ fn eq(&self, other: &Self) -> bool {
308
+ self.0 == other.0
309
+ }
310
+ }
311
+
312
+ impl std::cmp::Eq for ParquetValueVec {}
313
+
314
+ impl TryFrom<Arc<dyn Array>> for ParquetValueVec {
315
+ type Error = String;
316
+
317
+ fn try_from(column: Arc<dyn Array>) -> Result<Self, Self::Error> {
318
+ ParquetValueVec::try_from(&*column)
319
+ }
320
+ }
321
+
322
+ // Add macro for handling numeric array conversions
323
+ macro_rules! impl_numeric_array_conversion {
324
+ ($column:expr, $array_type:ty, $variant:ident) => {{
325
+ let array = downcast_array::<$array_type>($column);
326
+ if array.is_nullable() {
327
+ array
328
+ .values()
329
+ .iter()
330
+ .enumerate()
331
+ .map(|(i, x)| {
332
+ if array.is_null(i) {
333
+ ParquetValue::Null
334
+ } else {
335
+ ParquetValue::$variant(*x)
336
+ }
337
+ })
338
+ .collect()
339
+ } else {
340
+ array
341
+ .values()
342
+ .iter()
343
+ .map(|x| ParquetValue::$variant(*x))
344
+ .collect()
345
+ }
346
+ }};
347
+ }
348
+
349
+ // Add macro for handling boolean array conversions
350
+ macro_rules! impl_boolean_array_conversion {
351
+ ($column:expr, $array_type:ty, $variant:ident) => {{
352
+ let array = downcast_array::<$array_type>($column);
353
+ if array.is_nullable() {
354
+ array
355
+ .values()
356
+ .iter()
357
+ .enumerate()
358
+ .map(|(i, x)| {
359
+ if array.is_null(i) {
360
+ ParquetValue::Null
361
+ } else {
362
+ ParquetValue::$variant(x)
363
+ }
364
+ })
365
+ .collect()
366
+ } else {
367
+ array
368
+ .values()
369
+ .iter()
370
+ .map(|x| ParquetValue::$variant(x))
371
+ .collect()
372
+ }
373
+ }};
374
+ }
375
+
376
+ // Add macro for handling timestamp array conversions
377
+ macro_rules! impl_timestamp_array_conversion {
378
+ ($column:expr, $array_type:ty, $variant:ident, $tz:expr) => {{
379
+ let array = downcast_array::<$array_type>($column);
380
+ if array.is_nullable() {
381
+ array
382
+ .values()
383
+ .iter()
384
+ .enumerate()
385
+ .map(|(i, x)| {
386
+ if array.is_null(i) {
387
+ ParquetValue::Null
388
+ } else {
389
+ ParquetValue::$variant(*x, $tz.clone())
390
+ }
391
+ })
392
+ .collect()
393
+ } else {
394
+ array
395
+ .values()
396
+ .iter()
397
+ .map(|x| ParquetValue::$variant(*x, $tz.clone()))
398
+ .collect()
399
+ }
400
+ }};
401
+ }
402
+
403
+ impl TryFrom<&dyn Array> for ParquetValueVec {
404
+ type Error = String;
405
+
406
+ fn try_from(column: &dyn Array) -> Result<Self, Self::Error> {
407
+ let tmp_vec = match column.data_type() {
408
+ DataType::Boolean => impl_boolean_array_conversion!(column, BooleanArray, Boolean),
409
+ DataType::Int8 => impl_numeric_array_conversion!(column, Int8Array, Int8),
410
+ DataType::Int16 => impl_numeric_array_conversion!(column, Int16Array, Int16),
411
+ DataType::Int32 => impl_numeric_array_conversion!(column, Int32Array, Int32),
412
+ DataType::Int64 => impl_numeric_array_conversion!(column, Int64Array, Int64),
413
+ DataType::UInt8 => impl_numeric_array_conversion!(column, UInt8Array, UInt8),
414
+ DataType::UInt16 => impl_numeric_array_conversion!(column, UInt16Array, UInt16),
415
+ DataType::UInt32 => impl_numeric_array_conversion!(column, UInt32Array, UInt32),
416
+ DataType::UInt64 => impl_numeric_array_conversion!(column, UInt64Array, UInt64),
417
+ DataType::Float32 => impl_numeric_array_conversion!(column, Float32Array, Float32),
418
+ DataType::Float64 => impl_numeric_array_conversion!(column, Float64Array, Float64),
419
+ DataType::Date32 => impl_numeric_array_conversion!(column, Date32Array, Date32),
420
+ DataType::Date64 => impl_numeric_array_conversion!(column, Date64Array, Date64),
421
+ DataType::Timestamp(TimeUnit::Second, tz) => {
422
+ impl_timestamp_array_conversion!(column, TimestampSecondArray, TimestampSecond, tz)
423
+ }
424
+ DataType::Timestamp(TimeUnit::Millisecond, tz) => {
425
+ impl_timestamp_array_conversion!(
426
+ column,
427
+ TimestampMillisecondArray,
428
+ TimestampMillis,
429
+ tz
430
+ )
431
+ }
432
+ DataType::Timestamp(TimeUnit::Microsecond, tz) => {
433
+ impl_timestamp_array_conversion!(
434
+ column,
435
+ TimestampMicrosecondArray,
436
+ TimestampMicros,
437
+ tz
438
+ )
439
+ }
440
+ DataType::Timestamp(TimeUnit::Nanosecond, tz) => {
441
+ impl_timestamp_array_conversion!(
442
+ column,
443
+ TimestampNanosecondArray,
444
+ TimestampNanos,
445
+ tz
446
+ )
447
+ }
448
+ // Because f16 is unstable in Rust, we convert it to f32
449
+ DataType::Float16 => {
450
+ let array = downcast_array::<Float16Array>(column);
451
+ if array.is_nullable() {
452
+ array
453
+ .values()
454
+ .iter()
455
+ .enumerate()
456
+ .map(|(i, x)| {
457
+ if array.is_null(i) {
458
+ ParquetValue::Null
459
+ } else {
460
+ ParquetValue::Float16(f32::from(*x))
461
+ }
462
+ })
463
+ .collect()
464
+ } else {
465
+ array
466
+ .values()
467
+ .iter()
468
+ .map(|x| ParquetValue::Float16(f32::from(*x)))
469
+ .collect()
470
+ }
471
+ }
472
+ DataType::Utf8 => {
473
+ let array = downcast_array::<StringArray>(column);
474
+ array
475
+ .iter()
476
+ .map(|opt_x| match opt_x {
477
+ Some(x) => ParquetValue::String(x.to_string()),
478
+ None => ParquetValue::Null,
479
+ })
480
+ .collect()
481
+ }
482
+ DataType::Binary => {
483
+ let array = downcast_array::<BinaryArray>(column);
484
+ array
485
+ .iter()
486
+ .map(|opt_x| match opt_x {
487
+ Some(x) => ParquetValue::Bytes(x.to_vec()),
488
+ None => ParquetValue::Null,
489
+ })
490
+ .collect()
491
+ }
492
+ DataType::List(_field) => {
493
+ let list_array = downcast_array::<ListArray>(column);
494
+ list_array
495
+ .iter()
496
+ .map(|x| match x {
497
+ Some(values) => match ParquetValueVec::try_from(values) {
498
+ Ok(vec) => ParquetValue::List(vec.into_inner()),
499
+ Err(e) => {
500
+ panic!("Error converting list array to ParquetValueVec: {}", e)
501
+ }
502
+ },
503
+ None => ParquetValue::Null,
504
+ })
505
+ .collect()
506
+ }
507
+ DataType::Struct(_) => {
508
+ let struct_array = downcast_array::<StructArray>(column);
509
+ let mut values = Vec::with_capacity(struct_array.len());
510
+ for i in 0..struct_array.len() {
511
+ if struct_array.is_null(i) {
512
+ values.push(ParquetValue::Null);
513
+ continue;
514
+ }
515
+
516
+ let mut map = std::collections::HashMap::new();
517
+ for (field_idx, field) in struct_array.fields().iter().enumerate() {
518
+ let column = struct_array.column(field_idx);
519
+ let field_values = match ParquetValueVec::try_from(column.slice(i, 1)) {
520
+ Ok(vec) => vec.into_inner(),
521
+ Err(e) => {
522
+ panic!("Error converting struct field to ParquetValueVec: {}", e)
523
+ }
524
+ };
525
+ map.insert(
526
+ ParquetValue::String(field.name().to_string()),
527
+ field_values.into_iter().next().unwrap(),
528
+ );
529
+ }
530
+ values.push(ParquetValue::Map(map));
531
+ }
532
+ values
533
+ }
534
+ DataType::Null => {
535
+ let x = downcast_array::<NullArray>(column);
536
+ vec![ParquetValue::Null; x.len()]
537
+ }
538
+ _ => {
539
+ return Err(format!("Unsupported data type: {:?}", column.data_type()));
540
+ }
541
+ };
542
+ Ok(ParquetValueVec(tmp_vec))
543
+ }
544
+ }
545
+
546
+ impl std::hash::Hash for ParquetValue {
547
+ fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
548
+ match self {
549
+ ParquetValue::Int8(i) => i.hash(state),
550
+ ParquetValue::Int16(i) => i.hash(state),
551
+ ParquetValue::Int32(i) => i.hash(state),
552
+ ParquetValue::Int64(i) => i.hash(state),
553
+ ParquetValue::UInt8(i) => i.hash(state),
554
+ ParquetValue::UInt16(i) => i.hash(state),
555
+ ParquetValue::UInt32(i) => i.hash(state),
556
+ ParquetValue::UInt64(i) => i.hash(state),
557
+ ParquetValue::Float16(f) => f.to_bits().hash(state),
558
+ ParquetValue::Float32(f) => f.to_bits().hash(state),
559
+ ParquetValue::Float64(f) => f.to_bits().hash(state),
560
+ ParquetValue::Boolean(b) => b.hash(state),
561
+ ParquetValue::String(s) => s.hash(state),
562
+ ParquetValue::Bytes(b) => b.hash(state),
563
+ ParquetValue::Date32(d) => d.hash(state),
564
+ ParquetValue::Date64(d) => d.hash(state),
565
+ ParquetValue::TimestampSecond(ts, tz) => {
566
+ ts.hash(state);
567
+ tz.hash(state);
568
+ }
569
+ ParquetValue::TimestampMillis(ts, tz) => {
570
+ ts.hash(state);
571
+ tz.hash(state);
572
+ }
573
+ ParquetValue::TimestampMicros(ts, tz) => {
574
+ ts.hash(state);
575
+ tz.hash(state);
576
+ }
577
+ ParquetValue::TimestampNanos(ts, tz) => {
578
+ ts.hash(state);
579
+ tz.hash(state);
580
+ }
581
+ ParquetValue::List(l) => l.hash(state),
582
+ ParquetValue::Map(_m) => panic!("Map is not hashable"),
583
+ ParquetValue::Null => 0_i32.hash(state),
584
+ }
585
+ }
586
+ }
587
+
588
+ impl IntoValue for ParquetValue {
589
+ fn into_value_with(self, handle: &Ruby) -> Value {
590
+ match self {
591
+ ParquetValue::Int8(i) => i.into_value_with(handle),
592
+ ParquetValue::Int16(i) => i.into_value_with(handle),
593
+ ParquetValue::Int32(i) => i.into_value_with(handle),
594
+ ParquetValue::Int64(i) => i.into_value_with(handle),
595
+ ParquetValue::UInt8(i) => i.into_value_with(handle),
596
+ ParquetValue::UInt16(i) => i.into_value_with(handle),
597
+ ParquetValue::UInt32(i) => i.into_value_with(handle),
598
+ ParquetValue::UInt64(i) => i.into_value_with(handle),
599
+ ParquetValue::Float16(f) => f.into_value_with(handle),
600
+ ParquetValue::Float32(f) => f.into_value_with(handle),
601
+ ParquetValue::Float64(f) => f.into_value_with(handle),
602
+ ParquetValue::Boolean(b) => b.into_value_with(handle),
603
+ ParquetValue::String(s) => s.into_value_with(handle),
604
+ ParquetValue::Bytes(b) => b.into_value_with(handle),
605
+ ParquetValue::Date32(d) => {
606
+ let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
607
+ let formatted = ts.strftime("%Y-%m-%d").to_string();
608
+ formatted.into_value_with(handle)
609
+ }
610
+ ParquetValue::Date64(d) => {
611
+ let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
612
+ let formatted = ts.strftime("%Y-%m-%d").to_string();
613
+ formatted.into_value_with(handle)
614
+ }
615
+ ParquetValue::TimestampSecond(ts, tz) => {
616
+ let ts = parse_zoned_timestamp(&ParquetValue::TimestampSecond(ts, tz));
617
+ let time_class = handle.class_time();
618
+ time_class
619
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
620
+ .unwrap()
621
+ .into_value_with(handle)
622
+ }
623
+ ParquetValue::TimestampMillis(ts, tz) => {
624
+ let ts = parse_zoned_timestamp(&ParquetValue::TimestampMillis(ts, tz));
625
+ let time_class = handle.class_time();
626
+ time_class
627
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
628
+ .unwrap()
629
+ .into_value_with(handle)
630
+ }
631
+ ParquetValue::TimestampMicros(ts, tz) => {
632
+ let ts = parse_zoned_timestamp(&ParquetValue::TimestampMicros(ts, tz));
633
+ let time_class = handle.class_time();
634
+ time_class
635
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
636
+ .unwrap()
637
+ .into_value_with(handle)
638
+ }
639
+ ParquetValue::TimestampNanos(ts, tz) => {
640
+ let ts = parse_zoned_timestamp(&ParquetValue::TimestampNanos(ts, tz));
641
+ let time_class = handle.class_time();
642
+ time_class
643
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
644
+ .unwrap()
645
+ .into_value_with(handle)
646
+ }
647
+ ParquetValue::List(l) => {
648
+ let ary = handle.ary_new_capa(l.len());
649
+ l.into_iter()
650
+ .try_for_each(|v| ary.push(v.into_value_with(handle)))
651
+ .unwrap();
652
+ ary.into_value_with(handle)
653
+ }
654
+ ParquetValue::Map(m) => {
655
+ let hash = handle.hash_new_capa(m.len());
656
+ m.into_iter()
657
+ .try_for_each(|(k, v)| {
658
+ hash.aset(k.into_value_with(handle), v.into_value_with(handle))
659
+ })
660
+ .unwrap();
661
+ hash.into_value_with(handle)
662
+ }
663
+ ParquetValue::Null => handle.qnil().as_value(),
664
+ }
665
+ }
666
+ }
667
+
668
+ fn parse_zoned_timestamp(value: &ParquetValue) -> jiff::Timestamp {
669
+ let (ts, tz) = match value {
670
+ ParquetValue::TimestampSecond(ts, tz) => (jiff::Timestamp::from_second(*ts).unwrap(), tz),
671
+ ParquetValue::TimestampMillis(ts, tz) => {
672
+ (jiff::Timestamp::from_millisecond(*ts).unwrap(), tz)
673
+ }
674
+ ParquetValue::TimestampMicros(ts, tz) => {
675
+ (jiff::Timestamp::from_microsecond(*ts).unwrap(), tz)
676
+ }
677
+ ParquetValue::TimestampNanos(ts, tz) => {
678
+ (jiff::Timestamp::from_nanosecond(*ts as i128).unwrap(), tz)
679
+ }
680
+ _ => panic!("Invalid timestamp value"),
681
+ };
682
+
683
+ // If timezone is provided, convert to zoned timestamp
684
+ if let Some(tz) = tz {
685
+ // Handle fixed offset timezones like "+09:00" first
686
+ if tz.starts_with('+') || tz.starts_with('-') {
687
+ // Parse the offset string into hours and minutes
688
+ let (hours, minutes) = if tz.len() >= 5 && tz.contains(':') {
689
+ // Format: "+09:00" or "-09:00"
690
+ let h = tz[1..3].parse::<i32>().unwrap_or(0);
691
+ let m = tz[4..6].parse::<i32>().unwrap_or(0);
692
+ (h, m)
693
+ } else if tz.len() >= 3 {
694
+ // Format: "+09" or "-09"
695
+ let h = tz[1..3].parse::<i32>().unwrap_or(0);
696
+ (h, 0)
697
+ } else {
698
+ (0, 0)
699
+ };
700
+
701
+ // Apply sign
702
+ let total_minutes = if tz.starts_with('-') {
703
+ -(hours * 60 + minutes)
704
+ } else {
705
+ hours * 60 + minutes
706
+ };
707
+
708
+ // Create fixed timezone
709
+ let tz = jiff::tz::TimeZone::fixed(jiff::tz::offset((total_minutes / 60) as i8));
710
+ ts.to_zoned(tz).timestamp()
711
+ } else {
712
+ // Try IANA timezone
713
+ match ts.intz(&tz) {
714
+ Ok(zoned) => zoned.timestamp(),
715
+ Err(_) => ts, // Fall back to UTC if timezone is invalid
716
+ }
88
717
  }
718
+ } else {
719
+ // No timezone provided - treat as UTC
720
+ ts
89
721
  }
90
722
  }