parquet 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,27 +1,116 @@
1
- use std::{borrow::Cow, collections::HashMap, hash::BuildHasher};
1
+ use std::{borrow::Cow, collections::HashMap, hash::BuildHasher, sync::Arc};
2
2
 
3
+ use arrow_array::cast::downcast_array;
4
+ use arrow_array::{
5
+ Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Float16Array, Float32Array,
6
+ Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, NullArray, StringArray,
7
+ StructArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
8
+ TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
9
+ };
10
+ use arrow_schema::{DataType, TimeUnit};
11
+ use itertools::Itertools;
3
12
  use magnus::{value::ReprValue, IntoValue, Ruby, Value};
13
+ use parquet::data_type::Decimal;
4
14
  use parquet::record::Field;
5
15
 
16
+ use crate::header_cache::StringCacheKey;
17
+
6
18
  #[derive(Debug)]
7
- pub enum Record<S: BuildHasher + Default> {
19
+ pub enum RowRecord<S: BuildHasher + Default> {
8
20
  Vec(Vec<ParquetField>),
9
- Map(HashMap<&'static str, ParquetField, S>),
21
+ Map(HashMap<StringCacheKey, ParquetField, S>),
10
22
  }
11
23
 
12
- impl<S: BuildHasher + Default> IntoValue for Record<S> {
24
+ #[derive(Debug)]
25
+ pub enum ColumnRecord<S: BuildHasher + Default> {
26
+ Vec(Vec<Vec<ParquetValue>>),
27
+ Map(HashMap<StringCacheKey, Vec<ParquetValue>, S>),
28
+ }
29
+
30
+ impl<S: BuildHasher + Default> IntoValue for RowRecord<S> {
13
31
  fn into_value_with(self, handle: &Ruby) -> Value {
14
32
  match self {
15
- Record::Vec(vec) => {
33
+ RowRecord::Vec(vec) => {
16
34
  let ary = handle.ary_new_capa(vec.len());
17
35
  vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
18
- ary.into_value_with(handle)
36
+ handle.into_value(ary)
19
37
  }
20
- Record::Map(map) => {
38
+ RowRecord::Map(map) => {
21
39
  let hash = handle.hash_new_capa(map.len());
22
- map.into_iter()
23
- .try_for_each(|(k, v)| hash.aset(k, v))
40
+
41
+ let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
42
+ let mut i = 0;
43
+
44
+ for chunk in &map.into_iter().chunks(64) {
45
+ // Reduced to 64 to ensure space for pairs
46
+ for (k, v) in chunk {
47
+ if i + 1 >= values.len() {
48
+ // Bulk insert current batch if array is full
49
+ hash.bulk_insert(&values[..i]).unwrap();
50
+ values[..i].fill(handle.qnil().as_value());
51
+ i = 0;
52
+ }
53
+ values[i] = handle.into_value(k);
54
+ values[i + 1] = handle.into_value(v);
55
+ i += 2;
56
+ }
57
+ // Insert any remaining pairs
58
+ if i > 0 {
59
+ hash.bulk_insert(&values[..i]).unwrap();
60
+ values[..i].fill(handle.qnil().as_value());
61
+ i = 0;
62
+ }
63
+ }
64
+
65
+ hash.into_value_with(handle)
66
+ }
67
+ }
68
+ }
69
+ }
70
+
71
+ impl<S: BuildHasher + Default> IntoValue for ColumnRecord<S> {
72
+ fn into_value_with(self, handle: &Ruby) -> Value {
73
+ match self {
74
+ ColumnRecord::Vec(vec) => {
75
+ let ary = handle.ary_new_capa(vec.len());
76
+ vec.into_iter()
77
+ .try_for_each(|v| {
78
+ let nested_ary = handle.ary_new_capa(v.len());
79
+ v.into_iter().try_for_each(|v| nested_ary.push(v)).unwrap();
80
+ ary.push(nested_ary.into_value_with(handle))
81
+ })
24
82
  .unwrap();
83
+ ary.into_value_with(handle)
84
+ }
85
+ ColumnRecord::Map(map) => {
86
+ let hash = handle.hash_new_capa(map.len());
87
+
88
+ let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
89
+ let mut i = 0;
90
+
91
+ for chunk in &map.into_iter().chunks(64) {
92
+ // Reduced to 64 to ensure space for pairs
93
+ for (k, v) in chunk {
94
+ if i + 1 >= values.len() {
95
+ // Bulk insert current batch if array is full
96
+ hash.bulk_insert(&values[..i]).unwrap();
97
+ values[..i].fill(handle.qnil().as_value());
98
+ i = 0;
99
+ }
100
+ values[i] = handle.into_value(k);
101
+ let ary = handle.ary_new_capa(v.len());
102
+ v.into_iter().try_for_each(|v| ary.push(v)).unwrap();
103
+ values[i + 1] = handle.into_value(ary);
104
+ i += 2;
105
+ }
106
+ // Insert any remaining pairs
107
+ if i > 0 {
108
+ hash.bulk_insert(&values[..i]).unwrap();
109
+ values[..i].fill(handle.qnil().as_value());
110
+ i = 0;
111
+ }
112
+ }
113
+
25
114
  hash.into_value_with(handle)
26
115
  }
27
116
  }
@@ -43,7 +132,7 @@ pub struct ParquetField(pub Field);
43
132
  impl IntoValue for ParquetField {
44
133
  fn into_value_with(self, handle: &Ruby) -> Value {
45
134
  match self.0 {
46
- Field::Byte(b) => b.into_value_with(handle),
135
+ Field::Null => handle.qnil().as_value(),
47
136
  Field::Bool(b) => b.into_value_with(handle),
48
137
  Field::Short(s) => s.into_value_with(handle),
49
138
  Field::Int(i) => i.into_value_with(handle),
@@ -56,10 +145,29 @@ impl IntoValue for ParquetField {
56
145
  Field::Float(f) => f.into_value_with(handle),
57
146
  Field::Double(d) => d.into_value_with(handle),
58
147
  Field::Str(s) => s.into_value_with(handle),
148
+ Field::Byte(b) => b.into_value_with(handle),
59
149
  Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
60
- Field::Date(d) => d.into_value_with(handle),
61
- Field::TimestampMillis(ts) => ts.into_value_with(handle),
62
- Field::TimestampMicros(ts) => ts.into_value_with(handle),
150
+ Field::Date(d) => {
151
+ let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
152
+ let formatted = ts.strftime("%Y-%m-%d").to_string();
153
+ formatted.into_value_with(handle)
154
+ }
155
+ Field::TimestampMillis(ts) => {
156
+ let ts = jiff::Timestamp::from_millisecond(ts).unwrap();
157
+ let time_class = handle.class_time();
158
+ time_class
159
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
160
+ .unwrap()
161
+ .into_value_with(handle)
162
+ }
163
+ Field::TimestampMicros(ts) => {
164
+ let ts = jiff::Timestamp::from_microsecond(ts).unwrap();
165
+ let time_class = handle.class_time();
166
+ time_class
167
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
168
+ .unwrap()
169
+ .into_value_with(handle)
170
+ }
63
171
  Field::ListInternal(list) => {
64
172
  let elements = list.elements();
65
173
  let ary = handle.ary_new_capa(elements.len());
@@ -83,8 +191,532 @@ impl IntoValue for ParquetField {
83
191
  .unwrap();
84
192
  hash.into_value_with(handle)
85
193
  }
86
- Field::Null => handle.qnil().as_value(),
87
- _ => panic!("Unsupported field type"),
194
+ Field::Decimal(d) => {
195
+ let value = match d {
196
+ Decimal::Int32 { value, scale, .. } => {
197
+ let unscaled = i32::from_be_bytes(value);
198
+ format!("{}e-{}", unscaled, scale)
199
+ }
200
+ Decimal::Int64 { value, scale, .. } => {
201
+ let unscaled = i64::from_be_bytes(value);
202
+ format!("{}e-{}", unscaled, scale)
203
+ }
204
+ Decimal::Bytes { value, scale, .. } => {
205
+ // Convert bytes to string representation of unscaled value
206
+ let unscaled = String::from_utf8_lossy(value.data());
207
+ format!("{}e-{}", unscaled, scale)
208
+ }
209
+ };
210
+ handle.eval(&format!("BigDecimal(\"{value}\")")).unwrap()
211
+ }
212
+ Field::Group(row) => {
213
+ let hash = handle.hash_new();
214
+ row.get_column_iter()
215
+ .try_for_each(|(k, v)| {
216
+ hash.aset(
217
+ k.clone().into_value_with(handle),
218
+ ParquetField(v.clone()).into_value_with(handle),
219
+ )
220
+ })
221
+ .unwrap();
222
+ hash.into_value_with(handle)
223
+ }
224
+ }
225
+ }
226
+ }
227
+
228
+ #[allow(dead_code)]
229
+ #[derive(Clone, Debug)]
230
+ pub enum ParquetValue {
231
+ Int8(i8),
232
+ Int16(i16),
233
+ Int32(i32),
234
+ Int64(i64),
235
+ UInt8(u8),
236
+ UInt16(u16),
237
+ UInt32(u32),
238
+ UInt64(u64),
239
+ Float16(f32), // f16 converted to f32
240
+ Float32(f32),
241
+ Float64(f64),
242
+ Boolean(bool),
243
+ String(String),
244
+ Bytes(Vec<u8>),
245
+ Date32(i32),
246
+ Date64(i64),
247
+ TimestampSecond(i64, Option<Arc<str>>),
248
+ TimestampMillis(i64, Option<Arc<str>>),
249
+ TimestampMicros(i64, Option<Arc<str>>),
250
+ TimestampNanos(i64, Option<Arc<str>>),
251
+ List(Vec<ParquetValue>),
252
+ Map(HashMap<ParquetValue, ParquetValue>),
253
+ Null,
254
+ }
255
+
256
+ impl PartialEq for ParquetValue {
257
+ fn eq(&self, other: &Self) -> bool {
258
+ match (self, other) {
259
+ (ParquetValue::Int8(a), ParquetValue::Int8(b)) => a == b,
260
+ (ParquetValue::Int16(a), ParquetValue::Int16(b)) => a == b,
261
+ (ParquetValue::Int32(a), ParquetValue::Int32(b)) => a == b,
262
+ (ParquetValue::Int64(a), ParquetValue::Int64(b)) => a == b,
263
+ (ParquetValue::UInt8(a), ParquetValue::UInt8(b)) => a == b,
264
+ (ParquetValue::UInt16(a), ParquetValue::UInt16(b)) => a == b,
265
+ (ParquetValue::UInt32(a), ParquetValue::UInt32(b)) => a == b,
266
+ (ParquetValue::UInt64(a), ParquetValue::UInt64(b)) => a == b,
267
+ (ParquetValue::Float16(a), ParquetValue::Float16(b)) => a == b,
268
+ (ParquetValue::Float32(a), ParquetValue::Float32(b)) => a == b,
269
+ (ParquetValue::Float64(a), ParquetValue::Float64(b)) => a == b,
270
+ (ParquetValue::Boolean(a), ParquetValue::Boolean(b)) => a == b,
271
+ (ParquetValue::String(a), ParquetValue::String(b)) => a == b,
272
+ (ParquetValue::Bytes(a), ParquetValue::Bytes(b)) => a == b,
273
+ (ParquetValue::Date32(a), ParquetValue::Date32(b)) => a == b,
274
+ (ParquetValue::Date64(a), ParquetValue::Date64(b)) => a == b,
275
+ (ParquetValue::TimestampSecond(a, _), ParquetValue::TimestampSecond(b, _)) => a == b,
276
+ (ParquetValue::TimestampMillis(a, _), ParquetValue::TimestampMillis(b, _)) => a == b,
277
+ (ParquetValue::TimestampMicros(a, _), ParquetValue::TimestampMicros(b, _)) => a == b,
278
+ (ParquetValue::TimestampNanos(a, _), ParquetValue::TimestampNanos(b, _)) => a == b,
279
+ (ParquetValue::List(a), ParquetValue::List(b)) => a == b,
280
+ (ParquetValue::Null, ParquetValue::Null) => true,
281
+ _ => false,
282
+ }
283
+ }
284
+ }
285
+
286
+ impl Eq for ParquetValue {}
287
+
288
+ #[derive(Debug)]
289
+ pub struct ParquetValueVec(Vec<ParquetValue>);
290
+
291
+ impl ParquetValueVec {
292
+ pub fn into_inner(self) -> Vec<ParquetValue> {
293
+ self.0
294
+ }
295
+ }
296
+
297
+ impl IntoIterator for ParquetValueVec {
298
+ type Item = ParquetValue;
299
+ type IntoIter = std::vec::IntoIter<ParquetValue>;
300
+
301
+ fn into_iter(self) -> Self::IntoIter {
302
+ self.0.into_iter()
303
+ }
304
+ }
305
+
306
+ impl std::cmp::PartialEq for ParquetValueVec {
307
+ fn eq(&self, other: &Self) -> bool {
308
+ self.0 == other.0
309
+ }
310
+ }
311
+
312
+ impl std::cmp::Eq for ParquetValueVec {}
313
+
314
+ impl TryFrom<Arc<dyn Array>> for ParquetValueVec {
315
+ type Error = String;
316
+
317
+ fn try_from(column: Arc<dyn Array>) -> Result<Self, Self::Error> {
318
+ ParquetValueVec::try_from(&*column)
319
+ }
320
+ }
321
+
322
+ // Add macro for handling numeric array conversions
323
+ macro_rules! impl_numeric_array_conversion {
324
+ ($column:expr, $array_type:ty, $variant:ident) => {{
325
+ let array = downcast_array::<$array_type>($column);
326
+ if array.is_nullable() {
327
+ array
328
+ .values()
329
+ .iter()
330
+ .enumerate()
331
+ .map(|(i, x)| {
332
+ if array.is_null(i) {
333
+ ParquetValue::Null
334
+ } else {
335
+ ParquetValue::$variant(*x)
336
+ }
337
+ })
338
+ .collect()
339
+ } else {
340
+ array
341
+ .values()
342
+ .iter()
343
+ .map(|x| ParquetValue::$variant(*x))
344
+ .collect()
345
+ }
346
+ }};
347
+ }
348
+
349
+ // Add macro for handling boolean array conversions
350
+ macro_rules! impl_boolean_array_conversion {
351
+ ($column:expr, $array_type:ty, $variant:ident) => {{
352
+ let array = downcast_array::<$array_type>($column);
353
+ if array.is_nullable() {
354
+ array
355
+ .values()
356
+ .iter()
357
+ .enumerate()
358
+ .map(|(i, x)| {
359
+ if array.is_null(i) {
360
+ ParquetValue::Null
361
+ } else {
362
+ ParquetValue::$variant(x)
363
+ }
364
+ })
365
+ .collect()
366
+ } else {
367
+ array
368
+ .values()
369
+ .iter()
370
+ .map(|x| ParquetValue::$variant(x))
371
+ .collect()
372
+ }
373
+ }};
374
+ }
375
+
376
+ // Add macro for handling timestamp array conversions
377
+ macro_rules! impl_timestamp_array_conversion {
378
+ ($column:expr, $array_type:ty, $variant:ident, $tz:expr) => {{
379
+ let array = downcast_array::<$array_type>($column);
380
+ if array.is_nullable() {
381
+ array
382
+ .values()
383
+ .iter()
384
+ .enumerate()
385
+ .map(|(i, x)| {
386
+ if array.is_null(i) {
387
+ ParquetValue::Null
388
+ } else {
389
+ ParquetValue::$variant(*x, $tz.clone())
390
+ }
391
+ })
392
+ .collect()
393
+ } else {
394
+ array
395
+ .values()
396
+ .iter()
397
+ .map(|x| ParquetValue::$variant(*x, $tz.clone()))
398
+ .collect()
399
+ }
400
+ }};
401
+ }
402
+
403
+ impl TryFrom<&dyn Array> for ParquetValueVec {
404
+ type Error = String;
405
+
406
+ fn try_from(column: &dyn Array) -> Result<Self, Self::Error> {
407
+ let tmp_vec = match column.data_type() {
408
+ DataType::Boolean => impl_boolean_array_conversion!(column, BooleanArray, Boolean),
409
+ DataType::Int8 => impl_numeric_array_conversion!(column, Int8Array, Int8),
410
+ DataType::Int16 => impl_numeric_array_conversion!(column, Int16Array, Int16),
411
+ DataType::Int32 => impl_numeric_array_conversion!(column, Int32Array, Int32),
412
+ DataType::Int64 => impl_numeric_array_conversion!(column, Int64Array, Int64),
413
+ DataType::UInt8 => impl_numeric_array_conversion!(column, UInt8Array, UInt8),
414
+ DataType::UInt16 => impl_numeric_array_conversion!(column, UInt16Array, UInt16),
415
+ DataType::UInt32 => impl_numeric_array_conversion!(column, UInt32Array, UInt32),
416
+ DataType::UInt64 => impl_numeric_array_conversion!(column, UInt64Array, UInt64),
417
+ DataType::Float32 => impl_numeric_array_conversion!(column, Float32Array, Float32),
418
+ DataType::Float64 => impl_numeric_array_conversion!(column, Float64Array, Float64),
419
+ DataType::Date32 => impl_numeric_array_conversion!(column, Date32Array, Date32),
420
+ DataType::Date64 => impl_numeric_array_conversion!(column, Date64Array, Date64),
421
+ DataType::Timestamp(TimeUnit::Second, tz) => {
422
+ impl_timestamp_array_conversion!(column, TimestampSecondArray, TimestampSecond, tz)
423
+ }
424
+ DataType::Timestamp(TimeUnit::Millisecond, tz) => {
425
+ impl_timestamp_array_conversion!(
426
+ column,
427
+ TimestampMillisecondArray,
428
+ TimestampMillis,
429
+ tz
430
+ )
431
+ }
432
+ DataType::Timestamp(TimeUnit::Microsecond, tz) => {
433
+ impl_timestamp_array_conversion!(
434
+ column,
435
+ TimestampMicrosecondArray,
436
+ TimestampMicros,
437
+ tz
438
+ )
439
+ }
440
+ DataType::Timestamp(TimeUnit::Nanosecond, tz) => {
441
+ impl_timestamp_array_conversion!(
442
+ column,
443
+ TimestampNanosecondArray,
444
+ TimestampNanos,
445
+ tz
446
+ )
447
+ }
448
+ // Because f16 is unstable in Rust, we convert it to f32
449
+ DataType::Float16 => {
450
+ let array = downcast_array::<Float16Array>(column);
451
+ if array.is_nullable() {
452
+ array
453
+ .values()
454
+ .iter()
455
+ .enumerate()
456
+ .map(|(i, x)| {
457
+ if array.is_null(i) {
458
+ ParquetValue::Null
459
+ } else {
460
+ ParquetValue::Float16(f32::from(*x))
461
+ }
462
+ })
463
+ .collect()
464
+ } else {
465
+ array
466
+ .values()
467
+ .iter()
468
+ .map(|x| ParquetValue::Float16(f32::from(*x)))
469
+ .collect()
470
+ }
471
+ }
472
+ DataType::Utf8 => {
473
+ let array = downcast_array::<StringArray>(column);
474
+ array
475
+ .iter()
476
+ .map(|opt_x| match opt_x {
477
+ Some(x) => ParquetValue::String(x.to_string()),
478
+ None => ParquetValue::Null,
479
+ })
480
+ .collect()
481
+ }
482
+ DataType::Binary => {
483
+ let array = downcast_array::<BinaryArray>(column);
484
+ array
485
+ .iter()
486
+ .map(|opt_x| match opt_x {
487
+ Some(x) => ParquetValue::Bytes(x.to_vec()),
488
+ None => ParquetValue::Null,
489
+ })
490
+ .collect()
491
+ }
492
+ DataType::List(_field) => {
493
+ let list_array = downcast_array::<ListArray>(column);
494
+ list_array
495
+ .iter()
496
+ .map(|x| match x {
497
+ Some(values) => match ParquetValueVec::try_from(values) {
498
+ Ok(vec) => ParquetValue::List(vec.into_inner()),
499
+ Err(e) => {
500
+ panic!("Error converting list array to ParquetValueVec: {}", e)
501
+ }
502
+ },
503
+ None => ParquetValue::Null,
504
+ })
505
+ .collect()
506
+ }
507
+ DataType::Struct(_) => {
508
+ let struct_array = downcast_array::<StructArray>(column);
509
+ let mut values = Vec::with_capacity(struct_array.len());
510
+ for i in 0..struct_array.len() {
511
+ if struct_array.is_null(i) {
512
+ values.push(ParquetValue::Null);
513
+ continue;
514
+ }
515
+
516
+ let mut map = std::collections::HashMap::new();
517
+ for (field_idx, field) in struct_array.fields().iter().enumerate() {
518
+ let column = struct_array.column(field_idx);
519
+ let field_values = match ParquetValueVec::try_from(column.slice(i, 1)) {
520
+ Ok(vec) => vec.into_inner(),
521
+ Err(e) => {
522
+ panic!("Error converting struct field to ParquetValueVec: {}", e)
523
+ }
524
+ };
525
+ map.insert(
526
+ ParquetValue::String(field.name().to_string()),
527
+ field_values.into_iter().next().unwrap(),
528
+ );
529
+ }
530
+ values.push(ParquetValue::Map(map));
531
+ }
532
+ values
533
+ }
534
+ DataType::Null => {
535
+ let x = downcast_array::<NullArray>(column);
536
+ vec![ParquetValue::Null; x.len()]
537
+ }
538
+ _ => {
539
+ return Err(format!("Unsupported data type: {:?}", column.data_type()));
540
+ }
541
+ };
542
+ Ok(ParquetValueVec(tmp_vec))
543
+ }
544
+ }
545
+
546
+ impl std::hash::Hash for ParquetValue {
547
+ fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
548
+ match self {
549
+ ParquetValue::Int8(i) => i.hash(state),
550
+ ParquetValue::Int16(i) => i.hash(state),
551
+ ParquetValue::Int32(i) => i.hash(state),
552
+ ParquetValue::Int64(i) => i.hash(state),
553
+ ParquetValue::UInt8(i) => i.hash(state),
554
+ ParquetValue::UInt16(i) => i.hash(state),
555
+ ParquetValue::UInt32(i) => i.hash(state),
556
+ ParquetValue::UInt64(i) => i.hash(state),
557
+ ParquetValue::Float16(f) => f.to_bits().hash(state),
558
+ ParquetValue::Float32(f) => f.to_bits().hash(state),
559
+ ParquetValue::Float64(f) => f.to_bits().hash(state),
560
+ ParquetValue::Boolean(b) => b.hash(state),
561
+ ParquetValue::String(s) => s.hash(state),
562
+ ParquetValue::Bytes(b) => b.hash(state),
563
+ ParquetValue::Date32(d) => d.hash(state),
564
+ ParquetValue::Date64(d) => d.hash(state),
565
+ ParquetValue::TimestampSecond(ts, tz) => {
566
+ ts.hash(state);
567
+ tz.hash(state);
568
+ }
569
+ ParquetValue::TimestampMillis(ts, tz) => {
570
+ ts.hash(state);
571
+ tz.hash(state);
572
+ }
573
+ ParquetValue::TimestampMicros(ts, tz) => {
574
+ ts.hash(state);
575
+ tz.hash(state);
576
+ }
577
+ ParquetValue::TimestampNanos(ts, tz) => {
578
+ ts.hash(state);
579
+ tz.hash(state);
580
+ }
581
+ ParquetValue::List(l) => l.hash(state),
582
+ ParquetValue::Map(_m) => panic!("Map is not hashable"),
583
+ ParquetValue::Null => 0_i32.hash(state),
584
+ }
585
+ }
586
+ }
587
+
588
+ impl IntoValue for ParquetValue {
589
+ fn into_value_with(self, handle: &Ruby) -> Value {
590
+ match self {
591
+ ParquetValue::Int8(i) => i.into_value_with(handle),
592
+ ParquetValue::Int16(i) => i.into_value_with(handle),
593
+ ParquetValue::Int32(i) => i.into_value_with(handle),
594
+ ParquetValue::Int64(i) => i.into_value_with(handle),
595
+ ParquetValue::UInt8(i) => i.into_value_with(handle),
596
+ ParquetValue::UInt16(i) => i.into_value_with(handle),
597
+ ParquetValue::UInt32(i) => i.into_value_with(handle),
598
+ ParquetValue::UInt64(i) => i.into_value_with(handle),
599
+ ParquetValue::Float16(f) => f.into_value_with(handle),
600
+ ParquetValue::Float32(f) => f.into_value_with(handle),
601
+ ParquetValue::Float64(f) => f.into_value_with(handle),
602
+ ParquetValue::Boolean(b) => b.into_value_with(handle),
603
+ ParquetValue::String(s) => s.into_value_with(handle),
604
+ ParquetValue::Bytes(b) => b.into_value_with(handle),
605
+ ParquetValue::Date32(d) => {
606
+ let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
607
+ let formatted = ts.strftime("%Y-%m-%d").to_string();
608
+ formatted.into_value_with(handle)
609
+ }
610
+ ParquetValue::Date64(d) => {
611
+ let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
612
+ let formatted = ts.strftime("%Y-%m-%d").to_string();
613
+ formatted.into_value_with(handle)
614
+ }
615
+ ParquetValue::TimestampSecond(ts, tz) => {
616
+ let ts = parse_zoned_timestamp(&ParquetValue::TimestampSecond(ts, tz));
617
+ let time_class = handle.class_time();
618
+ time_class
619
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
620
+ .unwrap()
621
+ .into_value_with(handle)
622
+ }
623
+ ParquetValue::TimestampMillis(ts, tz) => {
624
+ let ts = parse_zoned_timestamp(&ParquetValue::TimestampMillis(ts, tz));
625
+ let time_class = handle.class_time();
626
+ time_class
627
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
628
+ .unwrap()
629
+ .into_value_with(handle)
630
+ }
631
+ ParquetValue::TimestampMicros(ts, tz) => {
632
+ let ts = parse_zoned_timestamp(&ParquetValue::TimestampMicros(ts, tz));
633
+ let time_class = handle.class_time();
634
+ time_class
635
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
636
+ .unwrap()
637
+ .into_value_with(handle)
638
+ }
639
+ ParquetValue::TimestampNanos(ts, tz) => {
640
+ let ts = parse_zoned_timestamp(&ParquetValue::TimestampNanos(ts, tz));
641
+ let time_class = handle.class_time();
642
+ time_class
643
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
644
+ .unwrap()
645
+ .into_value_with(handle)
646
+ }
647
+ ParquetValue::List(l) => {
648
+ let ary = handle.ary_new_capa(l.len());
649
+ l.into_iter()
650
+ .try_for_each(|v| ary.push(v.into_value_with(handle)))
651
+ .unwrap();
652
+ ary.into_value_with(handle)
653
+ }
654
+ ParquetValue::Map(m) => {
655
+ let hash = handle.hash_new_capa(m.len());
656
+ m.into_iter()
657
+ .try_for_each(|(k, v)| {
658
+ hash.aset(k.into_value_with(handle), v.into_value_with(handle))
659
+ })
660
+ .unwrap();
661
+ hash.into_value_with(handle)
662
+ }
663
+ ParquetValue::Null => handle.qnil().as_value(),
664
+ }
665
+ }
666
+ }
667
+
668
+ fn parse_zoned_timestamp(value: &ParquetValue) -> jiff::Timestamp {
669
+ let (ts, tz) = match value {
670
+ ParquetValue::TimestampSecond(ts, tz) => (jiff::Timestamp::from_second(*ts).unwrap(), tz),
671
+ ParquetValue::TimestampMillis(ts, tz) => {
672
+ (jiff::Timestamp::from_millisecond(*ts).unwrap(), tz)
673
+ }
674
+ ParquetValue::TimestampMicros(ts, tz) => {
675
+ (jiff::Timestamp::from_microsecond(*ts).unwrap(), tz)
676
+ }
677
+ ParquetValue::TimestampNanos(ts, tz) => {
678
+ (jiff::Timestamp::from_nanosecond(*ts as i128).unwrap(), tz)
679
+ }
680
+ _ => panic!("Invalid timestamp value"),
681
+ };
682
+
683
+ // If timezone is provided, convert to zoned timestamp
684
+ if let Some(tz) = tz {
685
+ // Handle fixed offset timezones like "+09:00" first
686
+ if tz.starts_with('+') || tz.starts_with('-') {
687
+ // Parse the offset string into hours and minutes
688
+ let (hours, minutes) = if tz.len() >= 5 && tz.contains(':') {
689
+ // Format: "+09:00" or "-09:00"
690
+ let h = tz[1..3].parse::<i32>().unwrap_or(0);
691
+ let m = tz[4..6].parse::<i32>().unwrap_or(0);
692
+ (h, m)
693
+ } else if tz.len() >= 3 {
694
+ // Format: "+09" or "-09"
695
+ let h = tz[1..3].parse::<i32>().unwrap_or(0);
696
+ (h, 0)
697
+ } else {
698
+ (0, 0)
699
+ };
700
+
701
+ // Apply sign
702
+ let total_minutes = if tz.starts_with('-') {
703
+ -(hours * 60 + minutes)
704
+ } else {
705
+ hours * 60 + minutes
706
+ };
707
+
708
+ // Create fixed timezone
709
+ let tz = jiff::tz::TimeZone::fixed(jiff::tz::offset((total_minutes / 60) as i8));
710
+ ts.to_zoned(tz).timestamp()
711
+ } else {
712
+ // Try IANA timezone
713
+ match ts.intz(&tz) {
714
+ Ok(zoned) => zoned.timestamp(),
715
+ Err(_) => ts, // Fall back to UTC if timezone is invalid
716
+ }
88
717
  }
718
+ } else {
719
+ // No timezone provided - treat as UTC
720
+ ts
89
721
  }
90
722
  }