parquet 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,722 @@
1
+ use std::{borrow::Cow, collections::HashMap, hash::BuildHasher, sync::Arc};
2
+
3
+ use arrow_array::cast::downcast_array;
4
+ use arrow_array::{
5
+ Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Float16Array, Float32Array,
6
+ Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, NullArray, StringArray,
7
+ StructArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
8
+ TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
9
+ };
10
+ use arrow_schema::{DataType, TimeUnit};
11
+ use itertools::Itertools;
12
+ use magnus::{value::ReprValue, IntoValue, Ruby, Value};
13
+ use parquet::data_type::Decimal;
14
+ use parquet::record::Field;
15
+
16
+ use crate::header_cache::StringCacheKey;
17
+
18
+ #[derive(Debug)]
19
+ pub enum RowRecord<S: BuildHasher + Default> {
20
+ Vec(Vec<ParquetField>),
21
+ Map(HashMap<StringCacheKey, ParquetField, S>),
22
+ }
23
+
24
+ #[derive(Debug)]
25
+ pub enum ColumnRecord<S: BuildHasher + Default> {
26
+ Vec(Vec<Vec<ParquetValue>>),
27
+ Map(HashMap<StringCacheKey, Vec<ParquetValue>, S>),
28
+ }
29
+
30
+ impl<S: BuildHasher + Default> IntoValue for RowRecord<S> {
31
+ fn into_value_with(self, handle: &Ruby) -> Value {
32
+ match self {
33
+ RowRecord::Vec(vec) => {
34
+ let ary = handle.ary_new_capa(vec.len());
35
+ vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
36
+ handle.into_value(ary)
37
+ }
38
+ RowRecord::Map(map) => {
39
+ let hash = handle.hash_new_capa(map.len());
40
+
41
+ let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
42
+ let mut i = 0;
43
+
44
+ for chunk in &map.into_iter().chunks(64) {
45
+ // Reduced to 64 to ensure space for pairs
46
+ for (k, v) in chunk {
47
+ if i + 1 >= values.len() {
48
+ // Bulk insert current batch if array is full
49
+ hash.bulk_insert(&values[..i]).unwrap();
50
+ values[..i].fill(handle.qnil().as_value());
51
+ i = 0;
52
+ }
53
+ values[i] = handle.into_value(k);
54
+ values[i + 1] = handle.into_value(v);
55
+ i += 2;
56
+ }
57
+ // Insert any remaining pairs
58
+ if i > 0 {
59
+ hash.bulk_insert(&values[..i]).unwrap();
60
+ values[..i].fill(handle.qnil().as_value());
61
+ i = 0;
62
+ }
63
+ }
64
+
65
+ hash.into_value_with(handle)
66
+ }
67
+ }
68
+ }
69
+ }
70
+
71
+ impl<S: BuildHasher + Default> IntoValue for ColumnRecord<S> {
72
+ fn into_value_with(self, handle: &Ruby) -> Value {
73
+ match self {
74
+ ColumnRecord::Vec(vec) => {
75
+ let ary = handle.ary_new_capa(vec.len());
76
+ vec.into_iter()
77
+ .try_for_each(|v| {
78
+ let nested_ary = handle.ary_new_capa(v.len());
79
+ v.into_iter().try_for_each(|v| nested_ary.push(v)).unwrap();
80
+ ary.push(nested_ary.into_value_with(handle))
81
+ })
82
+ .unwrap();
83
+ ary.into_value_with(handle)
84
+ }
85
+ ColumnRecord::Map(map) => {
86
+ let hash = handle.hash_new_capa(map.len());
87
+
88
+ let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
89
+ let mut i = 0;
90
+
91
+ for chunk in &map.into_iter().chunks(64) {
92
+ // Reduced to 64 to ensure space for pairs
93
+ for (k, v) in chunk {
94
+ if i + 1 >= values.len() {
95
+ // Bulk insert current batch if array is full
96
+ hash.bulk_insert(&values[..i]).unwrap();
97
+ values[..i].fill(handle.qnil().as_value());
98
+ i = 0;
99
+ }
100
+ values[i] = handle.into_value(k);
101
+ let ary = handle.ary_new_capa(v.len());
102
+ v.into_iter().try_for_each(|v| ary.push(v)).unwrap();
103
+ values[i + 1] = handle.into_value(ary);
104
+ i += 2;
105
+ }
106
+ // Insert any remaining pairs
107
+ if i > 0 {
108
+ hash.bulk_insert(&values[..i]).unwrap();
109
+ values[..i].fill(handle.qnil().as_value());
110
+ i = 0;
111
+ }
112
+ }
113
+
114
+ hash.into_value_with(handle)
115
+ }
116
+ }
117
+ }
118
+ }
119
+
120
+ #[derive(Debug, Clone)]
121
+ pub struct CowValue<'a>(pub Cow<'a, str>);
122
+
123
+ impl<'a> IntoValue for CowValue<'a> {
124
+ fn into_value_with(self, handle: &Ruby) -> Value {
125
+ self.0.into_value_with(handle)
126
+ }
127
+ }
128
+
129
+ #[derive(Debug)]
130
+ pub struct ParquetField(pub Field);
131
+
132
+ impl IntoValue for ParquetField {
133
+ fn into_value_with(self, handle: &Ruby) -> Value {
134
+ match self.0 {
135
+ Field::Null => handle.qnil().as_value(),
136
+ Field::Bool(b) => b.into_value_with(handle),
137
+ Field::Short(s) => s.into_value_with(handle),
138
+ Field::Int(i) => i.into_value_with(handle),
139
+ Field::Long(l) => l.into_value_with(handle),
140
+ Field::UByte(ub) => ub.into_value_with(handle),
141
+ Field::UShort(us) => us.into_value_with(handle),
142
+ Field::UInt(ui) => ui.into_value_with(handle),
143
+ Field::ULong(ul) => ul.into_value_with(handle),
144
+ Field::Float16(f) => f32::from(f).into_value_with(handle),
145
+ Field::Float(f) => f.into_value_with(handle),
146
+ Field::Double(d) => d.into_value_with(handle),
147
+ Field::Str(s) => s.into_value_with(handle),
148
+ Field::Byte(b) => b.into_value_with(handle),
149
+ Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
150
+ Field::Date(d) => {
151
+ let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
152
+ let formatted = ts.strftime("%Y-%m-%d").to_string();
153
+ formatted.into_value_with(handle)
154
+ }
155
+ Field::TimestampMillis(ts) => {
156
+ let ts = jiff::Timestamp::from_millisecond(ts).unwrap();
157
+ let time_class = handle.class_time();
158
+ time_class
159
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
160
+ .unwrap()
161
+ .into_value_with(handle)
162
+ }
163
+ Field::TimestampMicros(ts) => {
164
+ let ts = jiff::Timestamp::from_microsecond(ts).unwrap();
165
+ let time_class = handle.class_time();
166
+ time_class
167
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
168
+ .unwrap()
169
+ .into_value_with(handle)
170
+ }
171
+ Field::ListInternal(list) => {
172
+ let elements = list.elements();
173
+ let ary = handle.ary_new_capa(elements.len());
174
+ elements
175
+ .iter()
176
+ .try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
177
+ .unwrap();
178
+ ary.into_value_with(handle)
179
+ }
180
+ Field::MapInternal(map) => {
181
+ let entries = map.entries();
182
+ let hash = handle.hash_new_capa(entries.len());
183
+ entries
184
+ .iter()
185
+ .try_for_each(|(k, v)| {
186
+ hash.aset(
187
+ ParquetField(k.clone()).into_value_with(handle),
188
+ ParquetField(v.clone()).into_value_with(handle),
189
+ )
190
+ })
191
+ .unwrap();
192
+ hash.into_value_with(handle)
193
+ }
194
+ Field::Decimal(d) => {
195
+ let value = match d {
196
+ Decimal::Int32 { value, scale, .. } => {
197
+ let unscaled = i32::from_be_bytes(value);
198
+ format!("{}e-{}", unscaled, scale)
199
+ }
200
+ Decimal::Int64 { value, scale, .. } => {
201
+ let unscaled = i64::from_be_bytes(value);
202
+ format!("{}e-{}", unscaled, scale)
203
+ }
204
+ Decimal::Bytes { value, scale, .. } => {
205
+ // Convert bytes to string representation of unscaled value
206
+ let unscaled = String::from_utf8_lossy(value.data());
207
+ format!("{}e-{}", unscaled, scale)
208
+ }
209
+ };
210
+ handle.eval(&format!("BigDecimal(\"{value}\")")).unwrap()
211
+ }
212
+ Field::Group(row) => {
213
+ let hash = handle.hash_new();
214
+ row.get_column_iter()
215
+ .try_for_each(|(k, v)| {
216
+ hash.aset(
217
+ k.clone().into_value_with(handle),
218
+ ParquetField(v.clone()).into_value_with(handle),
219
+ )
220
+ })
221
+ .unwrap();
222
+ hash.into_value_with(handle)
223
+ }
224
+ }
225
+ }
226
+ }
227
+
228
+ #[allow(dead_code)]
229
+ #[derive(Clone, Debug)]
230
+ pub enum ParquetValue {
231
+ Int8(i8),
232
+ Int16(i16),
233
+ Int32(i32),
234
+ Int64(i64),
235
+ UInt8(u8),
236
+ UInt16(u16),
237
+ UInt32(u32),
238
+ UInt64(u64),
239
+ Float16(f32), // f16 converted to f32
240
+ Float32(f32),
241
+ Float64(f64),
242
+ Boolean(bool),
243
+ String(String),
244
+ Bytes(Vec<u8>),
245
+ Date32(i32),
246
+ Date64(i64),
247
+ TimestampSecond(i64, Option<Arc<str>>),
248
+ TimestampMillis(i64, Option<Arc<str>>),
249
+ TimestampMicros(i64, Option<Arc<str>>),
250
+ TimestampNanos(i64, Option<Arc<str>>),
251
+ List(Vec<ParquetValue>),
252
+ Map(HashMap<ParquetValue, ParquetValue>),
253
+ Null,
254
+ }
255
+
256
+ impl PartialEq for ParquetValue {
257
+ fn eq(&self, other: &Self) -> bool {
258
+ match (self, other) {
259
+ (ParquetValue::Int8(a), ParquetValue::Int8(b)) => a == b,
260
+ (ParquetValue::Int16(a), ParquetValue::Int16(b)) => a == b,
261
+ (ParquetValue::Int32(a), ParquetValue::Int32(b)) => a == b,
262
+ (ParquetValue::Int64(a), ParquetValue::Int64(b)) => a == b,
263
+ (ParquetValue::UInt8(a), ParquetValue::UInt8(b)) => a == b,
264
+ (ParquetValue::UInt16(a), ParquetValue::UInt16(b)) => a == b,
265
+ (ParquetValue::UInt32(a), ParquetValue::UInt32(b)) => a == b,
266
+ (ParquetValue::UInt64(a), ParquetValue::UInt64(b)) => a == b,
267
+ (ParquetValue::Float16(a), ParquetValue::Float16(b)) => a == b,
268
+ (ParquetValue::Float32(a), ParquetValue::Float32(b)) => a == b,
269
+ (ParquetValue::Float64(a), ParquetValue::Float64(b)) => a == b,
270
+ (ParquetValue::Boolean(a), ParquetValue::Boolean(b)) => a == b,
271
+ (ParquetValue::String(a), ParquetValue::String(b)) => a == b,
272
+ (ParquetValue::Bytes(a), ParquetValue::Bytes(b)) => a == b,
273
+ (ParquetValue::Date32(a), ParquetValue::Date32(b)) => a == b,
274
+ (ParquetValue::Date64(a), ParquetValue::Date64(b)) => a == b,
275
+ (ParquetValue::TimestampSecond(a, _), ParquetValue::TimestampSecond(b, _)) => a == b,
276
+ (ParquetValue::TimestampMillis(a, _), ParquetValue::TimestampMillis(b, _)) => a == b,
277
+ (ParquetValue::TimestampMicros(a, _), ParquetValue::TimestampMicros(b, _)) => a == b,
278
+ (ParquetValue::TimestampNanos(a, _), ParquetValue::TimestampNanos(b, _)) => a == b,
279
+ (ParquetValue::List(a), ParquetValue::List(b)) => a == b,
280
+ (ParquetValue::Null, ParquetValue::Null) => true,
281
+ _ => false,
282
+ }
283
+ }
284
+ }
285
+
286
+ impl Eq for ParquetValue {}
287
+
288
+ #[derive(Debug)]
289
+ pub struct ParquetValueVec(Vec<ParquetValue>);
290
+
291
+ impl ParquetValueVec {
292
+ pub fn into_inner(self) -> Vec<ParquetValue> {
293
+ self.0
294
+ }
295
+ }
296
+
297
+ impl IntoIterator for ParquetValueVec {
298
+ type Item = ParquetValue;
299
+ type IntoIter = std::vec::IntoIter<ParquetValue>;
300
+
301
+ fn into_iter(self) -> Self::IntoIter {
302
+ self.0.into_iter()
303
+ }
304
+ }
305
+
306
+ impl std::cmp::PartialEq for ParquetValueVec {
307
+ fn eq(&self, other: &Self) -> bool {
308
+ self.0 == other.0
309
+ }
310
+ }
311
+
312
+ impl std::cmp::Eq for ParquetValueVec {}
313
+
314
+ impl TryFrom<Arc<dyn Array>> for ParquetValueVec {
315
+ type Error = String;
316
+
317
+ fn try_from(column: Arc<dyn Array>) -> Result<Self, Self::Error> {
318
+ ParquetValueVec::try_from(&*column)
319
+ }
320
+ }
321
+
322
+ // Add macro for handling numeric array conversions
323
+ macro_rules! impl_numeric_array_conversion {
324
+ ($column:expr, $array_type:ty, $variant:ident) => {{
325
+ let array = downcast_array::<$array_type>($column);
326
+ if array.is_nullable() {
327
+ array
328
+ .values()
329
+ .iter()
330
+ .enumerate()
331
+ .map(|(i, x)| {
332
+ if array.is_null(i) {
333
+ ParquetValue::Null
334
+ } else {
335
+ ParquetValue::$variant(*x)
336
+ }
337
+ })
338
+ .collect()
339
+ } else {
340
+ array
341
+ .values()
342
+ .iter()
343
+ .map(|x| ParquetValue::$variant(*x))
344
+ .collect()
345
+ }
346
+ }};
347
+ }
348
+
349
+ // Add macro for handling boolean array conversions
350
+ macro_rules! impl_boolean_array_conversion {
351
+ ($column:expr, $array_type:ty, $variant:ident) => {{
352
+ let array = downcast_array::<$array_type>($column);
353
+ if array.is_nullable() {
354
+ array
355
+ .values()
356
+ .iter()
357
+ .enumerate()
358
+ .map(|(i, x)| {
359
+ if array.is_null(i) {
360
+ ParquetValue::Null
361
+ } else {
362
+ ParquetValue::$variant(x)
363
+ }
364
+ })
365
+ .collect()
366
+ } else {
367
+ array
368
+ .values()
369
+ .iter()
370
+ .map(|x| ParquetValue::$variant(x))
371
+ .collect()
372
+ }
373
+ }};
374
+ }
375
+
376
+ // Add macro for handling timestamp array conversions
377
+ macro_rules! impl_timestamp_array_conversion {
378
+ ($column:expr, $array_type:ty, $variant:ident, $tz:expr) => {{
379
+ let array = downcast_array::<$array_type>($column);
380
+ if array.is_nullable() {
381
+ array
382
+ .values()
383
+ .iter()
384
+ .enumerate()
385
+ .map(|(i, x)| {
386
+ if array.is_null(i) {
387
+ ParquetValue::Null
388
+ } else {
389
+ ParquetValue::$variant(*x, $tz.clone())
390
+ }
391
+ })
392
+ .collect()
393
+ } else {
394
+ array
395
+ .values()
396
+ .iter()
397
+ .map(|x| ParquetValue::$variant(*x, $tz.clone()))
398
+ .collect()
399
+ }
400
+ }};
401
+ }
402
+
403
+ impl TryFrom<&dyn Array> for ParquetValueVec {
404
+ type Error = String;
405
+
406
+ fn try_from(column: &dyn Array) -> Result<Self, Self::Error> {
407
+ let tmp_vec = match column.data_type() {
408
+ DataType::Boolean => impl_boolean_array_conversion!(column, BooleanArray, Boolean),
409
+ DataType::Int8 => impl_numeric_array_conversion!(column, Int8Array, Int8),
410
+ DataType::Int16 => impl_numeric_array_conversion!(column, Int16Array, Int16),
411
+ DataType::Int32 => impl_numeric_array_conversion!(column, Int32Array, Int32),
412
+ DataType::Int64 => impl_numeric_array_conversion!(column, Int64Array, Int64),
413
+ DataType::UInt8 => impl_numeric_array_conversion!(column, UInt8Array, UInt8),
414
+ DataType::UInt16 => impl_numeric_array_conversion!(column, UInt16Array, UInt16),
415
+ DataType::UInt32 => impl_numeric_array_conversion!(column, UInt32Array, UInt32),
416
+ DataType::UInt64 => impl_numeric_array_conversion!(column, UInt64Array, UInt64),
417
+ DataType::Float32 => impl_numeric_array_conversion!(column, Float32Array, Float32),
418
+ DataType::Float64 => impl_numeric_array_conversion!(column, Float64Array, Float64),
419
+ DataType::Date32 => impl_numeric_array_conversion!(column, Date32Array, Date32),
420
+ DataType::Date64 => impl_numeric_array_conversion!(column, Date64Array, Date64),
421
+ DataType::Timestamp(TimeUnit::Second, tz) => {
422
+ impl_timestamp_array_conversion!(column, TimestampSecondArray, TimestampSecond, tz)
423
+ }
424
+ DataType::Timestamp(TimeUnit::Millisecond, tz) => {
425
+ impl_timestamp_array_conversion!(
426
+ column,
427
+ TimestampMillisecondArray,
428
+ TimestampMillis,
429
+ tz
430
+ )
431
+ }
432
+ DataType::Timestamp(TimeUnit::Microsecond, tz) => {
433
+ impl_timestamp_array_conversion!(
434
+ column,
435
+ TimestampMicrosecondArray,
436
+ TimestampMicros,
437
+ tz
438
+ )
439
+ }
440
+ DataType::Timestamp(TimeUnit::Nanosecond, tz) => {
441
+ impl_timestamp_array_conversion!(
442
+ column,
443
+ TimestampNanosecondArray,
444
+ TimestampNanos,
445
+ tz
446
+ )
447
+ }
448
+ // Because f16 is unstable in Rust, we convert it to f32
449
+ DataType::Float16 => {
450
+ let array = downcast_array::<Float16Array>(column);
451
+ if array.is_nullable() {
452
+ array
453
+ .values()
454
+ .iter()
455
+ .enumerate()
456
+ .map(|(i, x)| {
457
+ if array.is_null(i) {
458
+ ParquetValue::Null
459
+ } else {
460
+ ParquetValue::Float16(f32::from(*x))
461
+ }
462
+ })
463
+ .collect()
464
+ } else {
465
+ array
466
+ .values()
467
+ .iter()
468
+ .map(|x| ParquetValue::Float16(f32::from(*x)))
469
+ .collect()
470
+ }
471
+ }
472
+ DataType::Utf8 => {
473
+ let array = downcast_array::<StringArray>(column);
474
+ array
475
+ .iter()
476
+ .map(|opt_x| match opt_x {
477
+ Some(x) => ParquetValue::String(x.to_string()),
478
+ None => ParquetValue::Null,
479
+ })
480
+ .collect()
481
+ }
482
+ DataType::Binary => {
483
+ let array = downcast_array::<BinaryArray>(column);
484
+ array
485
+ .iter()
486
+ .map(|opt_x| match opt_x {
487
+ Some(x) => ParquetValue::Bytes(x.to_vec()),
488
+ None => ParquetValue::Null,
489
+ })
490
+ .collect()
491
+ }
492
+ DataType::List(_field) => {
493
+ let list_array = downcast_array::<ListArray>(column);
494
+ list_array
495
+ .iter()
496
+ .map(|x| match x {
497
+ Some(values) => match ParquetValueVec::try_from(values) {
498
+ Ok(vec) => ParquetValue::List(vec.into_inner()),
499
+ Err(e) => {
500
+ panic!("Error converting list array to ParquetValueVec: {}", e)
501
+ }
502
+ },
503
+ None => ParquetValue::Null,
504
+ })
505
+ .collect()
506
+ }
507
+ DataType::Struct(_) => {
508
+ let struct_array = downcast_array::<StructArray>(column);
509
+ let mut values = Vec::with_capacity(struct_array.len());
510
+ for i in 0..struct_array.len() {
511
+ if struct_array.is_null(i) {
512
+ values.push(ParquetValue::Null);
513
+ continue;
514
+ }
515
+
516
+ let mut map = std::collections::HashMap::new();
517
+ for (field_idx, field) in struct_array.fields().iter().enumerate() {
518
+ let column = struct_array.column(field_idx);
519
+ let field_values = match ParquetValueVec::try_from(column.slice(i, 1)) {
520
+ Ok(vec) => vec.into_inner(),
521
+ Err(e) => {
522
+ panic!("Error converting struct field to ParquetValueVec: {}", e)
523
+ }
524
+ };
525
+ map.insert(
526
+ ParquetValue::String(field.name().to_string()),
527
+ field_values.into_iter().next().unwrap(),
528
+ );
529
+ }
530
+ values.push(ParquetValue::Map(map));
531
+ }
532
+ values
533
+ }
534
+ DataType::Null => {
535
+ let x = downcast_array::<NullArray>(column);
536
+ vec![ParquetValue::Null; x.len()]
537
+ }
538
+ _ => {
539
+ return Err(format!("Unsupported data type: {:?}", column.data_type()));
540
+ }
541
+ };
542
+ Ok(ParquetValueVec(tmp_vec))
543
+ }
544
+ }
545
+
546
+ impl std::hash::Hash for ParquetValue {
547
+ fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
548
+ match self {
549
+ ParquetValue::Int8(i) => i.hash(state),
550
+ ParquetValue::Int16(i) => i.hash(state),
551
+ ParquetValue::Int32(i) => i.hash(state),
552
+ ParquetValue::Int64(i) => i.hash(state),
553
+ ParquetValue::UInt8(i) => i.hash(state),
554
+ ParquetValue::UInt16(i) => i.hash(state),
555
+ ParquetValue::UInt32(i) => i.hash(state),
556
+ ParquetValue::UInt64(i) => i.hash(state),
557
+ ParquetValue::Float16(f) => f.to_bits().hash(state),
558
+ ParquetValue::Float32(f) => f.to_bits().hash(state),
559
+ ParquetValue::Float64(f) => f.to_bits().hash(state),
560
+ ParquetValue::Boolean(b) => b.hash(state),
561
+ ParquetValue::String(s) => s.hash(state),
562
+ ParquetValue::Bytes(b) => b.hash(state),
563
+ ParquetValue::Date32(d) => d.hash(state),
564
+ ParquetValue::Date64(d) => d.hash(state),
565
+ ParquetValue::TimestampSecond(ts, tz) => {
566
+ ts.hash(state);
567
+ tz.hash(state);
568
+ }
569
+ ParquetValue::TimestampMillis(ts, tz) => {
570
+ ts.hash(state);
571
+ tz.hash(state);
572
+ }
573
+ ParquetValue::TimestampMicros(ts, tz) => {
574
+ ts.hash(state);
575
+ tz.hash(state);
576
+ }
577
+ ParquetValue::TimestampNanos(ts, tz) => {
578
+ ts.hash(state);
579
+ tz.hash(state);
580
+ }
581
+ ParquetValue::List(l) => l.hash(state),
582
+ ParquetValue::Map(_m) => panic!("Map is not hashable"),
583
+ ParquetValue::Null => 0_i32.hash(state),
584
+ }
585
+ }
586
+ }
587
+
588
+ impl IntoValue for ParquetValue {
589
+ fn into_value_with(self, handle: &Ruby) -> Value {
590
+ match self {
591
+ ParquetValue::Int8(i) => i.into_value_with(handle),
592
+ ParquetValue::Int16(i) => i.into_value_with(handle),
593
+ ParquetValue::Int32(i) => i.into_value_with(handle),
594
+ ParquetValue::Int64(i) => i.into_value_with(handle),
595
+ ParquetValue::UInt8(i) => i.into_value_with(handle),
596
+ ParquetValue::UInt16(i) => i.into_value_with(handle),
597
+ ParquetValue::UInt32(i) => i.into_value_with(handle),
598
+ ParquetValue::UInt64(i) => i.into_value_with(handle),
599
+ ParquetValue::Float16(f) => f.into_value_with(handle),
600
+ ParquetValue::Float32(f) => f.into_value_with(handle),
601
+ ParquetValue::Float64(f) => f.into_value_with(handle),
602
+ ParquetValue::Boolean(b) => b.into_value_with(handle),
603
+ ParquetValue::String(s) => s.into_value_with(handle),
604
+ ParquetValue::Bytes(b) => b.into_value_with(handle),
605
+ ParquetValue::Date32(d) => {
606
+ let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
607
+ let formatted = ts.strftime("%Y-%m-%d").to_string();
608
+ formatted.into_value_with(handle)
609
+ }
610
+ ParquetValue::Date64(d) => {
611
+ let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
612
+ let formatted = ts.strftime("%Y-%m-%d").to_string();
613
+ formatted.into_value_with(handle)
614
+ }
615
+ ParquetValue::TimestampSecond(ts, tz) => {
616
+ let ts = parse_zoned_timestamp(&ParquetValue::TimestampSecond(ts, tz));
617
+ let time_class = handle.class_time();
618
+ time_class
619
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
620
+ .unwrap()
621
+ .into_value_with(handle)
622
+ }
623
+ ParquetValue::TimestampMillis(ts, tz) => {
624
+ let ts = parse_zoned_timestamp(&ParquetValue::TimestampMillis(ts, tz));
625
+ let time_class = handle.class_time();
626
+ time_class
627
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
628
+ .unwrap()
629
+ .into_value_with(handle)
630
+ }
631
+ ParquetValue::TimestampMicros(ts, tz) => {
632
+ let ts = parse_zoned_timestamp(&ParquetValue::TimestampMicros(ts, tz));
633
+ let time_class = handle.class_time();
634
+ time_class
635
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
636
+ .unwrap()
637
+ .into_value_with(handle)
638
+ }
639
+ ParquetValue::TimestampNanos(ts, tz) => {
640
+ let ts = parse_zoned_timestamp(&ParquetValue::TimestampNanos(ts, tz));
641
+ let time_class = handle.class_time();
642
+ time_class
643
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
644
+ .unwrap()
645
+ .into_value_with(handle)
646
+ }
647
+ ParquetValue::List(l) => {
648
+ let ary = handle.ary_new_capa(l.len());
649
+ l.into_iter()
650
+ .try_for_each(|v| ary.push(v.into_value_with(handle)))
651
+ .unwrap();
652
+ ary.into_value_with(handle)
653
+ }
654
+ ParquetValue::Map(m) => {
655
+ let hash = handle.hash_new_capa(m.len());
656
+ m.into_iter()
657
+ .try_for_each(|(k, v)| {
658
+ hash.aset(k.into_value_with(handle), v.into_value_with(handle))
659
+ })
660
+ .unwrap();
661
+ hash.into_value_with(handle)
662
+ }
663
+ ParquetValue::Null => handle.qnil().as_value(),
664
+ }
665
+ }
666
+ }
667
+
668
+ fn parse_zoned_timestamp(value: &ParquetValue) -> jiff::Timestamp {
669
+ let (ts, tz) = match value {
670
+ ParquetValue::TimestampSecond(ts, tz) => (jiff::Timestamp::from_second(*ts).unwrap(), tz),
671
+ ParquetValue::TimestampMillis(ts, tz) => {
672
+ (jiff::Timestamp::from_millisecond(*ts).unwrap(), tz)
673
+ }
674
+ ParquetValue::TimestampMicros(ts, tz) => {
675
+ (jiff::Timestamp::from_microsecond(*ts).unwrap(), tz)
676
+ }
677
+ ParquetValue::TimestampNanos(ts, tz) => {
678
+ (jiff::Timestamp::from_nanosecond(*ts as i128).unwrap(), tz)
679
+ }
680
+ _ => panic!("Invalid timestamp value"),
681
+ };
682
+
683
+ // If timezone is provided, convert to zoned timestamp
684
+ if let Some(tz) = tz {
685
+ // Handle fixed offset timezones like "+09:00" first
686
+ if tz.starts_with('+') || tz.starts_with('-') {
687
+ // Parse the offset string into hours and minutes
688
+ let (hours, minutes) = if tz.len() >= 5 && tz.contains(':') {
689
+ // Format: "+09:00" or "-09:00"
690
+ let h = tz[1..3].parse::<i32>().unwrap_or(0);
691
+ let m = tz[4..6].parse::<i32>().unwrap_or(0);
692
+ (h, m)
693
+ } else if tz.len() >= 3 {
694
+ // Format: "+09" or "-09"
695
+ let h = tz[1..3].parse::<i32>().unwrap_or(0);
696
+ (h, 0)
697
+ } else {
698
+ (0, 0)
699
+ };
700
+
701
+ // Apply sign
702
+ let total_minutes = if tz.starts_with('-') {
703
+ -(hours * 60 + minutes)
704
+ } else {
705
+ hours * 60 + minutes
706
+ };
707
+
708
+ // Create fixed timezone
709
+ let tz = jiff::tz::TimeZone::fixed(jiff::tz::offset((total_minutes / 60) as i8));
710
+ ts.to_zoned(tz).timestamp()
711
+ } else {
712
+ // Try IANA timezone
713
+ match ts.intz(&tz) {
714
+ Ok(zoned) => zoned.timestamp(),
715
+ Err(_) => ts, // Fall back to UTC if timezone is invalid
716
+ }
717
+ }
718
+ } else {
719
+ // No timezone provided - treat as UTC
720
+ ts
721
+ }
722
+ }