parquet 0.0.5 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,763 +0,0 @@
1
- use std::{borrow::Cow, collections::HashMap, hash::BuildHasher, sync::Arc};
2
-
3
- use arrow_array::cast::downcast_array;
4
- use arrow_array::{
5
- Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Float16Array, Float32Array,
6
- Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, NullArray, StringArray,
7
- StructArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
8
- TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
9
- };
10
- use arrow_schema::{DataType, TimeUnit};
11
- use itertools::Itertools;
12
- use magnus::{value::ReprValue, IntoValue, Ruby, Value};
13
- use parquet::data_type::Decimal;
14
- use parquet::record::Field;
15
-
16
- use crate::header_cache::StringCacheKey;
17
-
18
- #[derive(Copy, Clone, Debug, PartialEq, Eq)]
19
- pub enum ParserResultType {
20
- Hash,
21
- Array,
22
- }
23
-
24
- impl ParserResultType {
25
- pub fn iter() -> impl Iterator<Item = Self> {
26
- [Self::Hash, Self::Array].into_iter()
27
- }
28
- }
29
-
30
- impl TryFrom<&str> for ParserResultType {
31
- type Error = String;
32
-
33
- fn try_from(value: &str) -> Result<Self, Self::Error> {
34
- match value {
35
- "hash" => Ok(ParserResultType::Hash),
36
- "array" => Ok(ParserResultType::Array),
37
- _ => Err(format!("Invalid parser result type: {}", value)),
38
- }
39
- }
40
- }
41
-
42
- impl TryFrom<String> for ParserResultType {
43
- type Error = String;
44
-
45
- fn try_from(value: String) -> Result<Self, Self::Error> {
46
- Self::try_from(value.as_str())
47
- }
48
- }
49
-
50
- impl std::fmt::Display for ParserResultType {
51
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
52
- match self {
53
- ParserResultType::Hash => write!(f, "hash"),
54
- ParserResultType::Array => write!(f, "array"),
55
- }
56
- }
57
- }
58
-
59
- #[derive(Debug)]
60
- pub enum RowRecord<S: BuildHasher + Default> {
61
- Vec(Vec<ParquetField>),
62
- Map(HashMap<StringCacheKey, ParquetField, S>),
63
- }
64
-
65
- #[derive(Debug)]
66
- pub enum ColumnRecord<S: BuildHasher + Default> {
67
- Vec(Vec<Vec<ParquetValue>>),
68
- Map(HashMap<StringCacheKey, Vec<ParquetValue>, S>),
69
- }
70
-
71
- impl<S: BuildHasher + Default> IntoValue for RowRecord<S> {
72
- fn into_value_with(self, handle: &Ruby) -> Value {
73
- match self {
74
- RowRecord::Vec(vec) => {
75
- let ary = handle.ary_new_capa(vec.len());
76
- vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
77
- handle.into_value(ary)
78
- }
79
- RowRecord::Map(map) => {
80
- let hash = handle.hash_new_capa(map.len());
81
-
82
- let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
83
- let mut i = 0;
84
-
85
- for chunk in &map.into_iter().chunks(64) {
86
- // Reduced to 64 to ensure space for pairs
87
- for (k, v) in chunk {
88
- if i + 1 >= values.len() {
89
- // Bulk insert current batch if array is full
90
- hash.bulk_insert(&values[..i]).unwrap();
91
- values[..i].fill(handle.qnil().as_value());
92
- i = 0;
93
- }
94
- values[i] = handle.into_value(k);
95
- values[i + 1] = handle.into_value(v);
96
- i += 2;
97
- }
98
- // Insert any remaining pairs
99
- if i > 0 {
100
- hash.bulk_insert(&values[..i]).unwrap();
101
- values[..i].fill(handle.qnil().as_value());
102
- i = 0;
103
- }
104
- }
105
-
106
- hash.into_value_with(handle)
107
- }
108
- }
109
- }
110
- }
111
-
112
- impl<S: BuildHasher + Default> IntoValue for ColumnRecord<S> {
113
- fn into_value_with(self, handle: &Ruby) -> Value {
114
- match self {
115
- ColumnRecord::Vec(vec) => {
116
- let ary = handle.ary_new_capa(vec.len());
117
- vec.into_iter()
118
- .try_for_each(|v| {
119
- let nested_ary = handle.ary_new_capa(v.len());
120
- v.into_iter().try_for_each(|v| nested_ary.push(v)).unwrap();
121
- ary.push(nested_ary.into_value_with(handle))
122
- })
123
- .unwrap();
124
- ary.into_value_with(handle)
125
- }
126
- ColumnRecord::Map(map) => {
127
- let hash = handle.hash_new_capa(map.len());
128
-
129
- let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
130
- let mut i = 0;
131
-
132
- for chunk in &map.into_iter().chunks(64) {
133
- // Reduced to 64 to ensure space for pairs
134
- for (k, v) in chunk {
135
- if i + 1 >= values.len() {
136
- // Bulk insert current batch if array is full
137
- hash.bulk_insert(&values[..i]).unwrap();
138
- values[..i].fill(handle.qnil().as_value());
139
- i = 0;
140
- }
141
- values[i] = handle.into_value(k);
142
- let ary = handle.ary_new_capa(v.len());
143
- v.into_iter().try_for_each(|v| ary.push(v)).unwrap();
144
- values[i + 1] = handle.into_value(ary);
145
- i += 2;
146
- }
147
- // Insert any remaining pairs
148
- if i > 0 {
149
- hash.bulk_insert(&values[..i]).unwrap();
150
- values[..i].fill(handle.qnil().as_value());
151
- i = 0;
152
- }
153
- }
154
-
155
- hash.into_value_with(handle)
156
- }
157
- }
158
- }
159
- }
160
-
161
- #[derive(Debug, Clone)]
162
- pub struct CowValue<'a>(pub Cow<'a, str>);
163
-
164
- impl<'a> IntoValue for CowValue<'a> {
165
- fn into_value_with(self, handle: &Ruby) -> Value {
166
- self.0.into_value_with(handle)
167
- }
168
- }
169
-
170
- #[derive(Debug)]
171
- pub struct ParquetField(pub Field);
172
-
173
- impl IntoValue for ParquetField {
174
- fn into_value_with(self, handle: &Ruby) -> Value {
175
- match self.0 {
176
- Field::Null => handle.qnil().as_value(),
177
- Field::Bool(b) => b.into_value_with(handle),
178
- Field::Short(s) => s.into_value_with(handle),
179
- Field::Int(i) => i.into_value_with(handle),
180
- Field::Long(l) => l.into_value_with(handle),
181
- Field::UByte(ub) => ub.into_value_with(handle),
182
- Field::UShort(us) => us.into_value_with(handle),
183
- Field::UInt(ui) => ui.into_value_with(handle),
184
- Field::ULong(ul) => ul.into_value_with(handle),
185
- Field::Float16(f) => f32::from(f).into_value_with(handle),
186
- Field::Float(f) => f.into_value_with(handle),
187
- Field::Double(d) => d.into_value_with(handle),
188
- Field::Str(s) => s.into_value_with(handle),
189
- Field::Byte(b) => b.into_value_with(handle),
190
- Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
191
- Field::Date(d) => {
192
- let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
193
- let formatted = ts.strftime("%Y-%m-%d").to_string();
194
- formatted.into_value_with(handle)
195
- }
196
- Field::TimestampMillis(ts) => {
197
- let ts = jiff::Timestamp::from_millisecond(ts).unwrap();
198
- let time_class = handle.class_time();
199
- time_class
200
- .funcall::<_, _, Value>("parse", (ts.to_string(),))
201
- .unwrap()
202
- .into_value_with(handle)
203
- }
204
- Field::TimestampMicros(ts) => {
205
- let ts = jiff::Timestamp::from_microsecond(ts).unwrap();
206
- let time_class = handle.class_time();
207
- time_class
208
- .funcall::<_, _, Value>("parse", (ts.to_string(),))
209
- .unwrap()
210
- .into_value_with(handle)
211
- }
212
- Field::ListInternal(list) => {
213
- let elements = list.elements();
214
- let ary = handle.ary_new_capa(elements.len());
215
- elements
216
- .iter()
217
- .try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
218
- .unwrap();
219
- ary.into_value_with(handle)
220
- }
221
- Field::MapInternal(map) => {
222
- let entries = map.entries();
223
- let hash = handle.hash_new_capa(entries.len());
224
- entries
225
- .iter()
226
- .try_for_each(|(k, v)| {
227
- hash.aset(
228
- ParquetField(k.clone()).into_value_with(handle),
229
- ParquetField(v.clone()).into_value_with(handle),
230
- )
231
- })
232
- .unwrap();
233
- hash.into_value_with(handle)
234
- }
235
- Field::Decimal(d) => {
236
- let value = match d {
237
- Decimal::Int32 { value, scale, .. } => {
238
- let unscaled = i32::from_be_bytes(value);
239
- format!("{}e-{}", unscaled, scale)
240
- }
241
- Decimal::Int64 { value, scale, .. } => {
242
- let unscaled = i64::from_be_bytes(value);
243
- format!("{}e-{}", unscaled, scale)
244
- }
245
- Decimal::Bytes { value, scale, .. } => {
246
- // Convert bytes to string representation of unscaled value
247
- let unscaled = String::from_utf8_lossy(value.data());
248
- format!("{}e-{}", unscaled, scale)
249
- }
250
- };
251
- handle.eval(&format!("BigDecimal(\"{value}\")")).unwrap()
252
- }
253
- Field::Group(row) => {
254
- let hash = handle.hash_new();
255
- row.get_column_iter()
256
- .try_for_each(|(k, v)| {
257
- hash.aset(
258
- k.clone().into_value_with(handle),
259
- ParquetField(v.clone()).into_value_with(handle),
260
- )
261
- })
262
- .unwrap();
263
- hash.into_value_with(handle)
264
- }
265
- }
266
- }
267
- }
268
-
269
- #[allow(dead_code)]
270
- #[derive(Clone, Debug)]
271
- pub enum ParquetValue {
272
- Int8(i8),
273
- Int16(i16),
274
- Int32(i32),
275
- Int64(i64),
276
- UInt8(u8),
277
- UInt16(u16),
278
- UInt32(u32),
279
- UInt64(u64),
280
- Float16(f32), // f16 converted to f32
281
- Float32(f32),
282
- Float64(f64),
283
- Boolean(bool),
284
- String(String),
285
- Bytes(Vec<u8>),
286
- Date32(i32),
287
- Date64(i64),
288
- TimestampSecond(i64, Option<Arc<str>>),
289
- TimestampMillis(i64, Option<Arc<str>>),
290
- TimestampMicros(i64, Option<Arc<str>>),
291
- TimestampNanos(i64, Option<Arc<str>>),
292
- List(Vec<ParquetValue>),
293
- Map(HashMap<ParquetValue, ParquetValue>),
294
- Null,
295
- }
296
-
297
- impl PartialEq for ParquetValue {
298
- fn eq(&self, other: &Self) -> bool {
299
- match (self, other) {
300
- (ParquetValue::Int8(a), ParquetValue::Int8(b)) => a == b,
301
- (ParquetValue::Int16(a), ParquetValue::Int16(b)) => a == b,
302
- (ParquetValue::Int32(a), ParquetValue::Int32(b)) => a == b,
303
- (ParquetValue::Int64(a), ParquetValue::Int64(b)) => a == b,
304
- (ParquetValue::UInt8(a), ParquetValue::UInt8(b)) => a == b,
305
- (ParquetValue::UInt16(a), ParquetValue::UInt16(b)) => a == b,
306
- (ParquetValue::UInt32(a), ParquetValue::UInt32(b)) => a == b,
307
- (ParquetValue::UInt64(a), ParquetValue::UInt64(b)) => a == b,
308
- (ParquetValue::Float16(a), ParquetValue::Float16(b)) => a == b,
309
- (ParquetValue::Float32(a), ParquetValue::Float32(b)) => a == b,
310
- (ParquetValue::Float64(a), ParquetValue::Float64(b)) => a == b,
311
- (ParquetValue::Boolean(a), ParquetValue::Boolean(b)) => a == b,
312
- (ParquetValue::String(a), ParquetValue::String(b)) => a == b,
313
- (ParquetValue::Bytes(a), ParquetValue::Bytes(b)) => a == b,
314
- (ParquetValue::Date32(a), ParquetValue::Date32(b)) => a == b,
315
- (ParquetValue::Date64(a), ParquetValue::Date64(b)) => a == b,
316
- (ParquetValue::TimestampSecond(a, _), ParquetValue::TimestampSecond(b, _)) => a == b,
317
- (ParquetValue::TimestampMillis(a, _), ParquetValue::TimestampMillis(b, _)) => a == b,
318
- (ParquetValue::TimestampMicros(a, _), ParquetValue::TimestampMicros(b, _)) => a == b,
319
- (ParquetValue::TimestampNanos(a, _), ParquetValue::TimestampNanos(b, _)) => a == b,
320
- (ParquetValue::List(a), ParquetValue::List(b)) => a == b,
321
- (ParquetValue::Null, ParquetValue::Null) => true,
322
- _ => false,
323
- }
324
- }
325
- }
326
-
327
- impl Eq for ParquetValue {}
328
-
329
- #[derive(Debug)]
330
- pub struct ParquetValueVec(Vec<ParquetValue>);
331
-
332
- impl ParquetValueVec {
333
- pub fn into_inner(self) -> Vec<ParquetValue> {
334
- self.0
335
- }
336
- }
337
-
338
- impl IntoIterator for ParquetValueVec {
339
- type Item = ParquetValue;
340
- type IntoIter = std::vec::IntoIter<ParquetValue>;
341
-
342
- fn into_iter(self) -> Self::IntoIter {
343
- self.0.into_iter()
344
- }
345
- }
346
-
347
- impl std::cmp::PartialEq for ParquetValueVec {
348
- fn eq(&self, other: &Self) -> bool {
349
- self.0 == other.0
350
- }
351
- }
352
-
353
- impl std::cmp::Eq for ParquetValueVec {}
354
-
355
- impl TryFrom<Arc<dyn Array>> for ParquetValueVec {
356
- type Error = String;
357
-
358
- fn try_from(column: Arc<dyn Array>) -> Result<Self, Self::Error> {
359
- ParquetValueVec::try_from(&*column)
360
- }
361
- }
362
-
363
- // Add macro for handling numeric array conversions
364
- macro_rules! impl_numeric_array_conversion {
365
- ($column:expr, $array_type:ty, $variant:ident) => {{
366
- let array = downcast_array::<$array_type>($column);
367
- if array.is_nullable() {
368
- array
369
- .values()
370
- .iter()
371
- .enumerate()
372
- .map(|(i, x)| {
373
- if array.is_null(i) {
374
- ParquetValue::Null
375
- } else {
376
- ParquetValue::$variant(*x)
377
- }
378
- })
379
- .collect()
380
- } else {
381
- array
382
- .values()
383
- .iter()
384
- .map(|x| ParquetValue::$variant(*x))
385
- .collect()
386
- }
387
- }};
388
- }
389
-
390
- // Add macro for handling boolean array conversions
391
- macro_rules! impl_boolean_array_conversion {
392
- ($column:expr, $array_type:ty, $variant:ident) => {{
393
- let array = downcast_array::<$array_type>($column);
394
- if array.is_nullable() {
395
- array
396
- .values()
397
- .iter()
398
- .enumerate()
399
- .map(|(i, x)| {
400
- if array.is_null(i) {
401
- ParquetValue::Null
402
- } else {
403
- ParquetValue::$variant(x)
404
- }
405
- })
406
- .collect()
407
- } else {
408
- array
409
- .values()
410
- .iter()
411
- .map(|x| ParquetValue::$variant(x))
412
- .collect()
413
- }
414
- }};
415
- }
416
-
417
- // Add macro for handling timestamp array conversions
418
- macro_rules! impl_timestamp_array_conversion {
419
- ($column:expr, $array_type:ty, $variant:ident, $tz:expr) => {{
420
- let array = downcast_array::<$array_type>($column);
421
- if array.is_nullable() {
422
- array
423
- .values()
424
- .iter()
425
- .enumerate()
426
- .map(|(i, x)| {
427
- if array.is_null(i) {
428
- ParquetValue::Null
429
- } else {
430
- ParquetValue::$variant(*x, $tz.clone())
431
- }
432
- })
433
- .collect()
434
- } else {
435
- array
436
- .values()
437
- .iter()
438
- .map(|x| ParquetValue::$variant(*x, $tz.clone()))
439
- .collect()
440
- }
441
- }};
442
- }
443
-
444
- impl TryFrom<&dyn Array> for ParquetValueVec {
445
- type Error = String;
446
-
447
- fn try_from(column: &dyn Array) -> Result<Self, Self::Error> {
448
- let tmp_vec = match column.data_type() {
449
- DataType::Boolean => impl_boolean_array_conversion!(column, BooleanArray, Boolean),
450
- DataType::Int8 => impl_numeric_array_conversion!(column, Int8Array, Int8),
451
- DataType::Int16 => impl_numeric_array_conversion!(column, Int16Array, Int16),
452
- DataType::Int32 => impl_numeric_array_conversion!(column, Int32Array, Int32),
453
- DataType::Int64 => impl_numeric_array_conversion!(column, Int64Array, Int64),
454
- DataType::UInt8 => impl_numeric_array_conversion!(column, UInt8Array, UInt8),
455
- DataType::UInt16 => impl_numeric_array_conversion!(column, UInt16Array, UInt16),
456
- DataType::UInt32 => impl_numeric_array_conversion!(column, UInt32Array, UInt32),
457
- DataType::UInt64 => impl_numeric_array_conversion!(column, UInt64Array, UInt64),
458
- DataType::Float32 => impl_numeric_array_conversion!(column, Float32Array, Float32),
459
- DataType::Float64 => impl_numeric_array_conversion!(column, Float64Array, Float64),
460
- DataType::Date32 => impl_numeric_array_conversion!(column, Date32Array, Date32),
461
- DataType::Date64 => impl_numeric_array_conversion!(column, Date64Array, Date64),
462
- DataType::Timestamp(TimeUnit::Second, tz) => {
463
- impl_timestamp_array_conversion!(column, TimestampSecondArray, TimestampSecond, tz)
464
- }
465
- DataType::Timestamp(TimeUnit::Millisecond, tz) => {
466
- impl_timestamp_array_conversion!(
467
- column,
468
- TimestampMillisecondArray,
469
- TimestampMillis,
470
- tz
471
- )
472
- }
473
- DataType::Timestamp(TimeUnit::Microsecond, tz) => {
474
- impl_timestamp_array_conversion!(
475
- column,
476
- TimestampMicrosecondArray,
477
- TimestampMicros,
478
- tz
479
- )
480
- }
481
- DataType::Timestamp(TimeUnit::Nanosecond, tz) => {
482
- impl_timestamp_array_conversion!(
483
- column,
484
- TimestampNanosecondArray,
485
- TimestampNanos,
486
- tz
487
- )
488
- }
489
- // Because f16 is unstable in Rust, we convert it to f32
490
- DataType::Float16 => {
491
- let array = downcast_array::<Float16Array>(column);
492
- if array.is_nullable() {
493
- array
494
- .values()
495
- .iter()
496
- .enumerate()
497
- .map(|(i, x)| {
498
- if array.is_null(i) {
499
- ParquetValue::Null
500
- } else {
501
- ParquetValue::Float16(f32::from(*x))
502
- }
503
- })
504
- .collect()
505
- } else {
506
- array
507
- .values()
508
- .iter()
509
- .map(|x| ParquetValue::Float16(f32::from(*x)))
510
- .collect()
511
- }
512
- }
513
- DataType::Utf8 => {
514
- let array = downcast_array::<StringArray>(column);
515
- array
516
- .iter()
517
- .map(|opt_x| match opt_x {
518
- Some(x) => ParquetValue::String(x.to_string()),
519
- None => ParquetValue::Null,
520
- })
521
- .collect()
522
- }
523
- DataType::Binary => {
524
- let array = downcast_array::<BinaryArray>(column);
525
- array
526
- .iter()
527
- .map(|opt_x| match opt_x {
528
- Some(x) => ParquetValue::Bytes(x.to_vec()),
529
- None => ParquetValue::Null,
530
- })
531
- .collect()
532
- }
533
- DataType::List(_field) => {
534
- let list_array = downcast_array::<ListArray>(column);
535
- list_array
536
- .iter()
537
- .map(|x| match x {
538
- Some(values) => match ParquetValueVec::try_from(values) {
539
- Ok(vec) => ParquetValue::List(vec.into_inner()),
540
- Err(e) => {
541
- panic!("Error converting list array to ParquetValueVec: {}", e)
542
- }
543
- },
544
- None => ParquetValue::Null,
545
- })
546
- .collect()
547
- }
548
- DataType::Struct(_) => {
549
- let struct_array = downcast_array::<StructArray>(column);
550
- let mut values = Vec::with_capacity(struct_array.len());
551
- for i in 0..struct_array.len() {
552
- if struct_array.is_null(i) {
553
- values.push(ParquetValue::Null);
554
- continue;
555
- }
556
-
557
- let mut map = std::collections::HashMap::new();
558
- for (field_idx, field) in struct_array.fields().iter().enumerate() {
559
- let column = struct_array.column(field_idx);
560
- let field_values = match ParquetValueVec::try_from(column.slice(i, 1)) {
561
- Ok(vec) => vec.into_inner(),
562
- Err(e) => {
563
- panic!("Error converting struct field to ParquetValueVec: {}", e)
564
- }
565
- };
566
- map.insert(
567
- ParquetValue::String(field.name().to_string()),
568
- field_values.into_iter().next().unwrap(),
569
- );
570
- }
571
- values.push(ParquetValue::Map(map));
572
- }
573
- values
574
- }
575
- DataType::Null => {
576
- let x = downcast_array::<NullArray>(column);
577
- vec![ParquetValue::Null; x.len()]
578
- }
579
- _ => {
580
- return Err(format!("Unsupported data type: {:?}", column.data_type()));
581
- }
582
- };
583
- Ok(ParquetValueVec(tmp_vec))
584
- }
585
- }
586
-
587
- impl std::hash::Hash for ParquetValue {
588
- fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
589
- match self {
590
- ParquetValue::Int8(i) => i.hash(state),
591
- ParquetValue::Int16(i) => i.hash(state),
592
- ParquetValue::Int32(i) => i.hash(state),
593
- ParquetValue::Int64(i) => i.hash(state),
594
- ParquetValue::UInt8(i) => i.hash(state),
595
- ParquetValue::UInt16(i) => i.hash(state),
596
- ParquetValue::UInt32(i) => i.hash(state),
597
- ParquetValue::UInt64(i) => i.hash(state),
598
- ParquetValue::Float16(f) => f.to_bits().hash(state),
599
- ParquetValue::Float32(f) => f.to_bits().hash(state),
600
- ParquetValue::Float64(f) => f.to_bits().hash(state),
601
- ParquetValue::Boolean(b) => b.hash(state),
602
- ParquetValue::String(s) => s.hash(state),
603
- ParquetValue::Bytes(b) => b.hash(state),
604
- ParquetValue::Date32(d) => d.hash(state),
605
- ParquetValue::Date64(d) => d.hash(state),
606
- ParquetValue::TimestampSecond(ts, tz) => {
607
- ts.hash(state);
608
- tz.hash(state);
609
- }
610
- ParquetValue::TimestampMillis(ts, tz) => {
611
- ts.hash(state);
612
- tz.hash(state);
613
- }
614
- ParquetValue::TimestampMicros(ts, tz) => {
615
- ts.hash(state);
616
- tz.hash(state);
617
- }
618
- ParquetValue::TimestampNanos(ts, tz) => {
619
- ts.hash(state);
620
- tz.hash(state);
621
- }
622
- ParquetValue::List(l) => l.hash(state),
623
- ParquetValue::Map(_m) => panic!("Map is not hashable"),
624
- ParquetValue::Null => 0_i32.hash(state),
625
- }
626
- }
627
- }
628
-
629
- impl IntoValue for ParquetValue {
630
- fn into_value_with(self, handle: &Ruby) -> Value {
631
- match self {
632
- ParquetValue::Int8(i) => i.into_value_with(handle),
633
- ParquetValue::Int16(i) => i.into_value_with(handle),
634
- ParquetValue::Int32(i) => i.into_value_with(handle),
635
- ParquetValue::Int64(i) => i.into_value_with(handle),
636
- ParquetValue::UInt8(i) => i.into_value_with(handle),
637
- ParquetValue::UInt16(i) => i.into_value_with(handle),
638
- ParquetValue::UInt32(i) => i.into_value_with(handle),
639
- ParquetValue::UInt64(i) => i.into_value_with(handle),
640
- ParquetValue::Float16(f) => f.into_value_with(handle),
641
- ParquetValue::Float32(f) => f.into_value_with(handle),
642
- ParquetValue::Float64(f) => f.into_value_with(handle),
643
- ParquetValue::Boolean(b) => b.into_value_with(handle),
644
- ParquetValue::String(s) => s.into_value_with(handle),
645
- ParquetValue::Bytes(b) => b.into_value_with(handle),
646
- ParquetValue::Date32(d) => {
647
- let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
648
- let formatted = ts.strftime("%Y-%m-%d").to_string();
649
- formatted.into_value_with(handle)
650
- }
651
- ParquetValue::Date64(d) => {
652
- let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
653
- let formatted = ts.strftime("%Y-%m-%d").to_string();
654
- formatted.into_value_with(handle)
655
- }
656
- ParquetValue::TimestampSecond(ts, tz) => {
657
- let ts = parse_zoned_timestamp(&ParquetValue::TimestampSecond(ts, tz));
658
- let time_class = handle.class_time();
659
- time_class
660
- .funcall::<_, _, Value>("parse", (ts.to_string(),))
661
- .unwrap()
662
- .into_value_with(handle)
663
- }
664
- ParquetValue::TimestampMillis(ts, tz) => {
665
- let ts = parse_zoned_timestamp(&ParquetValue::TimestampMillis(ts, tz));
666
- let time_class = handle.class_time();
667
- time_class
668
- .funcall::<_, _, Value>("parse", (ts.to_string(),))
669
- .unwrap()
670
- .into_value_with(handle)
671
- }
672
- ParquetValue::TimestampMicros(ts, tz) => {
673
- let ts = parse_zoned_timestamp(&ParquetValue::TimestampMicros(ts, tz));
674
- let time_class = handle.class_time();
675
- time_class
676
- .funcall::<_, _, Value>("parse", (ts.to_string(),))
677
- .unwrap()
678
- .into_value_with(handle)
679
- }
680
- ParquetValue::TimestampNanos(ts, tz) => {
681
- let ts = parse_zoned_timestamp(&ParquetValue::TimestampNanos(ts, tz));
682
- let time_class = handle.class_time();
683
- time_class
684
- .funcall::<_, _, Value>("parse", (ts.to_string(),))
685
- .unwrap()
686
- .into_value_with(handle)
687
- }
688
- ParquetValue::List(l) => {
689
- let ary = handle.ary_new_capa(l.len());
690
- l.into_iter()
691
- .try_for_each(|v| ary.push(v.into_value_with(handle)))
692
- .unwrap();
693
- ary.into_value_with(handle)
694
- }
695
- ParquetValue::Map(m) => {
696
- let hash = handle.hash_new_capa(m.len());
697
- m.into_iter()
698
- .try_for_each(|(k, v)| {
699
- hash.aset(k.into_value_with(handle), v.into_value_with(handle))
700
- })
701
- .unwrap();
702
- hash.into_value_with(handle)
703
- }
704
- ParquetValue::Null => handle.qnil().as_value(),
705
- }
706
- }
707
- }
708
-
709
- fn parse_zoned_timestamp(value: &ParquetValue) -> jiff::Timestamp {
710
- let (ts, tz) = match value {
711
- ParquetValue::TimestampSecond(ts, tz) => (jiff::Timestamp::from_second(*ts).unwrap(), tz),
712
- ParquetValue::TimestampMillis(ts, tz) => {
713
- (jiff::Timestamp::from_millisecond(*ts).unwrap(), tz)
714
- }
715
- ParquetValue::TimestampMicros(ts, tz) => {
716
- (jiff::Timestamp::from_microsecond(*ts).unwrap(), tz)
717
- }
718
- ParquetValue::TimestampNanos(ts, tz) => {
719
- (jiff::Timestamp::from_nanosecond(*ts as i128).unwrap(), tz)
720
- }
721
- _ => panic!("Invalid timestamp value"),
722
- };
723
-
724
- // If timezone is provided, convert to zoned timestamp
725
- if let Some(tz) = tz {
726
- // Handle fixed offset timezones like "+09:00" first
727
- if tz.starts_with('+') || tz.starts_with('-') {
728
- // Parse the offset string into hours and minutes
729
- let (hours, minutes) = if tz.len() >= 5 && tz.contains(':') {
730
- // Format: "+09:00" or "-09:00"
731
- let h = tz[1..3].parse::<i32>().unwrap_or(0);
732
- let m = tz[4..6].parse::<i32>().unwrap_or(0);
733
- (h, m)
734
- } else if tz.len() >= 3 {
735
- // Format: "+09" or "-09"
736
- let h = tz[1..3].parse::<i32>().unwrap_or(0);
737
- (h, 0)
738
- } else {
739
- (0, 0)
740
- };
741
-
742
- // Apply sign
743
- let total_minutes = if tz.starts_with('-') {
744
- -(hours * 60 + minutes)
745
- } else {
746
- hours * 60 + minutes
747
- };
748
-
749
- // Create fixed timezone
750
- let tz = jiff::tz::TimeZone::fixed(jiff::tz::offset((total_minutes / 60) as i8));
751
- ts.to_zoned(tz).timestamp()
752
- } else {
753
- // Try IANA timezone
754
- match ts.intz(&tz) {
755
- Ok(zoned) => zoned.timestamp(),
756
- Err(_) => ts, // Fall back to UTC if timezone is invalid
757
- }
758
- }
759
- } else {
760
- // No timezone provided - treat as UTC
761
- ts
762
- }
763
- }