parquet 0.0.5 → 0.2.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,763 +0,0 @@
1
- use std::{borrow::Cow, collections::HashMap, hash::BuildHasher, sync::Arc};
2
-
3
- use arrow_array::cast::downcast_array;
4
- use arrow_array::{
5
- Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Float16Array, Float32Array,
6
- Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, NullArray, StringArray,
7
- StructArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
8
- TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
9
- };
10
- use arrow_schema::{DataType, TimeUnit};
11
- use itertools::Itertools;
12
- use magnus::{value::ReprValue, IntoValue, Ruby, Value};
13
- use parquet::data_type::Decimal;
14
- use parquet::record::Field;
15
-
16
- use crate::header_cache::StringCacheKey;
17
-
18
- #[derive(Copy, Clone, Debug, PartialEq, Eq)]
19
- pub enum ParserResultType {
20
- Hash,
21
- Array,
22
- }
23
-
24
- impl ParserResultType {
25
- pub fn iter() -> impl Iterator<Item = Self> {
26
- [Self::Hash, Self::Array].into_iter()
27
- }
28
- }
29
-
30
- impl TryFrom<&str> for ParserResultType {
31
- type Error = String;
32
-
33
- fn try_from(value: &str) -> Result<Self, Self::Error> {
34
- match value {
35
- "hash" => Ok(ParserResultType::Hash),
36
- "array" => Ok(ParserResultType::Array),
37
- _ => Err(format!("Invalid parser result type: {}", value)),
38
- }
39
- }
40
- }
41
-
42
- impl TryFrom<String> for ParserResultType {
43
- type Error = String;
44
-
45
- fn try_from(value: String) -> Result<Self, Self::Error> {
46
- Self::try_from(value.as_str())
47
- }
48
- }
49
-
50
- impl std::fmt::Display for ParserResultType {
51
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
52
- match self {
53
- ParserResultType::Hash => write!(f, "hash"),
54
- ParserResultType::Array => write!(f, "array"),
55
- }
56
- }
57
- }
58
-
59
- #[derive(Debug)]
60
- pub enum RowRecord<S: BuildHasher + Default> {
61
- Vec(Vec<ParquetField>),
62
- Map(HashMap<StringCacheKey, ParquetField, S>),
63
- }
64
-
65
- #[derive(Debug)]
66
- pub enum ColumnRecord<S: BuildHasher + Default> {
67
- Vec(Vec<Vec<ParquetValue>>),
68
- Map(HashMap<StringCacheKey, Vec<ParquetValue>, S>),
69
- }
70
-
71
- impl<S: BuildHasher + Default> IntoValue for RowRecord<S> {
72
- fn into_value_with(self, handle: &Ruby) -> Value {
73
- match self {
74
- RowRecord::Vec(vec) => {
75
- let ary = handle.ary_new_capa(vec.len());
76
- vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
77
- handle.into_value(ary)
78
- }
79
- RowRecord::Map(map) => {
80
- let hash = handle.hash_new_capa(map.len());
81
-
82
- let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
83
- let mut i = 0;
84
-
85
- for chunk in &map.into_iter().chunks(64) {
86
- // Reduced to 64 to ensure space for pairs
87
- for (k, v) in chunk {
88
- if i + 1 >= values.len() {
89
- // Bulk insert current batch if array is full
90
- hash.bulk_insert(&values[..i]).unwrap();
91
- values[..i].fill(handle.qnil().as_value());
92
- i = 0;
93
- }
94
- values[i] = handle.into_value(k);
95
- values[i + 1] = handle.into_value(v);
96
- i += 2;
97
- }
98
- // Insert any remaining pairs
99
- if i > 0 {
100
- hash.bulk_insert(&values[..i]).unwrap();
101
- values[..i].fill(handle.qnil().as_value());
102
- i = 0;
103
- }
104
- }
105
-
106
- hash.into_value_with(handle)
107
- }
108
- }
109
- }
110
- }
111
-
112
- impl<S: BuildHasher + Default> IntoValue for ColumnRecord<S> {
113
- fn into_value_with(self, handle: &Ruby) -> Value {
114
- match self {
115
- ColumnRecord::Vec(vec) => {
116
- let ary = handle.ary_new_capa(vec.len());
117
- vec.into_iter()
118
- .try_for_each(|v| {
119
- let nested_ary = handle.ary_new_capa(v.len());
120
- v.into_iter().try_for_each(|v| nested_ary.push(v)).unwrap();
121
- ary.push(nested_ary.into_value_with(handle))
122
- })
123
- .unwrap();
124
- ary.into_value_with(handle)
125
- }
126
- ColumnRecord::Map(map) => {
127
- let hash = handle.hash_new_capa(map.len());
128
-
129
- let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
130
- let mut i = 0;
131
-
132
- for chunk in &map.into_iter().chunks(64) {
133
- // Reduced to 64 to ensure space for pairs
134
- for (k, v) in chunk {
135
- if i + 1 >= values.len() {
136
- // Bulk insert current batch if array is full
137
- hash.bulk_insert(&values[..i]).unwrap();
138
- values[..i].fill(handle.qnil().as_value());
139
- i = 0;
140
- }
141
- values[i] = handle.into_value(k);
142
- let ary = handle.ary_new_capa(v.len());
143
- v.into_iter().try_for_each(|v| ary.push(v)).unwrap();
144
- values[i + 1] = handle.into_value(ary);
145
- i += 2;
146
- }
147
- // Insert any remaining pairs
148
- if i > 0 {
149
- hash.bulk_insert(&values[..i]).unwrap();
150
- values[..i].fill(handle.qnil().as_value());
151
- i = 0;
152
- }
153
- }
154
-
155
- hash.into_value_with(handle)
156
- }
157
- }
158
- }
159
- }
160
-
161
- #[derive(Debug, Clone)]
162
- pub struct CowValue<'a>(pub Cow<'a, str>);
163
-
164
- impl<'a> IntoValue for CowValue<'a> {
165
- fn into_value_with(self, handle: &Ruby) -> Value {
166
- self.0.into_value_with(handle)
167
- }
168
- }
169
-
170
- #[derive(Debug)]
171
- pub struct ParquetField(pub Field);
172
-
173
- impl IntoValue for ParquetField {
174
- fn into_value_with(self, handle: &Ruby) -> Value {
175
- match self.0 {
176
- Field::Null => handle.qnil().as_value(),
177
- Field::Bool(b) => b.into_value_with(handle),
178
- Field::Short(s) => s.into_value_with(handle),
179
- Field::Int(i) => i.into_value_with(handle),
180
- Field::Long(l) => l.into_value_with(handle),
181
- Field::UByte(ub) => ub.into_value_with(handle),
182
- Field::UShort(us) => us.into_value_with(handle),
183
- Field::UInt(ui) => ui.into_value_with(handle),
184
- Field::ULong(ul) => ul.into_value_with(handle),
185
- Field::Float16(f) => f32::from(f).into_value_with(handle),
186
- Field::Float(f) => f.into_value_with(handle),
187
- Field::Double(d) => d.into_value_with(handle),
188
- Field::Str(s) => s.into_value_with(handle),
189
- Field::Byte(b) => b.into_value_with(handle),
190
- Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
191
- Field::Date(d) => {
192
- let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
193
- let formatted = ts.strftime("%Y-%m-%d").to_string();
194
- formatted.into_value_with(handle)
195
- }
196
- Field::TimestampMillis(ts) => {
197
- let ts = jiff::Timestamp::from_millisecond(ts).unwrap();
198
- let time_class = handle.class_time();
199
- time_class
200
- .funcall::<_, _, Value>("parse", (ts.to_string(),))
201
- .unwrap()
202
- .into_value_with(handle)
203
- }
204
- Field::TimestampMicros(ts) => {
205
- let ts = jiff::Timestamp::from_microsecond(ts).unwrap();
206
- let time_class = handle.class_time();
207
- time_class
208
- .funcall::<_, _, Value>("parse", (ts.to_string(),))
209
- .unwrap()
210
- .into_value_with(handle)
211
- }
212
- Field::ListInternal(list) => {
213
- let elements = list.elements();
214
- let ary = handle.ary_new_capa(elements.len());
215
- elements
216
- .iter()
217
- .try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
218
- .unwrap();
219
- ary.into_value_with(handle)
220
- }
221
- Field::MapInternal(map) => {
222
- let entries = map.entries();
223
- let hash = handle.hash_new_capa(entries.len());
224
- entries
225
- .iter()
226
- .try_for_each(|(k, v)| {
227
- hash.aset(
228
- ParquetField(k.clone()).into_value_with(handle),
229
- ParquetField(v.clone()).into_value_with(handle),
230
- )
231
- })
232
- .unwrap();
233
- hash.into_value_with(handle)
234
- }
235
- Field::Decimal(d) => {
236
- let value = match d {
237
- Decimal::Int32 { value, scale, .. } => {
238
- let unscaled = i32::from_be_bytes(value);
239
- format!("{}e-{}", unscaled, scale)
240
- }
241
- Decimal::Int64 { value, scale, .. } => {
242
- let unscaled = i64::from_be_bytes(value);
243
- format!("{}e-{}", unscaled, scale)
244
- }
245
- Decimal::Bytes { value, scale, .. } => {
246
- // Convert bytes to string representation of unscaled value
247
- let unscaled = String::from_utf8_lossy(value.data());
248
- format!("{}e-{}", unscaled, scale)
249
- }
250
- };
251
- handle.eval(&format!("BigDecimal(\"{value}\")")).unwrap()
252
- }
253
- Field::Group(row) => {
254
- let hash = handle.hash_new();
255
- row.get_column_iter()
256
- .try_for_each(|(k, v)| {
257
- hash.aset(
258
- k.clone().into_value_with(handle),
259
- ParquetField(v.clone()).into_value_with(handle),
260
- )
261
- })
262
- .unwrap();
263
- hash.into_value_with(handle)
264
- }
265
- }
266
- }
267
- }
268
-
269
- #[allow(dead_code)]
270
- #[derive(Clone, Debug)]
271
- pub enum ParquetValue {
272
- Int8(i8),
273
- Int16(i16),
274
- Int32(i32),
275
- Int64(i64),
276
- UInt8(u8),
277
- UInt16(u16),
278
- UInt32(u32),
279
- UInt64(u64),
280
- Float16(f32), // f16 converted to f32
281
- Float32(f32),
282
- Float64(f64),
283
- Boolean(bool),
284
- String(String),
285
- Bytes(Vec<u8>),
286
- Date32(i32),
287
- Date64(i64),
288
- TimestampSecond(i64, Option<Arc<str>>),
289
- TimestampMillis(i64, Option<Arc<str>>),
290
- TimestampMicros(i64, Option<Arc<str>>),
291
- TimestampNanos(i64, Option<Arc<str>>),
292
- List(Vec<ParquetValue>),
293
- Map(HashMap<ParquetValue, ParquetValue>),
294
- Null,
295
- }
296
-
297
- impl PartialEq for ParquetValue {
298
- fn eq(&self, other: &Self) -> bool {
299
- match (self, other) {
300
- (ParquetValue::Int8(a), ParquetValue::Int8(b)) => a == b,
301
- (ParquetValue::Int16(a), ParquetValue::Int16(b)) => a == b,
302
- (ParquetValue::Int32(a), ParquetValue::Int32(b)) => a == b,
303
- (ParquetValue::Int64(a), ParquetValue::Int64(b)) => a == b,
304
- (ParquetValue::UInt8(a), ParquetValue::UInt8(b)) => a == b,
305
- (ParquetValue::UInt16(a), ParquetValue::UInt16(b)) => a == b,
306
- (ParquetValue::UInt32(a), ParquetValue::UInt32(b)) => a == b,
307
- (ParquetValue::UInt64(a), ParquetValue::UInt64(b)) => a == b,
308
- (ParquetValue::Float16(a), ParquetValue::Float16(b)) => a == b,
309
- (ParquetValue::Float32(a), ParquetValue::Float32(b)) => a == b,
310
- (ParquetValue::Float64(a), ParquetValue::Float64(b)) => a == b,
311
- (ParquetValue::Boolean(a), ParquetValue::Boolean(b)) => a == b,
312
- (ParquetValue::String(a), ParquetValue::String(b)) => a == b,
313
- (ParquetValue::Bytes(a), ParquetValue::Bytes(b)) => a == b,
314
- (ParquetValue::Date32(a), ParquetValue::Date32(b)) => a == b,
315
- (ParquetValue::Date64(a), ParquetValue::Date64(b)) => a == b,
316
- (ParquetValue::TimestampSecond(a, _), ParquetValue::TimestampSecond(b, _)) => a == b,
317
- (ParquetValue::TimestampMillis(a, _), ParquetValue::TimestampMillis(b, _)) => a == b,
318
- (ParquetValue::TimestampMicros(a, _), ParquetValue::TimestampMicros(b, _)) => a == b,
319
- (ParquetValue::TimestampNanos(a, _), ParquetValue::TimestampNanos(b, _)) => a == b,
320
- (ParquetValue::List(a), ParquetValue::List(b)) => a == b,
321
- (ParquetValue::Null, ParquetValue::Null) => true,
322
- _ => false,
323
- }
324
- }
325
- }
326
-
327
- impl Eq for ParquetValue {}
328
-
329
- #[derive(Debug)]
330
- pub struct ParquetValueVec(Vec<ParquetValue>);
331
-
332
- impl ParquetValueVec {
333
- pub fn into_inner(self) -> Vec<ParquetValue> {
334
- self.0
335
- }
336
- }
337
-
338
- impl IntoIterator for ParquetValueVec {
339
- type Item = ParquetValue;
340
- type IntoIter = std::vec::IntoIter<ParquetValue>;
341
-
342
- fn into_iter(self) -> Self::IntoIter {
343
- self.0.into_iter()
344
- }
345
- }
346
-
347
- impl std::cmp::PartialEq for ParquetValueVec {
348
- fn eq(&self, other: &Self) -> bool {
349
- self.0 == other.0
350
- }
351
- }
352
-
353
- impl std::cmp::Eq for ParquetValueVec {}
354
-
355
- impl TryFrom<Arc<dyn Array>> for ParquetValueVec {
356
- type Error = String;
357
-
358
- fn try_from(column: Arc<dyn Array>) -> Result<Self, Self::Error> {
359
- ParquetValueVec::try_from(&*column)
360
- }
361
- }
362
-
363
- // Add macro for handling numeric array conversions
364
- macro_rules! impl_numeric_array_conversion {
365
- ($column:expr, $array_type:ty, $variant:ident) => {{
366
- let array = downcast_array::<$array_type>($column);
367
- if array.is_nullable() {
368
- array
369
- .values()
370
- .iter()
371
- .enumerate()
372
- .map(|(i, x)| {
373
- if array.is_null(i) {
374
- ParquetValue::Null
375
- } else {
376
- ParquetValue::$variant(*x)
377
- }
378
- })
379
- .collect()
380
- } else {
381
- array
382
- .values()
383
- .iter()
384
- .map(|x| ParquetValue::$variant(*x))
385
- .collect()
386
- }
387
- }};
388
- }
389
-
390
- // Add macro for handling boolean array conversions
391
- macro_rules! impl_boolean_array_conversion {
392
- ($column:expr, $array_type:ty, $variant:ident) => {{
393
- let array = downcast_array::<$array_type>($column);
394
- if array.is_nullable() {
395
- array
396
- .values()
397
- .iter()
398
- .enumerate()
399
- .map(|(i, x)| {
400
- if array.is_null(i) {
401
- ParquetValue::Null
402
- } else {
403
- ParquetValue::$variant(x)
404
- }
405
- })
406
- .collect()
407
- } else {
408
- array
409
- .values()
410
- .iter()
411
- .map(|x| ParquetValue::$variant(x))
412
- .collect()
413
- }
414
- }};
415
- }
416
-
417
- // Add macro for handling timestamp array conversions
418
- macro_rules! impl_timestamp_array_conversion {
419
- ($column:expr, $array_type:ty, $variant:ident, $tz:expr) => {{
420
- let array = downcast_array::<$array_type>($column);
421
- if array.is_nullable() {
422
- array
423
- .values()
424
- .iter()
425
- .enumerate()
426
- .map(|(i, x)| {
427
- if array.is_null(i) {
428
- ParquetValue::Null
429
- } else {
430
- ParquetValue::$variant(*x, $tz.clone())
431
- }
432
- })
433
- .collect()
434
- } else {
435
- array
436
- .values()
437
- .iter()
438
- .map(|x| ParquetValue::$variant(*x, $tz.clone()))
439
- .collect()
440
- }
441
- }};
442
- }
443
-
444
- impl TryFrom<&dyn Array> for ParquetValueVec {
445
- type Error = String;
446
-
447
- fn try_from(column: &dyn Array) -> Result<Self, Self::Error> {
448
- let tmp_vec = match column.data_type() {
449
- DataType::Boolean => impl_boolean_array_conversion!(column, BooleanArray, Boolean),
450
- DataType::Int8 => impl_numeric_array_conversion!(column, Int8Array, Int8),
451
- DataType::Int16 => impl_numeric_array_conversion!(column, Int16Array, Int16),
452
- DataType::Int32 => impl_numeric_array_conversion!(column, Int32Array, Int32),
453
- DataType::Int64 => impl_numeric_array_conversion!(column, Int64Array, Int64),
454
- DataType::UInt8 => impl_numeric_array_conversion!(column, UInt8Array, UInt8),
455
- DataType::UInt16 => impl_numeric_array_conversion!(column, UInt16Array, UInt16),
456
- DataType::UInt32 => impl_numeric_array_conversion!(column, UInt32Array, UInt32),
457
- DataType::UInt64 => impl_numeric_array_conversion!(column, UInt64Array, UInt64),
458
- DataType::Float32 => impl_numeric_array_conversion!(column, Float32Array, Float32),
459
- DataType::Float64 => impl_numeric_array_conversion!(column, Float64Array, Float64),
460
- DataType::Date32 => impl_numeric_array_conversion!(column, Date32Array, Date32),
461
- DataType::Date64 => impl_numeric_array_conversion!(column, Date64Array, Date64),
462
- DataType::Timestamp(TimeUnit::Second, tz) => {
463
- impl_timestamp_array_conversion!(column, TimestampSecondArray, TimestampSecond, tz)
464
- }
465
- DataType::Timestamp(TimeUnit::Millisecond, tz) => {
466
- impl_timestamp_array_conversion!(
467
- column,
468
- TimestampMillisecondArray,
469
- TimestampMillis,
470
- tz
471
- )
472
- }
473
- DataType::Timestamp(TimeUnit::Microsecond, tz) => {
474
- impl_timestamp_array_conversion!(
475
- column,
476
- TimestampMicrosecondArray,
477
- TimestampMicros,
478
- tz
479
- )
480
- }
481
- DataType::Timestamp(TimeUnit::Nanosecond, tz) => {
482
- impl_timestamp_array_conversion!(
483
- column,
484
- TimestampNanosecondArray,
485
- TimestampNanos,
486
- tz
487
- )
488
- }
489
- // Because f16 is unstable in Rust, we convert it to f32
490
- DataType::Float16 => {
491
- let array = downcast_array::<Float16Array>(column);
492
- if array.is_nullable() {
493
- array
494
- .values()
495
- .iter()
496
- .enumerate()
497
- .map(|(i, x)| {
498
- if array.is_null(i) {
499
- ParquetValue::Null
500
- } else {
501
- ParquetValue::Float16(f32::from(*x))
502
- }
503
- })
504
- .collect()
505
- } else {
506
- array
507
- .values()
508
- .iter()
509
- .map(|x| ParquetValue::Float16(f32::from(*x)))
510
- .collect()
511
- }
512
- }
513
- DataType::Utf8 => {
514
- let array = downcast_array::<StringArray>(column);
515
- array
516
- .iter()
517
- .map(|opt_x| match opt_x {
518
- Some(x) => ParquetValue::String(x.to_string()),
519
- None => ParquetValue::Null,
520
- })
521
- .collect()
522
- }
523
- DataType::Binary => {
524
- let array = downcast_array::<BinaryArray>(column);
525
- array
526
- .iter()
527
- .map(|opt_x| match opt_x {
528
- Some(x) => ParquetValue::Bytes(x.to_vec()),
529
- None => ParquetValue::Null,
530
- })
531
- .collect()
532
- }
533
- DataType::List(_field) => {
534
- let list_array = downcast_array::<ListArray>(column);
535
- list_array
536
- .iter()
537
- .map(|x| match x {
538
- Some(values) => match ParquetValueVec::try_from(values) {
539
- Ok(vec) => ParquetValue::List(vec.into_inner()),
540
- Err(e) => {
541
- panic!("Error converting list array to ParquetValueVec: {}", e)
542
- }
543
- },
544
- None => ParquetValue::Null,
545
- })
546
- .collect()
547
- }
548
- DataType::Struct(_) => {
549
- let struct_array = downcast_array::<StructArray>(column);
550
- let mut values = Vec::with_capacity(struct_array.len());
551
- for i in 0..struct_array.len() {
552
- if struct_array.is_null(i) {
553
- values.push(ParquetValue::Null);
554
- continue;
555
- }
556
-
557
- let mut map = std::collections::HashMap::new();
558
- for (field_idx, field) in struct_array.fields().iter().enumerate() {
559
- let column = struct_array.column(field_idx);
560
- let field_values = match ParquetValueVec::try_from(column.slice(i, 1)) {
561
- Ok(vec) => vec.into_inner(),
562
- Err(e) => {
563
- panic!("Error converting struct field to ParquetValueVec: {}", e)
564
- }
565
- };
566
- map.insert(
567
- ParquetValue::String(field.name().to_string()),
568
- field_values.into_iter().next().unwrap(),
569
- );
570
- }
571
- values.push(ParquetValue::Map(map));
572
- }
573
- values
574
- }
575
- DataType::Null => {
576
- let x = downcast_array::<NullArray>(column);
577
- vec![ParquetValue::Null; x.len()]
578
- }
579
- _ => {
580
- return Err(format!("Unsupported data type: {:?}", column.data_type()));
581
- }
582
- };
583
- Ok(ParquetValueVec(tmp_vec))
584
- }
585
- }
586
-
587
- impl std::hash::Hash for ParquetValue {
588
- fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
589
- match self {
590
- ParquetValue::Int8(i) => i.hash(state),
591
- ParquetValue::Int16(i) => i.hash(state),
592
- ParquetValue::Int32(i) => i.hash(state),
593
- ParquetValue::Int64(i) => i.hash(state),
594
- ParquetValue::UInt8(i) => i.hash(state),
595
- ParquetValue::UInt16(i) => i.hash(state),
596
- ParquetValue::UInt32(i) => i.hash(state),
597
- ParquetValue::UInt64(i) => i.hash(state),
598
- ParquetValue::Float16(f) => f.to_bits().hash(state),
599
- ParquetValue::Float32(f) => f.to_bits().hash(state),
600
- ParquetValue::Float64(f) => f.to_bits().hash(state),
601
- ParquetValue::Boolean(b) => b.hash(state),
602
- ParquetValue::String(s) => s.hash(state),
603
- ParquetValue::Bytes(b) => b.hash(state),
604
- ParquetValue::Date32(d) => d.hash(state),
605
- ParquetValue::Date64(d) => d.hash(state),
606
- ParquetValue::TimestampSecond(ts, tz) => {
607
- ts.hash(state);
608
- tz.hash(state);
609
- }
610
- ParquetValue::TimestampMillis(ts, tz) => {
611
- ts.hash(state);
612
- tz.hash(state);
613
- }
614
- ParquetValue::TimestampMicros(ts, tz) => {
615
- ts.hash(state);
616
- tz.hash(state);
617
- }
618
- ParquetValue::TimestampNanos(ts, tz) => {
619
- ts.hash(state);
620
- tz.hash(state);
621
- }
622
- ParquetValue::List(l) => l.hash(state),
623
- ParquetValue::Map(_m) => panic!("Map is not hashable"),
624
- ParquetValue::Null => 0_i32.hash(state),
625
- }
626
- }
627
- }
628
-
629
- impl IntoValue for ParquetValue {
630
- fn into_value_with(self, handle: &Ruby) -> Value {
631
- match self {
632
- ParquetValue::Int8(i) => i.into_value_with(handle),
633
- ParquetValue::Int16(i) => i.into_value_with(handle),
634
- ParquetValue::Int32(i) => i.into_value_with(handle),
635
- ParquetValue::Int64(i) => i.into_value_with(handle),
636
- ParquetValue::UInt8(i) => i.into_value_with(handle),
637
- ParquetValue::UInt16(i) => i.into_value_with(handle),
638
- ParquetValue::UInt32(i) => i.into_value_with(handle),
639
- ParquetValue::UInt64(i) => i.into_value_with(handle),
640
- ParquetValue::Float16(f) => f.into_value_with(handle),
641
- ParquetValue::Float32(f) => f.into_value_with(handle),
642
- ParquetValue::Float64(f) => f.into_value_with(handle),
643
- ParquetValue::Boolean(b) => b.into_value_with(handle),
644
- ParquetValue::String(s) => s.into_value_with(handle),
645
- ParquetValue::Bytes(b) => b.into_value_with(handle),
646
- ParquetValue::Date32(d) => {
647
- let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
648
- let formatted = ts.strftime("%Y-%m-%d").to_string();
649
- formatted.into_value_with(handle)
650
- }
651
- ParquetValue::Date64(d) => {
652
- let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
653
- let formatted = ts.strftime("%Y-%m-%d").to_string();
654
- formatted.into_value_with(handle)
655
- }
656
- ParquetValue::TimestampSecond(ts, tz) => {
657
- let ts = parse_zoned_timestamp(&ParquetValue::TimestampSecond(ts, tz));
658
- let time_class = handle.class_time();
659
- time_class
660
- .funcall::<_, _, Value>("parse", (ts.to_string(),))
661
- .unwrap()
662
- .into_value_with(handle)
663
- }
664
- ParquetValue::TimestampMillis(ts, tz) => {
665
- let ts = parse_zoned_timestamp(&ParquetValue::TimestampMillis(ts, tz));
666
- let time_class = handle.class_time();
667
- time_class
668
- .funcall::<_, _, Value>("parse", (ts.to_string(),))
669
- .unwrap()
670
- .into_value_with(handle)
671
- }
672
- ParquetValue::TimestampMicros(ts, tz) => {
673
- let ts = parse_zoned_timestamp(&ParquetValue::TimestampMicros(ts, tz));
674
- let time_class = handle.class_time();
675
- time_class
676
- .funcall::<_, _, Value>("parse", (ts.to_string(),))
677
- .unwrap()
678
- .into_value_with(handle)
679
- }
680
- ParquetValue::TimestampNanos(ts, tz) => {
681
- let ts = parse_zoned_timestamp(&ParquetValue::TimestampNanos(ts, tz));
682
- let time_class = handle.class_time();
683
- time_class
684
- .funcall::<_, _, Value>("parse", (ts.to_string(),))
685
- .unwrap()
686
- .into_value_with(handle)
687
- }
688
- ParquetValue::List(l) => {
689
- let ary = handle.ary_new_capa(l.len());
690
- l.into_iter()
691
- .try_for_each(|v| ary.push(v.into_value_with(handle)))
692
- .unwrap();
693
- ary.into_value_with(handle)
694
- }
695
- ParquetValue::Map(m) => {
696
- let hash = handle.hash_new_capa(m.len());
697
- m.into_iter()
698
- .try_for_each(|(k, v)| {
699
- hash.aset(k.into_value_with(handle), v.into_value_with(handle))
700
- })
701
- .unwrap();
702
- hash.into_value_with(handle)
703
- }
704
- ParquetValue::Null => handle.qnil().as_value(),
705
- }
706
- }
707
- }
708
-
709
- fn parse_zoned_timestamp(value: &ParquetValue) -> jiff::Timestamp {
710
- let (ts, tz) = match value {
711
- ParquetValue::TimestampSecond(ts, tz) => (jiff::Timestamp::from_second(*ts).unwrap(), tz),
712
- ParquetValue::TimestampMillis(ts, tz) => {
713
- (jiff::Timestamp::from_millisecond(*ts).unwrap(), tz)
714
- }
715
- ParquetValue::TimestampMicros(ts, tz) => {
716
- (jiff::Timestamp::from_microsecond(*ts).unwrap(), tz)
717
- }
718
- ParquetValue::TimestampNanos(ts, tz) => {
719
- (jiff::Timestamp::from_nanosecond(*ts as i128).unwrap(), tz)
720
- }
721
- _ => panic!("Invalid timestamp value"),
722
- };
723
-
724
- // If timezone is provided, convert to zoned timestamp
725
- if let Some(tz) = tz {
726
- // Handle fixed offset timezones like "+09:00" first
727
- if tz.starts_with('+') || tz.starts_with('-') {
728
- // Parse the offset string into hours and minutes
729
- let (hours, minutes) = if tz.len() >= 5 && tz.contains(':') {
730
- // Format: "+09:00" or "-09:00"
731
- let h = tz[1..3].parse::<i32>().unwrap_or(0);
732
- let m = tz[4..6].parse::<i32>().unwrap_or(0);
733
- (h, m)
734
- } else if tz.len() >= 3 {
735
- // Format: "+09" or "-09"
736
- let h = tz[1..3].parse::<i32>().unwrap_or(0);
737
- (h, 0)
738
- } else {
739
- (0, 0)
740
- };
741
-
742
- // Apply sign
743
- let total_minutes = if tz.starts_with('-') {
744
- -(hours * 60 + minutes)
745
- } else {
746
- hours * 60 + minutes
747
- };
748
-
749
- // Create fixed timezone
750
- let tz = jiff::tz::TimeZone::fixed(jiff::tz::offset((total_minutes / 60) as i8));
751
- ts.to_zoned(tz).timestamp()
752
- } else {
753
- // Try IANA timezone
754
- match ts.intz(&tz) {
755
- Ok(zoned) => zoned.timestamp(),
756
- Err(_) => ts, // Fall back to UTC if timezone is invalid
757
- }
758
- }
759
- } else {
760
- // No timezone provided - treat as UTC
761
- ts
762
- }
763
- }