parquet 0.0.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +50 -0
- data/README.md +92 -2
- data/ext/parquet/Cargo.toml +1 -0
- data/ext/parquet/src/lib.rs +5 -3
- data/ext/parquet/src/{reader.rs → reader/mod.rs} +5 -2
- data/ext/parquet/src/types/core_types.rs +73 -0
- data/ext/parquet/src/types/mod.rs +30 -0
- data/ext/parquet/src/types/parquet_value.rs +462 -0
- data/ext/parquet/src/types/record_types.rs +204 -0
- data/ext/parquet/src/types/timestamp.rs +85 -0
- data/ext/parquet/src/types/type_conversion.rs +753 -0
- data/ext/parquet/src/types/writer_types.rs +275 -0
- data/ext/parquet/src/utils.rs +16 -5
- data/ext/parquet/src/writer/mod.rs +403 -0
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +33 -2
- metadata +13 -6
- data/ext/parquet/src/types.rs +0 -763
- /data/ext/parquet/src/{parquet_column_reader.rs → reader/parquet_column_reader.rs} +0 -0
- /data/ext/parquet/src/{parquet_row_reader.rs → reader/parquet_row_reader.rs} +0 -0
data/ext/parquet/src/types.rs
DELETED
@@ -1,763 +0,0 @@
|
|
1
|
-
use std::{borrow::Cow, collections::HashMap, hash::BuildHasher, sync::Arc};
|
2
|
-
|
3
|
-
use arrow_array::cast::downcast_array;
|
4
|
-
use arrow_array::{
|
5
|
-
Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Float16Array, Float32Array,
|
6
|
-
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, NullArray, StringArray,
|
7
|
-
StructArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
|
8
|
-
TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
|
9
|
-
};
|
10
|
-
use arrow_schema::{DataType, TimeUnit};
|
11
|
-
use itertools::Itertools;
|
12
|
-
use magnus::{value::ReprValue, IntoValue, Ruby, Value};
|
13
|
-
use parquet::data_type::Decimal;
|
14
|
-
use parquet::record::Field;
|
15
|
-
|
16
|
-
use crate::header_cache::StringCacheKey;
|
17
|
-
|
18
|
-
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
19
|
-
pub enum ParserResultType {
|
20
|
-
Hash,
|
21
|
-
Array,
|
22
|
-
}
|
23
|
-
|
24
|
-
impl ParserResultType {
|
25
|
-
pub fn iter() -> impl Iterator<Item = Self> {
|
26
|
-
[Self::Hash, Self::Array].into_iter()
|
27
|
-
}
|
28
|
-
}
|
29
|
-
|
30
|
-
impl TryFrom<&str> for ParserResultType {
|
31
|
-
type Error = String;
|
32
|
-
|
33
|
-
fn try_from(value: &str) -> Result<Self, Self::Error> {
|
34
|
-
match value {
|
35
|
-
"hash" => Ok(ParserResultType::Hash),
|
36
|
-
"array" => Ok(ParserResultType::Array),
|
37
|
-
_ => Err(format!("Invalid parser result type: {}", value)),
|
38
|
-
}
|
39
|
-
}
|
40
|
-
}
|
41
|
-
|
42
|
-
impl TryFrom<String> for ParserResultType {
|
43
|
-
type Error = String;
|
44
|
-
|
45
|
-
fn try_from(value: String) -> Result<Self, Self::Error> {
|
46
|
-
Self::try_from(value.as_str())
|
47
|
-
}
|
48
|
-
}
|
49
|
-
|
50
|
-
impl std::fmt::Display for ParserResultType {
|
51
|
-
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
52
|
-
match self {
|
53
|
-
ParserResultType::Hash => write!(f, "hash"),
|
54
|
-
ParserResultType::Array => write!(f, "array"),
|
55
|
-
}
|
56
|
-
}
|
57
|
-
}
|
58
|
-
|
59
|
-
#[derive(Debug)]
|
60
|
-
pub enum RowRecord<S: BuildHasher + Default> {
|
61
|
-
Vec(Vec<ParquetField>),
|
62
|
-
Map(HashMap<StringCacheKey, ParquetField, S>),
|
63
|
-
}
|
64
|
-
|
65
|
-
#[derive(Debug)]
|
66
|
-
pub enum ColumnRecord<S: BuildHasher + Default> {
|
67
|
-
Vec(Vec<Vec<ParquetValue>>),
|
68
|
-
Map(HashMap<StringCacheKey, Vec<ParquetValue>, S>),
|
69
|
-
}
|
70
|
-
|
71
|
-
impl<S: BuildHasher + Default> IntoValue for RowRecord<S> {
|
72
|
-
fn into_value_with(self, handle: &Ruby) -> Value {
|
73
|
-
match self {
|
74
|
-
RowRecord::Vec(vec) => {
|
75
|
-
let ary = handle.ary_new_capa(vec.len());
|
76
|
-
vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
|
77
|
-
handle.into_value(ary)
|
78
|
-
}
|
79
|
-
RowRecord::Map(map) => {
|
80
|
-
let hash = handle.hash_new_capa(map.len());
|
81
|
-
|
82
|
-
let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
|
83
|
-
let mut i = 0;
|
84
|
-
|
85
|
-
for chunk in &map.into_iter().chunks(64) {
|
86
|
-
// Reduced to 64 to ensure space for pairs
|
87
|
-
for (k, v) in chunk {
|
88
|
-
if i + 1 >= values.len() {
|
89
|
-
// Bulk insert current batch if array is full
|
90
|
-
hash.bulk_insert(&values[..i]).unwrap();
|
91
|
-
values[..i].fill(handle.qnil().as_value());
|
92
|
-
i = 0;
|
93
|
-
}
|
94
|
-
values[i] = handle.into_value(k);
|
95
|
-
values[i + 1] = handle.into_value(v);
|
96
|
-
i += 2;
|
97
|
-
}
|
98
|
-
// Insert any remaining pairs
|
99
|
-
if i > 0 {
|
100
|
-
hash.bulk_insert(&values[..i]).unwrap();
|
101
|
-
values[..i].fill(handle.qnil().as_value());
|
102
|
-
i = 0;
|
103
|
-
}
|
104
|
-
}
|
105
|
-
|
106
|
-
hash.into_value_with(handle)
|
107
|
-
}
|
108
|
-
}
|
109
|
-
}
|
110
|
-
}
|
111
|
-
|
112
|
-
impl<S: BuildHasher + Default> IntoValue for ColumnRecord<S> {
|
113
|
-
fn into_value_with(self, handle: &Ruby) -> Value {
|
114
|
-
match self {
|
115
|
-
ColumnRecord::Vec(vec) => {
|
116
|
-
let ary = handle.ary_new_capa(vec.len());
|
117
|
-
vec.into_iter()
|
118
|
-
.try_for_each(|v| {
|
119
|
-
let nested_ary = handle.ary_new_capa(v.len());
|
120
|
-
v.into_iter().try_for_each(|v| nested_ary.push(v)).unwrap();
|
121
|
-
ary.push(nested_ary.into_value_with(handle))
|
122
|
-
})
|
123
|
-
.unwrap();
|
124
|
-
ary.into_value_with(handle)
|
125
|
-
}
|
126
|
-
ColumnRecord::Map(map) => {
|
127
|
-
let hash = handle.hash_new_capa(map.len());
|
128
|
-
|
129
|
-
let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
|
130
|
-
let mut i = 0;
|
131
|
-
|
132
|
-
for chunk in &map.into_iter().chunks(64) {
|
133
|
-
// Reduced to 64 to ensure space for pairs
|
134
|
-
for (k, v) in chunk {
|
135
|
-
if i + 1 >= values.len() {
|
136
|
-
// Bulk insert current batch if array is full
|
137
|
-
hash.bulk_insert(&values[..i]).unwrap();
|
138
|
-
values[..i].fill(handle.qnil().as_value());
|
139
|
-
i = 0;
|
140
|
-
}
|
141
|
-
values[i] = handle.into_value(k);
|
142
|
-
let ary = handle.ary_new_capa(v.len());
|
143
|
-
v.into_iter().try_for_each(|v| ary.push(v)).unwrap();
|
144
|
-
values[i + 1] = handle.into_value(ary);
|
145
|
-
i += 2;
|
146
|
-
}
|
147
|
-
// Insert any remaining pairs
|
148
|
-
if i > 0 {
|
149
|
-
hash.bulk_insert(&values[..i]).unwrap();
|
150
|
-
values[..i].fill(handle.qnil().as_value());
|
151
|
-
i = 0;
|
152
|
-
}
|
153
|
-
}
|
154
|
-
|
155
|
-
hash.into_value_with(handle)
|
156
|
-
}
|
157
|
-
}
|
158
|
-
}
|
159
|
-
}
|
160
|
-
|
161
|
-
#[derive(Debug, Clone)]
|
162
|
-
pub struct CowValue<'a>(pub Cow<'a, str>);
|
163
|
-
|
164
|
-
impl<'a> IntoValue for CowValue<'a> {
|
165
|
-
fn into_value_with(self, handle: &Ruby) -> Value {
|
166
|
-
self.0.into_value_with(handle)
|
167
|
-
}
|
168
|
-
}
|
169
|
-
|
170
|
-
#[derive(Debug)]
|
171
|
-
pub struct ParquetField(pub Field);
|
172
|
-
|
173
|
-
impl IntoValue for ParquetField {
|
174
|
-
fn into_value_with(self, handle: &Ruby) -> Value {
|
175
|
-
match self.0 {
|
176
|
-
Field::Null => handle.qnil().as_value(),
|
177
|
-
Field::Bool(b) => b.into_value_with(handle),
|
178
|
-
Field::Short(s) => s.into_value_with(handle),
|
179
|
-
Field::Int(i) => i.into_value_with(handle),
|
180
|
-
Field::Long(l) => l.into_value_with(handle),
|
181
|
-
Field::UByte(ub) => ub.into_value_with(handle),
|
182
|
-
Field::UShort(us) => us.into_value_with(handle),
|
183
|
-
Field::UInt(ui) => ui.into_value_with(handle),
|
184
|
-
Field::ULong(ul) => ul.into_value_with(handle),
|
185
|
-
Field::Float16(f) => f32::from(f).into_value_with(handle),
|
186
|
-
Field::Float(f) => f.into_value_with(handle),
|
187
|
-
Field::Double(d) => d.into_value_with(handle),
|
188
|
-
Field::Str(s) => s.into_value_with(handle),
|
189
|
-
Field::Byte(b) => b.into_value_with(handle),
|
190
|
-
Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
|
191
|
-
Field::Date(d) => {
|
192
|
-
let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
|
193
|
-
let formatted = ts.strftime("%Y-%m-%d").to_string();
|
194
|
-
formatted.into_value_with(handle)
|
195
|
-
}
|
196
|
-
Field::TimestampMillis(ts) => {
|
197
|
-
let ts = jiff::Timestamp::from_millisecond(ts).unwrap();
|
198
|
-
let time_class = handle.class_time();
|
199
|
-
time_class
|
200
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
201
|
-
.unwrap()
|
202
|
-
.into_value_with(handle)
|
203
|
-
}
|
204
|
-
Field::TimestampMicros(ts) => {
|
205
|
-
let ts = jiff::Timestamp::from_microsecond(ts).unwrap();
|
206
|
-
let time_class = handle.class_time();
|
207
|
-
time_class
|
208
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
209
|
-
.unwrap()
|
210
|
-
.into_value_with(handle)
|
211
|
-
}
|
212
|
-
Field::ListInternal(list) => {
|
213
|
-
let elements = list.elements();
|
214
|
-
let ary = handle.ary_new_capa(elements.len());
|
215
|
-
elements
|
216
|
-
.iter()
|
217
|
-
.try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
|
218
|
-
.unwrap();
|
219
|
-
ary.into_value_with(handle)
|
220
|
-
}
|
221
|
-
Field::MapInternal(map) => {
|
222
|
-
let entries = map.entries();
|
223
|
-
let hash = handle.hash_new_capa(entries.len());
|
224
|
-
entries
|
225
|
-
.iter()
|
226
|
-
.try_for_each(|(k, v)| {
|
227
|
-
hash.aset(
|
228
|
-
ParquetField(k.clone()).into_value_with(handle),
|
229
|
-
ParquetField(v.clone()).into_value_with(handle),
|
230
|
-
)
|
231
|
-
})
|
232
|
-
.unwrap();
|
233
|
-
hash.into_value_with(handle)
|
234
|
-
}
|
235
|
-
Field::Decimal(d) => {
|
236
|
-
let value = match d {
|
237
|
-
Decimal::Int32 { value, scale, .. } => {
|
238
|
-
let unscaled = i32::from_be_bytes(value);
|
239
|
-
format!("{}e-{}", unscaled, scale)
|
240
|
-
}
|
241
|
-
Decimal::Int64 { value, scale, .. } => {
|
242
|
-
let unscaled = i64::from_be_bytes(value);
|
243
|
-
format!("{}e-{}", unscaled, scale)
|
244
|
-
}
|
245
|
-
Decimal::Bytes { value, scale, .. } => {
|
246
|
-
// Convert bytes to string representation of unscaled value
|
247
|
-
let unscaled = String::from_utf8_lossy(value.data());
|
248
|
-
format!("{}e-{}", unscaled, scale)
|
249
|
-
}
|
250
|
-
};
|
251
|
-
handle.eval(&format!("BigDecimal(\"{value}\")")).unwrap()
|
252
|
-
}
|
253
|
-
Field::Group(row) => {
|
254
|
-
let hash = handle.hash_new();
|
255
|
-
row.get_column_iter()
|
256
|
-
.try_for_each(|(k, v)| {
|
257
|
-
hash.aset(
|
258
|
-
k.clone().into_value_with(handle),
|
259
|
-
ParquetField(v.clone()).into_value_with(handle),
|
260
|
-
)
|
261
|
-
})
|
262
|
-
.unwrap();
|
263
|
-
hash.into_value_with(handle)
|
264
|
-
}
|
265
|
-
}
|
266
|
-
}
|
267
|
-
}
|
268
|
-
|
269
|
-
#[allow(dead_code)]
|
270
|
-
#[derive(Clone, Debug)]
|
271
|
-
pub enum ParquetValue {
|
272
|
-
Int8(i8),
|
273
|
-
Int16(i16),
|
274
|
-
Int32(i32),
|
275
|
-
Int64(i64),
|
276
|
-
UInt8(u8),
|
277
|
-
UInt16(u16),
|
278
|
-
UInt32(u32),
|
279
|
-
UInt64(u64),
|
280
|
-
Float16(f32), // f16 converted to f32
|
281
|
-
Float32(f32),
|
282
|
-
Float64(f64),
|
283
|
-
Boolean(bool),
|
284
|
-
String(String),
|
285
|
-
Bytes(Vec<u8>),
|
286
|
-
Date32(i32),
|
287
|
-
Date64(i64),
|
288
|
-
TimestampSecond(i64, Option<Arc<str>>),
|
289
|
-
TimestampMillis(i64, Option<Arc<str>>),
|
290
|
-
TimestampMicros(i64, Option<Arc<str>>),
|
291
|
-
TimestampNanos(i64, Option<Arc<str>>),
|
292
|
-
List(Vec<ParquetValue>),
|
293
|
-
Map(HashMap<ParquetValue, ParquetValue>),
|
294
|
-
Null,
|
295
|
-
}
|
296
|
-
|
297
|
-
impl PartialEq for ParquetValue {
|
298
|
-
fn eq(&self, other: &Self) -> bool {
|
299
|
-
match (self, other) {
|
300
|
-
(ParquetValue::Int8(a), ParquetValue::Int8(b)) => a == b,
|
301
|
-
(ParquetValue::Int16(a), ParquetValue::Int16(b)) => a == b,
|
302
|
-
(ParquetValue::Int32(a), ParquetValue::Int32(b)) => a == b,
|
303
|
-
(ParquetValue::Int64(a), ParquetValue::Int64(b)) => a == b,
|
304
|
-
(ParquetValue::UInt8(a), ParquetValue::UInt8(b)) => a == b,
|
305
|
-
(ParquetValue::UInt16(a), ParquetValue::UInt16(b)) => a == b,
|
306
|
-
(ParquetValue::UInt32(a), ParquetValue::UInt32(b)) => a == b,
|
307
|
-
(ParquetValue::UInt64(a), ParquetValue::UInt64(b)) => a == b,
|
308
|
-
(ParquetValue::Float16(a), ParquetValue::Float16(b)) => a == b,
|
309
|
-
(ParquetValue::Float32(a), ParquetValue::Float32(b)) => a == b,
|
310
|
-
(ParquetValue::Float64(a), ParquetValue::Float64(b)) => a == b,
|
311
|
-
(ParquetValue::Boolean(a), ParquetValue::Boolean(b)) => a == b,
|
312
|
-
(ParquetValue::String(a), ParquetValue::String(b)) => a == b,
|
313
|
-
(ParquetValue::Bytes(a), ParquetValue::Bytes(b)) => a == b,
|
314
|
-
(ParquetValue::Date32(a), ParquetValue::Date32(b)) => a == b,
|
315
|
-
(ParquetValue::Date64(a), ParquetValue::Date64(b)) => a == b,
|
316
|
-
(ParquetValue::TimestampSecond(a, _), ParquetValue::TimestampSecond(b, _)) => a == b,
|
317
|
-
(ParquetValue::TimestampMillis(a, _), ParquetValue::TimestampMillis(b, _)) => a == b,
|
318
|
-
(ParquetValue::TimestampMicros(a, _), ParquetValue::TimestampMicros(b, _)) => a == b,
|
319
|
-
(ParquetValue::TimestampNanos(a, _), ParquetValue::TimestampNanos(b, _)) => a == b,
|
320
|
-
(ParquetValue::List(a), ParquetValue::List(b)) => a == b,
|
321
|
-
(ParquetValue::Null, ParquetValue::Null) => true,
|
322
|
-
_ => false,
|
323
|
-
}
|
324
|
-
}
|
325
|
-
}
|
326
|
-
|
327
|
-
impl Eq for ParquetValue {}
|
328
|
-
|
329
|
-
#[derive(Debug)]
|
330
|
-
pub struct ParquetValueVec(Vec<ParquetValue>);
|
331
|
-
|
332
|
-
impl ParquetValueVec {
|
333
|
-
pub fn into_inner(self) -> Vec<ParquetValue> {
|
334
|
-
self.0
|
335
|
-
}
|
336
|
-
}
|
337
|
-
|
338
|
-
impl IntoIterator for ParquetValueVec {
|
339
|
-
type Item = ParquetValue;
|
340
|
-
type IntoIter = std::vec::IntoIter<ParquetValue>;
|
341
|
-
|
342
|
-
fn into_iter(self) -> Self::IntoIter {
|
343
|
-
self.0.into_iter()
|
344
|
-
}
|
345
|
-
}
|
346
|
-
|
347
|
-
impl std::cmp::PartialEq for ParquetValueVec {
|
348
|
-
fn eq(&self, other: &Self) -> bool {
|
349
|
-
self.0 == other.0
|
350
|
-
}
|
351
|
-
}
|
352
|
-
|
353
|
-
impl std::cmp::Eq for ParquetValueVec {}
|
354
|
-
|
355
|
-
impl TryFrom<Arc<dyn Array>> for ParquetValueVec {
|
356
|
-
type Error = String;
|
357
|
-
|
358
|
-
fn try_from(column: Arc<dyn Array>) -> Result<Self, Self::Error> {
|
359
|
-
ParquetValueVec::try_from(&*column)
|
360
|
-
}
|
361
|
-
}
|
362
|
-
|
363
|
-
// Add macro for handling numeric array conversions
|
364
|
-
macro_rules! impl_numeric_array_conversion {
|
365
|
-
($column:expr, $array_type:ty, $variant:ident) => {{
|
366
|
-
let array = downcast_array::<$array_type>($column);
|
367
|
-
if array.is_nullable() {
|
368
|
-
array
|
369
|
-
.values()
|
370
|
-
.iter()
|
371
|
-
.enumerate()
|
372
|
-
.map(|(i, x)| {
|
373
|
-
if array.is_null(i) {
|
374
|
-
ParquetValue::Null
|
375
|
-
} else {
|
376
|
-
ParquetValue::$variant(*x)
|
377
|
-
}
|
378
|
-
})
|
379
|
-
.collect()
|
380
|
-
} else {
|
381
|
-
array
|
382
|
-
.values()
|
383
|
-
.iter()
|
384
|
-
.map(|x| ParquetValue::$variant(*x))
|
385
|
-
.collect()
|
386
|
-
}
|
387
|
-
}};
|
388
|
-
}
|
389
|
-
|
390
|
-
// Add macro for handling boolean array conversions
|
391
|
-
macro_rules! impl_boolean_array_conversion {
|
392
|
-
($column:expr, $array_type:ty, $variant:ident) => {{
|
393
|
-
let array = downcast_array::<$array_type>($column);
|
394
|
-
if array.is_nullable() {
|
395
|
-
array
|
396
|
-
.values()
|
397
|
-
.iter()
|
398
|
-
.enumerate()
|
399
|
-
.map(|(i, x)| {
|
400
|
-
if array.is_null(i) {
|
401
|
-
ParquetValue::Null
|
402
|
-
} else {
|
403
|
-
ParquetValue::$variant(x)
|
404
|
-
}
|
405
|
-
})
|
406
|
-
.collect()
|
407
|
-
} else {
|
408
|
-
array
|
409
|
-
.values()
|
410
|
-
.iter()
|
411
|
-
.map(|x| ParquetValue::$variant(x))
|
412
|
-
.collect()
|
413
|
-
}
|
414
|
-
}};
|
415
|
-
}
|
416
|
-
|
417
|
-
// Add macro for handling timestamp array conversions
|
418
|
-
macro_rules! impl_timestamp_array_conversion {
|
419
|
-
($column:expr, $array_type:ty, $variant:ident, $tz:expr) => {{
|
420
|
-
let array = downcast_array::<$array_type>($column);
|
421
|
-
if array.is_nullable() {
|
422
|
-
array
|
423
|
-
.values()
|
424
|
-
.iter()
|
425
|
-
.enumerate()
|
426
|
-
.map(|(i, x)| {
|
427
|
-
if array.is_null(i) {
|
428
|
-
ParquetValue::Null
|
429
|
-
} else {
|
430
|
-
ParquetValue::$variant(*x, $tz.clone())
|
431
|
-
}
|
432
|
-
})
|
433
|
-
.collect()
|
434
|
-
} else {
|
435
|
-
array
|
436
|
-
.values()
|
437
|
-
.iter()
|
438
|
-
.map(|x| ParquetValue::$variant(*x, $tz.clone()))
|
439
|
-
.collect()
|
440
|
-
}
|
441
|
-
}};
|
442
|
-
}
|
443
|
-
|
444
|
-
impl TryFrom<&dyn Array> for ParquetValueVec {
|
445
|
-
type Error = String;
|
446
|
-
|
447
|
-
fn try_from(column: &dyn Array) -> Result<Self, Self::Error> {
|
448
|
-
let tmp_vec = match column.data_type() {
|
449
|
-
DataType::Boolean => impl_boolean_array_conversion!(column, BooleanArray, Boolean),
|
450
|
-
DataType::Int8 => impl_numeric_array_conversion!(column, Int8Array, Int8),
|
451
|
-
DataType::Int16 => impl_numeric_array_conversion!(column, Int16Array, Int16),
|
452
|
-
DataType::Int32 => impl_numeric_array_conversion!(column, Int32Array, Int32),
|
453
|
-
DataType::Int64 => impl_numeric_array_conversion!(column, Int64Array, Int64),
|
454
|
-
DataType::UInt8 => impl_numeric_array_conversion!(column, UInt8Array, UInt8),
|
455
|
-
DataType::UInt16 => impl_numeric_array_conversion!(column, UInt16Array, UInt16),
|
456
|
-
DataType::UInt32 => impl_numeric_array_conversion!(column, UInt32Array, UInt32),
|
457
|
-
DataType::UInt64 => impl_numeric_array_conversion!(column, UInt64Array, UInt64),
|
458
|
-
DataType::Float32 => impl_numeric_array_conversion!(column, Float32Array, Float32),
|
459
|
-
DataType::Float64 => impl_numeric_array_conversion!(column, Float64Array, Float64),
|
460
|
-
DataType::Date32 => impl_numeric_array_conversion!(column, Date32Array, Date32),
|
461
|
-
DataType::Date64 => impl_numeric_array_conversion!(column, Date64Array, Date64),
|
462
|
-
DataType::Timestamp(TimeUnit::Second, tz) => {
|
463
|
-
impl_timestamp_array_conversion!(column, TimestampSecondArray, TimestampSecond, tz)
|
464
|
-
}
|
465
|
-
DataType::Timestamp(TimeUnit::Millisecond, tz) => {
|
466
|
-
impl_timestamp_array_conversion!(
|
467
|
-
column,
|
468
|
-
TimestampMillisecondArray,
|
469
|
-
TimestampMillis,
|
470
|
-
tz
|
471
|
-
)
|
472
|
-
}
|
473
|
-
DataType::Timestamp(TimeUnit::Microsecond, tz) => {
|
474
|
-
impl_timestamp_array_conversion!(
|
475
|
-
column,
|
476
|
-
TimestampMicrosecondArray,
|
477
|
-
TimestampMicros,
|
478
|
-
tz
|
479
|
-
)
|
480
|
-
}
|
481
|
-
DataType::Timestamp(TimeUnit::Nanosecond, tz) => {
|
482
|
-
impl_timestamp_array_conversion!(
|
483
|
-
column,
|
484
|
-
TimestampNanosecondArray,
|
485
|
-
TimestampNanos,
|
486
|
-
tz
|
487
|
-
)
|
488
|
-
}
|
489
|
-
// Because f16 is unstable in Rust, we convert it to f32
|
490
|
-
DataType::Float16 => {
|
491
|
-
let array = downcast_array::<Float16Array>(column);
|
492
|
-
if array.is_nullable() {
|
493
|
-
array
|
494
|
-
.values()
|
495
|
-
.iter()
|
496
|
-
.enumerate()
|
497
|
-
.map(|(i, x)| {
|
498
|
-
if array.is_null(i) {
|
499
|
-
ParquetValue::Null
|
500
|
-
} else {
|
501
|
-
ParquetValue::Float16(f32::from(*x))
|
502
|
-
}
|
503
|
-
})
|
504
|
-
.collect()
|
505
|
-
} else {
|
506
|
-
array
|
507
|
-
.values()
|
508
|
-
.iter()
|
509
|
-
.map(|x| ParquetValue::Float16(f32::from(*x)))
|
510
|
-
.collect()
|
511
|
-
}
|
512
|
-
}
|
513
|
-
DataType::Utf8 => {
|
514
|
-
let array = downcast_array::<StringArray>(column);
|
515
|
-
array
|
516
|
-
.iter()
|
517
|
-
.map(|opt_x| match opt_x {
|
518
|
-
Some(x) => ParquetValue::String(x.to_string()),
|
519
|
-
None => ParquetValue::Null,
|
520
|
-
})
|
521
|
-
.collect()
|
522
|
-
}
|
523
|
-
DataType::Binary => {
|
524
|
-
let array = downcast_array::<BinaryArray>(column);
|
525
|
-
array
|
526
|
-
.iter()
|
527
|
-
.map(|opt_x| match opt_x {
|
528
|
-
Some(x) => ParquetValue::Bytes(x.to_vec()),
|
529
|
-
None => ParquetValue::Null,
|
530
|
-
})
|
531
|
-
.collect()
|
532
|
-
}
|
533
|
-
DataType::List(_field) => {
|
534
|
-
let list_array = downcast_array::<ListArray>(column);
|
535
|
-
list_array
|
536
|
-
.iter()
|
537
|
-
.map(|x| match x {
|
538
|
-
Some(values) => match ParquetValueVec::try_from(values) {
|
539
|
-
Ok(vec) => ParquetValue::List(vec.into_inner()),
|
540
|
-
Err(e) => {
|
541
|
-
panic!("Error converting list array to ParquetValueVec: {}", e)
|
542
|
-
}
|
543
|
-
},
|
544
|
-
None => ParquetValue::Null,
|
545
|
-
})
|
546
|
-
.collect()
|
547
|
-
}
|
548
|
-
DataType::Struct(_) => {
|
549
|
-
let struct_array = downcast_array::<StructArray>(column);
|
550
|
-
let mut values = Vec::with_capacity(struct_array.len());
|
551
|
-
for i in 0..struct_array.len() {
|
552
|
-
if struct_array.is_null(i) {
|
553
|
-
values.push(ParquetValue::Null);
|
554
|
-
continue;
|
555
|
-
}
|
556
|
-
|
557
|
-
let mut map = std::collections::HashMap::new();
|
558
|
-
for (field_idx, field) in struct_array.fields().iter().enumerate() {
|
559
|
-
let column = struct_array.column(field_idx);
|
560
|
-
let field_values = match ParquetValueVec::try_from(column.slice(i, 1)) {
|
561
|
-
Ok(vec) => vec.into_inner(),
|
562
|
-
Err(e) => {
|
563
|
-
panic!("Error converting struct field to ParquetValueVec: {}", e)
|
564
|
-
}
|
565
|
-
};
|
566
|
-
map.insert(
|
567
|
-
ParquetValue::String(field.name().to_string()),
|
568
|
-
field_values.into_iter().next().unwrap(),
|
569
|
-
);
|
570
|
-
}
|
571
|
-
values.push(ParquetValue::Map(map));
|
572
|
-
}
|
573
|
-
values
|
574
|
-
}
|
575
|
-
DataType::Null => {
|
576
|
-
let x = downcast_array::<NullArray>(column);
|
577
|
-
vec![ParquetValue::Null; x.len()]
|
578
|
-
}
|
579
|
-
_ => {
|
580
|
-
return Err(format!("Unsupported data type: {:?}", column.data_type()));
|
581
|
-
}
|
582
|
-
};
|
583
|
-
Ok(ParquetValueVec(tmp_vec))
|
584
|
-
}
|
585
|
-
}
|
586
|
-
|
587
|
-
impl std::hash::Hash for ParquetValue {
|
588
|
-
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
589
|
-
match self {
|
590
|
-
ParquetValue::Int8(i) => i.hash(state),
|
591
|
-
ParquetValue::Int16(i) => i.hash(state),
|
592
|
-
ParquetValue::Int32(i) => i.hash(state),
|
593
|
-
ParquetValue::Int64(i) => i.hash(state),
|
594
|
-
ParquetValue::UInt8(i) => i.hash(state),
|
595
|
-
ParquetValue::UInt16(i) => i.hash(state),
|
596
|
-
ParquetValue::UInt32(i) => i.hash(state),
|
597
|
-
ParquetValue::UInt64(i) => i.hash(state),
|
598
|
-
ParquetValue::Float16(f) => f.to_bits().hash(state),
|
599
|
-
ParquetValue::Float32(f) => f.to_bits().hash(state),
|
600
|
-
ParquetValue::Float64(f) => f.to_bits().hash(state),
|
601
|
-
ParquetValue::Boolean(b) => b.hash(state),
|
602
|
-
ParquetValue::String(s) => s.hash(state),
|
603
|
-
ParquetValue::Bytes(b) => b.hash(state),
|
604
|
-
ParquetValue::Date32(d) => d.hash(state),
|
605
|
-
ParquetValue::Date64(d) => d.hash(state),
|
606
|
-
ParquetValue::TimestampSecond(ts, tz) => {
|
607
|
-
ts.hash(state);
|
608
|
-
tz.hash(state);
|
609
|
-
}
|
610
|
-
ParquetValue::TimestampMillis(ts, tz) => {
|
611
|
-
ts.hash(state);
|
612
|
-
tz.hash(state);
|
613
|
-
}
|
614
|
-
ParquetValue::TimestampMicros(ts, tz) => {
|
615
|
-
ts.hash(state);
|
616
|
-
tz.hash(state);
|
617
|
-
}
|
618
|
-
ParquetValue::TimestampNanos(ts, tz) => {
|
619
|
-
ts.hash(state);
|
620
|
-
tz.hash(state);
|
621
|
-
}
|
622
|
-
ParquetValue::List(l) => l.hash(state),
|
623
|
-
ParquetValue::Map(_m) => panic!("Map is not hashable"),
|
624
|
-
ParquetValue::Null => 0_i32.hash(state),
|
625
|
-
}
|
626
|
-
}
|
627
|
-
}
|
628
|
-
|
629
|
-
impl IntoValue for ParquetValue {
|
630
|
-
fn into_value_with(self, handle: &Ruby) -> Value {
|
631
|
-
match self {
|
632
|
-
ParquetValue::Int8(i) => i.into_value_with(handle),
|
633
|
-
ParquetValue::Int16(i) => i.into_value_with(handle),
|
634
|
-
ParquetValue::Int32(i) => i.into_value_with(handle),
|
635
|
-
ParquetValue::Int64(i) => i.into_value_with(handle),
|
636
|
-
ParquetValue::UInt8(i) => i.into_value_with(handle),
|
637
|
-
ParquetValue::UInt16(i) => i.into_value_with(handle),
|
638
|
-
ParquetValue::UInt32(i) => i.into_value_with(handle),
|
639
|
-
ParquetValue::UInt64(i) => i.into_value_with(handle),
|
640
|
-
ParquetValue::Float16(f) => f.into_value_with(handle),
|
641
|
-
ParquetValue::Float32(f) => f.into_value_with(handle),
|
642
|
-
ParquetValue::Float64(f) => f.into_value_with(handle),
|
643
|
-
ParquetValue::Boolean(b) => b.into_value_with(handle),
|
644
|
-
ParquetValue::String(s) => s.into_value_with(handle),
|
645
|
-
ParquetValue::Bytes(b) => b.into_value_with(handle),
|
646
|
-
ParquetValue::Date32(d) => {
|
647
|
-
let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
|
648
|
-
let formatted = ts.strftime("%Y-%m-%d").to_string();
|
649
|
-
formatted.into_value_with(handle)
|
650
|
-
}
|
651
|
-
ParquetValue::Date64(d) => {
|
652
|
-
let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
|
653
|
-
let formatted = ts.strftime("%Y-%m-%d").to_string();
|
654
|
-
formatted.into_value_with(handle)
|
655
|
-
}
|
656
|
-
ParquetValue::TimestampSecond(ts, tz) => {
|
657
|
-
let ts = parse_zoned_timestamp(&ParquetValue::TimestampSecond(ts, tz));
|
658
|
-
let time_class = handle.class_time();
|
659
|
-
time_class
|
660
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
661
|
-
.unwrap()
|
662
|
-
.into_value_with(handle)
|
663
|
-
}
|
664
|
-
ParquetValue::TimestampMillis(ts, tz) => {
|
665
|
-
let ts = parse_zoned_timestamp(&ParquetValue::TimestampMillis(ts, tz));
|
666
|
-
let time_class = handle.class_time();
|
667
|
-
time_class
|
668
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
669
|
-
.unwrap()
|
670
|
-
.into_value_with(handle)
|
671
|
-
}
|
672
|
-
ParquetValue::TimestampMicros(ts, tz) => {
|
673
|
-
let ts = parse_zoned_timestamp(&ParquetValue::TimestampMicros(ts, tz));
|
674
|
-
let time_class = handle.class_time();
|
675
|
-
time_class
|
676
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
677
|
-
.unwrap()
|
678
|
-
.into_value_with(handle)
|
679
|
-
}
|
680
|
-
ParquetValue::TimestampNanos(ts, tz) => {
|
681
|
-
let ts = parse_zoned_timestamp(&ParquetValue::TimestampNanos(ts, tz));
|
682
|
-
let time_class = handle.class_time();
|
683
|
-
time_class
|
684
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
685
|
-
.unwrap()
|
686
|
-
.into_value_with(handle)
|
687
|
-
}
|
688
|
-
ParquetValue::List(l) => {
|
689
|
-
let ary = handle.ary_new_capa(l.len());
|
690
|
-
l.into_iter()
|
691
|
-
.try_for_each(|v| ary.push(v.into_value_with(handle)))
|
692
|
-
.unwrap();
|
693
|
-
ary.into_value_with(handle)
|
694
|
-
}
|
695
|
-
ParquetValue::Map(m) => {
|
696
|
-
let hash = handle.hash_new_capa(m.len());
|
697
|
-
m.into_iter()
|
698
|
-
.try_for_each(|(k, v)| {
|
699
|
-
hash.aset(k.into_value_with(handle), v.into_value_with(handle))
|
700
|
-
})
|
701
|
-
.unwrap();
|
702
|
-
hash.into_value_with(handle)
|
703
|
-
}
|
704
|
-
ParquetValue::Null => handle.qnil().as_value(),
|
705
|
-
}
|
706
|
-
}
|
707
|
-
}
|
708
|
-
|
709
|
-
fn parse_zoned_timestamp(value: &ParquetValue) -> jiff::Timestamp {
|
710
|
-
let (ts, tz) = match value {
|
711
|
-
ParquetValue::TimestampSecond(ts, tz) => (jiff::Timestamp::from_second(*ts).unwrap(), tz),
|
712
|
-
ParquetValue::TimestampMillis(ts, tz) => {
|
713
|
-
(jiff::Timestamp::from_millisecond(*ts).unwrap(), tz)
|
714
|
-
}
|
715
|
-
ParquetValue::TimestampMicros(ts, tz) => {
|
716
|
-
(jiff::Timestamp::from_microsecond(*ts).unwrap(), tz)
|
717
|
-
}
|
718
|
-
ParquetValue::TimestampNanos(ts, tz) => {
|
719
|
-
(jiff::Timestamp::from_nanosecond(*ts as i128).unwrap(), tz)
|
720
|
-
}
|
721
|
-
_ => panic!("Invalid timestamp value"),
|
722
|
-
};
|
723
|
-
|
724
|
-
// If timezone is provided, convert to zoned timestamp
|
725
|
-
if let Some(tz) = tz {
|
726
|
-
// Handle fixed offset timezones like "+09:00" first
|
727
|
-
if tz.starts_with('+') || tz.starts_with('-') {
|
728
|
-
// Parse the offset string into hours and minutes
|
729
|
-
let (hours, minutes) = if tz.len() >= 5 && tz.contains(':') {
|
730
|
-
// Format: "+09:00" or "-09:00"
|
731
|
-
let h = tz[1..3].parse::<i32>().unwrap_or(0);
|
732
|
-
let m = tz[4..6].parse::<i32>().unwrap_or(0);
|
733
|
-
(h, m)
|
734
|
-
} else if tz.len() >= 3 {
|
735
|
-
// Format: "+09" or "-09"
|
736
|
-
let h = tz[1..3].parse::<i32>().unwrap_or(0);
|
737
|
-
(h, 0)
|
738
|
-
} else {
|
739
|
-
(0, 0)
|
740
|
-
};
|
741
|
-
|
742
|
-
// Apply sign
|
743
|
-
let total_minutes = if tz.starts_with('-') {
|
744
|
-
-(hours * 60 + minutes)
|
745
|
-
} else {
|
746
|
-
hours * 60 + minutes
|
747
|
-
};
|
748
|
-
|
749
|
-
// Create fixed timezone
|
750
|
-
let tz = jiff::tz::TimeZone::fixed(jiff::tz::offset((total_minutes / 60) as i8));
|
751
|
-
ts.to_zoned(tz).timestamp()
|
752
|
-
} else {
|
753
|
-
// Try IANA timezone
|
754
|
-
match ts.intz(&tz) {
|
755
|
-
Ok(zoned) => zoned.timestamp(),
|
756
|
-
Err(_) => ts, // Fall back to UTC if timezone is invalid
|
757
|
-
}
|
758
|
-
}
|
759
|
-
} else {
|
760
|
-
// No timezone provided - treat as UTC
|
761
|
-
ts
|
762
|
-
}
|
763
|
-
}
|