parquet 0.0.4 → 0.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Cargo.lock +48 -40
- data/Gemfile +1 -1
- data/README.md +92 -2
- data/ext/parquet/Cargo.toml +5 -8
- data/ext/parquet/src/enumerator.rs +11 -5
- data/ext/parquet/src/lib.rs +5 -0
- data/ext/parquet/src/reader/mod.rs +42 -0
- data/ext/parquet/src/{reader.rs → reader/parquet_column_reader.rs} +7 -164
- data/ext/parquet/src/reader/parquet_row_reader.rs +152 -0
- data/ext/parquet/src/ruby_reader.rs +2 -3
- data/ext/parquet/src/types/core_types.rs +73 -0
- data/ext/parquet/src/types/mod.rs +30 -0
- data/ext/parquet/src/{types.rs → types/parquet_value.rs} +171 -435
- data/ext/parquet/src/types/record_types.rs +204 -0
- data/ext/parquet/src/types/timestamp.rs +85 -0
- data/ext/parquet/src/types/type_conversion.rs +753 -0
- data/ext/parquet/src/types/writer_types.rs +270 -0
- data/ext/parquet/src/utils.rs +34 -26
- data/ext/parquet/src/writer/mod.rs +403 -0
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +33 -2
- metadata +13 -4
@@ -1,232 +1,8 @@
|
|
1
|
-
use
|
1
|
+
use crate::{impl_date_conversion, impl_timestamp_array_conversion, impl_timestamp_conversion};
|
2
2
|
|
3
|
-
use
|
4
|
-
use arrow_array::{
|
5
|
-
Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Float16Array, Float32Array,
|
6
|
-
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, NullArray, StringArray,
|
7
|
-
StructArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
|
8
|
-
TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
|
9
|
-
};
|
10
|
-
use arrow_schema::{DataType, TimeUnit};
|
11
|
-
use itertools::Itertools;
|
12
|
-
use magnus::{value::ReprValue, IntoValue, Ruby, Value};
|
13
|
-
use parquet::data_type::Decimal;
|
14
|
-
use parquet::record::Field;
|
15
|
-
|
16
|
-
use crate::header_cache::StringCacheKey;
|
17
|
-
|
18
|
-
#[derive(Debug)]
|
19
|
-
pub enum RowRecord<S: BuildHasher + Default> {
|
20
|
-
Vec(Vec<ParquetField>),
|
21
|
-
Map(HashMap<StringCacheKey, ParquetField, S>),
|
22
|
-
}
|
23
|
-
|
24
|
-
#[derive(Debug)]
|
25
|
-
pub enum ColumnRecord<S: BuildHasher + Default> {
|
26
|
-
Vec(Vec<Vec<ParquetValue>>),
|
27
|
-
Map(HashMap<StringCacheKey, Vec<ParquetValue>, S>),
|
28
|
-
}
|
29
|
-
|
30
|
-
impl<S: BuildHasher + Default> IntoValue for RowRecord<S> {
|
31
|
-
fn into_value_with(self, handle: &Ruby) -> Value {
|
32
|
-
match self {
|
33
|
-
RowRecord::Vec(vec) => {
|
34
|
-
let ary = handle.ary_new_capa(vec.len());
|
35
|
-
vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
|
36
|
-
handle.into_value(ary)
|
37
|
-
}
|
38
|
-
RowRecord::Map(map) => {
|
39
|
-
let hash = handle.hash_new_capa(map.len());
|
40
|
-
|
41
|
-
let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
|
42
|
-
let mut i = 0;
|
43
|
-
|
44
|
-
for chunk in &map.into_iter().chunks(64) {
|
45
|
-
// Reduced to 64 to ensure space for pairs
|
46
|
-
for (k, v) in chunk {
|
47
|
-
if i + 1 >= values.len() {
|
48
|
-
// Bulk insert current batch if array is full
|
49
|
-
hash.bulk_insert(&values[..i]).unwrap();
|
50
|
-
values[..i].fill(handle.qnil().as_value());
|
51
|
-
i = 0;
|
52
|
-
}
|
53
|
-
values[i] = handle.into_value(k);
|
54
|
-
values[i + 1] = handle.into_value(v);
|
55
|
-
i += 2;
|
56
|
-
}
|
57
|
-
// Insert any remaining pairs
|
58
|
-
if i > 0 {
|
59
|
-
hash.bulk_insert(&values[..i]).unwrap();
|
60
|
-
values[..i].fill(handle.qnil().as_value());
|
61
|
-
i = 0;
|
62
|
-
}
|
63
|
-
}
|
64
|
-
|
65
|
-
hash.into_value_with(handle)
|
66
|
-
}
|
67
|
-
}
|
68
|
-
}
|
69
|
-
}
|
70
|
-
|
71
|
-
impl<S: BuildHasher + Default> IntoValue for ColumnRecord<S> {
|
72
|
-
fn into_value_with(self, handle: &Ruby) -> Value {
|
73
|
-
match self {
|
74
|
-
ColumnRecord::Vec(vec) => {
|
75
|
-
let ary = handle.ary_new_capa(vec.len());
|
76
|
-
vec.into_iter()
|
77
|
-
.try_for_each(|v| {
|
78
|
-
let nested_ary = handle.ary_new_capa(v.len());
|
79
|
-
v.into_iter().try_for_each(|v| nested_ary.push(v)).unwrap();
|
80
|
-
ary.push(nested_ary.into_value_with(handle))
|
81
|
-
})
|
82
|
-
.unwrap();
|
83
|
-
ary.into_value_with(handle)
|
84
|
-
}
|
85
|
-
ColumnRecord::Map(map) => {
|
86
|
-
let hash = handle.hash_new_capa(map.len());
|
87
|
-
|
88
|
-
let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
|
89
|
-
let mut i = 0;
|
90
|
-
|
91
|
-
for chunk in &map.into_iter().chunks(64) {
|
92
|
-
// Reduced to 64 to ensure space for pairs
|
93
|
-
for (k, v) in chunk {
|
94
|
-
if i + 1 >= values.len() {
|
95
|
-
// Bulk insert current batch if array is full
|
96
|
-
hash.bulk_insert(&values[..i]).unwrap();
|
97
|
-
values[..i].fill(handle.qnil().as_value());
|
98
|
-
i = 0;
|
99
|
-
}
|
100
|
-
values[i] = handle.into_value(k);
|
101
|
-
let ary = handle.ary_new_capa(v.len());
|
102
|
-
v.into_iter().try_for_each(|v| ary.push(v)).unwrap();
|
103
|
-
values[i + 1] = handle.into_value(ary);
|
104
|
-
i += 2;
|
105
|
-
}
|
106
|
-
// Insert any remaining pairs
|
107
|
-
if i > 0 {
|
108
|
-
hash.bulk_insert(&values[..i]).unwrap();
|
109
|
-
values[..i].fill(handle.qnil().as_value());
|
110
|
-
i = 0;
|
111
|
-
}
|
112
|
-
}
|
113
|
-
|
114
|
-
hash.into_value_with(handle)
|
115
|
-
}
|
116
|
-
}
|
117
|
-
}
|
118
|
-
}
|
3
|
+
use super::*;
|
119
4
|
|
120
5
|
#[derive(Debug, Clone)]
|
121
|
-
pub struct CowValue<'a>(pub Cow<'a, str>);
|
122
|
-
|
123
|
-
impl<'a> IntoValue for CowValue<'a> {
|
124
|
-
fn into_value_with(self, handle: &Ruby) -> Value {
|
125
|
-
self.0.into_value_with(handle)
|
126
|
-
}
|
127
|
-
}
|
128
|
-
|
129
|
-
#[derive(Debug)]
|
130
|
-
pub struct ParquetField(pub Field);
|
131
|
-
|
132
|
-
impl IntoValue for ParquetField {
|
133
|
-
fn into_value_with(self, handle: &Ruby) -> Value {
|
134
|
-
match self.0 {
|
135
|
-
Field::Null => handle.qnil().as_value(),
|
136
|
-
Field::Bool(b) => b.into_value_with(handle),
|
137
|
-
Field::Short(s) => s.into_value_with(handle),
|
138
|
-
Field::Int(i) => i.into_value_with(handle),
|
139
|
-
Field::Long(l) => l.into_value_with(handle),
|
140
|
-
Field::UByte(ub) => ub.into_value_with(handle),
|
141
|
-
Field::UShort(us) => us.into_value_with(handle),
|
142
|
-
Field::UInt(ui) => ui.into_value_with(handle),
|
143
|
-
Field::ULong(ul) => ul.into_value_with(handle),
|
144
|
-
Field::Float16(f) => f32::from(f).into_value_with(handle),
|
145
|
-
Field::Float(f) => f.into_value_with(handle),
|
146
|
-
Field::Double(d) => d.into_value_with(handle),
|
147
|
-
Field::Str(s) => s.into_value_with(handle),
|
148
|
-
Field::Byte(b) => b.into_value_with(handle),
|
149
|
-
Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
|
150
|
-
Field::Date(d) => {
|
151
|
-
let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
|
152
|
-
let formatted = ts.strftime("%Y-%m-%d").to_string();
|
153
|
-
formatted.into_value_with(handle)
|
154
|
-
}
|
155
|
-
Field::TimestampMillis(ts) => {
|
156
|
-
let ts = jiff::Timestamp::from_millisecond(ts).unwrap();
|
157
|
-
let time_class = handle.class_time();
|
158
|
-
time_class
|
159
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
160
|
-
.unwrap()
|
161
|
-
.into_value_with(handle)
|
162
|
-
}
|
163
|
-
Field::TimestampMicros(ts) => {
|
164
|
-
let ts = jiff::Timestamp::from_microsecond(ts).unwrap();
|
165
|
-
let time_class = handle.class_time();
|
166
|
-
time_class
|
167
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
168
|
-
.unwrap()
|
169
|
-
.into_value_with(handle)
|
170
|
-
}
|
171
|
-
Field::ListInternal(list) => {
|
172
|
-
let elements = list.elements();
|
173
|
-
let ary = handle.ary_new_capa(elements.len());
|
174
|
-
elements
|
175
|
-
.iter()
|
176
|
-
.try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
|
177
|
-
.unwrap();
|
178
|
-
ary.into_value_with(handle)
|
179
|
-
}
|
180
|
-
Field::MapInternal(map) => {
|
181
|
-
let entries = map.entries();
|
182
|
-
let hash = handle.hash_new_capa(entries.len());
|
183
|
-
entries
|
184
|
-
.iter()
|
185
|
-
.try_for_each(|(k, v)| {
|
186
|
-
hash.aset(
|
187
|
-
ParquetField(k.clone()).into_value_with(handle),
|
188
|
-
ParquetField(v.clone()).into_value_with(handle),
|
189
|
-
)
|
190
|
-
})
|
191
|
-
.unwrap();
|
192
|
-
hash.into_value_with(handle)
|
193
|
-
}
|
194
|
-
Field::Decimal(d) => {
|
195
|
-
let value = match d {
|
196
|
-
Decimal::Int32 { value, scale, .. } => {
|
197
|
-
let unscaled = i32::from_be_bytes(value);
|
198
|
-
format!("{}e-{}", unscaled, scale)
|
199
|
-
}
|
200
|
-
Decimal::Int64 { value, scale, .. } => {
|
201
|
-
let unscaled = i64::from_be_bytes(value);
|
202
|
-
format!("{}e-{}", unscaled, scale)
|
203
|
-
}
|
204
|
-
Decimal::Bytes { value, scale, .. } => {
|
205
|
-
// Convert bytes to string representation of unscaled value
|
206
|
-
let unscaled = String::from_utf8_lossy(value.data());
|
207
|
-
format!("{}e-{}", unscaled, scale)
|
208
|
-
}
|
209
|
-
};
|
210
|
-
handle.eval(&format!("BigDecimal(\"{value}\")")).unwrap()
|
211
|
-
}
|
212
|
-
Field::Group(row) => {
|
213
|
-
let hash = handle.hash_new();
|
214
|
-
row.get_column_iter()
|
215
|
-
.try_for_each(|(k, v)| {
|
216
|
-
hash.aset(
|
217
|
-
k.clone().into_value_with(handle),
|
218
|
-
ParquetField(v.clone()).into_value_with(handle),
|
219
|
-
)
|
220
|
-
})
|
221
|
-
.unwrap();
|
222
|
-
hash.into_value_with(handle)
|
223
|
-
}
|
224
|
-
}
|
225
|
-
}
|
226
|
-
}
|
227
|
-
|
228
|
-
#[allow(dead_code)]
|
229
|
-
#[derive(Clone, Debug)]
|
230
6
|
pub enum ParquetValue {
|
231
7
|
Int8(i8),
|
232
8
|
Int16(i16),
|
@@ -285,6 +61,175 @@ impl PartialEq for ParquetValue {
|
|
285
61
|
|
286
62
|
impl Eq for ParquetValue {}
|
287
63
|
|
64
|
+
impl std::hash::Hash for ParquetValue {
|
65
|
+
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
66
|
+
match self {
|
67
|
+
ParquetValue::Int8(i) => i.hash(state),
|
68
|
+
ParquetValue::Int16(i) => i.hash(state),
|
69
|
+
ParquetValue::Int32(i) => i.hash(state),
|
70
|
+
ParquetValue::Int64(i) => i.hash(state),
|
71
|
+
ParquetValue::UInt8(i) => i.hash(state),
|
72
|
+
ParquetValue::UInt16(i) => i.hash(state),
|
73
|
+
ParquetValue::UInt32(i) => i.hash(state),
|
74
|
+
ParquetValue::UInt64(i) => i.hash(state),
|
75
|
+
ParquetValue::Float16(f) => f.to_bits().hash(state),
|
76
|
+
ParquetValue::Float32(f) => f.to_bits().hash(state),
|
77
|
+
ParquetValue::Float64(f) => f.to_bits().hash(state),
|
78
|
+
ParquetValue::Boolean(b) => b.hash(state),
|
79
|
+
ParquetValue::String(s) => s.hash(state),
|
80
|
+
ParquetValue::Bytes(b) => b.hash(state),
|
81
|
+
ParquetValue::Date32(d) => d.hash(state),
|
82
|
+
ParquetValue::Date64(d) => d.hash(state),
|
83
|
+
ParquetValue::TimestampSecond(ts, tz) => {
|
84
|
+
ts.hash(state);
|
85
|
+
tz.hash(state);
|
86
|
+
}
|
87
|
+
ParquetValue::TimestampMillis(ts, tz) => {
|
88
|
+
ts.hash(state);
|
89
|
+
tz.hash(state);
|
90
|
+
}
|
91
|
+
ParquetValue::TimestampMicros(ts, tz) => {
|
92
|
+
ts.hash(state);
|
93
|
+
tz.hash(state);
|
94
|
+
}
|
95
|
+
ParquetValue::TimestampNanos(ts, tz) => {
|
96
|
+
ts.hash(state);
|
97
|
+
tz.hash(state);
|
98
|
+
}
|
99
|
+
ParquetValue::List(l) => l.hash(state),
|
100
|
+
ParquetValue::Map(_m) => panic!("Map is not hashable"),
|
101
|
+
ParquetValue::Null => 0_i32.hash(state),
|
102
|
+
}
|
103
|
+
}
|
104
|
+
}
|
105
|
+
|
106
|
+
impl IntoValue for ParquetValue {
|
107
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
108
|
+
match self {
|
109
|
+
ParquetValue::Int8(i) => i.into_value_with(handle),
|
110
|
+
ParquetValue::Int16(i) => i.into_value_with(handle),
|
111
|
+
ParquetValue::Int32(i) => i.into_value_with(handle),
|
112
|
+
ParquetValue::Int64(i) => i.into_value_with(handle),
|
113
|
+
ParquetValue::UInt8(i) => i.into_value_with(handle),
|
114
|
+
ParquetValue::UInt16(i) => i.into_value_with(handle),
|
115
|
+
ParquetValue::UInt32(i) => i.into_value_with(handle),
|
116
|
+
ParquetValue::UInt64(i) => i.into_value_with(handle),
|
117
|
+
ParquetValue::Float16(f) => f.into_value_with(handle),
|
118
|
+
ParquetValue::Float32(f) => f.into_value_with(handle),
|
119
|
+
ParquetValue::Float64(f) => f.into_value_with(handle),
|
120
|
+
ParquetValue::Boolean(b) => b.into_value_with(handle),
|
121
|
+
ParquetValue::String(s) => s.into_value_with(handle),
|
122
|
+
ParquetValue::Bytes(b) => handle.str_from_slice(&b).as_value(),
|
123
|
+
ParquetValue::Date32(d) => impl_date_conversion!(d, handle),
|
124
|
+
ParquetValue::Date64(d) => impl_date_conversion!(d, handle),
|
125
|
+
timestamp @ ParquetValue::TimestampSecond(_, _) => {
|
126
|
+
impl_timestamp_conversion!(timestamp, TimestampSecond, handle)
|
127
|
+
}
|
128
|
+
timestamp @ ParquetValue::TimestampMillis(_, _) => {
|
129
|
+
impl_timestamp_conversion!(timestamp, TimestampMillis, handle)
|
130
|
+
}
|
131
|
+
timestamp @ ParquetValue::TimestampMicros(_, _) => {
|
132
|
+
impl_timestamp_conversion!(timestamp, TimestampMicros, handle)
|
133
|
+
}
|
134
|
+
timestamp @ ParquetValue::TimestampNanos(_, _) => {
|
135
|
+
impl_timestamp_conversion!(timestamp, TimestampNanos, handle)
|
136
|
+
}
|
137
|
+
ParquetValue::List(l) => {
|
138
|
+
let ary = handle.ary_new_capa(l.len());
|
139
|
+
l.into_iter()
|
140
|
+
.try_for_each(|v| ary.push(v.into_value_with(handle)))
|
141
|
+
.unwrap();
|
142
|
+
ary.into_value_with(handle)
|
143
|
+
}
|
144
|
+
ParquetValue::Map(m) => {
|
145
|
+
let hash = handle.hash_new_capa(m.len());
|
146
|
+
m.into_iter()
|
147
|
+
.try_for_each(|(k, v)| {
|
148
|
+
hash.aset(k.into_value_with(handle), v.into_value_with(handle))
|
149
|
+
})
|
150
|
+
.unwrap();
|
151
|
+
hash.into_value_with(handle)
|
152
|
+
}
|
153
|
+
ParquetValue::Null => handle.qnil().as_value(),
|
154
|
+
}
|
155
|
+
}
|
156
|
+
}
|
157
|
+
|
158
|
+
impl ParquetValue {
|
159
|
+
pub fn from_value(value: Value, type_: &ParquetSchemaType) -> Result<Self, MagnusError> {
|
160
|
+
match type_ {
|
161
|
+
ParquetSchemaType::Int8 => {
|
162
|
+
let v = NumericConverter::<i8>::convert_with_string_fallback(value)?;
|
163
|
+
Ok(ParquetValue::Int8(v))
|
164
|
+
}
|
165
|
+
ParquetSchemaType::Int16 => {
|
166
|
+
let v = NumericConverter::<i16>::convert_with_string_fallback(value)?;
|
167
|
+
Ok(ParquetValue::Int16(v))
|
168
|
+
}
|
169
|
+
ParquetSchemaType::Int32 => {
|
170
|
+
let v = NumericConverter::<i32>::convert_with_string_fallback(value)?;
|
171
|
+
Ok(ParquetValue::Int32(v))
|
172
|
+
}
|
173
|
+
ParquetSchemaType::Int64 => {
|
174
|
+
let v = NumericConverter::<i64>::convert_with_string_fallback(value)?;
|
175
|
+
Ok(ParquetValue::Int64(v))
|
176
|
+
}
|
177
|
+
ParquetSchemaType::UInt8 => {
|
178
|
+
let v = NumericConverter::<u8>::convert_with_string_fallback(value)?;
|
179
|
+
Ok(ParquetValue::UInt8(v))
|
180
|
+
}
|
181
|
+
ParquetSchemaType::UInt16 => {
|
182
|
+
let v = NumericConverter::<u16>::convert_with_string_fallback(value)?;
|
183
|
+
Ok(ParquetValue::UInt16(v))
|
184
|
+
}
|
185
|
+
ParquetSchemaType::UInt32 => {
|
186
|
+
let v = NumericConverter::<u32>::convert_with_string_fallback(value)?;
|
187
|
+
Ok(ParquetValue::UInt32(v))
|
188
|
+
}
|
189
|
+
ParquetSchemaType::UInt64 => {
|
190
|
+
let v = NumericConverter::<u64>::convert_with_string_fallback(value)?;
|
191
|
+
Ok(ParquetValue::UInt64(v))
|
192
|
+
}
|
193
|
+
ParquetSchemaType::Float => {
|
194
|
+
let v = NumericConverter::<f32>::convert_with_string_fallback(value)?;
|
195
|
+
Ok(ParquetValue::Float32(v))
|
196
|
+
}
|
197
|
+
ParquetSchemaType::Double => {
|
198
|
+
let v = NumericConverter::<f64>::convert_with_string_fallback(value)?;
|
199
|
+
Ok(ParquetValue::Float64(v))
|
200
|
+
}
|
201
|
+
ParquetSchemaType::String => {
|
202
|
+
let v = String::try_convert(value)?;
|
203
|
+
Ok(ParquetValue::String(v))
|
204
|
+
}
|
205
|
+
ParquetSchemaType::Binary => {
|
206
|
+
let v = convert_to_binary(value)?;
|
207
|
+
Ok(ParquetValue::Bytes(v))
|
208
|
+
}
|
209
|
+
ParquetSchemaType::Boolean => {
|
210
|
+
let v = convert_to_boolean(value)?;
|
211
|
+
Ok(ParquetValue::Boolean(v))
|
212
|
+
}
|
213
|
+
ParquetSchemaType::Date32 => {
|
214
|
+
let v = convert_to_date32(value)?;
|
215
|
+
Ok(ParquetValue::Date32(v))
|
216
|
+
}
|
217
|
+
ParquetSchemaType::TimestampMillis => {
|
218
|
+
let v = convert_to_timestamp_millis(value)?;
|
219
|
+
Ok(ParquetValue::TimestampMillis(v, None))
|
220
|
+
}
|
221
|
+
ParquetSchemaType::TimestampMicros => {
|
222
|
+
let v = convert_to_timestamp_micros(value)?;
|
223
|
+
Ok(ParquetValue::TimestampMicros(v, None))
|
224
|
+
}
|
225
|
+
ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => Err(MagnusError::new(
|
226
|
+
magnus::exception::type_error(),
|
227
|
+
"Nested lists and maps are not supported",
|
228
|
+
)),
|
229
|
+
}
|
230
|
+
}
|
231
|
+
}
|
232
|
+
|
288
233
|
#[derive(Debug)]
|
289
234
|
pub struct ParquetValueVec(Vec<ParquetValue>);
|
290
235
|
|
@@ -319,7 +264,6 @@ impl TryFrom<Arc<dyn Array>> for ParquetValueVec {
|
|
319
264
|
}
|
320
265
|
}
|
321
266
|
|
322
|
-
// Add macro for handling numeric array conversions
|
323
267
|
macro_rules! impl_numeric_array_conversion {
|
324
268
|
($column:expr, $array_type:ty, $variant:ident) => {{
|
325
269
|
let array = downcast_array::<$array_type>($column);
|
@@ -345,8 +289,6 @@ macro_rules! impl_numeric_array_conversion {
|
|
345
289
|
}
|
346
290
|
}};
|
347
291
|
}
|
348
|
-
|
349
|
-
// Add macro for handling boolean array conversions
|
350
292
|
macro_rules! impl_boolean_array_conversion {
|
351
293
|
($column:expr, $array_type:ty, $variant:ident) => {{
|
352
294
|
let array = downcast_array::<$array_type>($column);
|
@@ -373,33 +315,6 @@ macro_rules! impl_boolean_array_conversion {
|
|
373
315
|
}};
|
374
316
|
}
|
375
317
|
|
376
|
-
// Add macro for handling timestamp array conversions
|
377
|
-
macro_rules! impl_timestamp_array_conversion {
|
378
|
-
($column:expr, $array_type:ty, $variant:ident, $tz:expr) => {{
|
379
|
-
let array = downcast_array::<$array_type>($column);
|
380
|
-
if array.is_nullable() {
|
381
|
-
array
|
382
|
-
.values()
|
383
|
-
.iter()
|
384
|
-
.enumerate()
|
385
|
-
.map(|(i, x)| {
|
386
|
-
if array.is_null(i) {
|
387
|
-
ParquetValue::Null
|
388
|
-
} else {
|
389
|
-
ParquetValue::$variant(*x, $tz.clone())
|
390
|
-
}
|
391
|
-
})
|
392
|
-
.collect()
|
393
|
-
} else {
|
394
|
-
array
|
395
|
-
.values()
|
396
|
-
.iter()
|
397
|
-
.map(|x| ParquetValue::$variant(*x, $tz.clone()))
|
398
|
-
.collect()
|
399
|
-
}
|
400
|
-
}};
|
401
|
-
}
|
402
|
-
|
403
318
|
impl TryFrom<&dyn Array> for ParquetValueVec {
|
404
319
|
type Error = String;
|
405
320
|
|
@@ -445,7 +360,6 @@ impl TryFrom<&dyn Array> for ParquetValueVec {
|
|
445
360
|
tz
|
446
361
|
)
|
447
362
|
}
|
448
|
-
// Because f16 is unstable in Rust, we convert it to f32
|
449
363
|
DataType::Float16 => {
|
450
364
|
let array = downcast_array::<Float16Array>(column);
|
451
365
|
if array.is_nullable() {
|
@@ -542,181 +456,3 @@ impl TryFrom<&dyn Array> for ParquetValueVec {
|
|
542
456
|
Ok(ParquetValueVec(tmp_vec))
|
543
457
|
}
|
544
458
|
}
|
545
|
-
|
546
|
-
impl std::hash::Hash for ParquetValue {
|
547
|
-
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
548
|
-
match self {
|
549
|
-
ParquetValue::Int8(i) => i.hash(state),
|
550
|
-
ParquetValue::Int16(i) => i.hash(state),
|
551
|
-
ParquetValue::Int32(i) => i.hash(state),
|
552
|
-
ParquetValue::Int64(i) => i.hash(state),
|
553
|
-
ParquetValue::UInt8(i) => i.hash(state),
|
554
|
-
ParquetValue::UInt16(i) => i.hash(state),
|
555
|
-
ParquetValue::UInt32(i) => i.hash(state),
|
556
|
-
ParquetValue::UInt64(i) => i.hash(state),
|
557
|
-
ParquetValue::Float16(f) => f.to_bits().hash(state),
|
558
|
-
ParquetValue::Float32(f) => f.to_bits().hash(state),
|
559
|
-
ParquetValue::Float64(f) => f.to_bits().hash(state),
|
560
|
-
ParquetValue::Boolean(b) => b.hash(state),
|
561
|
-
ParquetValue::String(s) => s.hash(state),
|
562
|
-
ParquetValue::Bytes(b) => b.hash(state),
|
563
|
-
ParquetValue::Date32(d) => d.hash(state),
|
564
|
-
ParquetValue::Date64(d) => d.hash(state),
|
565
|
-
ParquetValue::TimestampSecond(ts, tz) => {
|
566
|
-
ts.hash(state);
|
567
|
-
tz.hash(state);
|
568
|
-
}
|
569
|
-
ParquetValue::TimestampMillis(ts, tz) => {
|
570
|
-
ts.hash(state);
|
571
|
-
tz.hash(state);
|
572
|
-
}
|
573
|
-
ParquetValue::TimestampMicros(ts, tz) => {
|
574
|
-
ts.hash(state);
|
575
|
-
tz.hash(state);
|
576
|
-
}
|
577
|
-
ParquetValue::TimestampNanos(ts, tz) => {
|
578
|
-
ts.hash(state);
|
579
|
-
tz.hash(state);
|
580
|
-
}
|
581
|
-
ParquetValue::List(l) => l.hash(state),
|
582
|
-
ParquetValue::Map(_m) => panic!("Map is not hashable"),
|
583
|
-
ParquetValue::Null => 0_i32.hash(state),
|
584
|
-
}
|
585
|
-
}
|
586
|
-
}
|
587
|
-
|
588
|
-
impl IntoValue for ParquetValue {
|
589
|
-
fn into_value_with(self, handle: &Ruby) -> Value {
|
590
|
-
match self {
|
591
|
-
ParquetValue::Int8(i) => i.into_value_with(handle),
|
592
|
-
ParquetValue::Int16(i) => i.into_value_with(handle),
|
593
|
-
ParquetValue::Int32(i) => i.into_value_with(handle),
|
594
|
-
ParquetValue::Int64(i) => i.into_value_with(handle),
|
595
|
-
ParquetValue::UInt8(i) => i.into_value_with(handle),
|
596
|
-
ParquetValue::UInt16(i) => i.into_value_with(handle),
|
597
|
-
ParquetValue::UInt32(i) => i.into_value_with(handle),
|
598
|
-
ParquetValue::UInt64(i) => i.into_value_with(handle),
|
599
|
-
ParquetValue::Float16(f) => f.into_value_with(handle),
|
600
|
-
ParquetValue::Float32(f) => f.into_value_with(handle),
|
601
|
-
ParquetValue::Float64(f) => f.into_value_with(handle),
|
602
|
-
ParquetValue::Boolean(b) => b.into_value_with(handle),
|
603
|
-
ParquetValue::String(s) => s.into_value_with(handle),
|
604
|
-
ParquetValue::Bytes(b) => b.into_value_with(handle),
|
605
|
-
ParquetValue::Date32(d) => {
|
606
|
-
let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
|
607
|
-
let formatted = ts.strftime("%Y-%m-%d").to_string();
|
608
|
-
formatted.into_value_with(handle)
|
609
|
-
}
|
610
|
-
ParquetValue::Date64(d) => {
|
611
|
-
let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
|
612
|
-
let formatted = ts.strftime("%Y-%m-%d").to_string();
|
613
|
-
formatted.into_value_with(handle)
|
614
|
-
}
|
615
|
-
ParquetValue::TimestampSecond(ts, tz) => {
|
616
|
-
let ts = parse_zoned_timestamp(&ParquetValue::TimestampSecond(ts, tz));
|
617
|
-
let time_class = handle.class_time();
|
618
|
-
time_class
|
619
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
620
|
-
.unwrap()
|
621
|
-
.into_value_with(handle)
|
622
|
-
}
|
623
|
-
ParquetValue::TimestampMillis(ts, tz) => {
|
624
|
-
let ts = parse_zoned_timestamp(&ParquetValue::TimestampMillis(ts, tz));
|
625
|
-
let time_class = handle.class_time();
|
626
|
-
time_class
|
627
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
628
|
-
.unwrap()
|
629
|
-
.into_value_with(handle)
|
630
|
-
}
|
631
|
-
ParquetValue::TimestampMicros(ts, tz) => {
|
632
|
-
let ts = parse_zoned_timestamp(&ParquetValue::TimestampMicros(ts, tz));
|
633
|
-
let time_class = handle.class_time();
|
634
|
-
time_class
|
635
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
636
|
-
.unwrap()
|
637
|
-
.into_value_with(handle)
|
638
|
-
}
|
639
|
-
ParquetValue::TimestampNanos(ts, tz) => {
|
640
|
-
let ts = parse_zoned_timestamp(&ParquetValue::TimestampNanos(ts, tz));
|
641
|
-
let time_class = handle.class_time();
|
642
|
-
time_class
|
643
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
644
|
-
.unwrap()
|
645
|
-
.into_value_with(handle)
|
646
|
-
}
|
647
|
-
ParquetValue::List(l) => {
|
648
|
-
let ary = handle.ary_new_capa(l.len());
|
649
|
-
l.into_iter()
|
650
|
-
.try_for_each(|v| ary.push(v.into_value_with(handle)))
|
651
|
-
.unwrap();
|
652
|
-
ary.into_value_with(handle)
|
653
|
-
}
|
654
|
-
ParquetValue::Map(m) => {
|
655
|
-
let hash = handle.hash_new_capa(m.len());
|
656
|
-
m.into_iter()
|
657
|
-
.try_for_each(|(k, v)| {
|
658
|
-
hash.aset(k.into_value_with(handle), v.into_value_with(handle))
|
659
|
-
})
|
660
|
-
.unwrap();
|
661
|
-
hash.into_value_with(handle)
|
662
|
-
}
|
663
|
-
ParquetValue::Null => handle.qnil().as_value(),
|
664
|
-
}
|
665
|
-
}
|
666
|
-
}
|
667
|
-
|
668
|
-
fn parse_zoned_timestamp(value: &ParquetValue) -> jiff::Timestamp {
|
669
|
-
let (ts, tz) = match value {
|
670
|
-
ParquetValue::TimestampSecond(ts, tz) => (jiff::Timestamp::from_second(*ts).unwrap(), tz),
|
671
|
-
ParquetValue::TimestampMillis(ts, tz) => {
|
672
|
-
(jiff::Timestamp::from_millisecond(*ts).unwrap(), tz)
|
673
|
-
}
|
674
|
-
ParquetValue::TimestampMicros(ts, tz) => {
|
675
|
-
(jiff::Timestamp::from_microsecond(*ts).unwrap(), tz)
|
676
|
-
}
|
677
|
-
ParquetValue::TimestampNanos(ts, tz) => {
|
678
|
-
(jiff::Timestamp::from_nanosecond(*ts as i128).unwrap(), tz)
|
679
|
-
}
|
680
|
-
_ => panic!("Invalid timestamp value"),
|
681
|
-
};
|
682
|
-
|
683
|
-
// If timezone is provided, convert to zoned timestamp
|
684
|
-
if let Some(tz) = tz {
|
685
|
-
// Handle fixed offset timezones like "+09:00" first
|
686
|
-
if tz.starts_with('+') || tz.starts_with('-') {
|
687
|
-
// Parse the offset string into hours and minutes
|
688
|
-
let (hours, minutes) = if tz.len() >= 5 && tz.contains(':') {
|
689
|
-
// Format: "+09:00" or "-09:00"
|
690
|
-
let h = tz[1..3].parse::<i32>().unwrap_or(0);
|
691
|
-
let m = tz[4..6].parse::<i32>().unwrap_or(0);
|
692
|
-
(h, m)
|
693
|
-
} else if tz.len() >= 3 {
|
694
|
-
// Format: "+09" or "-09"
|
695
|
-
let h = tz[1..3].parse::<i32>().unwrap_or(0);
|
696
|
-
(h, 0)
|
697
|
-
} else {
|
698
|
-
(0, 0)
|
699
|
-
};
|
700
|
-
|
701
|
-
// Apply sign
|
702
|
-
let total_minutes = if tz.starts_with('-') {
|
703
|
-
-(hours * 60 + minutes)
|
704
|
-
} else {
|
705
|
-
hours * 60 + minutes
|
706
|
-
};
|
707
|
-
|
708
|
-
// Create fixed timezone
|
709
|
-
let tz = jiff::tz::TimeZone::fixed(jiff::tz::offset((total_minutes / 60) as i8));
|
710
|
-
ts.to_zoned(tz).timestamp()
|
711
|
-
} else {
|
712
|
-
// Try IANA timezone
|
713
|
-
match ts.intz(&tz) {
|
714
|
-
Ok(zoned) => zoned.timestamp(),
|
715
|
-
Err(_) => ts, // Fall back to UTC if timezone is invalid
|
716
|
-
}
|
717
|
-
}
|
718
|
-
} else {
|
719
|
-
// No timezone provided - treat as UTC
|
720
|
-
ts
|
721
|
-
}
|
722
|
-
}
|