parquet 0.0.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +48 -40
- data/Gemfile +1 -1
- data/README.md +92 -2
- data/ext/parquet/Cargo.toml +5 -8
- data/ext/parquet/src/enumerator.rs +11 -5
- data/ext/parquet/src/lib.rs +5 -0
- data/ext/parquet/src/reader/mod.rs +42 -0
- data/ext/parquet/src/{reader.rs → reader/parquet_column_reader.rs} +7 -164
- data/ext/parquet/src/reader/parquet_row_reader.rs +152 -0
- data/ext/parquet/src/ruby_reader.rs +2 -3
- data/ext/parquet/src/types/core_types.rs +73 -0
- data/ext/parquet/src/types/mod.rs +30 -0
- data/ext/parquet/src/{types.rs → types/parquet_value.rs} +171 -435
- data/ext/parquet/src/types/record_types.rs +204 -0
- data/ext/parquet/src/types/timestamp.rs +85 -0
- data/ext/parquet/src/types/type_conversion.rs +753 -0
- data/ext/parquet/src/types/writer_types.rs +270 -0
- data/ext/parquet/src/utils.rs +34 -26
- data/ext/parquet/src/writer/mod.rs +403 -0
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +33 -2
- metadata +13 -4
@@ -1,232 +1,8 @@
|
|
1
|
-
use
|
1
|
+
use crate::{impl_date_conversion, impl_timestamp_array_conversion, impl_timestamp_conversion};
|
2
2
|
|
3
|
-
use
|
4
|
-
use arrow_array::{
|
5
|
-
Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Float16Array, Float32Array,
|
6
|
-
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, NullArray, StringArray,
|
7
|
-
StructArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
|
8
|
-
TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
|
9
|
-
};
|
10
|
-
use arrow_schema::{DataType, TimeUnit};
|
11
|
-
use itertools::Itertools;
|
12
|
-
use magnus::{value::ReprValue, IntoValue, Ruby, Value};
|
13
|
-
use parquet::data_type::Decimal;
|
14
|
-
use parquet::record::Field;
|
15
|
-
|
16
|
-
use crate::header_cache::StringCacheKey;
|
17
|
-
|
18
|
-
#[derive(Debug)]
|
19
|
-
pub enum RowRecord<S: BuildHasher + Default> {
|
20
|
-
Vec(Vec<ParquetField>),
|
21
|
-
Map(HashMap<StringCacheKey, ParquetField, S>),
|
22
|
-
}
|
23
|
-
|
24
|
-
#[derive(Debug)]
|
25
|
-
pub enum ColumnRecord<S: BuildHasher + Default> {
|
26
|
-
Vec(Vec<Vec<ParquetValue>>),
|
27
|
-
Map(HashMap<StringCacheKey, Vec<ParquetValue>, S>),
|
28
|
-
}
|
29
|
-
|
30
|
-
impl<S: BuildHasher + Default> IntoValue for RowRecord<S> {
|
31
|
-
fn into_value_with(self, handle: &Ruby) -> Value {
|
32
|
-
match self {
|
33
|
-
RowRecord::Vec(vec) => {
|
34
|
-
let ary = handle.ary_new_capa(vec.len());
|
35
|
-
vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
|
36
|
-
handle.into_value(ary)
|
37
|
-
}
|
38
|
-
RowRecord::Map(map) => {
|
39
|
-
let hash = handle.hash_new_capa(map.len());
|
40
|
-
|
41
|
-
let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
|
42
|
-
let mut i = 0;
|
43
|
-
|
44
|
-
for chunk in &map.into_iter().chunks(64) {
|
45
|
-
// Reduced to 64 to ensure space for pairs
|
46
|
-
for (k, v) in chunk {
|
47
|
-
if i + 1 >= values.len() {
|
48
|
-
// Bulk insert current batch if array is full
|
49
|
-
hash.bulk_insert(&values[..i]).unwrap();
|
50
|
-
values[..i].fill(handle.qnil().as_value());
|
51
|
-
i = 0;
|
52
|
-
}
|
53
|
-
values[i] = handle.into_value(k);
|
54
|
-
values[i + 1] = handle.into_value(v);
|
55
|
-
i += 2;
|
56
|
-
}
|
57
|
-
// Insert any remaining pairs
|
58
|
-
if i > 0 {
|
59
|
-
hash.bulk_insert(&values[..i]).unwrap();
|
60
|
-
values[..i].fill(handle.qnil().as_value());
|
61
|
-
i = 0;
|
62
|
-
}
|
63
|
-
}
|
64
|
-
|
65
|
-
hash.into_value_with(handle)
|
66
|
-
}
|
67
|
-
}
|
68
|
-
}
|
69
|
-
}
|
70
|
-
|
71
|
-
impl<S: BuildHasher + Default> IntoValue for ColumnRecord<S> {
|
72
|
-
fn into_value_with(self, handle: &Ruby) -> Value {
|
73
|
-
match self {
|
74
|
-
ColumnRecord::Vec(vec) => {
|
75
|
-
let ary = handle.ary_new_capa(vec.len());
|
76
|
-
vec.into_iter()
|
77
|
-
.try_for_each(|v| {
|
78
|
-
let nested_ary = handle.ary_new_capa(v.len());
|
79
|
-
v.into_iter().try_for_each(|v| nested_ary.push(v)).unwrap();
|
80
|
-
ary.push(nested_ary.into_value_with(handle))
|
81
|
-
})
|
82
|
-
.unwrap();
|
83
|
-
ary.into_value_with(handle)
|
84
|
-
}
|
85
|
-
ColumnRecord::Map(map) => {
|
86
|
-
let hash = handle.hash_new_capa(map.len());
|
87
|
-
|
88
|
-
let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
|
89
|
-
let mut i = 0;
|
90
|
-
|
91
|
-
for chunk in &map.into_iter().chunks(64) {
|
92
|
-
// Reduced to 64 to ensure space for pairs
|
93
|
-
for (k, v) in chunk {
|
94
|
-
if i + 1 >= values.len() {
|
95
|
-
// Bulk insert current batch if array is full
|
96
|
-
hash.bulk_insert(&values[..i]).unwrap();
|
97
|
-
values[..i].fill(handle.qnil().as_value());
|
98
|
-
i = 0;
|
99
|
-
}
|
100
|
-
values[i] = handle.into_value(k);
|
101
|
-
let ary = handle.ary_new_capa(v.len());
|
102
|
-
v.into_iter().try_for_each(|v| ary.push(v)).unwrap();
|
103
|
-
values[i + 1] = handle.into_value(ary);
|
104
|
-
i += 2;
|
105
|
-
}
|
106
|
-
// Insert any remaining pairs
|
107
|
-
if i > 0 {
|
108
|
-
hash.bulk_insert(&values[..i]).unwrap();
|
109
|
-
values[..i].fill(handle.qnil().as_value());
|
110
|
-
i = 0;
|
111
|
-
}
|
112
|
-
}
|
113
|
-
|
114
|
-
hash.into_value_with(handle)
|
115
|
-
}
|
116
|
-
}
|
117
|
-
}
|
118
|
-
}
|
3
|
+
use super::*;
|
119
4
|
|
120
5
|
#[derive(Debug, Clone)]
|
121
|
-
pub struct CowValue<'a>(pub Cow<'a, str>);
|
122
|
-
|
123
|
-
impl<'a> IntoValue for CowValue<'a> {
|
124
|
-
fn into_value_with(self, handle: &Ruby) -> Value {
|
125
|
-
self.0.into_value_with(handle)
|
126
|
-
}
|
127
|
-
}
|
128
|
-
|
129
|
-
#[derive(Debug)]
|
130
|
-
pub struct ParquetField(pub Field);
|
131
|
-
|
132
|
-
impl IntoValue for ParquetField {
|
133
|
-
fn into_value_with(self, handle: &Ruby) -> Value {
|
134
|
-
match self.0 {
|
135
|
-
Field::Null => handle.qnil().as_value(),
|
136
|
-
Field::Bool(b) => b.into_value_with(handle),
|
137
|
-
Field::Short(s) => s.into_value_with(handle),
|
138
|
-
Field::Int(i) => i.into_value_with(handle),
|
139
|
-
Field::Long(l) => l.into_value_with(handle),
|
140
|
-
Field::UByte(ub) => ub.into_value_with(handle),
|
141
|
-
Field::UShort(us) => us.into_value_with(handle),
|
142
|
-
Field::UInt(ui) => ui.into_value_with(handle),
|
143
|
-
Field::ULong(ul) => ul.into_value_with(handle),
|
144
|
-
Field::Float16(f) => f32::from(f).into_value_with(handle),
|
145
|
-
Field::Float(f) => f.into_value_with(handle),
|
146
|
-
Field::Double(d) => d.into_value_with(handle),
|
147
|
-
Field::Str(s) => s.into_value_with(handle),
|
148
|
-
Field::Byte(b) => b.into_value_with(handle),
|
149
|
-
Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
|
150
|
-
Field::Date(d) => {
|
151
|
-
let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
|
152
|
-
let formatted = ts.strftime("%Y-%m-%d").to_string();
|
153
|
-
formatted.into_value_with(handle)
|
154
|
-
}
|
155
|
-
Field::TimestampMillis(ts) => {
|
156
|
-
let ts = jiff::Timestamp::from_millisecond(ts).unwrap();
|
157
|
-
let time_class = handle.class_time();
|
158
|
-
time_class
|
159
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
160
|
-
.unwrap()
|
161
|
-
.into_value_with(handle)
|
162
|
-
}
|
163
|
-
Field::TimestampMicros(ts) => {
|
164
|
-
let ts = jiff::Timestamp::from_microsecond(ts).unwrap();
|
165
|
-
let time_class = handle.class_time();
|
166
|
-
time_class
|
167
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
168
|
-
.unwrap()
|
169
|
-
.into_value_with(handle)
|
170
|
-
}
|
171
|
-
Field::ListInternal(list) => {
|
172
|
-
let elements = list.elements();
|
173
|
-
let ary = handle.ary_new_capa(elements.len());
|
174
|
-
elements
|
175
|
-
.iter()
|
176
|
-
.try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
|
177
|
-
.unwrap();
|
178
|
-
ary.into_value_with(handle)
|
179
|
-
}
|
180
|
-
Field::MapInternal(map) => {
|
181
|
-
let entries = map.entries();
|
182
|
-
let hash = handle.hash_new_capa(entries.len());
|
183
|
-
entries
|
184
|
-
.iter()
|
185
|
-
.try_for_each(|(k, v)| {
|
186
|
-
hash.aset(
|
187
|
-
ParquetField(k.clone()).into_value_with(handle),
|
188
|
-
ParquetField(v.clone()).into_value_with(handle),
|
189
|
-
)
|
190
|
-
})
|
191
|
-
.unwrap();
|
192
|
-
hash.into_value_with(handle)
|
193
|
-
}
|
194
|
-
Field::Decimal(d) => {
|
195
|
-
let value = match d {
|
196
|
-
Decimal::Int32 { value, scale, .. } => {
|
197
|
-
let unscaled = i32::from_be_bytes(value);
|
198
|
-
format!("{}e-{}", unscaled, scale)
|
199
|
-
}
|
200
|
-
Decimal::Int64 { value, scale, .. } => {
|
201
|
-
let unscaled = i64::from_be_bytes(value);
|
202
|
-
format!("{}e-{}", unscaled, scale)
|
203
|
-
}
|
204
|
-
Decimal::Bytes { value, scale, .. } => {
|
205
|
-
// Convert bytes to string representation of unscaled value
|
206
|
-
let unscaled = String::from_utf8_lossy(value.data());
|
207
|
-
format!("{}e-{}", unscaled, scale)
|
208
|
-
}
|
209
|
-
};
|
210
|
-
handle.eval(&format!("BigDecimal(\"{value}\")")).unwrap()
|
211
|
-
}
|
212
|
-
Field::Group(row) => {
|
213
|
-
let hash = handle.hash_new();
|
214
|
-
row.get_column_iter()
|
215
|
-
.try_for_each(|(k, v)| {
|
216
|
-
hash.aset(
|
217
|
-
k.clone().into_value_with(handle),
|
218
|
-
ParquetField(v.clone()).into_value_with(handle),
|
219
|
-
)
|
220
|
-
})
|
221
|
-
.unwrap();
|
222
|
-
hash.into_value_with(handle)
|
223
|
-
}
|
224
|
-
}
|
225
|
-
}
|
226
|
-
}
|
227
|
-
|
228
|
-
#[allow(dead_code)]
|
229
|
-
#[derive(Clone, Debug)]
|
230
6
|
pub enum ParquetValue {
|
231
7
|
Int8(i8),
|
232
8
|
Int16(i16),
|
@@ -285,6 +61,175 @@ impl PartialEq for ParquetValue {
|
|
285
61
|
|
286
62
|
impl Eq for ParquetValue {}
|
287
63
|
|
64
|
+
impl std::hash::Hash for ParquetValue {
|
65
|
+
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
66
|
+
match self {
|
67
|
+
ParquetValue::Int8(i) => i.hash(state),
|
68
|
+
ParquetValue::Int16(i) => i.hash(state),
|
69
|
+
ParquetValue::Int32(i) => i.hash(state),
|
70
|
+
ParquetValue::Int64(i) => i.hash(state),
|
71
|
+
ParquetValue::UInt8(i) => i.hash(state),
|
72
|
+
ParquetValue::UInt16(i) => i.hash(state),
|
73
|
+
ParquetValue::UInt32(i) => i.hash(state),
|
74
|
+
ParquetValue::UInt64(i) => i.hash(state),
|
75
|
+
ParquetValue::Float16(f) => f.to_bits().hash(state),
|
76
|
+
ParquetValue::Float32(f) => f.to_bits().hash(state),
|
77
|
+
ParquetValue::Float64(f) => f.to_bits().hash(state),
|
78
|
+
ParquetValue::Boolean(b) => b.hash(state),
|
79
|
+
ParquetValue::String(s) => s.hash(state),
|
80
|
+
ParquetValue::Bytes(b) => b.hash(state),
|
81
|
+
ParquetValue::Date32(d) => d.hash(state),
|
82
|
+
ParquetValue::Date64(d) => d.hash(state),
|
83
|
+
ParquetValue::TimestampSecond(ts, tz) => {
|
84
|
+
ts.hash(state);
|
85
|
+
tz.hash(state);
|
86
|
+
}
|
87
|
+
ParquetValue::TimestampMillis(ts, tz) => {
|
88
|
+
ts.hash(state);
|
89
|
+
tz.hash(state);
|
90
|
+
}
|
91
|
+
ParquetValue::TimestampMicros(ts, tz) => {
|
92
|
+
ts.hash(state);
|
93
|
+
tz.hash(state);
|
94
|
+
}
|
95
|
+
ParquetValue::TimestampNanos(ts, tz) => {
|
96
|
+
ts.hash(state);
|
97
|
+
tz.hash(state);
|
98
|
+
}
|
99
|
+
ParquetValue::List(l) => l.hash(state),
|
100
|
+
ParquetValue::Map(_m) => panic!("Map is not hashable"),
|
101
|
+
ParquetValue::Null => 0_i32.hash(state),
|
102
|
+
}
|
103
|
+
}
|
104
|
+
}
|
105
|
+
|
106
|
+
impl IntoValue for ParquetValue {
|
107
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
108
|
+
match self {
|
109
|
+
ParquetValue::Int8(i) => i.into_value_with(handle),
|
110
|
+
ParquetValue::Int16(i) => i.into_value_with(handle),
|
111
|
+
ParquetValue::Int32(i) => i.into_value_with(handle),
|
112
|
+
ParquetValue::Int64(i) => i.into_value_with(handle),
|
113
|
+
ParquetValue::UInt8(i) => i.into_value_with(handle),
|
114
|
+
ParquetValue::UInt16(i) => i.into_value_with(handle),
|
115
|
+
ParquetValue::UInt32(i) => i.into_value_with(handle),
|
116
|
+
ParquetValue::UInt64(i) => i.into_value_with(handle),
|
117
|
+
ParquetValue::Float16(f) => f.into_value_with(handle),
|
118
|
+
ParquetValue::Float32(f) => f.into_value_with(handle),
|
119
|
+
ParquetValue::Float64(f) => f.into_value_with(handle),
|
120
|
+
ParquetValue::Boolean(b) => b.into_value_with(handle),
|
121
|
+
ParquetValue::String(s) => s.into_value_with(handle),
|
122
|
+
ParquetValue::Bytes(b) => handle.str_from_slice(&b).as_value(),
|
123
|
+
ParquetValue::Date32(d) => impl_date_conversion!(d, handle),
|
124
|
+
ParquetValue::Date64(d) => impl_date_conversion!(d, handle),
|
125
|
+
timestamp @ ParquetValue::TimestampSecond(_, _) => {
|
126
|
+
impl_timestamp_conversion!(timestamp, TimestampSecond, handle)
|
127
|
+
}
|
128
|
+
timestamp @ ParquetValue::TimestampMillis(_, _) => {
|
129
|
+
impl_timestamp_conversion!(timestamp, TimestampMillis, handle)
|
130
|
+
}
|
131
|
+
timestamp @ ParquetValue::TimestampMicros(_, _) => {
|
132
|
+
impl_timestamp_conversion!(timestamp, TimestampMicros, handle)
|
133
|
+
}
|
134
|
+
timestamp @ ParquetValue::TimestampNanos(_, _) => {
|
135
|
+
impl_timestamp_conversion!(timestamp, TimestampNanos, handle)
|
136
|
+
}
|
137
|
+
ParquetValue::List(l) => {
|
138
|
+
let ary = handle.ary_new_capa(l.len());
|
139
|
+
l.into_iter()
|
140
|
+
.try_for_each(|v| ary.push(v.into_value_with(handle)))
|
141
|
+
.unwrap();
|
142
|
+
ary.into_value_with(handle)
|
143
|
+
}
|
144
|
+
ParquetValue::Map(m) => {
|
145
|
+
let hash = handle.hash_new_capa(m.len());
|
146
|
+
m.into_iter()
|
147
|
+
.try_for_each(|(k, v)| {
|
148
|
+
hash.aset(k.into_value_with(handle), v.into_value_with(handle))
|
149
|
+
})
|
150
|
+
.unwrap();
|
151
|
+
hash.into_value_with(handle)
|
152
|
+
}
|
153
|
+
ParquetValue::Null => handle.qnil().as_value(),
|
154
|
+
}
|
155
|
+
}
|
156
|
+
}
|
157
|
+
|
158
|
+
impl ParquetValue {
|
159
|
+
pub fn from_value(value: Value, type_: &ParquetSchemaType) -> Result<Self, MagnusError> {
|
160
|
+
match type_ {
|
161
|
+
ParquetSchemaType::Int8 => {
|
162
|
+
let v = NumericConverter::<i8>::convert_with_string_fallback(value)?;
|
163
|
+
Ok(ParquetValue::Int8(v))
|
164
|
+
}
|
165
|
+
ParquetSchemaType::Int16 => {
|
166
|
+
let v = NumericConverter::<i16>::convert_with_string_fallback(value)?;
|
167
|
+
Ok(ParquetValue::Int16(v))
|
168
|
+
}
|
169
|
+
ParquetSchemaType::Int32 => {
|
170
|
+
let v = NumericConverter::<i32>::convert_with_string_fallback(value)?;
|
171
|
+
Ok(ParquetValue::Int32(v))
|
172
|
+
}
|
173
|
+
ParquetSchemaType::Int64 => {
|
174
|
+
let v = NumericConverter::<i64>::convert_with_string_fallback(value)?;
|
175
|
+
Ok(ParquetValue::Int64(v))
|
176
|
+
}
|
177
|
+
ParquetSchemaType::UInt8 => {
|
178
|
+
let v = NumericConverter::<u8>::convert_with_string_fallback(value)?;
|
179
|
+
Ok(ParquetValue::UInt8(v))
|
180
|
+
}
|
181
|
+
ParquetSchemaType::UInt16 => {
|
182
|
+
let v = NumericConverter::<u16>::convert_with_string_fallback(value)?;
|
183
|
+
Ok(ParquetValue::UInt16(v))
|
184
|
+
}
|
185
|
+
ParquetSchemaType::UInt32 => {
|
186
|
+
let v = NumericConverter::<u32>::convert_with_string_fallback(value)?;
|
187
|
+
Ok(ParquetValue::UInt32(v))
|
188
|
+
}
|
189
|
+
ParquetSchemaType::UInt64 => {
|
190
|
+
let v = NumericConverter::<u64>::convert_with_string_fallback(value)?;
|
191
|
+
Ok(ParquetValue::UInt64(v))
|
192
|
+
}
|
193
|
+
ParquetSchemaType::Float => {
|
194
|
+
let v = NumericConverter::<f32>::convert_with_string_fallback(value)?;
|
195
|
+
Ok(ParquetValue::Float32(v))
|
196
|
+
}
|
197
|
+
ParquetSchemaType::Double => {
|
198
|
+
let v = NumericConverter::<f64>::convert_with_string_fallback(value)?;
|
199
|
+
Ok(ParquetValue::Float64(v))
|
200
|
+
}
|
201
|
+
ParquetSchemaType::String => {
|
202
|
+
let v = String::try_convert(value)?;
|
203
|
+
Ok(ParquetValue::String(v))
|
204
|
+
}
|
205
|
+
ParquetSchemaType::Binary => {
|
206
|
+
let v = convert_to_binary(value)?;
|
207
|
+
Ok(ParquetValue::Bytes(v))
|
208
|
+
}
|
209
|
+
ParquetSchemaType::Boolean => {
|
210
|
+
let v = convert_to_boolean(value)?;
|
211
|
+
Ok(ParquetValue::Boolean(v))
|
212
|
+
}
|
213
|
+
ParquetSchemaType::Date32 => {
|
214
|
+
let v = convert_to_date32(value)?;
|
215
|
+
Ok(ParquetValue::Date32(v))
|
216
|
+
}
|
217
|
+
ParquetSchemaType::TimestampMillis => {
|
218
|
+
let v = convert_to_timestamp_millis(value)?;
|
219
|
+
Ok(ParquetValue::TimestampMillis(v, None))
|
220
|
+
}
|
221
|
+
ParquetSchemaType::TimestampMicros => {
|
222
|
+
let v = convert_to_timestamp_micros(value)?;
|
223
|
+
Ok(ParquetValue::TimestampMicros(v, None))
|
224
|
+
}
|
225
|
+
ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => Err(MagnusError::new(
|
226
|
+
magnus::exception::type_error(),
|
227
|
+
"Nested lists and maps are not supported",
|
228
|
+
)),
|
229
|
+
}
|
230
|
+
}
|
231
|
+
}
|
232
|
+
|
288
233
|
#[derive(Debug)]
|
289
234
|
pub struct ParquetValueVec(Vec<ParquetValue>);
|
290
235
|
|
@@ -319,7 +264,6 @@ impl TryFrom<Arc<dyn Array>> for ParquetValueVec {
|
|
319
264
|
}
|
320
265
|
}
|
321
266
|
|
322
|
-
// Add macro for handling numeric array conversions
|
323
267
|
macro_rules! impl_numeric_array_conversion {
|
324
268
|
($column:expr, $array_type:ty, $variant:ident) => {{
|
325
269
|
let array = downcast_array::<$array_type>($column);
|
@@ -345,8 +289,6 @@ macro_rules! impl_numeric_array_conversion {
|
|
345
289
|
}
|
346
290
|
}};
|
347
291
|
}
|
348
|
-
|
349
|
-
// Add macro for handling boolean array conversions
|
350
292
|
macro_rules! impl_boolean_array_conversion {
|
351
293
|
($column:expr, $array_type:ty, $variant:ident) => {{
|
352
294
|
let array = downcast_array::<$array_type>($column);
|
@@ -373,33 +315,6 @@ macro_rules! impl_boolean_array_conversion {
|
|
373
315
|
}};
|
374
316
|
}
|
375
317
|
|
376
|
-
// Add macro for handling timestamp array conversions
|
377
|
-
macro_rules! impl_timestamp_array_conversion {
|
378
|
-
($column:expr, $array_type:ty, $variant:ident, $tz:expr) => {{
|
379
|
-
let array = downcast_array::<$array_type>($column);
|
380
|
-
if array.is_nullable() {
|
381
|
-
array
|
382
|
-
.values()
|
383
|
-
.iter()
|
384
|
-
.enumerate()
|
385
|
-
.map(|(i, x)| {
|
386
|
-
if array.is_null(i) {
|
387
|
-
ParquetValue::Null
|
388
|
-
} else {
|
389
|
-
ParquetValue::$variant(*x, $tz.clone())
|
390
|
-
}
|
391
|
-
})
|
392
|
-
.collect()
|
393
|
-
} else {
|
394
|
-
array
|
395
|
-
.values()
|
396
|
-
.iter()
|
397
|
-
.map(|x| ParquetValue::$variant(*x, $tz.clone()))
|
398
|
-
.collect()
|
399
|
-
}
|
400
|
-
}};
|
401
|
-
}
|
402
|
-
|
403
318
|
impl TryFrom<&dyn Array> for ParquetValueVec {
|
404
319
|
type Error = String;
|
405
320
|
|
@@ -445,7 +360,6 @@ impl TryFrom<&dyn Array> for ParquetValueVec {
|
|
445
360
|
tz
|
446
361
|
)
|
447
362
|
}
|
448
|
-
// Because f16 is unstable in Rust, we convert it to f32
|
449
363
|
DataType::Float16 => {
|
450
364
|
let array = downcast_array::<Float16Array>(column);
|
451
365
|
if array.is_nullable() {
|
@@ -542,181 +456,3 @@ impl TryFrom<&dyn Array> for ParquetValueVec {
|
|
542
456
|
Ok(ParquetValueVec(tmp_vec))
|
543
457
|
}
|
544
458
|
}
|
545
|
-
|
546
|
-
impl std::hash::Hash for ParquetValue {
|
547
|
-
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
548
|
-
match self {
|
549
|
-
ParquetValue::Int8(i) => i.hash(state),
|
550
|
-
ParquetValue::Int16(i) => i.hash(state),
|
551
|
-
ParquetValue::Int32(i) => i.hash(state),
|
552
|
-
ParquetValue::Int64(i) => i.hash(state),
|
553
|
-
ParquetValue::UInt8(i) => i.hash(state),
|
554
|
-
ParquetValue::UInt16(i) => i.hash(state),
|
555
|
-
ParquetValue::UInt32(i) => i.hash(state),
|
556
|
-
ParquetValue::UInt64(i) => i.hash(state),
|
557
|
-
ParquetValue::Float16(f) => f.to_bits().hash(state),
|
558
|
-
ParquetValue::Float32(f) => f.to_bits().hash(state),
|
559
|
-
ParquetValue::Float64(f) => f.to_bits().hash(state),
|
560
|
-
ParquetValue::Boolean(b) => b.hash(state),
|
561
|
-
ParquetValue::String(s) => s.hash(state),
|
562
|
-
ParquetValue::Bytes(b) => b.hash(state),
|
563
|
-
ParquetValue::Date32(d) => d.hash(state),
|
564
|
-
ParquetValue::Date64(d) => d.hash(state),
|
565
|
-
ParquetValue::TimestampSecond(ts, tz) => {
|
566
|
-
ts.hash(state);
|
567
|
-
tz.hash(state);
|
568
|
-
}
|
569
|
-
ParquetValue::TimestampMillis(ts, tz) => {
|
570
|
-
ts.hash(state);
|
571
|
-
tz.hash(state);
|
572
|
-
}
|
573
|
-
ParquetValue::TimestampMicros(ts, tz) => {
|
574
|
-
ts.hash(state);
|
575
|
-
tz.hash(state);
|
576
|
-
}
|
577
|
-
ParquetValue::TimestampNanos(ts, tz) => {
|
578
|
-
ts.hash(state);
|
579
|
-
tz.hash(state);
|
580
|
-
}
|
581
|
-
ParquetValue::List(l) => l.hash(state),
|
582
|
-
ParquetValue::Map(_m) => panic!("Map is not hashable"),
|
583
|
-
ParquetValue::Null => 0_i32.hash(state),
|
584
|
-
}
|
585
|
-
}
|
586
|
-
}
|
587
|
-
|
588
|
-
impl IntoValue for ParquetValue {
|
589
|
-
fn into_value_with(self, handle: &Ruby) -> Value {
|
590
|
-
match self {
|
591
|
-
ParquetValue::Int8(i) => i.into_value_with(handle),
|
592
|
-
ParquetValue::Int16(i) => i.into_value_with(handle),
|
593
|
-
ParquetValue::Int32(i) => i.into_value_with(handle),
|
594
|
-
ParquetValue::Int64(i) => i.into_value_with(handle),
|
595
|
-
ParquetValue::UInt8(i) => i.into_value_with(handle),
|
596
|
-
ParquetValue::UInt16(i) => i.into_value_with(handle),
|
597
|
-
ParquetValue::UInt32(i) => i.into_value_with(handle),
|
598
|
-
ParquetValue::UInt64(i) => i.into_value_with(handle),
|
599
|
-
ParquetValue::Float16(f) => f.into_value_with(handle),
|
600
|
-
ParquetValue::Float32(f) => f.into_value_with(handle),
|
601
|
-
ParquetValue::Float64(f) => f.into_value_with(handle),
|
602
|
-
ParquetValue::Boolean(b) => b.into_value_with(handle),
|
603
|
-
ParquetValue::String(s) => s.into_value_with(handle),
|
604
|
-
ParquetValue::Bytes(b) => b.into_value_with(handle),
|
605
|
-
ParquetValue::Date32(d) => {
|
606
|
-
let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
|
607
|
-
let formatted = ts.strftime("%Y-%m-%d").to_string();
|
608
|
-
formatted.into_value_with(handle)
|
609
|
-
}
|
610
|
-
ParquetValue::Date64(d) => {
|
611
|
-
let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
|
612
|
-
let formatted = ts.strftime("%Y-%m-%d").to_string();
|
613
|
-
formatted.into_value_with(handle)
|
614
|
-
}
|
615
|
-
ParquetValue::TimestampSecond(ts, tz) => {
|
616
|
-
let ts = parse_zoned_timestamp(&ParquetValue::TimestampSecond(ts, tz));
|
617
|
-
let time_class = handle.class_time();
|
618
|
-
time_class
|
619
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
620
|
-
.unwrap()
|
621
|
-
.into_value_with(handle)
|
622
|
-
}
|
623
|
-
ParquetValue::TimestampMillis(ts, tz) => {
|
624
|
-
let ts = parse_zoned_timestamp(&ParquetValue::TimestampMillis(ts, tz));
|
625
|
-
let time_class = handle.class_time();
|
626
|
-
time_class
|
627
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
628
|
-
.unwrap()
|
629
|
-
.into_value_with(handle)
|
630
|
-
}
|
631
|
-
ParquetValue::TimestampMicros(ts, tz) => {
|
632
|
-
let ts = parse_zoned_timestamp(&ParquetValue::TimestampMicros(ts, tz));
|
633
|
-
let time_class = handle.class_time();
|
634
|
-
time_class
|
635
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
636
|
-
.unwrap()
|
637
|
-
.into_value_with(handle)
|
638
|
-
}
|
639
|
-
ParquetValue::TimestampNanos(ts, tz) => {
|
640
|
-
let ts = parse_zoned_timestamp(&ParquetValue::TimestampNanos(ts, tz));
|
641
|
-
let time_class = handle.class_time();
|
642
|
-
time_class
|
643
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
644
|
-
.unwrap()
|
645
|
-
.into_value_with(handle)
|
646
|
-
}
|
647
|
-
ParquetValue::List(l) => {
|
648
|
-
let ary = handle.ary_new_capa(l.len());
|
649
|
-
l.into_iter()
|
650
|
-
.try_for_each(|v| ary.push(v.into_value_with(handle)))
|
651
|
-
.unwrap();
|
652
|
-
ary.into_value_with(handle)
|
653
|
-
}
|
654
|
-
ParquetValue::Map(m) => {
|
655
|
-
let hash = handle.hash_new_capa(m.len());
|
656
|
-
m.into_iter()
|
657
|
-
.try_for_each(|(k, v)| {
|
658
|
-
hash.aset(k.into_value_with(handle), v.into_value_with(handle))
|
659
|
-
})
|
660
|
-
.unwrap();
|
661
|
-
hash.into_value_with(handle)
|
662
|
-
}
|
663
|
-
ParquetValue::Null => handle.qnil().as_value(),
|
664
|
-
}
|
665
|
-
}
|
666
|
-
}
|
667
|
-
|
668
|
-
fn parse_zoned_timestamp(value: &ParquetValue) -> jiff::Timestamp {
|
669
|
-
let (ts, tz) = match value {
|
670
|
-
ParquetValue::TimestampSecond(ts, tz) => (jiff::Timestamp::from_second(*ts).unwrap(), tz),
|
671
|
-
ParquetValue::TimestampMillis(ts, tz) => {
|
672
|
-
(jiff::Timestamp::from_millisecond(*ts).unwrap(), tz)
|
673
|
-
}
|
674
|
-
ParquetValue::TimestampMicros(ts, tz) => {
|
675
|
-
(jiff::Timestamp::from_microsecond(*ts).unwrap(), tz)
|
676
|
-
}
|
677
|
-
ParquetValue::TimestampNanos(ts, tz) => {
|
678
|
-
(jiff::Timestamp::from_nanosecond(*ts as i128).unwrap(), tz)
|
679
|
-
}
|
680
|
-
_ => panic!("Invalid timestamp value"),
|
681
|
-
};
|
682
|
-
|
683
|
-
// If timezone is provided, convert to zoned timestamp
|
684
|
-
if let Some(tz) = tz {
|
685
|
-
// Handle fixed offset timezones like "+09:00" first
|
686
|
-
if tz.starts_with('+') || tz.starts_with('-') {
|
687
|
-
// Parse the offset string into hours and minutes
|
688
|
-
let (hours, minutes) = if tz.len() >= 5 && tz.contains(':') {
|
689
|
-
// Format: "+09:00" or "-09:00"
|
690
|
-
let h = tz[1..3].parse::<i32>().unwrap_or(0);
|
691
|
-
let m = tz[4..6].parse::<i32>().unwrap_or(0);
|
692
|
-
(h, m)
|
693
|
-
} else if tz.len() >= 3 {
|
694
|
-
// Format: "+09" or "-09"
|
695
|
-
let h = tz[1..3].parse::<i32>().unwrap_or(0);
|
696
|
-
(h, 0)
|
697
|
-
} else {
|
698
|
-
(0, 0)
|
699
|
-
};
|
700
|
-
|
701
|
-
// Apply sign
|
702
|
-
let total_minutes = if tz.starts_with('-') {
|
703
|
-
-(hours * 60 + minutes)
|
704
|
-
} else {
|
705
|
-
hours * 60 + minutes
|
706
|
-
};
|
707
|
-
|
708
|
-
// Create fixed timezone
|
709
|
-
let tz = jiff::tz::TimeZone::fixed(jiff::tz::offset((total_minutes / 60) as i8));
|
710
|
-
ts.to_zoned(tz).timestamp()
|
711
|
-
} else {
|
712
|
-
// Try IANA timezone
|
713
|
-
match ts.intz(&tz) {
|
714
|
-
Ok(zoned) => zoned.timestamp(),
|
715
|
-
Err(_) => ts, // Fall back to UTC if timezone is invalid
|
716
|
-
}
|
717
|
-
}
|
718
|
-
} else {
|
719
|
-
// No timezone provided - treat as UTC
|
720
|
-
ts
|
721
|
-
}
|
722
|
-
}
|