parquet 0.0.4 → 0.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Cargo.lock +48 -40
- data/Gemfile +1 -1
- data/README.md +92 -2
- data/ext/parquet/Cargo.toml +5 -8
- data/ext/parquet/src/enumerator.rs +11 -5
- data/ext/parquet/src/lib.rs +5 -0
- data/ext/parquet/src/reader/mod.rs +42 -0
- data/ext/parquet/src/{reader.rs → reader/parquet_column_reader.rs} +7 -164
- data/ext/parquet/src/reader/parquet_row_reader.rs +152 -0
- data/ext/parquet/src/ruby_reader.rs +2 -3
- data/ext/parquet/src/types/core_types.rs +73 -0
- data/ext/parquet/src/types/mod.rs +30 -0
- data/ext/parquet/src/{types.rs → types/parquet_value.rs} +171 -435
- data/ext/parquet/src/types/record_types.rs +204 -0
- data/ext/parquet/src/types/timestamp.rs +85 -0
- data/ext/parquet/src/types/type_conversion.rs +753 -0
- data/ext/parquet/src/types/writer_types.rs +270 -0
- data/ext/parquet/src/utils.rs +34 -26
- data/ext/parquet/src/writer/mod.rs +403 -0
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +33 -2
- metadata +13 -4
@@ -0,0 +1,204 @@
|
|
1
|
+
use itertools::Itertools;
|
2
|
+
|
3
|
+
use super::*;
|
4
|
+
|
5
|
+
#[derive(Debug)]
|
6
|
+
pub enum RowRecord<S: BuildHasher + Default> {
|
7
|
+
Vec(Vec<ParquetField>),
|
8
|
+
Map(HashMap<StringCacheKey, ParquetField, S>),
|
9
|
+
}
|
10
|
+
|
11
|
+
#[derive(Debug)]
|
12
|
+
pub enum ColumnRecord<S: BuildHasher + Default> {
|
13
|
+
Vec(Vec<Vec<ParquetValue>>),
|
14
|
+
Map(HashMap<StringCacheKey, Vec<ParquetValue>, S>),
|
15
|
+
}
|
16
|
+
|
17
|
+
#[derive(Debug)]
|
18
|
+
pub struct ParquetField(pub Field);
|
19
|
+
|
20
|
+
impl<S: BuildHasher + Default> IntoValue for RowRecord<S> {
|
21
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
22
|
+
match self {
|
23
|
+
RowRecord::Vec(vec) => {
|
24
|
+
let ary = handle.ary_new_capa(vec.len());
|
25
|
+
vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
|
26
|
+
handle.into_value(ary)
|
27
|
+
}
|
28
|
+
RowRecord::Map(map) => {
|
29
|
+
let hash = handle.hash_new_capa(map.len());
|
30
|
+
|
31
|
+
let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
|
32
|
+
let mut i = 0;
|
33
|
+
|
34
|
+
for chunk in &map.into_iter().chunks(64) {
|
35
|
+
// Reduced to 64 to ensure space for pairs
|
36
|
+
for (k, v) in chunk {
|
37
|
+
if i + 1 >= values.len() {
|
38
|
+
// Bulk insert current batch if array is full
|
39
|
+
hash.bulk_insert(&values[..i]).unwrap();
|
40
|
+
values[..i].fill(handle.qnil().as_value());
|
41
|
+
i = 0;
|
42
|
+
}
|
43
|
+
values[i] = handle.into_value(k);
|
44
|
+
values[i + 1] = handle.into_value(v);
|
45
|
+
i += 2;
|
46
|
+
}
|
47
|
+
// Insert any remaining pairs
|
48
|
+
if i > 0 {
|
49
|
+
hash.bulk_insert(&values[..i]).unwrap();
|
50
|
+
values[..i].fill(handle.qnil().as_value());
|
51
|
+
i = 0;
|
52
|
+
}
|
53
|
+
}
|
54
|
+
|
55
|
+
hash.into_value_with(handle)
|
56
|
+
}
|
57
|
+
}
|
58
|
+
}
|
59
|
+
}
|
60
|
+
|
61
|
+
impl<S: BuildHasher + Default> IntoValue for ColumnRecord<S> {
|
62
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
63
|
+
match self {
|
64
|
+
ColumnRecord::Vec(vec) => {
|
65
|
+
let ary = handle.ary_new_capa(vec.len());
|
66
|
+
vec.into_iter()
|
67
|
+
.try_for_each(|v| {
|
68
|
+
let nested_ary = handle.ary_new_capa(v.len());
|
69
|
+
v.into_iter().try_for_each(|v| nested_ary.push(v)).unwrap();
|
70
|
+
ary.push(nested_ary.into_value_with(handle))
|
71
|
+
})
|
72
|
+
.unwrap();
|
73
|
+
ary.into_value_with(handle)
|
74
|
+
}
|
75
|
+
ColumnRecord::Map(map) => {
|
76
|
+
let hash = handle.hash_new_capa(map.len());
|
77
|
+
|
78
|
+
let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
|
79
|
+
let mut i = 0;
|
80
|
+
|
81
|
+
for chunk in &map.into_iter().chunks(64) {
|
82
|
+
// Reduced to 64 to ensure space for pairs
|
83
|
+
for (k, v) in chunk {
|
84
|
+
if i + 1 >= values.len() {
|
85
|
+
// Bulk insert current batch if array is full
|
86
|
+
hash.bulk_insert(&values[..i]).unwrap();
|
87
|
+
values[..i].fill(handle.qnil().as_value());
|
88
|
+
i = 0;
|
89
|
+
}
|
90
|
+
values[i] = handle.into_value(k);
|
91
|
+
let ary = handle.ary_new_capa(v.len());
|
92
|
+
v.into_iter().try_for_each(|v| ary.push(v)).unwrap();
|
93
|
+
values[i + 1] = handle.into_value(ary);
|
94
|
+
i += 2;
|
95
|
+
}
|
96
|
+
// Insert any remaining pairs
|
97
|
+
if i > 0 {
|
98
|
+
hash.bulk_insert(&values[..i]).unwrap();
|
99
|
+
values[..i].fill(handle.qnil().as_value());
|
100
|
+
i = 0;
|
101
|
+
}
|
102
|
+
}
|
103
|
+
|
104
|
+
hash.into_value_with(handle)
|
105
|
+
}
|
106
|
+
}
|
107
|
+
}
|
108
|
+
}
|
109
|
+
|
110
|
+
impl IntoValue for ParquetField {
|
111
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
112
|
+
match self.0 {
|
113
|
+
Field::Null => handle.qnil().as_value(),
|
114
|
+
Field::Bool(b) => b.into_value_with(handle),
|
115
|
+
Field::Short(s) => s.into_value_with(handle),
|
116
|
+
Field::Int(i) => i.into_value_with(handle),
|
117
|
+
Field::Long(l) => l.into_value_with(handle),
|
118
|
+
Field::UByte(ub) => ub.into_value_with(handle),
|
119
|
+
Field::UShort(us) => us.into_value_with(handle),
|
120
|
+
Field::UInt(ui) => ui.into_value_with(handle),
|
121
|
+
Field::ULong(ul) => ul.into_value_with(handle),
|
122
|
+
Field::Float16(f) => f32::from(f).into_value_with(handle),
|
123
|
+
Field::Float(f) => f.into_value_with(handle),
|
124
|
+
Field::Double(d) => d.into_value_with(handle),
|
125
|
+
Field::Str(s) => s.into_value_with(handle),
|
126
|
+
Field::Byte(b) => b.into_value_with(handle),
|
127
|
+
Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
|
128
|
+
Field::Date(d) => {
|
129
|
+
let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
|
130
|
+
let formatted = ts.strftime("%Y-%m-%d").to_string();
|
131
|
+
formatted.into_value_with(handle)
|
132
|
+
}
|
133
|
+
Field::TimestampMillis(ts) => {
|
134
|
+
let ts = jiff::Timestamp::from_millisecond(ts).unwrap();
|
135
|
+
let time_class = handle.class_time();
|
136
|
+
time_class
|
137
|
+
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
138
|
+
.unwrap()
|
139
|
+
.into_value_with(handle)
|
140
|
+
}
|
141
|
+
Field::TimestampMicros(ts) => {
|
142
|
+
let ts = jiff::Timestamp::from_microsecond(ts).unwrap();
|
143
|
+
let time_class = handle.class_time();
|
144
|
+
time_class
|
145
|
+
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
146
|
+
.unwrap()
|
147
|
+
.into_value_with(handle)
|
148
|
+
}
|
149
|
+
Field::ListInternal(list) => {
|
150
|
+
let elements = list.elements();
|
151
|
+
let ary = handle.ary_new_capa(elements.len());
|
152
|
+
elements
|
153
|
+
.iter()
|
154
|
+
.try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
|
155
|
+
.unwrap();
|
156
|
+
ary.into_value_with(handle)
|
157
|
+
}
|
158
|
+
Field::MapInternal(map) => {
|
159
|
+
let entries = map.entries();
|
160
|
+
let hash = handle.hash_new_capa(entries.len());
|
161
|
+
entries
|
162
|
+
.iter()
|
163
|
+
.try_for_each(|(k, v)| {
|
164
|
+
hash.aset(
|
165
|
+
ParquetField(k.clone()).into_value_with(handle),
|
166
|
+
ParquetField(v.clone()).into_value_with(handle),
|
167
|
+
)
|
168
|
+
})
|
169
|
+
.unwrap();
|
170
|
+
hash.into_value_with(handle)
|
171
|
+
}
|
172
|
+
Field::Decimal(d) => {
|
173
|
+
let value = match d {
|
174
|
+
Decimal::Int32 { value, scale, .. } => {
|
175
|
+
let unscaled = i32::from_be_bytes(value);
|
176
|
+
format!("{}e-{}", unscaled, scale)
|
177
|
+
}
|
178
|
+
Decimal::Int64 { value, scale, .. } => {
|
179
|
+
let unscaled = i64::from_be_bytes(value);
|
180
|
+
format!("{}e-{}", unscaled, scale)
|
181
|
+
}
|
182
|
+
Decimal::Bytes { value, scale, .. } => {
|
183
|
+
// Convert bytes to string representation of unscaled value
|
184
|
+
let unscaled = String::from_utf8_lossy(value.data());
|
185
|
+
format!("{}e-{}", unscaled, scale)
|
186
|
+
}
|
187
|
+
};
|
188
|
+
handle.eval(&format!("BigDecimal(\"{value}\")")).unwrap()
|
189
|
+
}
|
190
|
+
Field::Group(row) => {
|
191
|
+
let hash = handle.hash_new();
|
192
|
+
row.get_column_iter()
|
193
|
+
.try_for_each(|(k, v)| {
|
194
|
+
hash.aset(
|
195
|
+
k.clone().into_value_with(handle),
|
196
|
+
ParquetField(v.clone()).into_value_with(handle),
|
197
|
+
)
|
198
|
+
})
|
199
|
+
.unwrap();
|
200
|
+
hash.into_value_with(handle)
|
201
|
+
}
|
202
|
+
}
|
203
|
+
}
|
204
|
+
}
|
@@ -0,0 +1,85 @@
|
|
1
|
+
use super::*;
|
2
|
+
|
3
|
+
pub fn parse_zoned_timestamp(value: &ParquetValue) -> jiff::Timestamp {
|
4
|
+
let (ts, tz) = match value {
|
5
|
+
ParquetValue::TimestampSecond(ts, tz) => (jiff::Timestamp::from_second(*ts).unwrap(), tz),
|
6
|
+
ParquetValue::TimestampMillis(ts, tz) => {
|
7
|
+
(jiff::Timestamp::from_millisecond(*ts).unwrap(), tz)
|
8
|
+
}
|
9
|
+
ParquetValue::TimestampMicros(ts, tz) => {
|
10
|
+
(jiff::Timestamp::from_microsecond(*ts).unwrap(), tz)
|
11
|
+
}
|
12
|
+
ParquetValue::TimestampNanos(ts, tz) => {
|
13
|
+
(jiff::Timestamp::from_nanosecond(*ts as i128).unwrap(), tz)
|
14
|
+
}
|
15
|
+
_ => panic!("Invalid timestamp value"),
|
16
|
+
};
|
17
|
+
|
18
|
+
// If timezone is provided, convert to zoned timestamp
|
19
|
+
if let Some(tz) = tz {
|
20
|
+
// Handle fixed offset timezones like "+09:00" first
|
21
|
+
if tz.starts_with('+') || tz.starts_with('-') {
|
22
|
+
// Parse the offset string into hours and minutes
|
23
|
+
let (hours, minutes) = if tz.len() >= 5 && tz.contains(':') {
|
24
|
+
// Format: "+09:00" or "-09:00"
|
25
|
+
let h = tz[1..3].parse::<i32>().unwrap_or(0);
|
26
|
+
let m = tz[4..6].parse::<i32>().unwrap_or(0);
|
27
|
+
(h, m)
|
28
|
+
} else if tz.len() >= 3 {
|
29
|
+
// Format: "+09" or "-09"
|
30
|
+
let h = tz[1..3].parse::<i32>().unwrap_or(0);
|
31
|
+
(h, 0)
|
32
|
+
} else {
|
33
|
+
(0, 0)
|
34
|
+
};
|
35
|
+
|
36
|
+
// Apply sign
|
37
|
+
let total_minutes = if tz.starts_with('-') {
|
38
|
+
-(hours * 60 + minutes)
|
39
|
+
} else {
|
40
|
+
hours * 60 + minutes
|
41
|
+
};
|
42
|
+
|
43
|
+
// Create fixed timezone
|
44
|
+
let tz = jiff::tz::TimeZone::fixed(jiff::tz::offset((total_minutes / 60) as i8));
|
45
|
+
ts.to_zoned(tz).timestamp()
|
46
|
+
} else {
|
47
|
+
// Try IANA timezone
|
48
|
+
match ts.intz(&tz) {
|
49
|
+
Ok(zoned) => zoned.timestamp(),
|
50
|
+
Err(_) => ts, // Fall back to UTC if timezone is invalid
|
51
|
+
}
|
52
|
+
}
|
53
|
+
} else {
|
54
|
+
// No timezone provided - treat as UTC
|
55
|
+
ts
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
// Macro for handling timestamp conversions
|
60
|
+
#[macro_export]
|
61
|
+
macro_rules! impl_timestamp_conversion {
|
62
|
+
($value:expr, $unit:ident, $handle:expr) => {{
|
63
|
+
match $value {
|
64
|
+
ParquetValue::$unit(ts, tz) => {
|
65
|
+
let ts = parse_zoned_timestamp(&ParquetValue::$unit(ts, tz));
|
66
|
+
let time_class = $handle.class_time();
|
67
|
+
time_class
|
68
|
+
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
69
|
+
.unwrap()
|
70
|
+
.into_value_with($handle)
|
71
|
+
}
|
72
|
+
_ => panic!("Invalid timestamp type"),
|
73
|
+
}
|
74
|
+
}};
|
75
|
+
}
|
76
|
+
|
77
|
+
// Macro for handling date conversions
|
78
|
+
#[macro_export]
|
79
|
+
macro_rules! impl_date_conversion {
|
80
|
+
($value:expr, $handle:expr) => {{
|
81
|
+
let ts = jiff::Timestamp::from_second(($value as i64) * 86400).unwrap();
|
82
|
+
let formatted = ts.strftime("%Y-%m-%d").to_string();
|
83
|
+
formatted.into_value_with($handle)
|
84
|
+
}};
|
85
|
+
}
|