parquet 0.0.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +48 -40
- data/Gemfile +1 -1
- data/README.md +92 -2
- data/ext/parquet/Cargo.toml +5 -8
- data/ext/parquet/src/enumerator.rs +11 -5
- data/ext/parquet/src/lib.rs +5 -0
- data/ext/parquet/src/reader/mod.rs +42 -0
- data/ext/parquet/src/{reader.rs → reader/parquet_column_reader.rs} +7 -164
- data/ext/parquet/src/reader/parquet_row_reader.rs +152 -0
- data/ext/parquet/src/ruby_reader.rs +2 -3
- data/ext/parquet/src/types/core_types.rs +73 -0
- data/ext/parquet/src/types/mod.rs +30 -0
- data/ext/parquet/src/{types.rs → types/parquet_value.rs} +171 -435
- data/ext/parquet/src/types/record_types.rs +204 -0
- data/ext/parquet/src/types/timestamp.rs +85 -0
- data/ext/parquet/src/types/type_conversion.rs +753 -0
- data/ext/parquet/src/types/writer_types.rs +270 -0
- data/ext/parquet/src/utils.rs +34 -26
- data/ext/parquet/src/writer/mod.rs +403 -0
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +33 -2
- metadata +13 -4
@@ -0,0 +1,204 @@
|
|
1
|
+
use itertools::Itertools;
|
2
|
+
|
3
|
+
use super::*;
|
4
|
+
|
5
|
+
#[derive(Debug)]
|
6
|
+
pub enum RowRecord<S: BuildHasher + Default> {
|
7
|
+
Vec(Vec<ParquetField>),
|
8
|
+
Map(HashMap<StringCacheKey, ParquetField, S>),
|
9
|
+
}
|
10
|
+
|
11
|
+
#[derive(Debug)]
|
12
|
+
pub enum ColumnRecord<S: BuildHasher + Default> {
|
13
|
+
Vec(Vec<Vec<ParquetValue>>),
|
14
|
+
Map(HashMap<StringCacheKey, Vec<ParquetValue>, S>),
|
15
|
+
}
|
16
|
+
|
17
|
+
#[derive(Debug)]
|
18
|
+
pub struct ParquetField(pub Field);
|
19
|
+
|
20
|
+
impl<S: BuildHasher + Default> IntoValue for RowRecord<S> {
|
21
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
22
|
+
match self {
|
23
|
+
RowRecord::Vec(vec) => {
|
24
|
+
let ary = handle.ary_new_capa(vec.len());
|
25
|
+
vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
|
26
|
+
handle.into_value(ary)
|
27
|
+
}
|
28
|
+
RowRecord::Map(map) => {
|
29
|
+
let hash = handle.hash_new_capa(map.len());
|
30
|
+
|
31
|
+
let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
|
32
|
+
let mut i = 0;
|
33
|
+
|
34
|
+
for chunk in &map.into_iter().chunks(64) {
|
35
|
+
// Reduced to 64 to ensure space for pairs
|
36
|
+
for (k, v) in chunk {
|
37
|
+
if i + 1 >= values.len() {
|
38
|
+
// Bulk insert current batch if array is full
|
39
|
+
hash.bulk_insert(&values[..i]).unwrap();
|
40
|
+
values[..i].fill(handle.qnil().as_value());
|
41
|
+
i = 0;
|
42
|
+
}
|
43
|
+
values[i] = handle.into_value(k);
|
44
|
+
values[i + 1] = handle.into_value(v);
|
45
|
+
i += 2;
|
46
|
+
}
|
47
|
+
// Insert any remaining pairs
|
48
|
+
if i > 0 {
|
49
|
+
hash.bulk_insert(&values[..i]).unwrap();
|
50
|
+
values[..i].fill(handle.qnil().as_value());
|
51
|
+
i = 0;
|
52
|
+
}
|
53
|
+
}
|
54
|
+
|
55
|
+
hash.into_value_with(handle)
|
56
|
+
}
|
57
|
+
}
|
58
|
+
}
|
59
|
+
}
|
60
|
+
|
61
|
+
impl<S: BuildHasher + Default> IntoValue for ColumnRecord<S> {
|
62
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
63
|
+
match self {
|
64
|
+
ColumnRecord::Vec(vec) => {
|
65
|
+
let ary = handle.ary_new_capa(vec.len());
|
66
|
+
vec.into_iter()
|
67
|
+
.try_for_each(|v| {
|
68
|
+
let nested_ary = handle.ary_new_capa(v.len());
|
69
|
+
v.into_iter().try_for_each(|v| nested_ary.push(v)).unwrap();
|
70
|
+
ary.push(nested_ary.into_value_with(handle))
|
71
|
+
})
|
72
|
+
.unwrap();
|
73
|
+
ary.into_value_with(handle)
|
74
|
+
}
|
75
|
+
ColumnRecord::Map(map) => {
|
76
|
+
let hash = handle.hash_new_capa(map.len());
|
77
|
+
|
78
|
+
let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
|
79
|
+
let mut i = 0;
|
80
|
+
|
81
|
+
for chunk in &map.into_iter().chunks(64) {
|
82
|
+
// Reduced to 64 to ensure space for pairs
|
83
|
+
for (k, v) in chunk {
|
84
|
+
if i + 1 >= values.len() {
|
85
|
+
// Bulk insert current batch if array is full
|
86
|
+
hash.bulk_insert(&values[..i]).unwrap();
|
87
|
+
values[..i].fill(handle.qnil().as_value());
|
88
|
+
i = 0;
|
89
|
+
}
|
90
|
+
values[i] = handle.into_value(k);
|
91
|
+
let ary = handle.ary_new_capa(v.len());
|
92
|
+
v.into_iter().try_for_each(|v| ary.push(v)).unwrap();
|
93
|
+
values[i + 1] = handle.into_value(ary);
|
94
|
+
i += 2;
|
95
|
+
}
|
96
|
+
// Insert any remaining pairs
|
97
|
+
if i > 0 {
|
98
|
+
hash.bulk_insert(&values[..i]).unwrap();
|
99
|
+
values[..i].fill(handle.qnil().as_value());
|
100
|
+
i = 0;
|
101
|
+
}
|
102
|
+
}
|
103
|
+
|
104
|
+
hash.into_value_with(handle)
|
105
|
+
}
|
106
|
+
}
|
107
|
+
}
|
108
|
+
}
|
109
|
+
|
110
|
+
impl IntoValue for ParquetField {
|
111
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
112
|
+
match self.0 {
|
113
|
+
Field::Null => handle.qnil().as_value(),
|
114
|
+
Field::Bool(b) => b.into_value_with(handle),
|
115
|
+
Field::Short(s) => s.into_value_with(handle),
|
116
|
+
Field::Int(i) => i.into_value_with(handle),
|
117
|
+
Field::Long(l) => l.into_value_with(handle),
|
118
|
+
Field::UByte(ub) => ub.into_value_with(handle),
|
119
|
+
Field::UShort(us) => us.into_value_with(handle),
|
120
|
+
Field::UInt(ui) => ui.into_value_with(handle),
|
121
|
+
Field::ULong(ul) => ul.into_value_with(handle),
|
122
|
+
Field::Float16(f) => f32::from(f).into_value_with(handle),
|
123
|
+
Field::Float(f) => f.into_value_with(handle),
|
124
|
+
Field::Double(d) => d.into_value_with(handle),
|
125
|
+
Field::Str(s) => s.into_value_with(handle),
|
126
|
+
Field::Byte(b) => b.into_value_with(handle),
|
127
|
+
Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
|
128
|
+
Field::Date(d) => {
|
129
|
+
let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
|
130
|
+
let formatted = ts.strftime("%Y-%m-%d").to_string();
|
131
|
+
formatted.into_value_with(handle)
|
132
|
+
}
|
133
|
+
Field::TimestampMillis(ts) => {
|
134
|
+
let ts = jiff::Timestamp::from_millisecond(ts).unwrap();
|
135
|
+
let time_class = handle.class_time();
|
136
|
+
time_class
|
137
|
+
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
138
|
+
.unwrap()
|
139
|
+
.into_value_with(handle)
|
140
|
+
}
|
141
|
+
Field::TimestampMicros(ts) => {
|
142
|
+
let ts = jiff::Timestamp::from_microsecond(ts).unwrap();
|
143
|
+
let time_class = handle.class_time();
|
144
|
+
time_class
|
145
|
+
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
146
|
+
.unwrap()
|
147
|
+
.into_value_with(handle)
|
148
|
+
}
|
149
|
+
Field::ListInternal(list) => {
|
150
|
+
let elements = list.elements();
|
151
|
+
let ary = handle.ary_new_capa(elements.len());
|
152
|
+
elements
|
153
|
+
.iter()
|
154
|
+
.try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
|
155
|
+
.unwrap();
|
156
|
+
ary.into_value_with(handle)
|
157
|
+
}
|
158
|
+
Field::MapInternal(map) => {
|
159
|
+
let entries = map.entries();
|
160
|
+
let hash = handle.hash_new_capa(entries.len());
|
161
|
+
entries
|
162
|
+
.iter()
|
163
|
+
.try_for_each(|(k, v)| {
|
164
|
+
hash.aset(
|
165
|
+
ParquetField(k.clone()).into_value_with(handle),
|
166
|
+
ParquetField(v.clone()).into_value_with(handle),
|
167
|
+
)
|
168
|
+
})
|
169
|
+
.unwrap();
|
170
|
+
hash.into_value_with(handle)
|
171
|
+
}
|
172
|
+
Field::Decimal(d) => {
|
173
|
+
let value = match d {
|
174
|
+
Decimal::Int32 { value, scale, .. } => {
|
175
|
+
let unscaled = i32::from_be_bytes(value);
|
176
|
+
format!("{}e-{}", unscaled, scale)
|
177
|
+
}
|
178
|
+
Decimal::Int64 { value, scale, .. } => {
|
179
|
+
let unscaled = i64::from_be_bytes(value);
|
180
|
+
format!("{}e-{}", unscaled, scale)
|
181
|
+
}
|
182
|
+
Decimal::Bytes { value, scale, .. } => {
|
183
|
+
// Convert bytes to string representation of unscaled value
|
184
|
+
let unscaled = String::from_utf8_lossy(value.data());
|
185
|
+
format!("{}e-{}", unscaled, scale)
|
186
|
+
}
|
187
|
+
};
|
188
|
+
handle.eval(&format!("BigDecimal(\"{value}\")")).unwrap()
|
189
|
+
}
|
190
|
+
Field::Group(row) => {
|
191
|
+
let hash = handle.hash_new();
|
192
|
+
row.get_column_iter()
|
193
|
+
.try_for_each(|(k, v)| {
|
194
|
+
hash.aset(
|
195
|
+
k.clone().into_value_with(handle),
|
196
|
+
ParquetField(v.clone()).into_value_with(handle),
|
197
|
+
)
|
198
|
+
})
|
199
|
+
.unwrap();
|
200
|
+
hash.into_value_with(handle)
|
201
|
+
}
|
202
|
+
}
|
203
|
+
}
|
204
|
+
}
|
@@ -0,0 +1,85 @@
|
|
1
|
+
use super::*;
|
2
|
+
|
3
|
+
pub fn parse_zoned_timestamp(value: &ParquetValue) -> jiff::Timestamp {
|
4
|
+
let (ts, tz) = match value {
|
5
|
+
ParquetValue::TimestampSecond(ts, tz) => (jiff::Timestamp::from_second(*ts).unwrap(), tz),
|
6
|
+
ParquetValue::TimestampMillis(ts, tz) => {
|
7
|
+
(jiff::Timestamp::from_millisecond(*ts).unwrap(), tz)
|
8
|
+
}
|
9
|
+
ParquetValue::TimestampMicros(ts, tz) => {
|
10
|
+
(jiff::Timestamp::from_microsecond(*ts).unwrap(), tz)
|
11
|
+
}
|
12
|
+
ParquetValue::TimestampNanos(ts, tz) => {
|
13
|
+
(jiff::Timestamp::from_nanosecond(*ts as i128).unwrap(), tz)
|
14
|
+
}
|
15
|
+
_ => panic!("Invalid timestamp value"),
|
16
|
+
};
|
17
|
+
|
18
|
+
// If timezone is provided, convert to zoned timestamp
|
19
|
+
if let Some(tz) = tz {
|
20
|
+
// Handle fixed offset timezones like "+09:00" first
|
21
|
+
if tz.starts_with('+') || tz.starts_with('-') {
|
22
|
+
// Parse the offset string into hours and minutes
|
23
|
+
let (hours, minutes) = if tz.len() >= 5 && tz.contains(':') {
|
24
|
+
// Format: "+09:00" or "-09:00"
|
25
|
+
let h = tz[1..3].parse::<i32>().unwrap_or(0);
|
26
|
+
let m = tz[4..6].parse::<i32>().unwrap_or(0);
|
27
|
+
(h, m)
|
28
|
+
} else if tz.len() >= 3 {
|
29
|
+
// Format: "+09" or "-09"
|
30
|
+
let h = tz[1..3].parse::<i32>().unwrap_or(0);
|
31
|
+
(h, 0)
|
32
|
+
} else {
|
33
|
+
(0, 0)
|
34
|
+
};
|
35
|
+
|
36
|
+
// Apply sign
|
37
|
+
let total_minutes = if tz.starts_with('-') {
|
38
|
+
-(hours * 60 + minutes)
|
39
|
+
} else {
|
40
|
+
hours * 60 + minutes
|
41
|
+
};
|
42
|
+
|
43
|
+
// Create fixed timezone
|
44
|
+
let tz = jiff::tz::TimeZone::fixed(jiff::tz::offset((total_minutes / 60) as i8));
|
45
|
+
ts.to_zoned(tz).timestamp()
|
46
|
+
} else {
|
47
|
+
// Try IANA timezone
|
48
|
+
match ts.intz(&tz) {
|
49
|
+
Ok(zoned) => zoned.timestamp(),
|
50
|
+
Err(_) => ts, // Fall back to UTC if timezone is invalid
|
51
|
+
}
|
52
|
+
}
|
53
|
+
} else {
|
54
|
+
// No timezone provided - treat as UTC
|
55
|
+
ts
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
// Macro for handling timestamp conversions
|
60
|
+
#[macro_export]
|
61
|
+
macro_rules! impl_timestamp_conversion {
|
62
|
+
($value:expr, $unit:ident, $handle:expr) => {{
|
63
|
+
match $value {
|
64
|
+
ParquetValue::$unit(ts, tz) => {
|
65
|
+
let ts = parse_zoned_timestamp(&ParquetValue::$unit(ts, tz));
|
66
|
+
let time_class = $handle.class_time();
|
67
|
+
time_class
|
68
|
+
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
69
|
+
.unwrap()
|
70
|
+
.into_value_with($handle)
|
71
|
+
}
|
72
|
+
_ => panic!("Invalid timestamp type"),
|
73
|
+
}
|
74
|
+
}};
|
75
|
+
}
|
76
|
+
|
77
|
+
// Macro for handling date conversions
|
78
|
+
#[macro_export]
|
79
|
+
macro_rules! impl_date_conversion {
|
80
|
+
($value:expr, $handle:expr) => {{
|
81
|
+
let ts = jiff::Timestamp::from_second(($value as i64) * 86400).unwrap();
|
82
|
+
let formatted = ts.strftime("%Y-%m-%d").to_string();
|
83
|
+
formatted.into_value_with($handle)
|
84
|
+
}};
|
85
|
+
}
|