parquet 0.0.4 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,204 @@
1
+ use itertools::Itertools;
2
+
3
+ use super::*;
4
+
5
+ #[derive(Debug)]
6
+ pub enum RowRecord<S: BuildHasher + Default> {
7
+ Vec(Vec<ParquetField>),
8
+ Map(HashMap<StringCacheKey, ParquetField, S>),
9
+ }
10
+
11
+ #[derive(Debug)]
12
+ pub enum ColumnRecord<S: BuildHasher + Default> {
13
+ Vec(Vec<Vec<ParquetValue>>),
14
+ Map(HashMap<StringCacheKey, Vec<ParquetValue>, S>),
15
+ }
16
+
17
+ #[derive(Debug)]
18
+ pub struct ParquetField(pub Field);
19
+
20
+ impl<S: BuildHasher + Default> IntoValue for RowRecord<S> {
21
+ fn into_value_with(self, handle: &Ruby) -> Value {
22
+ match self {
23
+ RowRecord::Vec(vec) => {
24
+ let ary = handle.ary_new_capa(vec.len());
25
+ vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
26
+ handle.into_value(ary)
27
+ }
28
+ RowRecord::Map(map) => {
29
+ let hash = handle.hash_new_capa(map.len());
30
+
31
+ let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
32
+ let mut i = 0;
33
+
34
+ for chunk in &map.into_iter().chunks(64) {
35
+ // Reduced to 64 to ensure space for pairs
36
+ for (k, v) in chunk {
37
+ if i + 1 >= values.len() {
38
+ // Bulk insert current batch if array is full
39
+ hash.bulk_insert(&values[..i]).unwrap();
40
+ values[..i].fill(handle.qnil().as_value());
41
+ i = 0;
42
+ }
43
+ values[i] = handle.into_value(k);
44
+ values[i + 1] = handle.into_value(v);
45
+ i += 2;
46
+ }
47
+ // Insert any remaining pairs
48
+ if i > 0 {
49
+ hash.bulk_insert(&values[..i]).unwrap();
50
+ values[..i].fill(handle.qnil().as_value());
51
+ i = 0;
52
+ }
53
+ }
54
+
55
+ hash.into_value_with(handle)
56
+ }
57
+ }
58
+ }
59
+ }
60
+
61
+ impl<S: BuildHasher + Default> IntoValue for ColumnRecord<S> {
62
+ fn into_value_with(self, handle: &Ruby) -> Value {
63
+ match self {
64
+ ColumnRecord::Vec(vec) => {
65
+ let ary = handle.ary_new_capa(vec.len());
66
+ vec.into_iter()
67
+ .try_for_each(|v| {
68
+ let nested_ary = handle.ary_new_capa(v.len());
69
+ v.into_iter().try_for_each(|v| nested_ary.push(v)).unwrap();
70
+ ary.push(nested_ary.into_value_with(handle))
71
+ })
72
+ .unwrap();
73
+ ary.into_value_with(handle)
74
+ }
75
+ ColumnRecord::Map(map) => {
76
+ let hash = handle.hash_new_capa(map.len());
77
+
78
+ let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
79
+ let mut i = 0;
80
+
81
+ for chunk in &map.into_iter().chunks(64) {
82
+ // Reduced to 64 to ensure space for pairs
83
+ for (k, v) in chunk {
84
+ if i + 1 >= values.len() {
85
+ // Bulk insert current batch if array is full
86
+ hash.bulk_insert(&values[..i]).unwrap();
87
+ values[..i].fill(handle.qnil().as_value());
88
+ i = 0;
89
+ }
90
+ values[i] = handle.into_value(k);
91
+ let ary = handle.ary_new_capa(v.len());
92
+ v.into_iter().try_for_each(|v| ary.push(v)).unwrap();
93
+ values[i + 1] = handle.into_value(ary);
94
+ i += 2;
95
+ }
96
+ // Insert any remaining pairs
97
+ if i > 0 {
98
+ hash.bulk_insert(&values[..i]).unwrap();
99
+ values[..i].fill(handle.qnil().as_value());
100
+ i = 0;
101
+ }
102
+ }
103
+
104
+ hash.into_value_with(handle)
105
+ }
106
+ }
107
+ }
108
+ }
109
+
110
+ impl IntoValue for ParquetField {
111
+ fn into_value_with(self, handle: &Ruby) -> Value {
112
+ match self.0 {
113
+ Field::Null => handle.qnil().as_value(),
114
+ Field::Bool(b) => b.into_value_with(handle),
115
+ Field::Short(s) => s.into_value_with(handle),
116
+ Field::Int(i) => i.into_value_with(handle),
117
+ Field::Long(l) => l.into_value_with(handle),
118
+ Field::UByte(ub) => ub.into_value_with(handle),
119
+ Field::UShort(us) => us.into_value_with(handle),
120
+ Field::UInt(ui) => ui.into_value_with(handle),
121
+ Field::ULong(ul) => ul.into_value_with(handle),
122
+ Field::Float16(f) => f32::from(f).into_value_with(handle),
123
+ Field::Float(f) => f.into_value_with(handle),
124
+ Field::Double(d) => d.into_value_with(handle),
125
+ Field::Str(s) => s.into_value_with(handle),
126
+ Field::Byte(b) => b.into_value_with(handle),
127
+ Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
128
+ Field::Date(d) => {
129
+ let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
130
+ let formatted = ts.strftime("%Y-%m-%d").to_string();
131
+ formatted.into_value_with(handle)
132
+ }
133
+ Field::TimestampMillis(ts) => {
134
+ let ts = jiff::Timestamp::from_millisecond(ts).unwrap();
135
+ let time_class = handle.class_time();
136
+ time_class
137
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
138
+ .unwrap()
139
+ .into_value_with(handle)
140
+ }
141
+ Field::TimestampMicros(ts) => {
142
+ let ts = jiff::Timestamp::from_microsecond(ts).unwrap();
143
+ let time_class = handle.class_time();
144
+ time_class
145
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
146
+ .unwrap()
147
+ .into_value_with(handle)
148
+ }
149
+ Field::ListInternal(list) => {
150
+ let elements = list.elements();
151
+ let ary = handle.ary_new_capa(elements.len());
152
+ elements
153
+ .iter()
154
+ .try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
155
+ .unwrap();
156
+ ary.into_value_with(handle)
157
+ }
158
+ Field::MapInternal(map) => {
159
+ let entries = map.entries();
160
+ let hash = handle.hash_new_capa(entries.len());
161
+ entries
162
+ .iter()
163
+ .try_for_each(|(k, v)| {
164
+ hash.aset(
165
+ ParquetField(k.clone()).into_value_with(handle),
166
+ ParquetField(v.clone()).into_value_with(handle),
167
+ )
168
+ })
169
+ .unwrap();
170
+ hash.into_value_with(handle)
171
+ }
172
+ Field::Decimal(d) => {
173
+ let value = match d {
174
+ Decimal::Int32 { value, scale, .. } => {
175
+ let unscaled = i32::from_be_bytes(value);
176
+ format!("{}e-{}", unscaled, scale)
177
+ }
178
+ Decimal::Int64 { value, scale, .. } => {
179
+ let unscaled = i64::from_be_bytes(value);
180
+ format!("{}e-{}", unscaled, scale)
181
+ }
182
+ Decimal::Bytes { value, scale, .. } => {
183
+ // Convert bytes to string representation of unscaled value
184
+ let unscaled = String::from_utf8_lossy(value.data());
185
+ format!("{}e-{}", unscaled, scale)
186
+ }
187
+ };
188
+ handle.eval(&format!("BigDecimal(\"{value}\")")).unwrap()
189
+ }
190
+ Field::Group(row) => {
191
+ let hash = handle.hash_new();
192
+ row.get_column_iter()
193
+ .try_for_each(|(k, v)| {
194
+ hash.aset(
195
+ k.clone().into_value_with(handle),
196
+ ParquetField(v.clone()).into_value_with(handle),
197
+ )
198
+ })
199
+ .unwrap();
200
+ hash.into_value_with(handle)
201
+ }
202
+ }
203
+ }
204
+ }
@@ -0,0 +1,85 @@
1
+ use super::*;
2
+
3
+ pub fn parse_zoned_timestamp(value: &ParquetValue) -> jiff::Timestamp {
4
+ let (ts, tz) = match value {
5
+ ParquetValue::TimestampSecond(ts, tz) => (jiff::Timestamp::from_second(*ts).unwrap(), tz),
6
+ ParquetValue::TimestampMillis(ts, tz) => {
7
+ (jiff::Timestamp::from_millisecond(*ts).unwrap(), tz)
8
+ }
9
+ ParquetValue::TimestampMicros(ts, tz) => {
10
+ (jiff::Timestamp::from_microsecond(*ts).unwrap(), tz)
11
+ }
12
+ ParquetValue::TimestampNanos(ts, tz) => {
13
+ (jiff::Timestamp::from_nanosecond(*ts as i128).unwrap(), tz)
14
+ }
15
+ _ => panic!("Invalid timestamp value"),
16
+ };
17
+
18
+ // If timezone is provided, convert to zoned timestamp
19
+ if let Some(tz) = tz {
20
+ // Handle fixed offset timezones like "+09:00" first
21
+ if tz.starts_with('+') || tz.starts_with('-') {
22
+ // Parse the offset string into hours and minutes
23
+ let (hours, minutes) = if tz.len() >= 5 && tz.contains(':') {
24
+ // Format: "+09:00" or "-09:00"
25
+ let h = tz[1..3].parse::<i32>().unwrap_or(0);
26
+ let m = tz[4..6].parse::<i32>().unwrap_or(0);
27
+ (h, m)
28
+ } else if tz.len() >= 3 {
29
+ // Format: "+09" or "-09"
30
+ let h = tz[1..3].parse::<i32>().unwrap_or(0);
31
+ (h, 0)
32
+ } else {
33
+ (0, 0)
34
+ };
35
+
36
+ // Apply sign
37
+ let total_minutes = if tz.starts_with('-') {
38
+ -(hours * 60 + minutes)
39
+ } else {
40
+ hours * 60 + minutes
41
+ };
42
+
43
+ // Create fixed timezone
44
+ let tz = jiff::tz::TimeZone::fixed(jiff::tz::offset((total_minutes / 60) as i8));
45
+ ts.to_zoned(tz).timestamp()
46
+ } else {
47
+ // Try IANA timezone
48
+ match ts.intz(&tz) {
49
+ Ok(zoned) => zoned.timestamp(),
50
+ Err(_) => ts, // Fall back to UTC if timezone is invalid
51
+ }
52
+ }
53
+ } else {
54
+ // No timezone provided - treat as UTC
55
+ ts
56
+ }
57
+ }
58
+
59
+ // Macro for handling timestamp conversions
60
+ #[macro_export]
61
+ macro_rules! impl_timestamp_conversion {
62
+ ($value:expr, $unit:ident, $handle:expr) => {{
63
+ match $value {
64
+ ParquetValue::$unit(ts, tz) => {
65
+ let ts = parse_zoned_timestamp(&ParquetValue::$unit(ts, tz));
66
+ let time_class = $handle.class_time();
67
+ time_class
68
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
69
+ .unwrap()
70
+ .into_value_with($handle)
71
+ }
72
+ _ => panic!("Invalid timestamp type"),
73
+ }
74
+ }};
75
+ }
76
+
77
+ // Macro for handling date conversions
78
+ #[macro_export]
79
+ macro_rules! impl_date_conversion {
80
+ ($value:expr, $handle:expr) => {{
81
+ let ts = jiff::Timestamp::from_second(($value as i64) * 86400).unwrap();
82
+ let formatted = ts.strftime("%Y-%m-%d").to_string();
83
+ formatted.into_value_with($handle)
84
+ }};
85
+ }