parquet 0.0.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,204 @@
1
+ use itertools::Itertools;
2
+
3
+ use super::*;
4
+
5
+ #[derive(Debug)]
6
+ pub enum RowRecord<S: BuildHasher + Default> {
7
+ Vec(Vec<ParquetField>),
8
+ Map(HashMap<StringCacheKey, ParquetField, S>),
9
+ }
10
+
11
+ #[derive(Debug)]
12
+ pub enum ColumnRecord<S: BuildHasher + Default> {
13
+ Vec(Vec<Vec<ParquetValue>>),
14
+ Map(HashMap<StringCacheKey, Vec<ParquetValue>, S>),
15
+ }
16
+
17
+ #[derive(Debug)]
18
+ pub struct ParquetField(pub Field);
19
+
20
+ impl<S: BuildHasher + Default> IntoValue for RowRecord<S> {
21
+ fn into_value_with(self, handle: &Ruby) -> Value {
22
+ match self {
23
+ RowRecord::Vec(vec) => {
24
+ let ary = handle.ary_new_capa(vec.len());
25
+ vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
26
+ handle.into_value(ary)
27
+ }
28
+ RowRecord::Map(map) => {
29
+ let hash = handle.hash_new_capa(map.len());
30
+
31
+ let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
32
+ let mut i = 0;
33
+
34
+ for chunk in &map.into_iter().chunks(64) {
35
+ // Reduced to 64 to ensure space for pairs
36
+ for (k, v) in chunk {
37
+ if i + 1 >= values.len() {
38
+ // Bulk insert current batch if array is full
39
+ hash.bulk_insert(&values[..i]).unwrap();
40
+ values[..i].fill(handle.qnil().as_value());
41
+ i = 0;
42
+ }
43
+ values[i] = handle.into_value(k);
44
+ values[i + 1] = handle.into_value(v);
45
+ i += 2;
46
+ }
47
+ // Insert any remaining pairs
48
+ if i > 0 {
49
+ hash.bulk_insert(&values[..i]).unwrap();
50
+ values[..i].fill(handle.qnil().as_value());
51
+ i = 0;
52
+ }
53
+ }
54
+
55
+ hash.into_value_with(handle)
56
+ }
57
+ }
58
+ }
59
+ }
60
+
61
+ impl<S: BuildHasher + Default> IntoValue for ColumnRecord<S> {
62
+ fn into_value_with(self, handle: &Ruby) -> Value {
63
+ match self {
64
+ ColumnRecord::Vec(vec) => {
65
+ let ary = handle.ary_new_capa(vec.len());
66
+ vec.into_iter()
67
+ .try_for_each(|v| {
68
+ let nested_ary = handle.ary_new_capa(v.len());
69
+ v.into_iter().try_for_each(|v| nested_ary.push(v)).unwrap();
70
+ ary.push(nested_ary.into_value_with(handle))
71
+ })
72
+ .unwrap();
73
+ ary.into_value_with(handle)
74
+ }
75
+ ColumnRecord::Map(map) => {
76
+ let hash = handle.hash_new_capa(map.len());
77
+
78
+ let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
79
+ let mut i = 0;
80
+
81
+ for chunk in &map.into_iter().chunks(64) {
82
+ // Reduced to 64 to ensure space for pairs
83
+ for (k, v) in chunk {
84
+ if i + 1 >= values.len() {
85
+ // Bulk insert current batch if array is full
86
+ hash.bulk_insert(&values[..i]).unwrap();
87
+ values[..i].fill(handle.qnil().as_value());
88
+ i = 0;
89
+ }
90
+ values[i] = handle.into_value(k);
91
+ let ary = handle.ary_new_capa(v.len());
92
+ v.into_iter().try_for_each(|v| ary.push(v)).unwrap();
93
+ values[i + 1] = handle.into_value(ary);
94
+ i += 2;
95
+ }
96
+ // Insert any remaining pairs
97
+ if i > 0 {
98
+ hash.bulk_insert(&values[..i]).unwrap();
99
+ values[..i].fill(handle.qnil().as_value());
100
+ i = 0;
101
+ }
102
+ }
103
+
104
+ hash.into_value_with(handle)
105
+ }
106
+ }
107
+ }
108
+ }
109
+
110
+ impl IntoValue for ParquetField {
111
+ fn into_value_with(self, handle: &Ruby) -> Value {
112
+ match self.0 {
113
+ Field::Null => handle.qnil().as_value(),
114
+ Field::Bool(b) => b.into_value_with(handle),
115
+ Field::Short(s) => s.into_value_with(handle),
116
+ Field::Int(i) => i.into_value_with(handle),
117
+ Field::Long(l) => l.into_value_with(handle),
118
+ Field::UByte(ub) => ub.into_value_with(handle),
119
+ Field::UShort(us) => us.into_value_with(handle),
120
+ Field::UInt(ui) => ui.into_value_with(handle),
121
+ Field::ULong(ul) => ul.into_value_with(handle),
122
+ Field::Float16(f) => f32::from(f).into_value_with(handle),
123
+ Field::Float(f) => f.into_value_with(handle),
124
+ Field::Double(d) => d.into_value_with(handle),
125
+ Field::Str(s) => s.into_value_with(handle),
126
+ Field::Byte(b) => b.into_value_with(handle),
127
+ Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
128
+ Field::Date(d) => {
129
+ let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
130
+ let formatted = ts.strftime("%Y-%m-%d").to_string();
131
+ formatted.into_value_with(handle)
132
+ }
133
+ Field::TimestampMillis(ts) => {
134
+ let ts = jiff::Timestamp::from_millisecond(ts).unwrap();
135
+ let time_class = handle.class_time();
136
+ time_class
137
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
138
+ .unwrap()
139
+ .into_value_with(handle)
140
+ }
141
+ Field::TimestampMicros(ts) => {
142
+ let ts = jiff::Timestamp::from_microsecond(ts).unwrap();
143
+ let time_class = handle.class_time();
144
+ time_class
145
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
146
+ .unwrap()
147
+ .into_value_with(handle)
148
+ }
149
+ Field::ListInternal(list) => {
150
+ let elements = list.elements();
151
+ let ary = handle.ary_new_capa(elements.len());
152
+ elements
153
+ .iter()
154
+ .try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
155
+ .unwrap();
156
+ ary.into_value_with(handle)
157
+ }
158
+ Field::MapInternal(map) => {
159
+ let entries = map.entries();
160
+ let hash = handle.hash_new_capa(entries.len());
161
+ entries
162
+ .iter()
163
+ .try_for_each(|(k, v)| {
164
+ hash.aset(
165
+ ParquetField(k.clone()).into_value_with(handle),
166
+ ParquetField(v.clone()).into_value_with(handle),
167
+ )
168
+ })
169
+ .unwrap();
170
+ hash.into_value_with(handle)
171
+ }
172
+ Field::Decimal(d) => {
173
+ let value = match d {
174
+ Decimal::Int32 { value, scale, .. } => {
175
+ let unscaled = i32::from_be_bytes(value);
176
+ format!("{}e-{}", unscaled, scale)
177
+ }
178
+ Decimal::Int64 { value, scale, .. } => {
179
+ let unscaled = i64::from_be_bytes(value);
180
+ format!("{}e-{}", unscaled, scale)
181
+ }
182
+ Decimal::Bytes { value, scale, .. } => {
183
+ // Convert bytes to string representation of unscaled value
184
+ let unscaled = String::from_utf8_lossy(value.data());
185
+ format!("{}e-{}", unscaled, scale)
186
+ }
187
+ };
188
+ handle.eval(&format!("BigDecimal(\"{value}\")")).unwrap()
189
+ }
190
+ Field::Group(row) => {
191
+ let hash = handle.hash_new();
192
+ row.get_column_iter()
193
+ .try_for_each(|(k, v)| {
194
+ hash.aset(
195
+ k.clone().into_value_with(handle),
196
+ ParquetField(v.clone()).into_value_with(handle),
197
+ )
198
+ })
199
+ .unwrap();
200
+ hash.into_value_with(handle)
201
+ }
202
+ }
203
+ }
204
+ }
@@ -0,0 +1,85 @@
1
+ use super::*;
2
+
3
+ pub fn parse_zoned_timestamp(value: &ParquetValue) -> jiff::Timestamp {
4
+ let (ts, tz) = match value {
5
+ ParquetValue::TimestampSecond(ts, tz) => (jiff::Timestamp::from_second(*ts).unwrap(), tz),
6
+ ParquetValue::TimestampMillis(ts, tz) => {
7
+ (jiff::Timestamp::from_millisecond(*ts).unwrap(), tz)
8
+ }
9
+ ParquetValue::TimestampMicros(ts, tz) => {
10
+ (jiff::Timestamp::from_microsecond(*ts).unwrap(), tz)
11
+ }
12
+ ParquetValue::TimestampNanos(ts, tz) => {
13
+ (jiff::Timestamp::from_nanosecond(*ts as i128).unwrap(), tz)
14
+ }
15
+ _ => panic!("Invalid timestamp value"),
16
+ };
17
+
18
+ // If timezone is provided, convert to zoned timestamp
19
+ if let Some(tz) = tz {
20
+ // Handle fixed offset timezones like "+09:00" first
21
+ if tz.starts_with('+') || tz.starts_with('-') {
22
+ // Parse the offset string into hours and minutes
23
+ let (hours, minutes) = if tz.len() >= 5 && tz.contains(':') {
24
+ // Format: "+09:00" or "-09:00"
25
+ let h = tz[1..3].parse::<i32>().unwrap_or(0);
26
+ let m = tz[4..6].parse::<i32>().unwrap_or(0);
27
+ (h, m)
28
+ } else if tz.len() >= 3 {
29
+ // Format: "+09" or "-09"
30
+ let h = tz[1..3].parse::<i32>().unwrap_or(0);
31
+ (h, 0)
32
+ } else {
33
+ (0, 0)
34
+ };
35
+
36
+ // Apply sign
37
+ let total_minutes = if tz.starts_with('-') {
38
+ -(hours * 60 + minutes)
39
+ } else {
40
+ hours * 60 + minutes
41
+ };
42
+
43
+ // Create fixed timezone
44
+ let tz = jiff::tz::TimeZone::fixed(jiff::tz::offset((total_minutes / 60) as i8));
45
+ ts.to_zoned(tz).timestamp()
46
+ } else {
47
+ // Try IANA timezone
48
+ match ts.intz(&tz) {
49
+ Ok(zoned) => zoned.timestamp(),
50
+ Err(_) => ts, // Fall back to UTC if timezone is invalid
51
+ }
52
+ }
53
+ } else {
54
+ // No timezone provided - treat as UTC
55
+ ts
56
+ }
57
+ }
58
+
59
+ // Macro for handling timestamp conversions
60
+ #[macro_export]
61
+ macro_rules! impl_timestamp_conversion {
62
+ ($value:expr, $unit:ident, $handle:expr) => {{
63
+ match $value {
64
+ ParquetValue::$unit(ts, tz) => {
65
+ let ts = parse_zoned_timestamp(&ParquetValue::$unit(ts, tz));
66
+ let time_class = $handle.class_time();
67
+ time_class
68
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))
69
+ .unwrap()
70
+ .into_value_with($handle)
71
+ }
72
+ _ => panic!("Invalid timestamp type"),
73
+ }
74
+ }};
75
+ }
76
+
77
+ // Macro for handling date conversions
78
+ #[macro_export]
79
+ macro_rules! impl_date_conversion {
80
+ ($value:expr, $handle:expr) => {{
81
+ let ts = jiff::Timestamp::from_second(($value as i64) * 86400).unwrap();
82
+ let formatted = ts.strftime("%Y-%m-%d").to_string();
83
+ formatted.into_value_with($handle)
84
+ }};
85
+ }