parquet 0.5.12 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +295 -98
- data/Cargo.toml +1 -1
- data/Gemfile +1 -0
- data/README.md +94 -3
- data/ext/parquet/Cargo.toml +8 -5
- data/ext/parquet/src/adapter_ffi.rs +156 -0
- data/ext/parquet/src/lib.rs +13 -21
- data/ext/parquet-core/Cargo.toml +23 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
- data/ext/parquet-core/src/error.rs +163 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +263 -0
- data/ext/parquet-core/src/schema.rs +283 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +151 -0
- data/ext/parquet-core/src/value.rs +209 -0
- data/ext/parquet-core/src/writer.rs +839 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +430 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
- data/ext/parquet-ruby-adapter/src/error.rs +148 -0
- data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
- data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +94 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
- data/lib/parquet/schema.rb +19 -0
- data/lib/parquet/version.rb +1 -1
- metadata +50 -24
- data/ext/parquet/src/enumerator.rs +0 -68
- data/ext/parquet/src/header_cache.rs +0 -99
- data/ext/parquet/src/logger.rs +0 -171
- data/ext/parquet/src/reader/common.rs +0 -111
- data/ext/parquet/src/reader/mod.rs +0 -211
- data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
- data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
- data/ext/parquet/src/reader/unified/mod.rs +0 -363
- data/ext/parquet/src/types/core_types.rs +0 -120
- data/ext/parquet/src/types/mod.rs +0 -100
- data/ext/parquet/src/types/parquet_value.rs +0 -1275
- data/ext/parquet/src/types/record_types.rs +0 -603
- data/ext/parquet/src/types/schema_converter.rs +0 -290
- data/ext/parquet/src/types/schema_node.rs +0 -424
- data/ext/parquet/src/types/timestamp.rs +0 -285
- data/ext/parquet/src/types/type_conversion.rs +0 -1949
- data/ext/parquet/src/types/writer_types.rs +0 -329
- data/ext/parquet/src/utils.rs +0 -184
- data/ext/parquet/src/writer/mod.rs +0 -505
- data/ext/parquet/src/writer/write_columns.rs +0 -238
- data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -1,1275 +0,0 @@
|
|
1
|
-
use crate::{impl_date_conversion, impl_timestamp_array_conversion, impl_timestamp_conversion};
|
2
|
-
|
3
|
-
use super::record_types::{format_decimal_with_i8_scale, format_i256_decimal_with_scale};
|
4
|
-
use super::*;
|
5
|
-
use arrow_array::MapArray;
|
6
|
-
use magnus::{RArray, RString};
|
7
|
-
|
8
|
-
#[derive(Debug, Clone)]
|
9
|
-
pub enum ParquetValue {
|
10
|
-
Int8(i8),
|
11
|
-
Int16(i16),
|
12
|
-
Int32(i32),
|
13
|
-
Int64(i64),
|
14
|
-
UInt8(u8),
|
15
|
-
UInt16(u16),
|
16
|
-
UInt32(u32),
|
17
|
-
UInt64(u64),
|
18
|
-
Float16(f32), // f16 converted to f32
|
19
|
-
Float32(f32),
|
20
|
-
Float64(f64),
|
21
|
-
Boolean(bool),
|
22
|
-
String(String),
|
23
|
-
Bytes(Vec<u8>),
|
24
|
-
Date32(i32),
|
25
|
-
Date64(i64),
|
26
|
-
Decimal128(i128, i8),
|
27
|
-
Decimal256(arrow_buffer::i256, i8),
|
28
|
-
TimestampSecond(i64, Option<Arc<str>>),
|
29
|
-
TimestampMillis(i64, Option<Arc<str>>),
|
30
|
-
TimestampMicros(i64, Option<Arc<str>>),
|
31
|
-
TimestampNanos(i64, Option<Arc<str>>),
|
32
|
-
TimeMillis(i32), // Time of day in milliseconds since midnight
|
33
|
-
TimeMicros(i64), // Time of day in microseconds since midnight
|
34
|
-
List(Vec<ParquetValue>), // A list of values (can be empty or have null items)
|
35
|
-
// We're not using a separate NilList type anymore - we'll handle nil lists elsewhere
|
36
|
-
Map(HashMap<ParquetValue, ParquetValue>),
|
37
|
-
Null,
|
38
|
-
}
|
39
|
-
|
40
|
-
impl PartialEq for ParquetValue {
|
41
|
-
fn eq(&self, other: &Self) -> bool {
|
42
|
-
match (self, other) {
|
43
|
-
(ParquetValue::Int8(a), ParquetValue::Int8(b)) => a == b,
|
44
|
-
(ParquetValue::Int16(a), ParquetValue::Int16(b)) => a == b,
|
45
|
-
(ParquetValue::Int32(a), ParquetValue::Int32(b)) => a == b,
|
46
|
-
(ParquetValue::Int64(a), ParquetValue::Int64(b)) => a == b,
|
47
|
-
(ParquetValue::UInt8(a), ParquetValue::UInt8(b)) => a == b,
|
48
|
-
(ParquetValue::UInt16(a), ParquetValue::UInt16(b)) => a == b,
|
49
|
-
(ParquetValue::UInt32(a), ParquetValue::UInt32(b)) => a == b,
|
50
|
-
(ParquetValue::UInt64(a), ParquetValue::UInt64(b)) => a == b,
|
51
|
-
(ParquetValue::Float16(a), ParquetValue::Float16(b)) => a == b,
|
52
|
-
(ParquetValue::Float32(a), ParquetValue::Float32(b)) => a == b,
|
53
|
-
(ParquetValue::Float64(a), ParquetValue::Float64(b)) => a == b,
|
54
|
-
(ParquetValue::Boolean(a), ParquetValue::Boolean(b)) => a == b,
|
55
|
-
(ParquetValue::String(a), ParquetValue::String(b)) => a == b,
|
56
|
-
(ParquetValue::Bytes(a), ParquetValue::Bytes(b)) => a == b,
|
57
|
-
(ParquetValue::Date32(a), ParquetValue::Date32(b)) => a == b,
|
58
|
-
(ParquetValue::Date64(a), ParquetValue::Date64(b)) => a == b,
|
59
|
-
(ParquetValue::Decimal128(a, scale_a), ParquetValue::Decimal128(b, scale_b)) => {
|
60
|
-
if scale_a == scale_b {
|
61
|
-
// Same scale, compare directly
|
62
|
-
a == b
|
63
|
-
} else {
|
64
|
-
// Different scales, need to adjust for proper comparison
|
65
|
-
let mut a_val = *a;
|
66
|
-
let mut b_val = *b;
|
67
|
-
|
68
|
-
// Adjust to the same scale for proper comparison
|
69
|
-
if scale_a < scale_b {
|
70
|
-
// Scale up a to match b's scale
|
71
|
-
let scale_diff = (*scale_b - *scale_a) as u32;
|
72
|
-
if scale_diff <= 38 {
|
73
|
-
// Limit to avoid overflow
|
74
|
-
a_val *= 10_i128.pow(scale_diff);
|
75
|
-
} else {
|
76
|
-
// For large scale differences, use BigInt for the comparison
|
77
|
-
let a_big = num::BigInt::from(*a)
|
78
|
-
* num::BigInt::from(10_i128.pow(scale_diff.min(38)));
|
79
|
-
let b_big = num::BigInt::from(*b);
|
80
|
-
return a_big == b_big;
|
81
|
-
}
|
82
|
-
} else {
|
83
|
-
// Scale up b to match a's scale
|
84
|
-
let scale_diff = (*scale_a - *scale_b) as u32;
|
85
|
-
if scale_diff <= 38 {
|
86
|
-
// Limit to avoid overflow
|
87
|
-
b_val *= 10_i128.pow(scale_diff);
|
88
|
-
} else {
|
89
|
-
// For large scale differences, use BigInt for the comparison
|
90
|
-
let a_big = num::BigInt::from(*a);
|
91
|
-
let b_big = num::BigInt::from(*b)
|
92
|
-
* num::BigInt::from(10_i128.pow(scale_diff.min(38)));
|
93
|
-
return a_big == b_big;
|
94
|
-
}
|
95
|
-
}
|
96
|
-
|
97
|
-
a_val == b_val
|
98
|
-
}
|
99
|
-
}
|
100
|
-
(ParquetValue::Decimal256(a, scale_a), ParquetValue::Decimal256(b, scale_b)) => {
|
101
|
-
if scale_a == scale_b {
|
102
|
-
// Same scale, compare directly
|
103
|
-
a == b
|
104
|
-
} else {
|
105
|
-
// TODO: Implement decimal256 comparison
|
106
|
-
todo!("decimal256 comparison");
|
107
|
-
}
|
108
|
-
}
|
109
|
-
(ParquetValue::TimestampSecond(a, _), ParquetValue::TimestampSecond(b, _)) => a == b,
|
110
|
-
(ParquetValue::TimestampMillis(a, _), ParquetValue::TimestampMillis(b, _)) => a == b,
|
111
|
-
(ParquetValue::TimestampMicros(a, _), ParquetValue::TimestampMicros(b, _)) => a == b,
|
112
|
-
(ParquetValue::TimestampNanos(a, _), ParquetValue::TimestampNanos(b, _)) => a == b,
|
113
|
-
(ParquetValue::TimeMillis(a), ParquetValue::TimeMillis(b)) => a == b,
|
114
|
-
(ParquetValue::TimeMicros(a), ParquetValue::TimeMicros(b)) => a == b,
|
115
|
-
(ParquetValue::List(a), ParquetValue::List(b)) => a == b,
|
116
|
-
(ParquetValue::Null, ParquetValue::Null) => true,
|
117
|
-
_ => false,
|
118
|
-
}
|
119
|
-
}
|
120
|
-
}
|
121
|
-
|
122
|
-
impl Eq for ParquetValue {}
|
123
|
-
|
124
|
-
impl std::hash::Hash for ParquetValue {
|
125
|
-
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
126
|
-
match self {
|
127
|
-
ParquetValue::Int8(i) => i.hash(state),
|
128
|
-
ParquetValue::Int16(i) => i.hash(state),
|
129
|
-
ParquetValue::Int32(i) => i.hash(state),
|
130
|
-
ParquetValue::Int64(i) => i.hash(state),
|
131
|
-
ParquetValue::UInt8(i) => i.hash(state),
|
132
|
-
ParquetValue::UInt16(i) => i.hash(state),
|
133
|
-
ParquetValue::UInt32(i) => i.hash(state),
|
134
|
-
ParquetValue::UInt64(i) => i.hash(state),
|
135
|
-
ParquetValue::Float16(f) => f.to_bits().hash(state),
|
136
|
-
ParquetValue::Float32(f) => f.to_bits().hash(state),
|
137
|
-
ParquetValue::Float64(f) => f.to_bits().hash(state),
|
138
|
-
ParquetValue::Boolean(b) => b.hash(state),
|
139
|
-
ParquetValue::String(s) => s.hash(state),
|
140
|
-
ParquetValue::Bytes(b) => b.hash(state),
|
141
|
-
ParquetValue::Date32(d) => d.hash(state),
|
142
|
-
ParquetValue::Date64(d) => d.hash(state),
|
143
|
-
ParquetValue::Decimal128(d, scale) => {
|
144
|
-
d.hash(state);
|
145
|
-
scale.hash(state);
|
146
|
-
}
|
147
|
-
ParquetValue::Decimal256(d, scale) => {
|
148
|
-
d.hash(state);
|
149
|
-
scale.hash(state);
|
150
|
-
}
|
151
|
-
ParquetValue::TimestampSecond(ts, tz) => {
|
152
|
-
ts.hash(state);
|
153
|
-
tz.hash(state);
|
154
|
-
}
|
155
|
-
ParquetValue::TimestampMillis(ts, tz) => {
|
156
|
-
ts.hash(state);
|
157
|
-
tz.hash(state);
|
158
|
-
}
|
159
|
-
ParquetValue::TimestampMicros(ts, tz) => {
|
160
|
-
ts.hash(state);
|
161
|
-
tz.hash(state);
|
162
|
-
}
|
163
|
-
ParquetValue::TimestampNanos(ts, tz) => {
|
164
|
-
ts.hash(state);
|
165
|
-
tz.hash(state);
|
166
|
-
}
|
167
|
-
ParquetValue::TimeMillis(t) => t.hash(state),
|
168
|
-
ParquetValue::TimeMicros(t) => t.hash(state),
|
169
|
-
ParquetValue::List(l) => l.hash(state),
|
170
|
-
ParquetValue::Map(m) => {
|
171
|
-
for (k, v) in m {
|
172
|
-
k.hash(state);
|
173
|
-
v.hash(state);
|
174
|
-
}
|
175
|
-
}
|
176
|
-
ParquetValue::Null => 0_i32.hash(state),
|
177
|
-
}
|
178
|
-
}
|
179
|
-
}
|
180
|
-
|
181
|
-
impl TryIntoValue for ParquetValue {
|
182
|
-
fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ParquetGemError> {
|
183
|
-
match self {
|
184
|
-
ParquetValue::Int8(i) => Ok(i.into_value_with(handle)),
|
185
|
-
ParquetValue::Int16(i) => Ok(i.into_value_with(handle)),
|
186
|
-
ParquetValue::Int32(i) => Ok(i.into_value_with(handle)),
|
187
|
-
ParquetValue::Int64(i) => Ok(i.into_value_with(handle)),
|
188
|
-
ParquetValue::UInt8(i) => Ok(i.into_value_with(handle)),
|
189
|
-
ParquetValue::UInt16(i) => Ok(i.into_value_with(handle)),
|
190
|
-
ParquetValue::UInt32(i) => Ok(i.into_value_with(handle)),
|
191
|
-
ParquetValue::UInt64(i) => Ok(i.into_value_with(handle)),
|
192
|
-
ParquetValue::Float16(f) => Ok(f.into_value_with(handle)),
|
193
|
-
ParquetValue::Float32(f) => Ok(f.into_value_with(handle)),
|
194
|
-
ParquetValue::Float64(f) => Ok(f.into_value_with(handle)),
|
195
|
-
ParquetValue::Boolean(b) => Ok(b.into_value_with(handle)),
|
196
|
-
ParquetValue::String(s) => Ok(s.into_value_with(handle)),
|
197
|
-
ParquetValue::Bytes(b) => Ok(handle.str_from_slice(&b).as_value()),
|
198
|
-
ParquetValue::Decimal128(d, scale) => {
|
199
|
-
// Load the bigdecimal gem if it's not already loaded
|
200
|
-
LOADED_BIGDECIMAL.get_or_init(|| handle.require("bigdecimal").unwrap_or_default());
|
201
|
-
|
202
|
-
// Format with proper scaling based on the sign of scale
|
203
|
-
let value = format_decimal_with_i8_scale(d, scale);
|
204
|
-
|
205
|
-
let kernel = handle.module_kernel();
|
206
|
-
Ok(kernel.funcall::<_, _, Value>("BigDecimal", (value,))?)
|
207
|
-
}
|
208
|
-
ParquetValue::Decimal256(d, scale) => {
|
209
|
-
// Load the bigdecimal gem if it's not already loaded
|
210
|
-
LOADED_BIGDECIMAL.get_or_init(|| handle.require("bigdecimal").unwrap_or_default());
|
211
|
-
|
212
|
-
// Format with proper scaling based on the sign of scale
|
213
|
-
// Use specialized function to preserve full precision
|
214
|
-
let value = format_i256_decimal_with_scale(d, scale)?;
|
215
|
-
|
216
|
-
let kernel = handle.module_kernel();
|
217
|
-
Ok(kernel.funcall::<_, _, Value>("BigDecimal", (value,))?)
|
218
|
-
}
|
219
|
-
ParquetValue::Date32(d) => impl_date_conversion!(d, handle),
|
220
|
-
ParquetValue::Date64(d) => impl_date_conversion!(d, handle),
|
221
|
-
timestamp @ ParquetValue::TimestampSecond(_, _) => {
|
222
|
-
impl_timestamp_conversion!(timestamp, TimestampSecond, handle)
|
223
|
-
}
|
224
|
-
timestamp @ ParquetValue::TimestampMillis(_, _) => {
|
225
|
-
impl_timestamp_conversion!(timestamp, TimestampMillis, handle)
|
226
|
-
}
|
227
|
-
timestamp @ ParquetValue::TimestampMicros(_, _) => {
|
228
|
-
impl_timestamp_conversion!(timestamp, TimestampMicros, handle)
|
229
|
-
}
|
230
|
-
timestamp @ ParquetValue::TimestampNanos(_, _) => {
|
231
|
-
impl_timestamp_conversion!(timestamp, TimestampNanos, handle)
|
232
|
-
}
|
233
|
-
ParquetValue::TimeMillis(millis) => {
|
234
|
-
// Convert time of day in milliseconds to a Ruby Time object
|
235
|
-
// Use epoch date (1970-01-01) with the given time
|
236
|
-
let total_seconds = millis / 1000;
|
237
|
-
let ms = millis % 1000;
|
238
|
-
let hours = total_seconds / 3600;
|
239
|
-
let minutes = (total_seconds % 3600) / 60;
|
240
|
-
let seconds = total_seconds % 60;
|
241
|
-
|
242
|
-
// Create a Time object for 1970-01-01 with the given time
|
243
|
-
let time_class = handle.class_time();
|
244
|
-
let time = time_class.funcall::<_, _, Value>(
|
245
|
-
"new",
|
246
|
-
(1970, 1, 1, hours, minutes, seconds, ms * 1000), // Ruby expects microseconds
|
247
|
-
)?;
|
248
|
-
Ok(time.into_value_with(handle))
|
249
|
-
}
|
250
|
-
ParquetValue::TimeMicros(micros) => {
|
251
|
-
// Convert time of day in microseconds to a Ruby Time object
|
252
|
-
// Use epoch date (1970-01-01) with the given time
|
253
|
-
let total_seconds = micros / 1_000_000;
|
254
|
-
let us = micros % 1_000_000;
|
255
|
-
let hours = total_seconds / 3600;
|
256
|
-
let minutes = (total_seconds % 3600) / 60;
|
257
|
-
let seconds = total_seconds % 60;
|
258
|
-
|
259
|
-
// Create a Time object for 1970-01-01 with the given time
|
260
|
-
let time_class = handle.class_time();
|
261
|
-
let time = time_class
|
262
|
-
.funcall::<_, _, Value>("new", (1970, 1, 1, hours, minutes, seconds, us))?;
|
263
|
-
Ok(time.into_value_with(handle))
|
264
|
-
}
|
265
|
-
ParquetValue::List(l) => {
|
266
|
-
// For lists, convert to Ruby array and check for specific cases
|
267
|
-
// when we might need to return nil instead of an empty array
|
268
|
-
|
269
|
-
// Normal case - convert list elements to a Ruby array
|
270
|
-
let ary = handle.ary_new_capa(l.len());
|
271
|
-
l.into_iter().try_for_each(|v| {
|
272
|
-
ary.push(v.try_into_value_with(handle)?)?;
|
273
|
-
Ok::<_, ParquetGemError>(())
|
274
|
-
})?;
|
275
|
-
|
276
|
-
// The complex_types test expects double_list to be nil when empty,
|
277
|
-
// but it needs the context which we don't have directly.
|
278
|
-
// We'll let List stay as an empty array, and in each_row.rs it can
|
279
|
-
// be handled there with field name context.
|
280
|
-
Ok(ary.into_value_with(handle))
|
281
|
-
}
|
282
|
-
ParquetValue::Map(m) => {
|
283
|
-
#[cfg(ruby_lt_3_2)]
|
284
|
-
let hash = handle.hash_new_capa(m.len());
|
285
|
-
|
286
|
-
#[cfg(not(ruby_lt_3_2))]
|
287
|
-
let hash = handle.hash_new();
|
288
|
-
|
289
|
-
m.into_iter().try_for_each(|(k, v)| {
|
290
|
-
hash.aset(
|
291
|
-
k.try_into_value_with(handle)?,
|
292
|
-
v.try_into_value_with(handle)?,
|
293
|
-
)?;
|
294
|
-
Ok::<_, ParquetGemError>(())
|
295
|
-
})?;
|
296
|
-
Ok(hash.into_value_with(handle))
|
297
|
-
}
|
298
|
-
ParquetValue::Null => Ok(handle.qnil().as_value()),
|
299
|
-
}
|
300
|
-
}
|
301
|
-
}
|
302
|
-
|
303
|
-
impl ParquetValue {
|
304
|
-
pub fn from_value(
|
305
|
-
ruby: &Ruby,
|
306
|
-
value: Value,
|
307
|
-
type_: &ParquetSchemaType,
|
308
|
-
format: Option<&str>,
|
309
|
-
) -> Result<Self, MagnusError> {
|
310
|
-
if value.is_nil() {
|
311
|
-
return Ok(ParquetValue::Null);
|
312
|
-
}
|
313
|
-
|
314
|
-
match type_ {
|
315
|
-
ParquetSchemaType::Primitive(primative) => match primative {
|
316
|
-
PrimitiveType::Int8 => {
|
317
|
-
let v = NumericConverter::<i8>::convert_with_string_fallback(ruby, value)?;
|
318
|
-
Ok(ParquetValue::Int8(v))
|
319
|
-
}
|
320
|
-
PrimitiveType::Int16 => {
|
321
|
-
let v = NumericConverter::<i16>::convert_with_string_fallback(ruby, value)?;
|
322
|
-
Ok(ParquetValue::Int16(v))
|
323
|
-
}
|
324
|
-
PrimitiveType::Int32 => {
|
325
|
-
let v = NumericConverter::<i32>::convert_with_string_fallback(ruby, value)?;
|
326
|
-
Ok(ParquetValue::Int32(v))
|
327
|
-
}
|
328
|
-
PrimitiveType::Int64 => {
|
329
|
-
let v = NumericConverter::<i64>::convert_with_string_fallback(ruby, value)?;
|
330
|
-
Ok(ParquetValue::Int64(v))
|
331
|
-
}
|
332
|
-
PrimitiveType::UInt8 => {
|
333
|
-
let v = NumericConverter::<u8>::convert_with_string_fallback(ruby, value)?;
|
334
|
-
Ok(ParquetValue::UInt8(v))
|
335
|
-
}
|
336
|
-
PrimitiveType::UInt16 => {
|
337
|
-
let v = NumericConverter::<u16>::convert_with_string_fallback(ruby, value)?;
|
338
|
-
Ok(ParquetValue::UInt16(v))
|
339
|
-
}
|
340
|
-
PrimitiveType::UInt32 => {
|
341
|
-
let v = NumericConverter::<u32>::convert_with_string_fallback(ruby, value)?;
|
342
|
-
Ok(ParquetValue::UInt32(v))
|
343
|
-
}
|
344
|
-
PrimitiveType::UInt64 => {
|
345
|
-
let v = NumericConverter::<u64>::convert_with_string_fallback(ruby, value)?;
|
346
|
-
Ok(ParquetValue::UInt64(v))
|
347
|
-
}
|
348
|
-
PrimitiveType::Float32 => {
|
349
|
-
let v = NumericConverter::<f32>::convert_with_string_fallback(ruby, value)?;
|
350
|
-
Ok(ParquetValue::Float32(v))
|
351
|
-
}
|
352
|
-
PrimitiveType::Float64 => {
|
353
|
-
let v = NumericConverter::<f64>::convert_with_string_fallback(ruby, value)?;
|
354
|
-
Ok(ParquetValue::Float64(v))
|
355
|
-
}
|
356
|
-
PrimitiveType::Decimal128(_precision, scale) => {
|
357
|
-
if value.is_kind_of(ruby.class_string()) {
|
358
|
-
convert_to_decimal(value, *scale)
|
359
|
-
} else if let Ok(s) = value.funcall::<_, _, RString>("to_s", ()) {
|
360
|
-
convert_to_decimal(s.as_value(), *scale)
|
361
|
-
} else {
|
362
|
-
Err(MagnusError::new(
|
363
|
-
magnus::exception::type_error(),
|
364
|
-
"Expected a string for a decimal type",
|
365
|
-
))
|
366
|
-
}
|
367
|
-
}
|
368
|
-
PrimitiveType::Decimal256(_precision, scale) => {
|
369
|
-
if value.is_kind_of(ruby.class_string()) {
|
370
|
-
convert_to_decimal(value, *scale)
|
371
|
-
} else if let Ok(s) = value.funcall::<_, _, RString>("to_s", ()) {
|
372
|
-
convert_to_decimal(s.as_value(), *scale)
|
373
|
-
} else {
|
374
|
-
Err(MagnusError::new(
|
375
|
-
magnus::exception::type_error(),
|
376
|
-
"Expected a string for a decimal type",
|
377
|
-
))
|
378
|
-
}
|
379
|
-
}
|
380
|
-
PrimitiveType::String => {
|
381
|
-
let v = convert_to_string(value)?;
|
382
|
-
Ok(ParquetValue::String(v))
|
383
|
-
}
|
384
|
-
PrimitiveType::Binary => {
|
385
|
-
let v = convert_to_binary(value)?;
|
386
|
-
Ok(ParquetValue::Bytes(v))
|
387
|
-
}
|
388
|
-
PrimitiveType::Boolean => {
|
389
|
-
let v = convert_to_boolean(ruby, value)?;
|
390
|
-
Ok(ParquetValue::Boolean(v))
|
391
|
-
}
|
392
|
-
PrimitiveType::Date32 => {
|
393
|
-
let v = convert_to_date32(ruby, value, format)?;
|
394
|
-
Ok(ParquetValue::Date32(v))
|
395
|
-
}
|
396
|
-
PrimitiveType::TimestampMillis => {
|
397
|
-
if value.is_kind_of(ruby.class_time()) {
|
398
|
-
use crate::types::timestamp::ruby_time_to_timestamp_with_tz;
|
399
|
-
let (v, tz) = ruby_time_to_timestamp_with_tz(value, "millis")?;
|
400
|
-
Ok(ParquetValue::TimestampMillis(v, tz))
|
401
|
-
} else {
|
402
|
-
let v = convert_to_timestamp_millis(ruby, value, format)?;
|
403
|
-
Ok(ParquetValue::TimestampMillis(v, None))
|
404
|
-
}
|
405
|
-
}
|
406
|
-
PrimitiveType::TimestampMicros => {
|
407
|
-
if value.is_kind_of(ruby.class_time()) {
|
408
|
-
use crate::types::timestamp::ruby_time_to_timestamp_with_tz;
|
409
|
-
let (v, tz) = ruby_time_to_timestamp_with_tz(value, "micros")?;
|
410
|
-
Ok(ParquetValue::TimestampMicros(v, tz))
|
411
|
-
} else {
|
412
|
-
let v = convert_to_timestamp_micros(ruby, value, format)?;
|
413
|
-
Ok(ParquetValue::TimestampMicros(v, None))
|
414
|
-
}
|
415
|
-
}
|
416
|
-
PrimitiveType::TimeMillis => {
|
417
|
-
let v = convert_to_time_millis(ruby, value, format)?;
|
418
|
-
Ok(ParquetValue::TimeMillis(v))
|
419
|
-
}
|
420
|
-
PrimitiveType::TimeMicros => {
|
421
|
-
let v = convert_to_time_micros(ruby, value, format)?;
|
422
|
-
Ok(ParquetValue::TimeMicros(v))
|
423
|
-
}
|
424
|
-
},
|
425
|
-
ParquetSchemaType::List(list_field) => {
|
426
|
-
// We expect the Ruby object to be an Array, each item converting
|
427
|
-
// to the item_type. We gather them into ParquetValue::List(...)
|
428
|
-
let array = RArray::from_value(value).ok_or_else(|| {
|
429
|
-
// Just get a simple string representation of the class
|
430
|
-
let type_info = format!("{:?}", value.class());
|
431
|
-
|
432
|
-
MagnusError::new(
|
433
|
-
magnus::exception::type_error(),
|
434
|
-
format!(
|
435
|
-
"Value must be an Array for a list type, got {} instead",
|
436
|
-
type_info
|
437
|
-
),
|
438
|
-
)
|
439
|
-
})?;
|
440
|
-
let mut items = Vec::with_capacity(array.len());
|
441
|
-
for (index, item_val) in array.into_iter().enumerate() {
|
442
|
-
match ParquetValue::from_value(
|
443
|
-
ruby,
|
444
|
-
item_val,
|
445
|
-
&list_field.item_type,
|
446
|
-
list_field.format,
|
447
|
-
) {
|
448
|
-
Ok(child_val) => items.push(child_val),
|
449
|
-
Err(e) => {
|
450
|
-
// Enhance the error with the item index
|
451
|
-
return Err(MagnusError::new(
|
452
|
-
magnus::exception::type_error(),
|
453
|
-
format!("Failed to convert item at index {} of list: {}", index, e),
|
454
|
-
));
|
455
|
-
}
|
456
|
-
}
|
457
|
-
}
|
458
|
-
Ok(ParquetValue::List(items))
|
459
|
-
}
|
460
|
-
ParquetSchemaType::Map(map_field) => {
|
461
|
-
// We expect the Ruby object to be a Hash
|
462
|
-
let hash_pairs: Vec<(Value, Value)> = value.funcall("to_a", ())?;
|
463
|
-
let mut result = HashMap::with_capacity(hash_pairs.len());
|
464
|
-
for (k, v) in hash_pairs {
|
465
|
-
let key_val = ParquetValue::from_value(
|
466
|
-
ruby,
|
467
|
-
k,
|
468
|
-
&map_field.key_type,
|
469
|
-
map_field.key_format,
|
470
|
-
)?;
|
471
|
-
let val_val = ParquetValue::from_value(
|
472
|
-
ruby,
|
473
|
-
v,
|
474
|
-
&map_field.value_type,
|
475
|
-
map_field.value_format,
|
476
|
-
)?;
|
477
|
-
result.insert(key_val, val_val);
|
478
|
-
}
|
479
|
-
Ok(ParquetValue::Map(result))
|
480
|
-
}
|
481
|
-
ParquetSchemaType::Struct(struct_field) => {
|
482
|
-
// We expect a Ruby hash or object that responds to to_h
|
483
|
-
let hash_obj = if value.respond_to("to_h", false)? {
|
484
|
-
value.funcall::<_, _, Value>("to_h", ())?
|
485
|
-
} else {
|
486
|
-
return Err(MagnusError::new(
|
487
|
-
magnus::exception::type_error(),
|
488
|
-
"Value must be a Hash or respond to to_h for a struct type",
|
489
|
-
));
|
490
|
-
};
|
491
|
-
|
492
|
-
let mut result = HashMap::new();
|
493
|
-
|
494
|
-
// For each field in the struct definition, try to find a matching key in the hash
|
495
|
-
for field in &struct_field.fields {
|
496
|
-
let field_name = ParquetValue::String(field.name.clone());
|
497
|
-
let ruby_field_name = ruby.str_new(&field.name).as_value();
|
498
|
-
|
499
|
-
// Try to get the field value using Ruby's [] method
|
500
|
-
let field_value_obj =
|
501
|
-
hash_obj.funcall::<_, _, Value>("[]", (ruby_field_name,))?;
|
502
|
-
|
503
|
-
let field_value = if field_value_obj.is_nil() {
|
504
|
-
ParquetValue::Null // Field not provided or nil, treat as null
|
505
|
-
} else {
|
506
|
-
ParquetValue::from_value(
|
507
|
-
ruby,
|
508
|
-
field_value_obj,
|
509
|
-
&field.type_,
|
510
|
-
field.format.as_deref(),
|
511
|
-
)?
|
512
|
-
};
|
513
|
-
|
514
|
-
result.insert(field_name, field_value);
|
515
|
-
}
|
516
|
-
|
517
|
-
// Use Map to represent a struct since it's a collection of named values
|
518
|
-
Ok(ParquetValue::Map(result))
|
519
|
-
}
|
520
|
-
}
|
521
|
-
}
|
522
|
-
}
|
523
|
-
|
524
|
-
enum ParsedDecimal {
|
525
|
-
Int128(i128),
|
526
|
-
Int256(arrow_buffer::i256),
|
527
|
-
}
|
528
|
-
|
529
|
-
/// Unified helper to parse a decimal string and apply scaling
|
530
|
-
fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<ParsedDecimal, MagnusError> {
|
531
|
-
let s = input_str.trim();
|
532
|
-
|
533
|
-
// 1. Handle scientific notation case (e.g., "0.12345e3")
|
534
|
-
if let Some(e_pos) = s.to_lowercase().find('e') {
|
535
|
-
let base = &s[0..e_pos];
|
536
|
-
let exp = &s[e_pos + 1..];
|
537
|
-
|
538
|
-
// Parse the exponent with detailed error message
|
539
|
-
let exp_val = exp.parse::<i32>().map_err(|e| {
|
540
|
-
MagnusError::new(
|
541
|
-
magnus::exception::type_error(),
|
542
|
-
format!(
|
543
|
-
"Failed to parse exponent '{}' in decimal string '{}': {}",
|
544
|
-
exp, s, e
|
545
|
-
),
|
546
|
-
)
|
547
|
-
})?;
|
548
|
-
|
549
|
-
// For very large exponents, we'll need to use BigInt
|
550
|
-
if exp_val.abs() > 38 {
|
551
|
-
return parse_large_decimal_with_bigint(s, input_scale);
|
552
|
-
}
|
553
|
-
|
554
|
-
// Handle the base part which might contain a decimal point
|
555
|
-
let (base_val, base_scale) = if let Some(decimal_pos) = base.find('.') {
|
556
|
-
let mut base_without_point = base.to_string();
|
557
|
-
base_without_point.remove(decimal_pos);
|
558
|
-
|
559
|
-
let base_scale = base.len() - decimal_pos - 1;
|
560
|
-
|
561
|
-
// Try to parse as i128 first
|
562
|
-
match base_without_point.parse::<i128>() {
|
563
|
-
Ok(v) => (v, base_scale as i32),
|
564
|
-
Err(_) => {
|
565
|
-
// Value too large for i128, use BigInt
|
566
|
-
return parse_large_decimal_with_bigint(s, input_scale);
|
567
|
-
}
|
568
|
-
}
|
569
|
-
} else {
|
570
|
-
// No decimal point in base
|
571
|
-
match base.parse::<i128>() {
|
572
|
-
Ok(v) => (v, 0),
|
573
|
-
Err(_) => {
|
574
|
-
// Value too large for i128, use BigInt
|
575
|
-
return parse_large_decimal_with_bigint(s, input_scale);
|
576
|
-
}
|
577
|
-
}
|
578
|
-
};
|
579
|
-
|
580
|
-
// Calculate the effective scale: base_scale - exp_val
|
581
|
-
let effective_scale = base_scale - exp_val;
|
582
|
-
|
583
|
-
// Adjust the value based on the difference between effective scale and requested scale
|
584
|
-
match effective_scale.cmp(&(input_scale as i32)) {
|
585
|
-
std::cmp::Ordering::Less => {
|
586
|
-
// Need to multiply to increase scale
|
587
|
-
let scale_diff = (input_scale as i32 - effective_scale) as u32;
|
588
|
-
if scale_diff > 38 {
|
589
|
-
return parse_large_decimal_with_bigint(s, input_scale);
|
590
|
-
}
|
591
|
-
|
592
|
-
// Check for overflow
|
593
|
-
match base_val.checked_mul(10_i128.pow(scale_diff)) {
|
594
|
-
Some(v) => Ok(ParsedDecimal::Int128(v)),
|
595
|
-
None => parse_large_decimal_with_bigint(s, input_scale),
|
596
|
-
}
|
597
|
-
}
|
598
|
-
std::cmp::Ordering::Greater => {
|
599
|
-
// Need to divide to decrease scale
|
600
|
-
let scale_diff = (effective_scale - input_scale as i32) as u32;
|
601
|
-
if scale_diff > 38 {
|
602
|
-
return Err(MagnusError::new(
|
603
|
-
magnus::exception::range_error(),
|
604
|
-
format!("Scale adjustment too large ({}) for decimal value '{}'. Consider using a larger scale.", scale_diff, s),
|
605
|
-
));
|
606
|
-
}
|
607
|
-
Ok(ParsedDecimal::Int128(base_val / 10_i128.pow(scale_diff)))
|
608
|
-
}
|
609
|
-
std::cmp::Ordering::Equal => Ok(ParsedDecimal::Int128(base_val)),
|
610
|
-
}
|
611
|
-
}
|
612
|
-
// 2. Handle decimal point in the string (e.g., "123.456")
|
613
|
-
else if let Some(decimal_pos) = s.find('.') {
|
614
|
-
let mut s_without_point = s.to_string();
|
615
|
-
s_without_point.remove(decimal_pos);
|
616
|
-
|
617
|
-
// Calculate the actual scale from the decimal position
|
618
|
-
let actual_scale = s.len() - decimal_pos - 1;
|
619
|
-
|
620
|
-
// Try to parse as i128 first
|
621
|
-
let v = match s_without_point.parse::<i128>() {
|
622
|
-
Ok(v) => v,
|
623
|
-
Err(_) => {
|
624
|
-
// Value too large for i128, use BigInt
|
625
|
-
return parse_large_decimal_with_bigint(s, input_scale);
|
626
|
-
}
|
627
|
-
};
|
628
|
-
|
629
|
-
// Scale the value if needed based on the difference between
|
630
|
-
// the actual scale and the requested scale
|
631
|
-
match actual_scale.cmp(&(input_scale as usize)) {
|
632
|
-
std::cmp::Ordering::Less => {
|
633
|
-
// Need to multiply to increase scale
|
634
|
-
let scale_diff = (input_scale - actual_scale as i8) as u32;
|
635
|
-
if scale_diff > 38 {
|
636
|
-
return parse_large_decimal_with_bigint(s, input_scale);
|
637
|
-
}
|
638
|
-
|
639
|
-
// Check for overflow
|
640
|
-
match v.checked_mul(10_i128.pow(scale_diff)) {
|
641
|
-
Some(v) => Ok(ParsedDecimal::Int128(v)),
|
642
|
-
None => parse_large_decimal_with_bigint(s, input_scale),
|
643
|
-
}
|
644
|
-
}
|
645
|
-
std::cmp::Ordering::Greater => {
|
646
|
-
// Need to divide to decrease scale
|
647
|
-
let scale_diff = (actual_scale as i8 - input_scale) as u32;
|
648
|
-
if scale_diff > 38 {
|
649
|
-
return Err(MagnusError::new(
|
650
|
-
magnus::exception::range_error(),
|
651
|
-
format!("Scale adjustment too large ({}) for decimal value '{}'. Consider using a larger scale.", scale_diff, s),
|
652
|
-
));
|
653
|
-
}
|
654
|
-
Ok(ParsedDecimal::Int128(v / 10_i128.pow(scale_diff)))
|
655
|
-
}
|
656
|
-
std::cmp::Ordering::Equal => Ok(ParsedDecimal::Int128(v)),
|
657
|
-
}
|
658
|
-
}
|
659
|
-
// 3. Plain integer value (e.g., "12345")
|
660
|
-
else {
|
661
|
-
// No decimal point, try to parse as i128 first
|
662
|
-
let v = match s.parse::<i128>() {
|
663
|
-
Ok(v) => v,
|
664
|
-
Err(_) => {
|
665
|
-
// Value too large for i128, use BigInt
|
666
|
-
return parse_large_decimal_with_bigint(s, input_scale);
|
667
|
-
}
|
668
|
-
};
|
669
|
-
|
670
|
-
// Apply scale - make sure it's reasonable
|
671
|
-
if input_scale > 38 {
|
672
|
-
return parse_large_decimal_with_bigint(s, input_scale);
|
673
|
-
} else if input_scale < -38 {
|
674
|
-
return Err(MagnusError::new(
|
675
|
-
magnus::exception::range_error(),
|
676
|
-
format!(
|
677
|
-
"Scale {} is too small for decimal value '{}'. Must be ≥ -38.",
|
678
|
-
input_scale, s
|
679
|
-
),
|
680
|
-
));
|
681
|
-
}
|
682
|
-
|
683
|
-
// Apply positive scale (multiply)
|
684
|
-
if input_scale >= 0 {
|
685
|
-
match v.checked_mul(10_i128.pow(input_scale as u32)) {
|
686
|
-
Some(v) => Ok(ParsedDecimal::Int128(v)),
|
687
|
-
None => parse_large_decimal_with_bigint(s, input_scale),
|
688
|
-
}
|
689
|
-
} else {
|
690
|
-
// Apply negative scale (divide)
|
691
|
-
Ok(ParsedDecimal::Int128(
|
692
|
-
v / 10_i128.pow((-input_scale) as u32),
|
693
|
-
))
|
694
|
-
}
|
695
|
-
}
|
696
|
-
}
|
697
|
-
|
698
|
-
/// Parse large decimal values using BigInt when they would overflow i128
|
699
|
-
fn parse_large_decimal_with_bigint(s: &str, input_scale: i8) -> Result<ParsedDecimal, MagnusError> {
|
700
|
-
use num::BigInt;
|
701
|
-
use std::str::FromStr;
|
702
|
-
|
703
|
-
// Parse the input string as a BigInt
|
704
|
-
let bigint = if let Some(e_pos) = s.to_lowercase().find('e') {
|
705
|
-
// Handle scientific notation
|
706
|
-
let base = &s[0..e_pos];
|
707
|
-
let exp = &s[e_pos + 1..];
|
708
|
-
|
709
|
-
let exp_val = exp.parse::<i32>().map_err(|e| {
|
710
|
-
MagnusError::new(
|
711
|
-
magnus::exception::type_error(),
|
712
|
-
format!("Failed to parse exponent '{}': {}", exp, e),
|
713
|
-
)
|
714
|
-
})?;
|
715
|
-
|
716
|
-
// Parse base as BigInt
|
717
|
-
let base_bigint = if let Some(decimal_pos) = base.find('.') {
|
718
|
-
let mut base_without_point = base.to_string();
|
719
|
-
base_without_point.remove(decimal_pos);
|
720
|
-
let base_scale = base.len() - decimal_pos - 1;
|
721
|
-
|
722
|
-
let bigint = BigInt::from_str(&base_without_point).map_err(|e| {
|
723
|
-
MagnusError::new(
|
724
|
-
magnus::exception::type_error(),
|
725
|
-
format!("Failed to parse decimal base '{}': {}", base, e),
|
726
|
-
)
|
727
|
-
})?;
|
728
|
-
|
729
|
-
// Adjust for the decimal point
|
730
|
-
let effective_exp = exp_val - base_scale as i32;
|
731
|
-
|
732
|
-
if effective_exp > 0 {
|
733
|
-
bigint * BigInt::from(10).pow(effective_exp as u32)
|
734
|
-
} else if effective_exp < 0 {
|
735
|
-
bigint / BigInt::from(10).pow((-effective_exp) as u32)
|
736
|
-
} else {
|
737
|
-
bigint
|
738
|
-
}
|
739
|
-
} else {
|
740
|
-
let bigint = BigInt::from_str(base).map_err(|e| {
|
741
|
-
MagnusError::new(
|
742
|
-
magnus::exception::type_error(),
|
743
|
-
format!("Failed to parse decimal base '{}': {}", base, e),
|
744
|
-
)
|
745
|
-
})?;
|
746
|
-
|
747
|
-
if exp_val > 0 {
|
748
|
-
bigint * BigInt::from(10).pow(exp_val as u32)
|
749
|
-
} else if exp_val < 0 {
|
750
|
-
bigint / BigInt::from(10).pow((-exp_val) as u32)
|
751
|
-
} else {
|
752
|
-
bigint
|
753
|
-
}
|
754
|
-
};
|
755
|
-
|
756
|
-
base_bigint
|
757
|
-
} else if let Some(decimal_pos) = s.find('.') {
|
758
|
-
// Handle decimal point
|
759
|
-
let mut s_without_point = s.to_string();
|
760
|
-
s_without_point.remove(decimal_pos);
|
761
|
-
|
762
|
-
let actual_scale = s.len() - decimal_pos - 1;
|
763
|
-
let bigint = BigInt::from_str(&s_without_point).map_err(|e| {
|
764
|
-
MagnusError::new(
|
765
|
-
magnus::exception::type_error(),
|
766
|
-
format!("Failed to parse decimal string '{}': {}", s, e),
|
767
|
-
)
|
768
|
-
})?;
|
769
|
-
|
770
|
-
// Adjust for scale difference
|
771
|
-
let scale_diff = actual_scale as i8 - input_scale;
|
772
|
-
|
773
|
-
if scale_diff > 0 {
|
774
|
-
bigint / BigInt::from(10).pow(scale_diff as u32)
|
775
|
-
} else if scale_diff < 0 {
|
776
|
-
bigint * BigInt::from(10).pow((-scale_diff) as u32)
|
777
|
-
} else {
|
778
|
-
bigint
|
779
|
-
}
|
780
|
-
} else {
|
781
|
-
// Plain integer
|
782
|
-
let bigint = BigInt::from_str(s).map_err(|e| {
|
783
|
-
MagnusError::new(
|
784
|
-
magnus::exception::type_error(),
|
785
|
-
format!("Failed to parse integer string '{}': {}", s, e),
|
786
|
-
)
|
787
|
-
})?;
|
788
|
-
|
789
|
-
if input_scale > 0 {
|
790
|
-
bigint * BigInt::from(10).pow(input_scale as u32)
|
791
|
-
} else if input_scale < 0 {
|
792
|
-
bigint / BigInt::from(10).pow((-input_scale) as u32)
|
793
|
-
} else {
|
794
|
-
bigint
|
795
|
-
}
|
796
|
-
};
|
797
|
-
|
798
|
-
// Convert BigInt to bytes and then to i256
|
799
|
-
let bytes = bigint.to_signed_bytes_le();
|
800
|
-
|
801
|
-
if bytes.len() <= 16 {
|
802
|
-
// Fits in i128
|
803
|
-
let mut buf = if bigint.sign() == num::bigint::Sign::Minus {
|
804
|
-
[0xff; 16]
|
805
|
-
} else {
|
806
|
-
[0; 16]
|
807
|
-
};
|
808
|
-
buf[..bytes.len()].copy_from_slice(&bytes);
|
809
|
-
|
810
|
-
Ok(ParsedDecimal::Int128(i128::from_le_bytes(buf)))
|
811
|
-
} else if bytes.len() <= 32 {
|
812
|
-
// Fits in i256
|
813
|
-
let mut buf = if bigint.sign() == num::bigint::Sign::Minus {
|
814
|
-
[0xff; 32]
|
815
|
-
} else {
|
816
|
-
[0; 32]
|
817
|
-
};
|
818
|
-
buf[..bytes.len()].copy_from_slice(&bytes);
|
819
|
-
|
820
|
-
Ok(ParsedDecimal::Int256(arrow_buffer::i256::from_le_bytes(
|
821
|
-
buf,
|
822
|
-
)))
|
823
|
-
} else {
|
824
|
-
Err(MagnusError::new(
|
825
|
-
magnus::exception::range_error(),
|
826
|
-
format!("Decimal value '{}' is too large to fit in 256 bits", s),
|
827
|
-
))
|
828
|
-
}
|
829
|
-
}
|
830
|
-
|
831
|
-
fn convert_to_decimal(value: Value, scale: i8) -> Result<ParquetValue, MagnusError> {
|
832
|
-
// Get the decimal string based on the type of value
|
833
|
-
let s = if unsafe { value.classname() } == "BigDecimal" {
|
834
|
-
value
|
835
|
-
.funcall::<_, _, RString>("to_s", ("F",))?
|
836
|
-
.to_string()?
|
837
|
-
} else {
|
838
|
-
value.to_r_string()?.to_string()?
|
839
|
-
};
|
840
|
-
|
841
|
-
// Use our unified parser to convert the string to a decimal value with scaling
|
842
|
-
match parse_decimal_string(&s, scale) {
|
843
|
-
Ok(decimal_value) => match decimal_value {
|
844
|
-
ParsedDecimal::Int128(v) => Ok(ParquetValue::Decimal128(v, scale)),
|
845
|
-
ParsedDecimal::Int256(v) => Ok(ParquetValue::Decimal256(v, scale)),
|
846
|
-
},
|
847
|
-
Err(e) => Err(MagnusError::new(
|
848
|
-
magnus::exception::type_error(),
|
849
|
-
format!(
|
850
|
-
"Failed to convert '{}' to decimal with scale {}: {}",
|
851
|
-
s, scale, e
|
852
|
-
),
|
853
|
-
)),
|
854
|
-
}
|
855
|
-
}
|
856
|
-
|
857
|
-
#[derive(Debug)]
|
858
|
-
pub struct ParquetValueVec(Vec<ParquetValue>);
|
859
|
-
|
860
|
-
impl ParquetValueVec {
|
861
|
-
pub fn into_inner(self) -> Vec<ParquetValue> {
|
862
|
-
self.0
|
863
|
-
}
|
864
|
-
}
|
865
|
-
|
866
|
-
impl IntoIterator for ParquetValueVec {
|
867
|
-
type Item = ParquetValue;
|
868
|
-
type IntoIter = std::vec::IntoIter<ParquetValue>;
|
869
|
-
|
870
|
-
fn into_iter(self) -> Self::IntoIter {
|
871
|
-
self.0.into_iter()
|
872
|
-
}
|
873
|
-
}
|
874
|
-
|
875
|
-
impl std::cmp::PartialEq for ParquetValueVec {
|
876
|
-
fn eq(&self, other: &Self) -> bool {
|
877
|
-
self.0 == other.0
|
878
|
-
}
|
879
|
-
}
|
880
|
-
|
881
|
-
impl std::cmp::Eq for ParquetValueVec {}
|
882
|
-
|
883
|
-
macro_rules! impl_numeric_array_conversion {
|
884
|
-
($column:expr, $array_type:ty, $variant:ident) => {{
|
885
|
-
let array = downcast_array::<$array_type>($column);
|
886
|
-
Ok(ParquetValueVec(if array.is_nullable() {
|
887
|
-
array
|
888
|
-
.values()
|
889
|
-
.iter()
|
890
|
-
.enumerate()
|
891
|
-
.map(|(i, x)| {
|
892
|
-
if array.is_null(i) {
|
893
|
-
ParquetValue::Null
|
894
|
-
} else {
|
895
|
-
ParquetValue::$variant(*x)
|
896
|
-
}
|
897
|
-
})
|
898
|
-
.collect()
|
899
|
-
} else {
|
900
|
-
array
|
901
|
-
.values()
|
902
|
-
.iter()
|
903
|
-
.map(|x| ParquetValue::$variant(*x))
|
904
|
-
.collect()
|
905
|
-
}))
|
906
|
-
}};
|
907
|
-
}
|
908
|
-
macro_rules! impl_boolean_array_conversion {
|
909
|
-
($column:expr, $array_type:ty, $variant:ident) => {{
|
910
|
-
let array = downcast_array::<$array_type>($column);
|
911
|
-
Ok(ParquetValueVec(if array.is_nullable() {
|
912
|
-
array
|
913
|
-
.values()
|
914
|
-
.iter()
|
915
|
-
.enumerate()
|
916
|
-
.map(|(i, x)| {
|
917
|
-
if array.is_null(i) {
|
918
|
-
ParquetValue::Null
|
919
|
-
} else {
|
920
|
-
ParquetValue::$variant(x)
|
921
|
-
}
|
922
|
-
})
|
923
|
-
.collect()
|
924
|
-
} else {
|
925
|
-
array
|
926
|
-
.values()
|
927
|
-
.iter()
|
928
|
-
.map(|x| ParquetValue::$variant(x))
|
929
|
-
.collect()
|
930
|
-
}))
|
931
|
-
}};
|
932
|
-
}
|
933
|
-
|
934
|
-
pub struct ArrayWrapper<'a> {
|
935
|
-
pub array: &'a dyn Array,
|
936
|
-
pub strict: bool,
|
937
|
-
}
|
938
|
-
|
939
|
-
impl<'a> TryFrom<ArrayWrapper<'a>> for ParquetValueVec {
|
940
|
-
type Error = ParquetGemError;
|
941
|
-
|
942
|
-
fn try_from(column: ArrayWrapper<'a>) -> Result<Self, Self::Error> {
|
943
|
-
match column.array.data_type() {
|
944
|
-
DataType::Boolean => {
|
945
|
-
impl_boolean_array_conversion!(column.array, BooleanArray, Boolean)
|
946
|
-
}
|
947
|
-
DataType::Int8 => impl_numeric_array_conversion!(column.array, Int8Array, Int8),
|
948
|
-
DataType::Int16 => impl_numeric_array_conversion!(column.array, Int16Array, Int16),
|
949
|
-
DataType::Int32 => impl_numeric_array_conversion!(column.array, Int32Array, Int32),
|
950
|
-
DataType::Int64 => impl_numeric_array_conversion!(column.array, Int64Array, Int64),
|
951
|
-
DataType::UInt8 => impl_numeric_array_conversion!(column.array, UInt8Array, UInt8),
|
952
|
-
DataType::UInt16 => impl_numeric_array_conversion!(column.array, UInt16Array, UInt16),
|
953
|
-
DataType::UInt32 => impl_numeric_array_conversion!(column.array, UInt32Array, UInt32),
|
954
|
-
DataType::UInt64 => impl_numeric_array_conversion!(column.array, UInt64Array, UInt64),
|
955
|
-
DataType::Float32 => {
|
956
|
-
impl_numeric_array_conversion!(column.array, Float32Array, Float32)
|
957
|
-
}
|
958
|
-
DataType::Float64 => {
|
959
|
-
impl_numeric_array_conversion!(column.array, Float64Array, Float64)
|
960
|
-
}
|
961
|
-
DataType::Date32 => impl_numeric_array_conversion!(column.array, Date32Array, Date32),
|
962
|
-
DataType::Date64 => impl_numeric_array_conversion!(column.array, Date64Array, Date64),
|
963
|
-
DataType::Decimal128(_precision, scale) => {
|
964
|
-
let array = downcast_array::<Decimal128Array>(column.array);
|
965
|
-
Ok(ParquetValueVec(if array.is_nullable() {
|
966
|
-
array
|
967
|
-
.values()
|
968
|
-
.iter()
|
969
|
-
.enumerate()
|
970
|
-
.map(|(i, x)| {
|
971
|
-
if array.is_null(i) {
|
972
|
-
ParquetValue::Null
|
973
|
-
} else {
|
974
|
-
ParquetValue::Decimal128(*x, *scale)
|
975
|
-
}
|
976
|
-
})
|
977
|
-
.collect()
|
978
|
-
} else {
|
979
|
-
array
|
980
|
-
.values()
|
981
|
-
.iter()
|
982
|
-
.map(|x| ParquetValue::Decimal128(*x, *scale))
|
983
|
-
.collect()
|
984
|
-
}))
|
985
|
-
}
|
986
|
-
DataType::Decimal256(_precision, scale) => {
|
987
|
-
let array = downcast_array::<Decimal256Array>(column.array);
|
988
|
-
Ok(ParquetValueVec(if array.is_nullable() {
|
989
|
-
array
|
990
|
-
.values()
|
991
|
-
.iter()
|
992
|
-
.enumerate()
|
993
|
-
.map(|(i, x)| {
|
994
|
-
if array.is_null(i) {
|
995
|
-
ParquetValue::Null
|
996
|
-
} else {
|
997
|
-
ParquetValue::Decimal256(*x, *scale)
|
998
|
-
}
|
999
|
-
})
|
1000
|
-
.collect()
|
1001
|
-
} else {
|
1002
|
-
array
|
1003
|
-
.values()
|
1004
|
-
.iter()
|
1005
|
-
.map(|x| ParquetValue::Decimal256(*x, *scale))
|
1006
|
-
.collect()
|
1007
|
-
}))
|
1008
|
-
}
|
1009
|
-
DataType::Timestamp(TimeUnit::Second, tz) => {
|
1010
|
-
impl_timestamp_array_conversion!(
|
1011
|
-
column.array,
|
1012
|
-
TimestampSecondArray,
|
1013
|
-
TimestampSecond,
|
1014
|
-
tz
|
1015
|
-
)
|
1016
|
-
}
|
1017
|
-
DataType::Timestamp(TimeUnit::Millisecond, tz) => {
|
1018
|
-
impl_timestamp_array_conversion!(
|
1019
|
-
column.array,
|
1020
|
-
TimestampMillisecondArray,
|
1021
|
-
TimestampMillis,
|
1022
|
-
tz
|
1023
|
-
)
|
1024
|
-
}
|
1025
|
-
DataType::Timestamp(TimeUnit::Microsecond, tz) => {
|
1026
|
-
impl_timestamp_array_conversion!(
|
1027
|
-
column.array,
|
1028
|
-
TimestampMicrosecondArray,
|
1029
|
-
TimestampMicros,
|
1030
|
-
tz
|
1031
|
-
)
|
1032
|
-
}
|
1033
|
-
DataType::Timestamp(TimeUnit::Nanosecond, tz) => {
|
1034
|
-
impl_timestamp_array_conversion!(
|
1035
|
-
column.array,
|
1036
|
-
TimestampNanosecondArray,
|
1037
|
-
TimestampNanos,
|
1038
|
-
tz
|
1039
|
-
)
|
1040
|
-
}
|
1041
|
-
DataType::Time32(TimeUnit::Millisecond) => {
|
1042
|
-
let array = downcast_array::<Time32MillisecondArray>(column.array);
|
1043
|
-
Ok(ParquetValueVec(if array.is_nullable() {
|
1044
|
-
array
|
1045
|
-
.values()
|
1046
|
-
.iter()
|
1047
|
-
.enumerate()
|
1048
|
-
.map(|(i, x)| {
|
1049
|
-
if array.is_null(i) {
|
1050
|
-
ParquetValue::Null
|
1051
|
-
} else {
|
1052
|
-
ParquetValue::TimeMillis(*x)
|
1053
|
-
}
|
1054
|
-
})
|
1055
|
-
.collect()
|
1056
|
-
} else {
|
1057
|
-
array
|
1058
|
-
.values()
|
1059
|
-
.iter()
|
1060
|
-
.map(|x| ParquetValue::TimeMillis(*x))
|
1061
|
-
.collect()
|
1062
|
-
}))
|
1063
|
-
}
|
1064
|
-
DataType::Time64(TimeUnit::Microsecond) => {
|
1065
|
-
let array = downcast_array::<Time64MicrosecondArray>(column.array);
|
1066
|
-
Ok(ParquetValueVec(if array.is_nullable() {
|
1067
|
-
array
|
1068
|
-
.values()
|
1069
|
-
.iter()
|
1070
|
-
.enumerate()
|
1071
|
-
.map(|(i, x)| {
|
1072
|
-
if array.is_null(i) {
|
1073
|
-
ParquetValue::Null
|
1074
|
-
} else {
|
1075
|
-
ParquetValue::TimeMicros(*x)
|
1076
|
-
}
|
1077
|
-
})
|
1078
|
-
.collect()
|
1079
|
-
} else {
|
1080
|
-
array
|
1081
|
-
.values()
|
1082
|
-
.iter()
|
1083
|
-
.map(|x| ParquetValue::TimeMicros(*x))
|
1084
|
-
.collect()
|
1085
|
-
}))
|
1086
|
-
}
|
1087
|
-
DataType::Float16 => {
|
1088
|
-
let array = downcast_array::<Float16Array>(column.array);
|
1089
|
-
if array.is_nullable() {
|
1090
|
-
Ok(ParquetValueVec(
|
1091
|
-
array
|
1092
|
-
.values()
|
1093
|
-
.iter()
|
1094
|
-
.enumerate()
|
1095
|
-
.map(|(i, x)| {
|
1096
|
-
if array.is_null(i) {
|
1097
|
-
ParquetValue::Null
|
1098
|
-
} else {
|
1099
|
-
ParquetValue::Float16(f32::from(*x))
|
1100
|
-
}
|
1101
|
-
})
|
1102
|
-
.collect(),
|
1103
|
-
))
|
1104
|
-
} else {
|
1105
|
-
Ok(ParquetValueVec(
|
1106
|
-
array
|
1107
|
-
.values()
|
1108
|
-
.iter()
|
1109
|
-
.map(|x| ParquetValue::Float16(f32::from(*x)))
|
1110
|
-
.collect(),
|
1111
|
-
))
|
1112
|
-
}
|
1113
|
-
}
|
1114
|
-
DataType::Utf8 => {
|
1115
|
-
let array = downcast_array::<StringArray>(column.array);
|
1116
|
-
let mut tmp_vec = Vec::with_capacity(array.len());
|
1117
|
-
let iter = array.iter().map(|opt_x| match opt_x {
|
1118
|
-
Some(x) => {
|
1119
|
-
if column.strict {
|
1120
|
-
Ok::<_, ParquetGemError>(ParquetValue::String(
|
1121
|
-
simdutf8::basic::from_utf8(x.as_bytes())?.to_string(),
|
1122
|
-
))
|
1123
|
-
} else {
|
1124
|
-
Ok::<_, ParquetGemError>(ParquetValue::String(x.to_string()))
|
1125
|
-
}
|
1126
|
-
}
|
1127
|
-
None => Ok(ParquetValue::Null),
|
1128
|
-
});
|
1129
|
-
for x in iter {
|
1130
|
-
tmp_vec.push(x?);
|
1131
|
-
}
|
1132
|
-
Ok(ParquetValueVec(tmp_vec))
|
1133
|
-
}
|
1134
|
-
DataType::Binary => {
|
1135
|
-
let array = downcast_array::<BinaryArray>(column.array);
|
1136
|
-
Ok(ParquetValueVec(
|
1137
|
-
array
|
1138
|
-
.iter()
|
1139
|
-
.map(|opt_x| match opt_x {
|
1140
|
-
Some(x) => ParquetValue::Bytes(x.to_vec()),
|
1141
|
-
None => ParquetValue::Null,
|
1142
|
-
})
|
1143
|
-
.collect(),
|
1144
|
-
))
|
1145
|
-
}
|
1146
|
-
DataType::List(_field) => {
|
1147
|
-
let list_array = downcast_array::<ListArray>(column.array);
|
1148
|
-
let sub_list = list_array
|
1149
|
-
.iter()
|
1150
|
-
.map(|x| match x {
|
1151
|
-
Some(values) => match ParquetValueVec::try_from(ArrayWrapper {
|
1152
|
-
array: &*values,
|
1153
|
-
strict: column.strict,
|
1154
|
-
}) {
|
1155
|
-
Ok(vec) => Ok(ParquetValue::List(vec.into_inner())),
|
1156
|
-
Err(e) => Err(MagnusError::new(
|
1157
|
-
magnus::exception::type_error(),
|
1158
|
-
format!("Error converting list array to ParquetValueVec: {}", e),
|
1159
|
-
))?,
|
1160
|
-
},
|
1161
|
-
None => Ok(ParquetValue::Null),
|
1162
|
-
})
|
1163
|
-
.collect::<Result<Vec<ParquetValue>, Self::Error>>()?;
|
1164
|
-
Ok(ParquetValueVec(sub_list))
|
1165
|
-
}
|
1166
|
-
DataType::Struct(_) => {
|
1167
|
-
let struct_array = downcast_array::<StructArray>(column.array);
|
1168
|
-
let mut values = Vec::with_capacity(struct_array.len());
|
1169
|
-
for i in 0..struct_array.len() {
|
1170
|
-
if struct_array.is_null(i) {
|
1171
|
-
values.push(ParquetValue::Null);
|
1172
|
-
continue;
|
1173
|
-
}
|
1174
|
-
|
1175
|
-
let mut map = std::collections::HashMap::new();
|
1176
|
-
for (field_idx, field) in struct_array.fields().iter().enumerate() {
|
1177
|
-
let c = struct_array.column(field_idx);
|
1178
|
-
let field_values = match ParquetValueVec::try_from(ArrayWrapper {
|
1179
|
-
array: &*c.slice(i, 1),
|
1180
|
-
strict: column.strict,
|
1181
|
-
}) {
|
1182
|
-
Ok(vec) => vec.into_inner(),
|
1183
|
-
Err(e) => {
|
1184
|
-
return Err(MagnusError::new(
|
1185
|
-
magnus::exception::type_error(),
|
1186
|
-
format!(
|
1187
|
-
"Error converting struct field to ParquetValueVec: {}",
|
1188
|
-
e
|
1189
|
-
),
|
1190
|
-
))?;
|
1191
|
-
}
|
1192
|
-
};
|
1193
|
-
map.insert(
|
1194
|
-
ParquetValue::String(field.name().to_string()),
|
1195
|
-
field_values.into_iter().next().ok_or_else(|| {
|
1196
|
-
MagnusError::new(
|
1197
|
-
magnus::exception::type_error(),
|
1198
|
-
"Expected a single value for struct field".to_string(),
|
1199
|
-
)
|
1200
|
-
})?,
|
1201
|
-
);
|
1202
|
-
}
|
1203
|
-
values.push(ParquetValue::Map(map));
|
1204
|
-
}
|
1205
|
-
Ok(ParquetValueVec(values))
|
1206
|
-
}
|
1207
|
-
DataType::Map(_field, _keys_sorted) => {
|
1208
|
-
let map_array = downcast_array::<MapArray>(column.array);
|
1209
|
-
|
1210
|
-
let mut result = Vec::with_capacity(map_array.len());
|
1211
|
-
|
1212
|
-
let offsets = map_array.offsets();
|
1213
|
-
let struct_array = map_array.entries();
|
1214
|
-
|
1215
|
-
for i in 0..map_array.len() {
|
1216
|
-
if map_array.is_null(i) {
|
1217
|
-
result.push(ParquetValue::Null);
|
1218
|
-
continue;
|
1219
|
-
}
|
1220
|
-
|
1221
|
-
let start = offsets[i] as usize;
|
1222
|
-
let end = offsets[i + 1] as usize;
|
1223
|
-
|
1224
|
-
let mut map_data =
|
1225
|
-
HashMap::with_capacity_and_hasher(end - start, Default::default());
|
1226
|
-
|
1227
|
-
// In Arrow's MapArray, the entries are a struct with fields named "keys" and "values"
|
1228
|
-
// Get the columns directly by index since we know the structure
|
1229
|
-
let key_array = struct_array.column(0); // First field is always keys
|
1230
|
-
let val_array = struct_array.column(1); // Second field is always values
|
1231
|
-
|
1232
|
-
for entry_index in start..end {
|
1233
|
-
let key_value = if key_array.is_null(entry_index) {
|
1234
|
-
ParquetValue::Null
|
1235
|
-
} else {
|
1236
|
-
let subarray = key_array.slice(entry_index, 1);
|
1237
|
-
let subwrapper = ArrayWrapper {
|
1238
|
-
array: &*subarray,
|
1239
|
-
strict: column.strict,
|
1240
|
-
};
|
1241
|
-
let mut converted = ParquetValueVec::try_from(subwrapper)?.0;
|
1242
|
-
converted.pop().unwrap_or(ParquetValue::Null)
|
1243
|
-
};
|
1244
|
-
|
1245
|
-
let val_value = if val_array.is_null(entry_index) {
|
1246
|
-
ParquetValue::Null
|
1247
|
-
} else {
|
1248
|
-
let subarray = val_array.slice(entry_index, 1);
|
1249
|
-
let subwrapper = ArrayWrapper {
|
1250
|
-
array: &*subarray,
|
1251
|
-
strict: column.strict,
|
1252
|
-
};
|
1253
|
-
let mut converted = ParquetValueVec::try_from(subwrapper)?.0;
|
1254
|
-
converted.pop().unwrap_or(ParquetValue::Null)
|
1255
|
-
};
|
1256
|
-
|
1257
|
-
map_data.insert(key_value, val_value);
|
1258
|
-
}
|
1259
|
-
|
1260
|
-
result.push(ParquetValue::Map(map_data));
|
1261
|
-
}
|
1262
|
-
|
1263
|
-
Ok(ParquetValueVec(result))
|
1264
|
-
}
|
1265
|
-
DataType::Null => {
|
1266
|
-
let x = downcast_array::<NullArray>(column.array);
|
1267
|
-
Ok(ParquetValueVec(vec![ParquetValue::Null; x.len()]))
|
1268
|
-
}
|
1269
|
-
_ => Err(MagnusError::new(
|
1270
|
-
magnus::exception::type_error(),
|
1271
|
-
format!("Unsupported data type: {:?}", column.array.data_type()),
|
1272
|
-
))?,
|
1273
|
-
}
|
1274
|
-
}
|
1275
|
-
}
|