parquet 0.5.12 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +295 -98
- data/Cargo.toml +1 -1
- data/Gemfile +1 -0
- data/README.md +94 -3
- data/ext/parquet/Cargo.toml +8 -5
- data/ext/parquet/src/adapter_ffi.rs +156 -0
- data/ext/parquet/src/lib.rs +13 -21
- data/ext/parquet-core/Cargo.toml +23 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
- data/ext/parquet-core/src/error.rs +163 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +263 -0
- data/ext/parquet-core/src/schema.rs +283 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +151 -0
- data/ext/parquet-core/src/value.rs +209 -0
- data/ext/parquet-core/src/writer.rs +839 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +430 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
- data/ext/parquet-ruby-adapter/src/error.rs +148 -0
- data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
- data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +94 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
- data/lib/parquet/schema.rb +19 -0
- data/lib/parquet/version.rb +1 -1
- metadata +50 -24
- data/ext/parquet/src/enumerator.rs +0 -68
- data/ext/parquet/src/header_cache.rs +0 -99
- data/ext/parquet/src/logger.rs +0 -171
- data/ext/parquet/src/reader/common.rs +0 -111
- data/ext/parquet/src/reader/mod.rs +0 -211
- data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
- data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
- data/ext/parquet/src/reader/unified/mod.rs +0 -363
- data/ext/parquet/src/types/core_types.rs +0 -120
- data/ext/parquet/src/types/mod.rs +0 -100
- data/ext/parquet/src/types/parquet_value.rs +0 -1275
- data/ext/parquet/src/types/record_types.rs +0 -603
- data/ext/parquet/src/types/schema_converter.rs +0 -290
- data/ext/parquet/src/types/schema_node.rs +0 -424
- data/ext/parquet/src/types/timestamp.rs +0 -285
- data/ext/parquet/src/types/type_conversion.rs +0 -1949
- data/ext/parquet/src/types/writer_types.rs +0 -329
- data/ext/parquet/src/utils.rs +0 -184
- data/ext/parquet/src/writer/mod.rs +0 -505
- data/ext/parquet/src/writer/write_columns.rs +0 -238
- data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -1,603 +0,0 @@
|
|
1
|
-
use std::sync::OnceLock;
|
2
|
-
|
3
|
-
use itertools::Itertools;
|
4
|
-
use parquet::{
|
5
|
-
basic::{ConvertedType, LogicalType},
|
6
|
-
data_type::AsBytes,
|
7
|
-
};
|
8
|
-
|
9
|
-
use super::*;
|
10
|
-
|
11
|
-
pub static LOADED_BIGDECIMAL: OnceLock<bool> = OnceLock::new();
|
12
|
-
|
13
|
-
/// Format decimal value with appropriate scale for BigDecimal conversion
|
14
|
-
/// Handles positive and negative scales correctly for i8 scale
|
15
|
-
pub fn format_decimal_with_i8_scale<T: std::fmt::Display>(value: T, scale: i8) -> String {
|
16
|
-
if scale >= 0 {
|
17
|
-
// Positive scale means divide (move decimal point left)
|
18
|
-
format!("{}e-{}", value, scale)
|
19
|
-
} else {
|
20
|
-
// Negative scale means multiply (move decimal point right)
|
21
|
-
format!("{}e{}", value, -scale)
|
22
|
-
}
|
23
|
-
}
|
24
|
-
|
25
|
-
/// Format i256 decimal value with appropriate scale for BigDecimal conversion
|
26
|
-
/// Uses bytes conversion to preserve full precision
|
27
|
-
pub fn format_i256_decimal_with_scale(
|
28
|
-
value: arrow_buffer::i256,
|
29
|
-
scale: i8,
|
30
|
-
) -> Result<String, ParquetGemError> {
|
31
|
-
// Convert i256 to big-endian bytes
|
32
|
-
let bytes = value.to_be_bytes();
|
33
|
-
|
34
|
-
// Use the existing bytes_to_decimal function which handles full precision
|
35
|
-
bytes_to_decimal(&bytes, scale as i32)
|
36
|
-
}
|
37
|
-
|
38
|
-
/// Format decimal value with appropriate scale for BigDecimal conversion
|
39
|
-
/// Handles positive and negative scales correctly for i32 scale
|
40
|
-
pub fn format_decimal_with_i32_scale<T: std::fmt::Display>(value: T, scale: i32) -> String {
|
41
|
-
if scale >= 0 {
|
42
|
-
// Positive scale means divide (move decimal point left)
|
43
|
-
format!("{}e-{}", value, scale)
|
44
|
-
} else {
|
45
|
-
// Negative scale means multiply (move decimal point right)
|
46
|
-
format!("{}e{}", value, -scale)
|
47
|
-
}
|
48
|
-
}
|
49
|
-
|
50
|
-
/// Convert arbitrary-length big-endian byte array to decimal string
|
51
|
-
/// Supports byte arrays from 1 to 32 bytes in length
|
52
|
-
fn bytes_to_decimal(bytes: &[u8], scale: i32) -> Result<String, ParquetGemError> {
|
53
|
-
match bytes.len() {
|
54
|
-
0 => Err(ParquetGemError::InvalidDecimal(
|
55
|
-
"Empty byte array for decimal".to_string(),
|
56
|
-
)),
|
57
|
-
1 => {
|
58
|
-
// For 1 byte, use i8
|
59
|
-
let value = bytes[0] as i8;
|
60
|
-
Ok(format_decimal_with_i32_scale(value, scale))
|
61
|
-
}
|
62
|
-
2 => {
|
63
|
-
// For 2 bytes, use i16
|
64
|
-
let mut value: i16 = 0;
|
65
|
-
let is_negative = bytes[0] & 0x80 != 0;
|
66
|
-
|
67
|
-
for &byte in bytes {
|
68
|
-
value = (value << 8) | (byte as i16);
|
69
|
-
}
|
70
|
-
|
71
|
-
// Sign extend if negative
|
72
|
-
if is_negative {
|
73
|
-
let shift = 16 - (bytes.len() * 8);
|
74
|
-
value = (value << shift) >> shift;
|
75
|
-
}
|
76
|
-
|
77
|
-
Ok(format_decimal_with_i32_scale(value, scale))
|
78
|
-
}
|
79
|
-
3..=4 => {
|
80
|
-
// For 3-4 bytes, use i32
|
81
|
-
let mut value: i32 = 0;
|
82
|
-
let is_negative = bytes[0] & 0x80 != 0;
|
83
|
-
|
84
|
-
for &byte in bytes {
|
85
|
-
value = (value << 8) | (byte as i32);
|
86
|
-
}
|
87
|
-
|
88
|
-
// Sign extend if negative
|
89
|
-
if is_negative {
|
90
|
-
let shift = 32 - (bytes.len() * 8);
|
91
|
-
value = (value << shift) >> shift;
|
92
|
-
}
|
93
|
-
|
94
|
-
Ok(format_decimal_with_i32_scale(value, scale))
|
95
|
-
}
|
96
|
-
5..=8 => {
|
97
|
-
// For 5-8 bytes, use i64
|
98
|
-
let mut value: i64 = 0;
|
99
|
-
let is_negative = bytes[0] & 0x80 != 0;
|
100
|
-
|
101
|
-
for &byte in bytes {
|
102
|
-
value = (value << 8) | (byte as i64);
|
103
|
-
}
|
104
|
-
|
105
|
-
// Sign extend if negative
|
106
|
-
if is_negative {
|
107
|
-
let shift = 64 - (bytes.len() * 8);
|
108
|
-
value = (value << shift) >> shift;
|
109
|
-
}
|
110
|
-
|
111
|
-
Ok(format_decimal_with_i32_scale(value, scale))
|
112
|
-
}
|
113
|
-
9..=16 => {
|
114
|
-
// For 9-16 bytes, use i128
|
115
|
-
let mut value: i128 = 0;
|
116
|
-
let is_negative = bytes[0] & 0x80 != 0;
|
117
|
-
|
118
|
-
for &byte in bytes {
|
119
|
-
value = (value << 8) | (byte as i128);
|
120
|
-
}
|
121
|
-
|
122
|
-
// Sign extend if negative
|
123
|
-
if is_negative {
|
124
|
-
let shift = 128 - (bytes.len() * 8);
|
125
|
-
value = (value << shift) >> shift;
|
126
|
-
}
|
127
|
-
|
128
|
-
Ok(format_decimal_with_i32_scale(value, scale))
|
129
|
-
}
|
130
|
-
17..=32 => {
|
131
|
-
// For 17-32 bytes, we need arbitrary precision handling
|
132
|
-
// Check if the number is negative (MSB of first byte)
|
133
|
-
let is_negative = bytes[0] & 0x80 != 0;
|
134
|
-
|
135
|
-
if is_negative {
|
136
|
-
// For negative numbers, we need to compute two's complement
|
137
|
-
// First, invert all bits
|
138
|
-
let mut inverted = Vec::with_capacity(bytes.len());
|
139
|
-
for &byte in bytes {
|
140
|
-
inverted.push(!byte);
|
141
|
-
}
|
142
|
-
|
143
|
-
// Then add 1
|
144
|
-
let mut carry = 1u8;
|
145
|
-
for i in (0..inverted.len()).rev() {
|
146
|
-
let (sum, new_carry) = inverted[i].overflowing_add(carry);
|
147
|
-
inverted[i] = sum;
|
148
|
-
carry = if new_carry { 1 } else { 0 };
|
149
|
-
}
|
150
|
-
|
151
|
-
// Convert to decimal string
|
152
|
-
let mut result = String::new();
|
153
|
-
let mut remainder = inverted;
|
154
|
-
|
155
|
-
// Repeatedly divide by 10 to get decimal digits
|
156
|
-
while !remainder.iter().all(|&b| b == 0) {
|
157
|
-
let mut carry = 0u16;
|
158
|
-
for i in 0..remainder.len() {
|
159
|
-
let temp = (carry << 8) | (remainder[i] as u16);
|
160
|
-
remainder[i] = (temp / 10) as u8;
|
161
|
-
carry = temp % 10;
|
162
|
-
}
|
163
|
-
result.push_str(&carry.to_string());
|
164
|
-
}
|
165
|
-
|
166
|
-
// The digits are in reverse order
|
167
|
-
if result.is_empty() {
|
168
|
-
result = "0".to_string();
|
169
|
-
} else {
|
170
|
-
result = result.chars().rev().collect();
|
171
|
-
}
|
172
|
-
|
173
|
-
// Add negative sign and format with scale
|
174
|
-
Ok(format_decimal_with_i32_scale(format!("-{}", result), scale))
|
175
|
-
} else {
|
176
|
-
// For positive numbers, direct conversion
|
177
|
-
let mut result = String::new();
|
178
|
-
let mut remainder = bytes.to_vec();
|
179
|
-
|
180
|
-
// Repeatedly divide by 10 to get decimal digits
|
181
|
-
while !remainder.iter().all(|&b| b == 0) {
|
182
|
-
let mut carry = 0u16;
|
183
|
-
for i in 0..remainder.len() {
|
184
|
-
let temp = (carry << 8) | (remainder[i] as u16);
|
185
|
-
remainder[i] = (temp / 10) as u8;
|
186
|
-
carry = temp % 10;
|
187
|
-
}
|
188
|
-
result.push_str(&carry.to_string());
|
189
|
-
}
|
190
|
-
|
191
|
-
// The digits are in reverse order
|
192
|
-
if result.is_empty() {
|
193
|
-
result = "0".to_string();
|
194
|
-
} else {
|
195
|
-
result = result.chars().rev().collect();
|
196
|
-
}
|
197
|
-
|
198
|
-
Ok(format_decimal_with_i32_scale(result, scale))
|
199
|
-
}
|
200
|
-
}
|
201
|
-
_ => Err(ParquetGemError::InvalidDecimal(format!(
|
202
|
-
"Unsupported decimal byte array size: {} (maximum 32 bytes)",
|
203
|
-
bytes.len()
|
204
|
-
))),
|
205
|
-
}
|
206
|
-
}
|
207
|
-
|
208
|
-
#[derive(Debug)]
|
209
|
-
pub enum RowRecord<S: BuildHasher + Default> {
|
210
|
-
Vec(Vec<ParquetField>),
|
211
|
-
Map(HashMap<StringCacheKey, ParquetField, S>),
|
212
|
-
}
|
213
|
-
|
214
|
-
#[derive(Debug)]
|
215
|
-
pub enum ColumnRecord<S: BuildHasher + Default> {
|
216
|
-
Vec(Vec<Vec<ParquetValue>>),
|
217
|
-
Map(HashMap<StringCacheKey, Vec<ParquetValue>, S>),
|
218
|
-
}
|
219
|
-
|
220
|
-
#[derive(Debug)]
|
221
|
-
pub struct ParquetField {
|
222
|
-
pub field: Field,
|
223
|
-
#[allow(dead_code)]
|
224
|
-
pub converted_type: ConvertedType,
|
225
|
-
pub logical_type: Option<LogicalType>,
|
226
|
-
pub strict: bool,
|
227
|
-
}
|
228
|
-
|
229
|
-
impl<S: BuildHasher + Default> TryIntoValue for RowRecord<S> {
|
230
|
-
fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ParquetGemError> {
|
231
|
-
match self {
|
232
|
-
RowRecord::Vec(vec) => {
|
233
|
-
let ary = handle.ary_new_capa(vec.len());
|
234
|
-
vec.into_iter().try_for_each(|v| {
|
235
|
-
ary.push(v.try_into_value_with(handle)?)?;
|
236
|
-
Ok::<_, ParquetGemError>(())
|
237
|
-
})?;
|
238
|
-
Ok(handle.into_value(ary))
|
239
|
-
}
|
240
|
-
RowRecord::Map(map) => {
|
241
|
-
#[cfg(ruby_lt_3_2)]
|
242
|
-
let hash = handle.hash_new_capa(map.len());
|
243
|
-
|
244
|
-
#[cfg(not(ruby_lt_3_2))]
|
245
|
-
let hash = handle.hash_new();
|
246
|
-
|
247
|
-
let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
|
248
|
-
let mut i = 0;
|
249
|
-
|
250
|
-
for chunk in &map.into_iter().chunks(64) {
|
251
|
-
// Reduced to 64 to ensure space for pairs
|
252
|
-
for (k, v) in chunk {
|
253
|
-
if i + 1 >= values.len() {
|
254
|
-
// Bulk insert current batch if array is full
|
255
|
-
hash.bulk_insert(&values[..i])?;
|
256
|
-
values[..i].fill(handle.qnil().as_value());
|
257
|
-
i = 0;
|
258
|
-
}
|
259
|
-
values[i] = handle.into_value(k);
|
260
|
-
values[i + 1] = v.try_into_value_with(handle)?;
|
261
|
-
i += 2;
|
262
|
-
}
|
263
|
-
// Insert any remaining pairs
|
264
|
-
if i > 0 {
|
265
|
-
hash.bulk_insert(&values[..i])?;
|
266
|
-
values[..i].fill(handle.qnil().as_value());
|
267
|
-
i = 0;
|
268
|
-
}
|
269
|
-
}
|
270
|
-
|
271
|
-
Ok(hash.into_value_with(handle))
|
272
|
-
}
|
273
|
-
}
|
274
|
-
}
|
275
|
-
}
|
276
|
-
|
277
|
-
impl<S: BuildHasher + Default> TryIntoValue for ColumnRecord<S> {
|
278
|
-
fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ParquetGemError> {
|
279
|
-
match self {
|
280
|
-
ColumnRecord::Vec(vec) => {
|
281
|
-
let ary = handle.ary_new_capa(vec.len());
|
282
|
-
vec.into_iter().try_for_each(|v| {
|
283
|
-
let nested_ary = handle.ary_new_capa(v.len());
|
284
|
-
v.into_iter().try_for_each(|v| {
|
285
|
-
nested_ary.push(v.try_into_value_with(handle)?)?;
|
286
|
-
Ok::<_, ParquetGemError>(())
|
287
|
-
})?;
|
288
|
-
ary.push(nested_ary.into_value_with(handle))?;
|
289
|
-
Ok::<_, ParquetGemError>(())
|
290
|
-
})?;
|
291
|
-
Ok(ary.into_value_with(handle))
|
292
|
-
}
|
293
|
-
ColumnRecord::Map(map) => {
|
294
|
-
#[cfg(ruby_lt_3_2)]
|
295
|
-
let hash = handle.hash_new_capa(map.len());
|
296
|
-
|
297
|
-
#[cfg(not(ruby_lt_3_2))]
|
298
|
-
let hash = handle.hash_new();
|
299
|
-
|
300
|
-
let mut values: [Value; 128] = [handle.qnil().as_value(); 128];
|
301
|
-
let mut i = 0;
|
302
|
-
|
303
|
-
for chunk in &map.into_iter().chunks(64) {
|
304
|
-
// Reduced to 64 to ensure space for pairs
|
305
|
-
for (k, v) in chunk {
|
306
|
-
if i + 1 >= values.len() {
|
307
|
-
// Bulk insert current batch if array is full
|
308
|
-
hash.bulk_insert(&values[..i])?;
|
309
|
-
values[..i].fill(handle.qnil().as_value());
|
310
|
-
i = 0;
|
311
|
-
}
|
312
|
-
values[i] = handle.into_value(k);
|
313
|
-
let ary = handle.ary_new_capa(v.len());
|
314
|
-
v.into_iter().try_for_each(|v| {
|
315
|
-
ary.push(v.try_into_value_with(handle)?)?;
|
316
|
-
Ok::<_, ParquetGemError>(())
|
317
|
-
})?;
|
318
|
-
values[i + 1] = handle.into_value(ary);
|
319
|
-
i += 2;
|
320
|
-
}
|
321
|
-
// Insert any remaining pairs
|
322
|
-
if i > 0 {
|
323
|
-
hash.bulk_insert(&values[..i])?;
|
324
|
-
values[..i].fill(handle.qnil().as_value());
|
325
|
-
i = 0;
|
326
|
-
}
|
327
|
-
}
|
328
|
-
|
329
|
-
Ok(hash.into_value_with(handle))
|
330
|
-
}
|
331
|
-
}
|
332
|
-
}
|
333
|
-
}
|
334
|
-
|
335
|
-
pub trait TryIntoValue {
|
336
|
-
fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ParquetGemError>;
|
337
|
-
}
|
338
|
-
|
339
|
-
impl TryIntoValue for ParquetField {
|
340
|
-
fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ParquetGemError> {
|
341
|
-
match self.field {
|
342
|
-
Field::Null => Ok(handle.qnil().as_value()),
|
343
|
-
Field::Bool(b) => Ok(b.into_value_with(handle)),
|
344
|
-
Field::Short(s) => Ok(s.into_value_with(handle)),
|
345
|
-
Field::Int(i) => Ok(i.into_value_with(handle)),
|
346
|
-
Field::Long(l) => Ok(l.into_value_with(handle)),
|
347
|
-
Field::UByte(ub) => Ok(ub.into_value_with(handle)),
|
348
|
-
Field::UShort(us) => Ok(us.into_value_with(handle)),
|
349
|
-
Field::UInt(ui) => Ok(ui.into_value_with(handle)),
|
350
|
-
Field::ULong(ul) => Ok(ul.into_value_with(handle)),
|
351
|
-
Field::Float16(f) => Ok(f32::from(f).into_value_with(handle)),
|
352
|
-
Field::Float(f) => Ok(f.into_value_with(handle)),
|
353
|
-
Field::Double(d) => Ok(d.into_value_with(handle)),
|
354
|
-
Field::Str(s) => {
|
355
|
-
if self.strict {
|
356
|
-
Ok(simdutf8::basic::from_utf8(s.as_bytes())
|
357
|
-
.map_err(ParquetGemError::Utf8Error)
|
358
|
-
.map(|s| s.into_value_with(handle))?)
|
359
|
-
} else {
|
360
|
-
let s = String::from_utf8_lossy(s.as_bytes());
|
361
|
-
Ok(s.into_value_with(handle))
|
362
|
-
}
|
363
|
-
}
|
364
|
-
Field::Byte(b) => Ok(b.into_value_with(handle)),
|
365
|
-
Field::Bytes(b) => {
|
366
|
-
if matches!(self.logical_type, Some(parquet::basic::LogicalType::Uuid)) {
|
367
|
-
let bytes = b.as_bytes();
|
368
|
-
let uuid = uuid::Uuid::from_slice(bytes)?;
|
369
|
-
Ok(uuid.to_string().into_value_with(handle))
|
370
|
-
} else {
|
371
|
-
Ok(handle.str_from_slice(b.data()).as_value())
|
372
|
-
}
|
373
|
-
}
|
374
|
-
Field::Date(d) => {
|
375
|
-
let ts = jiff::Timestamp::from_second((d as i64) * 86400)?;
|
376
|
-
let formatted = ts.strftime("%Y-%m-%d").to_string();
|
377
|
-
Ok(formatted.into_value_with(handle))
|
378
|
-
}
|
379
|
-
Field::TimeMillis(ts) => {
|
380
|
-
let ts = jiff::Timestamp::from_millisecond(ts as i64)?;
|
381
|
-
let time_class = handle.class_time();
|
382
|
-
Ok(time_class
|
383
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))?
|
384
|
-
.into_value_with(handle))
|
385
|
-
}
|
386
|
-
Field::TimestampMillis(ts) => {
|
387
|
-
let ts = jiff::Timestamp::from_millisecond(ts)?;
|
388
|
-
let time_class = handle.class_time();
|
389
|
-
Ok(time_class
|
390
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))?
|
391
|
-
.into_value_with(handle))
|
392
|
-
}
|
393
|
-
Field::TimestampMicros(ts) | Field::TimeMicros(ts) => {
|
394
|
-
let ts = jiff::Timestamp::from_microsecond(ts)?;
|
395
|
-
let time_class = handle.class_time();
|
396
|
-
Ok(time_class
|
397
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))?
|
398
|
-
.into_value_with(handle))
|
399
|
-
}
|
400
|
-
Field::ListInternal(list) => {
|
401
|
-
let elements = list.elements();
|
402
|
-
let ary = handle.ary_new_capa(elements.len());
|
403
|
-
elements.iter().try_for_each(|e| {
|
404
|
-
ary.push(
|
405
|
-
ParquetField {
|
406
|
-
field: e.clone(),
|
407
|
-
logical_type: e.to_logical_type(),
|
408
|
-
converted_type: e.to_converted_type(),
|
409
|
-
strict: self.strict,
|
410
|
-
}
|
411
|
-
.try_into_value_with(handle)?,
|
412
|
-
)?;
|
413
|
-
Ok::<_, ParquetGemError>(())
|
414
|
-
})?;
|
415
|
-
Ok(ary.into_value_with(handle))
|
416
|
-
}
|
417
|
-
Field::MapInternal(map) => {
|
418
|
-
#[cfg(ruby_lt_3_2)]
|
419
|
-
let hash = handle.hash_new_capa(map.len());
|
420
|
-
|
421
|
-
#[cfg(not(ruby_lt_3_2))]
|
422
|
-
let hash = handle.hash_new();
|
423
|
-
|
424
|
-
map.entries().iter().try_for_each(|(k, v)| {
|
425
|
-
hash.aset(
|
426
|
-
ParquetField {
|
427
|
-
field: k.clone(),
|
428
|
-
converted_type: k.to_converted_type(),
|
429
|
-
logical_type: k.to_logical_type(),
|
430
|
-
strict: self.strict,
|
431
|
-
}
|
432
|
-
.try_into_value_with(handle)?,
|
433
|
-
ParquetField {
|
434
|
-
field: v.clone(),
|
435
|
-
converted_type: v.to_converted_type(),
|
436
|
-
logical_type: v.to_logical_type(),
|
437
|
-
strict: self.strict,
|
438
|
-
}
|
439
|
-
.try_into_value_with(handle)?,
|
440
|
-
)?;
|
441
|
-
Ok::<_, ParquetGemError>(())
|
442
|
-
})?;
|
443
|
-
Ok(hash.into_value_with(handle))
|
444
|
-
}
|
445
|
-
Field::Decimal(d) => {
|
446
|
-
let value = match d {
|
447
|
-
Decimal::Int32 { value, scale, .. } => {
|
448
|
-
let unscaled = i32::from_be_bytes(value);
|
449
|
-
format_decimal_with_i32_scale(unscaled, scale)
|
450
|
-
}
|
451
|
-
Decimal::Int64 { value, scale, .. } => {
|
452
|
-
let unscaled = i64::from_be_bytes(value);
|
453
|
-
format_decimal_with_i32_scale(unscaled, scale)
|
454
|
-
}
|
455
|
-
Decimal::Bytes { value, scale, .. } => {
|
456
|
-
bytes_to_decimal(value.as_bytes(), scale)?
|
457
|
-
}
|
458
|
-
};
|
459
|
-
|
460
|
-
// Load the bigdecimal gem if it's not already loaded
|
461
|
-
LOADED_BIGDECIMAL.get_or_init(|| handle.require("bigdecimal").unwrap_or_default());
|
462
|
-
|
463
|
-
let kernel = handle.module_kernel();
|
464
|
-
Ok(kernel.funcall::<_, _, Value>("BigDecimal", (value,))?)
|
465
|
-
}
|
466
|
-
Field::Group(row) => {
|
467
|
-
let hash = handle.hash_new();
|
468
|
-
row.get_column_iter().try_for_each(|(k, v)| {
|
469
|
-
hash.aset(
|
470
|
-
k.clone().into_value_with(handle),
|
471
|
-
ParquetField {
|
472
|
-
field: v.clone(),
|
473
|
-
converted_type: v.to_converted_type(),
|
474
|
-
logical_type: v.to_logical_type(),
|
475
|
-
strict: self.strict,
|
476
|
-
}
|
477
|
-
.try_into_value_with(handle)?,
|
478
|
-
)?;
|
479
|
-
Ok::<_, ParquetGemError>(())
|
480
|
-
})?;
|
481
|
-
Ok(hash.into_value_with(handle))
|
482
|
-
}
|
483
|
-
}
|
484
|
-
}
|
485
|
-
}
|
486
|
-
|
487
|
-
trait ToTypeInfo {
|
488
|
-
fn to_converted_type(&self) -> ConvertedType;
|
489
|
-
fn to_logical_type(&self) -> Option<LogicalType>;
|
490
|
-
}
|
491
|
-
|
492
|
-
impl ToTypeInfo for &parquet::record::Field {
|
493
|
-
fn to_converted_type(&self) -> ConvertedType {
|
494
|
-
match self {
|
495
|
-
Field::Null => ConvertedType::NONE,
|
496
|
-
Field::Bool(_) => ConvertedType::INT_8,
|
497
|
-
Field::Byte(_) => ConvertedType::INT_8,
|
498
|
-
Field::Short(_) => ConvertedType::INT_16,
|
499
|
-
Field::Int(_) => ConvertedType::INT_32,
|
500
|
-
Field::Long(_) => ConvertedType::INT_64,
|
501
|
-
Field::UByte(_) => ConvertedType::UINT_8,
|
502
|
-
Field::UShort(_) => ConvertedType::UINT_16,
|
503
|
-
Field::UInt(_) => ConvertedType::UINT_32,
|
504
|
-
Field::ULong(_) => ConvertedType::UINT_64,
|
505
|
-
Field::Float16(_) => ConvertedType::NONE,
|
506
|
-
Field::Float(_) => ConvertedType::NONE,
|
507
|
-
Field::Double(_) => ConvertedType::NONE,
|
508
|
-
Field::Decimal(_) => ConvertedType::DECIMAL,
|
509
|
-
Field::Str(_) => ConvertedType::UTF8,
|
510
|
-
Field::Bytes(_) => ConvertedType::LIST,
|
511
|
-
Field::Date(_) => ConvertedType::DATE,
|
512
|
-
Field::TimeMillis(_) => ConvertedType::TIME_MILLIS,
|
513
|
-
Field::TimeMicros(_) => ConvertedType::TIMESTAMP_MICROS,
|
514
|
-
Field::TimestampMillis(_) => ConvertedType::TIMESTAMP_MILLIS,
|
515
|
-
Field::TimestampMicros(_) => ConvertedType::TIMESTAMP_MICROS,
|
516
|
-
Field::Group(_) => ConvertedType::NONE,
|
517
|
-
Field::ListInternal(_) => ConvertedType::LIST,
|
518
|
-
Field::MapInternal(_) => ConvertedType::MAP,
|
519
|
-
}
|
520
|
-
}
|
521
|
-
fn to_logical_type(&self) -> Option<LogicalType> {
|
522
|
-
Some(match self {
|
523
|
-
Field::Null => LogicalType::Unknown,
|
524
|
-
Field::Bool(_) => LogicalType::Integer {
|
525
|
-
bit_width: 1,
|
526
|
-
is_signed: false,
|
527
|
-
},
|
528
|
-
Field::Byte(_) => LogicalType::Integer {
|
529
|
-
bit_width: 8,
|
530
|
-
is_signed: false,
|
531
|
-
},
|
532
|
-
Field::Short(_) => LogicalType::Integer {
|
533
|
-
bit_width: 16,
|
534
|
-
is_signed: true,
|
535
|
-
},
|
536
|
-
Field::Int(_) => LogicalType::Integer {
|
537
|
-
bit_width: 32,
|
538
|
-
is_signed: true,
|
539
|
-
},
|
540
|
-
Field::Long(_) => LogicalType::Integer {
|
541
|
-
bit_width: 64,
|
542
|
-
is_signed: true,
|
543
|
-
},
|
544
|
-
Field::UByte(_) => LogicalType::Integer {
|
545
|
-
bit_width: 8,
|
546
|
-
is_signed: false,
|
547
|
-
},
|
548
|
-
Field::UShort(_) => LogicalType::Integer {
|
549
|
-
bit_width: 16,
|
550
|
-
is_signed: false,
|
551
|
-
},
|
552
|
-
Field::UInt(_) => LogicalType::Integer {
|
553
|
-
bit_width: 32,
|
554
|
-
is_signed: false,
|
555
|
-
},
|
556
|
-
Field::ULong(_) => LogicalType::Integer {
|
557
|
-
bit_width: 64,
|
558
|
-
is_signed: false,
|
559
|
-
},
|
560
|
-
Field::Float16(_) => LogicalType::Float16,
|
561
|
-
Field::Float(_) => LogicalType::Decimal {
|
562
|
-
scale: 7,
|
563
|
-
precision: 7,
|
564
|
-
},
|
565
|
-
Field::Double(_) => LogicalType::Decimal {
|
566
|
-
scale: 15,
|
567
|
-
precision: 15,
|
568
|
-
},
|
569
|
-
Field::Decimal(decimal) => LogicalType::Decimal {
|
570
|
-
scale: decimal.scale(),
|
571
|
-
precision: decimal.precision(),
|
572
|
-
},
|
573
|
-
Field::Str(_) => LogicalType::String,
|
574
|
-
Field::Bytes(b) => {
|
575
|
-
if b.data().len() == 16 && uuid::Uuid::from_slice(b.as_bytes()).is_ok() {
|
576
|
-
LogicalType::Uuid
|
577
|
-
} else {
|
578
|
-
LogicalType::Unknown
|
579
|
-
}
|
580
|
-
}
|
581
|
-
Field::Date(_) => LogicalType::Date,
|
582
|
-
Field::TimeMillis(_) => LogicalType::Time {
|
583
|
-
is_adjusted_to_u_t_c: true,
|
584
|
-
unit: parquet::basic::TimeUnit::MILLIS(parquet::format::MilliSeconds {}),
|
585
|
-
},
|
586
|
-
Field::TimeMicros(_) => LogicalType::Time {
|
587
|
-
is_adjusted_to_u_t_c: true,
|
588
|
-
unit: parquet::basic::TimeUnit::MICROS(parquet::format::MicroSeconds {}),
|
589
|
-
},
|
590
|
-
Field::TimestampMillis(_) => LogicalType::Timestamp {
|
591
|
-
is_adjusted_to_u_t_c: true,
|
592
|
-
unit: parquet::basic::TimeUnit::MILLIS(parquet::format::MilliSeconds {}),
|
593
|
-
},
|
594
|
-
Field::TimestampMicros(_) => LogicalType::Timestamp {
|
595
|
-
is_adjusted_to_u_t_c: true,
|
596
|
-
unit: parquet::basic::TimeUnit::MICROS(parquet::format::MicroSeconds {}),
|
597
|
-
},
|
598
|
-
Field::Group(_) => LogicalType::Unknown,
|
599
|
-
Field::ListInternal(_) => LogicalType::List,
|
600
|
-
Field::MapInternal(_) => LogicalType::Map,
|
601
|
-
})
|
602
|
-
}
|
603
|
-
}
|