parquet 0.5.9 → 0.5.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +3 -0
- data/ext/parquet/Cargo.toml +2 -0
- data/ext/parquet/build.rs +1 -1
- data/ext/parquet/src/lib.rs +3 -0
- data/ext/parquet/src/types/core_types.rs +1 -0
- data/ext/parquet/src/types/mod.rs +7 -4
- data/ext/parquet/src/types/parquet_value.rs +290 -73
- data/ext/parquet/src/types/record_types.rs +92 -8
- data/ext/parquet/src/types/schema_node.rs +11 -5
- data/ext/parquet/src/types/type_conversion.rs +216 -0
- data/ext/parquet/src/types/writer_types.rs +50 -0
- data/ext/parquet/src/writer/mod.rs +3 -0
- data/ext/parquet/src/writer/write_columns.rs +3 -0
- data/ext/parquet/src/writer/write_rows.rs +1 -0
- data/lib/parquet/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 82528b663c4a577262db90b6d17ba473a81d0ea725ceba486b63a3619040fa73
|
4
|
+
data.tar.gz: 2e44daa9b4e36ef1503589daaa0815cbc3acee10c565d9942f6c0b6d35ced5f0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 418951253384f5492385fcb30fa5b0113b85d9bc51346b6abad16105c124d8869266943c1a29bc0879cfee4270b94d32fb99004e233c6ebde4a70e1d329435af
|
7
|
+
data.tar.gz: bc0db4ebb36add314253b5b9b946cc2c84f315d51ba7fefbead6c7de3b65a3f7752fa4e4cf0be19704405b390ae0106d8383e30791e7fac4a86a75141c214de1
|
data/Cargo.lock
CHANGED
@@ -126,6 +126,7 @@ dependencies = [
|
|
126
126
|
"arrow-data",
|
127
127
|
"arrow-schema",
|
128
128
|
"flatbuffers",
|
129
|
+
"lz4_flex",
|
129
130
|
]
|
130
131
|
|
131
132
|
[[package]]
|
@@ -842,6 +843,8 @@ version = "0.1.0"
|
|
842
843
|
dependencies = [
|
843
844
|
"ahash",
|
844
845
|
"arrow-array",
|
846
|
+
"arrow-buffer",
|
847
|
+
"arrow-ipc",
|
845
848
|
"arrow-schema",
|
846
849
|
"bytes",
|
847
850
|
"either",
|
data/ext/parquet/Cargo.toml
CHANGED
@@ -12,6 +12,8 @@ rb-sys-env = "^0.2"
|
|
12
12
|
[dependencies]
|
13
13
|
ahash = "0.8"
|
14
14
|
arrow-array = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-time" }
|
15
|
+
arrow-buffer = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-time" }
|
16
|
+
arrow-ipc = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-time", features = ["lz4"] }
|
15
17
|
arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-time" }
|
16
18
|
bytes = "^1.9"
|
17
19
|
either = "1.9"
|
data/ext/parquet/build.rs
CHANGED
data/ext/parquet/src/lib.rs
CHANGED
@@ -19,6 +19,9 @@ use writer::write_rows;
|
|
19
19
|
/// Initializes the Ruby extension and defines methods.
|
20
20
|
#[magnus::init]
|
21
21
|
fn init(ruby: &Ruby) -> Result<(), Error> {
|
22
|
+
// Require 'time' for Time.parse method
|
23
|
+
ruby.require("time")?;
|
24
|
+
|
22
25
|
let module = ruby.define_module("Parquet")?;
|
23
26
|
module.define_module_function("metadata", magnus::method!(reader::parse_metadata, -1))?;
|
24
27
|
module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
|
@@ -23,10 +23,11 @@ pub use writer_types::*;
|
|
23
23
|
// Common imports used across the module
|
24
24
|
use arrow_array::cast::downcast_array;
|
25
25
|
use arrow_array::{
|
26
|
-
Array, BinaryArray, BooleanArray, Date32Array, Date64Array,
|
27
|
-
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
|
28
|
-
|
29
|
-
|
26
|
+
Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, Decimal256Array,
|
27
|
+
Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
|
28
|
+
ListArray, NullArray, StringArray, StructArray, TimestampMicrosecondArray,
|
29
|
+
TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array,
|
30
|
+
UInt32Array, UInt64Array, UInt8Array,
|
30
31
|
};
|
31
32
|
use arrow_schema::{DataType, TimeUnit};
|
32
33
|
use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby, Value};
|
@@ -62,6 +63,8 @@ pub enum ParquetGemError {
|
|
62
63
|
InvalidDecimal(String),
|
63
64
|
#[error("Failed to parse UUID: {0}")]
|
64
65
|
UuidError(#[from] uuid::Error),
|
66
|
+
#[error("Decimals larger than 128 bits are not supported")]
|
67
|
+
DecimalWouldBeTruncated,
|
65
68
|
}
|
66
69
|
|
67
70
|
#[derive(Debug)]
|
@@ -1,7 +1,7 @@
|
|
1
1
|
use crate::{impl_date_conversion, impl_timestamp_array_conversion, impl_timestamp_conversion};
|
2
2
|
|
3
|
+
use super::record_types::{format_decimal_with_i8_scale, format_i256_decimal_with_scale};
|
3
4
|
use super::*;
|
4
|
-
use super::record_types::format_decimal_with_i8_scale;
|
5
5
|
use arrow_array::MapArray;
|
6
6
|
use magnus::{RArray, RString};
|
7
7
|
|
@@ -24,6 +24,7 @@ pub enum ParquetValue {
|
|
24
24
|
Date32(i32),
|
25
25
|
Date64(i64),
|
26
26
|
Decimal128(i128, i8),
|
27
|
+
Decimal256(arrow_buffer::i256, i8),
|
27
28
|
TimestampSecond(i64, Option<Arc<str>>),
|
28
29
|
TimestampMillis(i64, Option<Arc<str>>),
|
29
30
|
TimestampMicros(i64, Option<Arc<str>>),
|
@@ -94,6 +95,15 @@ impl PartialEq for ParquetValue {
|
|
94
95
|
a_val == b_val
|
95
96
|
}
|
96
97
|
}
|
98
|
+
(ParquetValue::Decimal256(a, scale_a), ParquetValue::Decimal256(b, scale_b)) => {
|
99
|
+
if scale_a == scale_b {
|
100
|
+
// Same scale, compare directly
|
101
|
+
a == b
|
102
|
+
} else {
|
103
|
+
// TODO: Implement decimal256 comparison
|
104
|
+
todo!("decimal256 comparison");
|
105
|
+
}
|
106
|
+
}
|
97
107
|
(ParquetValue::TimestampSecond(a, _), ParquetValue::TimestampSecond(b, _)) => a == b,
|
98
108
|
(ParquetValue::TimestampMillis(a, _), ParquetValue::TimestampMillis(b, _)) => a == b,
|
99
109
|
(ParquetValue::TimestampMicros(a, _), ParquetValue::TimestampMicros(b, _)) => a == b,
|
@@ -130,6 +140,10 @@ impl std::hash::Hash for ParquetValue {
|
|
130
140
|
d.hash(state);
|
131
141
|
scale.hash(state);
|
132
142
|
}
|
143
|
+
ParquetValue::Decimal256(d, scale) => {
|
144
|
+
d.hash(state);
|
145
|
+
scale.hash(state);
|
146
|
+
}
|
133
147
|
ParquetValue::TimestampSecond(ts, tz) => {
|
134
148
|
ts.hash(state);
|
135
149
|
tz.hash(state);
|
@@ -185,6 +199,17 @@ impl TryIntoValue for ParquetValue {
|
|
185
199
|
let kernel = handle.module_kernel();
|
186
200
|
Ok(kernel.funcall::<_, _, Value>("BigDecimal", (value,))?)
|
187
201
|
}
|
202
|
+
ParquetValue::Decimal256(d, scale) => {
|
203
|
+
// Load the bigdecimal gem if it's not already loaded
|
204
|
+
LOADED_BIGDECIMAL.get_or_init(|| handle.require("bigdecimal").unwrap_or_default());
|
205
|
+
|
206
|
+
// Format with proper scaling based on the sign of scale
|
207
|
+
// Use specialized function to preserve full precision
|
208
|
+
let value = format_i256_decimal_with_scale(d, scale)?;
|
209
|
+
|
210
|
+
let kernel = handle.module_kernel();
|
211
|
+
Ok(kernel.funcall::<_, _, Value>("BigDecimal", (value,))?)
|
212
|
+
}
|
188
213
|
ParquetValue::Date32(d) => impl_date_conversion!(d, handle),
|
189
214
|
ParquetValue::Date64(d) => impl_date_conversion!(d, handle),
|
190
215
|
timestamp @ ParquetValue::TimestampSecond(_, _) => {
|
@@ -292,9 +317,21 @@ impl ParquetValue {
|
|
292
317
|
}
|
293
318
|
PrimitiveType::Decimal128(_precision, scale) => {
|
294
319
|
if value.is_kind_of(ruby.class_string()) {
|
295
|
-
|
320
|
+
convert_to_decimal(value, *scale)
|
296
321
|
} else if let Ok(s) = value.funcall::<_, _, RString>("to_s", ()) {
|
297
|
-
|
322
|
+
convert_to_decimal(s.as_value(), *scale)
|
323
|
+
} else {
|
324
|
+
Err(MagnusError::new(
|
325
|
+
magnus::exception::type_error(),
|
326
|
+
"Expected a string for a decimal type",
|
327
|
+
))
|
328
|
+
}
|
329
|
+
}
|
330
|
+
PrimitiveType::Decimal256(_precision, scale) => {
|
331
|
+
if value.is_kind_of(ruby.class_string()) {
|
332
|
+
convert_to_decimal(value, *scale)
|
333
|
+
} else if let Ok(s) = value.funcall::<_, _, RString>("to_s", ()) {
|
334
|
+
convert_to_decimal(s.as_value(), *scale)
|
298
335
|
} else {
|
299
336
|
Err(MagnusError::new(
|
300
337
|
magnus::exception::type_error(),
|
@@ -425,8 +462,14 @@ impl ParquetValue {
|
|
425
462
|
}
|
426
463
|
}
|
427
464
|
}
|
465
|
+
|
466
|
+
enum ParsedDecimal {
|
467
|
+
Int128(i128),
|
468
|
+
Int256(arrow_buffer::i256),
|
469
|
+
}
|
470
|
+
|
428
471
|
/// Unified helper to parse a decimal string and apply scaling
|
429
|
-
fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<
|
472
|
+
fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<ParsedDecimal, MagnusError> {
|
430
473
|
let s = input_str.trim();
|
431
474
|
|
432
475
|
// 1. Handle scientific notation case (e.g., "0.12345e3")
|
@@ -445,12 +488,9 @@ fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<i128, Magnus
|
|
445
488
|
)
|
446
489
|
})?;
|
447
490
|
|
448
|
-
//
|
491
|
+
// For very large exponents, we'll need to use BigInt
|
449
492
|
if exp_val.abs() > 38 {
|
450
|
-
return
|
451
|
-
magnus::exception::range_error(),
|
452
|
-
format!("Exponent {} is out of range for decimal value '{}'. Must be between -38 and 38.", exp_val, s),
|
453
|
-
));
|
493
|
+
return parse_large_decimal_with_bigint(s, input_scale);
|
454
494
|
}
|
455
495
|
|
456
496
|
// Handle the base part which might contain a decimal point
|
@@ -460,30 +500,23 @@ fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<i128, Magnus
|
|
460
500
|
|
461
501
|
let base_scale = base.len() - decimal_pos - 1;
|
462
502
|
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
})?;
|
472
|
-
|
473
|
-
(base_val, base_scale as i32)
|
503
|
+
// Try to parse as i128 first
|
504
|
+
match base_without_point.parse::<i128>() {
|
505
|
+
Ok(v) => (v, base_scale as i32),
|
506
|
+
Err(_) => {
|
507
|
+
// Value too large for i128, use BigInt
|
508
|
+
return parse_large_decimal_with_bigint(s, input_scale);
|
509
|
+
}
|
510
|
+
}
|
474
511
|
} else {
|
475
512
|
// No decimal point in base
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
)
|
484
|
-
})?;
|
485
|
-
|
486
|
-
(base_val, 0)
|
513
|
+
match base.parse::<i128>() {
|
514
|
+
Ok(v) => (v, 0),
|
515
|
+
Err(_) => {
|
516
|
+
// Value too large for i128, use BigInt
|
517
|
+
return parse_large_decimal_with_bigint(s, input_scale);
|
518
|
+
}
|
519
|
+
}
|
487
520
|
};
|
488
521
|
|
489
522
|
// Calculate the effective scale: base_scale - exp_val
|
@@ -495,12 +528,14 @@ fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<i128, Magnus
|
|
495
528
|
// Need to multiply to increase scale
|
496
529
|
let scale_diff = (input_scale as i32 - effective_scale) as u32;
|
497
530
|
if scale_diff > 38 {
|
498
|
-
return
|
499
|
-
|
500
|
-
|
501
|
-
|
531
|
+
return parse_large_decimal_with_bigint(s, input_scale);
|
532
|
+
}
|
533
|
+
|
534
|
+
// Check for overflow
|
535
|
+
match base_val.checked_mul(10_i128.pow(scale_diff)) {
|
536
|
+
Some(v) => Ok(ParsedDecimal::Int128(v)),
|
537
|
+
None => parse_large_decimal_with_bigint(s, input_scale),
|
502
538
|
}
|
503
|
-
Ok(base_val * 10_i128.pow(scale_diff))
|
504
539
|
}
|
505
540
|
std::cmp::Ordering::Greater => {
|
506
541
|
// Need to divide to decrease scale
|
@@ -511,9 +546,9 @@ fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<i128, Magnus
|
|
511
546
|
format!("Scale adjustment too large ({}) for decimal value '{}'. Consider using a larger scale.", scale_diff, s),
|
512
547
|
));
|
513
548
|
}
|
514
|
-
Ok(base_val / 10_i128.pow(scale_diff))
|
549
|
+
Ok(ParsedDecimal::Int128(base_val / 10_i128.pow(scale_diff)))
|
515
550
|
}
|
516
|
-
std::cmp::Ordering::Equal => Ok(base_val),
|
551
|
+
std::cmp::Ordering::Equal => Ok(ParsedDecimal::Int128(base_val)),
|
517
552
|
}
|
518
553
|
}
|
519
554
|
// 2. Handle decimal point in the string (e.g., "123.456")
|
@@ -524,16 +559,14 @@ fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<i128, Magnus
|
|
524
559
|
// Calculate the actual scale from the decimal position
|
525
560
|
let actual_scale = s.len() - decimal_pos - 1;
|
526
561
|
|
527
|
-
//
|
528
|
-
let v = s_without_point.parse::<i128>()
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
)
|
536
|
-
})?;
|
562
|
+
// Try to parse as i128 first
|
563
|
+
let v = match s_without_point.parse::<i128>() {
|
564
|
+
Ok(v) => v,
|
565
|
+
Err(_) => {
|
566
|
+
// Value too large for i128, use BigInt
|
567
|
+
return parse_large_decimal_with_bigint(s, input_scale);
|
568
|
+
}
|
569
|
+
};
|
537
570
|
|
538
571
|
// Scale the value if needed based on the difference between
|
539
572
|
// the actual scale and the requested scale
|
@@ -542,12 +575,14 @@ fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<i128, Magnus
|
|
542
575
|
// Need to multiply to increase scale
|
543
576
|
let scale_diff = (input_scale - actual_scale as i8) as u32;
|
544
577
|
if scale_diff > 38 {
|
545
|
-
return
|
546
|
-
|
547
|
-
|
548
|
-
|
578
|
+
return parse_large_decimal_with_bigint(s, input_scale);
|
579
|
+
}
|
580
|
+
|
581
|
+
// Check for overflow
|
582
|
+
match v.checked_mul(10_i128.pow(scale_diff)) {
|
583
|
+
Some(v) => Ok(ParsedDecimal::Int128(v)),
|
584
|
+
None => parse_large_decimal_with_bigint(s, input_scale),
|
549
585
|
}
|
550
|
-
Ok(v * 10_i128.pow(scale_diff))
|
551
586
|
}
|
552
587
|
std::cmp::Ordering::Greater => {
|
553
588
|
// Need to divide to decrease scale
|
@@ -558,30 +593,25 @@ fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<i128, Magnus
|
|
558
593
|
format!("Scale adjustment too large ({}) for decimal value '{}'. Consider using a larger scale.", scale_diff, s),
|
559
594
|
));
|
560
595
|
}
|
561
|
-
Ok(v / 10_i128.pow(scale_diff))
|
596
|
+
Ok(ParsedDecimal::Int128(v / 10_i128.pow(scale_diff)))
|
562
597
|
}
|
563
|
-
std::cmp::Ordering::Equal => Ok(v),
|
598
|
+
std::cmp::Ordering::Equal => Ok(ParsedDecimal::Int128(v)),
|
564
599
|
}
|
565
600
|
}
|
566
601
|
// 3. Plain integer value (e.g., "12345")
|
567
602
|
else {
|
568
|
-
// No decimal point, parse as i128
|
569
|
-
let v = s.parse::<i128>()
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
603
|
+
// No decimal point, try to parse as i128 first
|
604
|
+
let v = match s.parse::<i128>() {
|
605
|
+
Ok(v) => v,
|
606
|
+
Err(_) => {
|
607
|
+
// Value too large for i128, use BigInt
|
608
|
+
return parse_large_decimal_with_bigint(s, input_scale);
|
609
|
+
}
|
610
|
+
};
|
575
611
|
|
576
612
|
// Apply scale - make sure it's reasonable
|
577
613
|
if input_scale > 38 {
|
578
|
-
return
|
579
|
-
magnus::exception::range_error(),
|
580
|
-
format!(
|
581
|
-
"Scale {} is too large for decimal value '{}'. Must be ≤ 38.",
|
582
|
-
input_scale, s
|
583
|
-
),
|
584
|
-
));
|
614
|
+
return parse_large_decimal_with_bigint(s, input_scale);
|
585
615
|
} else if input_scale < -38 {
|
586
616
|
return Err(MagnusError::new(
|
587
617
|
magnus::exception::range_error(),
|
@@ -594,15 +624,153 @@ fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<i128, Magnus
|
|
594
624
|
|
595
625
|
// Apply positive scale (multiply)
|
596
626
|
if input_scale >= 0 {
|
597
|
-
|
627
|
+
match v.checked_mul(10_i128.pow(input_scale as u32)) {
|
628
|
+
Some(v) => Ok(ParsedDecimal::Int128(v)),
|
629
|
+
None => parse_large_decimal_with_bigint(s, input_scale),
|
630
|
+
}
|
598
631
|
} else {
|
599
632
|
// Apply negative scale (divide)
|
600
|
-
Ok(
|
633
|
+
Ok(ParsedDecimal::Int128(
|
634
|
+
v / 10_i128.pow((-input_scale) as u32),
|
635
|
+
))
|
636
|
+
}
|
637
|
+
}
|
638
|
+
}
|
639
|
+
|
640
|
+
/// Parse large decimal values using BigInt when they would overflow i128
|
641
|
+
fn parse_large_decimal_with_bigint(s: &str, input_scale: i8) -> Result<ParsedDecimal, MagnusError> {
|
642
|
+
use num::BigInt;
|
643
|
+
use std::str::FromStr;
|
644
|
+
|
645
|
+
// Parse the input string as a BigInt
|
646
|
+
let bigint = if let Some(e_pos) = s.to_lowercase().find('e') {
|
647
|
+
// Handle scientific notation
|
648
|
+
let base = &s[0..e_pos];
|
649
|
+
let exp = &s[e_pos + 1..];
|
650
|
+
|
651
|
+
let exp_val = exp.parse::<i32>().map_err(|e| {
|
652
|
+
MagnusError::new(
|
653
|
+
magnus::exception::type_error(),
|
654
|
+
format!("Failed to parse exponent '{}': {}", exp, e),
|
655
|
+
)
|
656
|
+
})?;
|
657
|
+
|
658
|
+
// Parse base as BigInt
|
659
|
+
let base_bigint = if let Some(decimal_pos) = base.find('.') {
|
660
|
+
let mut base_without_point = base.to_string();
|
661
|
+
base_without_point.remove(decimal_pos);
|
662
|
+
let base_scale = base.len() - decimal_pos - 1;
|
663
|
+
|
664
|
+
let bigint = BigInt::from_str(&base_without_point).map_err(|e| {
|
665
|
+
MagnusError::new(
|
666
|
+
magnus::exception::type_error(),
|
667
|
+
format!("Failed to parse decimal base '{}': {}", base, e),
|
668
|
+
)
|
669
|
+
})?;
|
670
|
+
|
671
|
+
// Adjust for the decimal point
|
672
|
+
let effective_exp = exp_val - base_scale as i32;
|
673
|
+
|
674
|
+
if effective_exp > 0 {
|
675
|
+
bigint * BigInt::from(10).pow(effective_exp as u32)
|
676
|
+
} else if effective_exp < 0 {
|
677
|
+
bigint / BigInt::from(10).pow((-effective_exp) as u32)
|
678
|
+
} else {
|
679
|
+
bigint
|
680
|
+
}
|
681
|
+
} else {
|
682
|
+
let bigint = BigInt::from_str(base).map_err(|e| {
|
683
|
+
MagnusError::new(
|
684
|
+
magnus::exception::type_error(),
|
685
|
+
format!("Failed to parse decimal base '{}': {}", base, e),
|
686
|
+
)
|
687
|
+
})?;
|
688
|
+
|
689
|
+
if exp_val > 0 {
|
690
|
+
bigint * BigInt::from(10).pow(exp_val as u32)
|
691
|
+
} else if exp_val < 0 {
|
692
|
+
bigint / BigInt::from(10).pow((-exp_val) as u32)
|
693
|
+
} else {
|
694
|
+
bigint
|
695
|
+
}
|
696
|
+
};
|
697
|
+
|
698
|
+
base_bigint
|
699
|
+
} else if let Some(decimal_pos) = s.find('.') {
|
700
|
+
// Handle decimal point
|
701
|
+
let mut s_without_point = s.to_string();
|
702
|
+
s_without_point.remove(decimal_pos);
|
703
|
+
|
704
|
+
let actual_scale = s.len() - decimal_pos - 1;
|
705
|
+
let bigint = BigInt::from_str(&s_without_point).map_err(|e| {
|
706
|
+
MagnusError::new(
|
707
|
+
magnus::exception::type_error(),
|
708
|
+
format!("Failed to parse decimal string '{}': {}", s, e),
|
709
|
+
)
|
710
|
+
})?;
|
711
|
+
|
712
|
+
// Adjust for scale difference
|
713
|
+
let scale_diff = actual_scale as i8 - input_scale;
|
714
|
+
|
715
|
+
if scale_diff > 0 {
|
716
|
+
bigint / BigInt::from(10).pow(scale_diff as u32)
|
717
|
+
} else if scale_diff < 0 {
|
718
|
+
bigint * BigInt::from(10).pow((-scale_diff) as u32)
|
719
|
+
} else {
|
720
|
+
bigint
|
721
|
+
}
|
722
|
+
} else {
|
723
|
+
// Plain integer
|
724
|
+
let bigint = BigInt::from_str(s).map_err(|e| {
|
725
|
+
MagnusError::new(
|
726
|
+
magnus::exception::type_error(),
|
727
|
+
format!("Failed to parse integer string '{}': {}", s, e),
|
728
|
+
)
|
729
|
+
})?;
|
730
|
+
|
731
|
+
if input_scale > 0 {
|
732
|
+
bigint * BigInt::from(10).pow(input_scale as u32)
|
733
|
+
} else if input_scale < 0 {
|
734
|
+
bigint / BigInt::from(10).pow((-input_scale) as u32)
|
735
|
+
} else {
|
736
|
+
bigint
|
601
737
|
}
|
738
|
+
};
|
739
|
+
|
740
|
+
// Convert BigInt to bytes and then to i256
|
741
|
+
let bytes = bigint.to_signed_bytes_le();
|
742
|
+
|
743
|
+
if bytes.len() <= 16 {
|
744
|
+
// Fits in i128
|
745
|
+
let mut buf = if bigint.sign() == num::bigint::Sign::Minus {
|
746
|
+
[0xff; 16]
|
747
|
+
} else {
|
748
|
+
[0; 16]
|
749
|
+
};
|
750
|
+
buf[..bytes.len()].copy_from_slice(&bytes);
|
751
|
+
|
752
|
+
Ok(ParsedDecimal::Int128(i128::from_le_bytes(buf)))
|
753
|
+
} else if bytes.len() <= 32 {
|
754
|
+
// Fits in i256
|
755
|
+
let mut buf = if bigint.sign() == num::bigint::Sign::Minus {
|
756
|
+
[0xff; 32]
|
757
|
+
} else {
|
758
|
+
[0; 32]
|
759
|
+
};
|
760
|
+
buf[..bytes.len()].copy_from_slice(&bytes);
|
761
|
+
|
762
|
+
Ok(ParsedDecimal::Int256(arrow_buffer::i256::from_le_bytes(
|
763
|
+
buf,
|
764
|
+
)))
|
765
|
+
} else {
|
766
|
+
Err(MagnusError::new(
|
767
|
+
magnus::exception::range_error(),
|
768
|
+
format!("Decimal value '{}' is too large to fit in 256 bits", s),
|
769
|
+
))
|
602
770
|
}
|
603
771
|
}
|
604
772
|
|
605
|
-
fn
|
773
|
+
fn convert_to_decimal(value: Value, scale: i8) -> Result<ParquetValue, MagnusError> {
|
606
774
|
// Get the decimal string based on the type of value
|
607
775
|
let s = if unsafe { value.classname() } == "BigDecimal" {
|
608
776
|
value
|
@@ -614,7 +782,10 @@ fn convert_to_decimal128(value: Value, scale: i8) -> Result<ParquetValue, Magnus
|
|
614
782
|
|
615
783
|
// Use our unified parser to convert the string to a decimal value with scaling
|
616
784
|
match parse_decimal_string(&s, scale) {
|
617
|
-
Ok(decimal_value) =>
|
785
|
+
Ok(decimal_value) => match decimal_value {
|
786
|
+
ParsedDecimal::Int128(v) => Ok(ParquetValue::Decimal128(v, scale)),
|
787
|
+
ParsedDecimal::Int256(v) => Ok(ParquetValue::Decimal256(v, scale)),
|
788
|
+
},
|
618
789
|
Err(e) => Err(MagnusError::new(
|
619
790
|
magnus::exception::type_error(),
|
620
791
|
format!(
|
@@ -731,6 +902,52 @@ impl<'a> TryFrom<ArrayWrapper<'a>> for ParquetValueVec {
|
|
731
902
|
}
|
732
903
|
DataType::Date32 => impl_numeric_array_conversion!(column.array, Date32Array, Date32),
|
733
904
|
DataType::Date64 => impl_numeric_array_conversion!(column.array, Date64Array, Date64),
|
905
|
+
DataType::Decimal128(_precision, scale) => {
|
906
|
+
let array = downcast_array::<Decimal128Array>(column.array);
|
907
|
+
Ok(ParquetValueVec(if array.is_nullable() {
|
908
|
+
array
|
909
|
+
.values()
|
910
|
+
.iter()
|
911
|
+
.enumerate()
|
912
|
+
.map(|(i, x)| {
|
913
|
+
if array.is_null(i) {
|
914
|
+
ParquetValue::Null
|
915
|
+
} else {
|
916
|
+
ParquetValue::Decimal128(*x, *scale)
|
917
|
+
}
|
918
|
+
})
|
919
|
+
.collect()
|
920
|
+
} else {
|
921
|
+
array
|
922
|
+
.values()
|
923
|
+
.iter()
|
924
|
+
.map(|x| ParquetValue::Decimal128(*x, *scale))
|
925
|
+
.collect()
|
926
|
+
}))
|
927
|
+
}
|
928
|
+
DataType::Decimal256(_precision, scale) => {
|
929
|
+
let array = downcast_array::<Decimal256Array>(column.array);
|
930
|
+
Ok(ParquetValueVec(if array.is_nullable() {
|
931
|
+
array
|
932
|
+
.values()
|
933
|
+
.iter()
|
934
|
+
.enumerate()
|
935
|
+
.map(|(i, x)| {
|
936
|
+
if array.is_null(i) {
|
937
|
+
ParquetValue::Null
|
938
|
+
} else {
|
939
|
+
ParquetValue::Decimal256(*x, *scale)
|
940
|
+
}
|
941
|
+
})
|
942
|
+
.collect()
|
943
|
+
} else {
|
944
|
+
array
|
945
|
+
.values()
|
946
|
+
.iter()
|
947
|
+
.map(|x| ParquetValue::Decimal256(*x, *scale))
|
948
|
+
.collect()
|
949
|
+
}))
|
950
|
+
}
|
734
951
|
DataType::Timestamp(TimeUnit::Second, tz) => {
|
735
952
|
impl_timestamp_array_conversion!(
|
736
953
|
column.array,
|
@@ -22,6 +22,19 @@ pub fn format_decimal_with_i8_scale<T: std::fmt::Display>(value: T, scale: i8) -
|
|
22
22
|
}
|
23
23
|
}
|
24
24
|
|
25
|
+
/// Format i256 decimal value with appropriate scale for BigDecimal conversion
|
26
|
+
/// Uses bytes conversion to preserve full precision
|
27
|
+
pub fn format_i256_decimal_with_scale(
|
28
|
+
value: arrow_buffer::i256,
|
29
|
+
scale: i8,
|
30
|
+
) -> Result<String, ParquetGemError> {
|
31
|
+
// Convert i256 to big-endian bytes
|
32
|
+
let bytes = value.to_be_bytes();
|
33
|
+
|
34
|
+
// Use the existing bytes_to_decimal function which handles full precision
|
35
|
+
bytes_to_decimal(&bytes, scale as i32)
|
36
|
+
}
|
37
|
+
|
25
38
|
/// Format decimal value with appropriate scale for BigDecimal conversion
|
26
39
|
/// Handles positive and negative scales correctly for i32 scale
|
27
40
|
pub fn format_decimal_with_i32_scale<T: std::fmt::Display>(value: T, scale: i32) -> String {
|
@@ -35,7 +48,7 @@ pub fn format_decimal_with_i32_scale<T: std::fmt::Display>(value: T, scale: i32)
|
|
35
48
|
}
|
36
49
|
|
37
50
|
/// Convert arbitrary-length big-endian byte array to decimal string
|
38
|
-
/// Supports byte arrays from 1 to
|
51
|
+
/// Supports byte arrays from 1 to 32 bytes in length
|
39
52
|
fn bytes_to_decimal(bytes: &[u8], scale: i32) -> Result<String, ParquetGemError> {
|
40
53
|
match bytes.len() {
|
41
54
|
0 => Err(ParquetGemError::InvalidDecimal(
|
@@ -50,34 +63,34 @@ fn bytes_to_decimal(bytes: &[u8], scale: i32) -> Result<String, ParquetGemError>
|
|
50
63
|
// For 2 bytes, use i16
|
51
64
|
let mut value: i16 = 0;
|
52
65
|
let is_negative = bytes[0] & 0x80 != 0;
|
53
|
-
|
66
|
+
|
54
67
|
for &byte in bytes {
|
55
68
|
value = (value << 8) | (byte as i16);
|
56
69
|
}
|
57
|
-
|
70
|
+
|
58
71
|
// Sign extend if negative
|
59
72
|
if is_negative {
|
60
73
|
let shift = 16 - (bytes.len() * 8);
|
61
74
|
value = (value << shift) >> shift;
|
62
75
|
}
|
63
|
-
|
76
|
+
|
64
77
|
Ok(format_decimal_with_i32_scale(value, scale))
|
65
78
|
}
|
66
79
|
3..=4 => {
|
67
80
|
// For 3-4 bytes, use i32
|
68
81
|
let mut value: i32 = 0;
|
69
82
|
let is_negative = bytes[0] & 0x80 != 0;
|
70
|
-
|
83
|
+
|
71
84
|
for &byte in bytes {
|
72
85
|
value = (value << 8) | (byte as i32);
|
73
86
|
}
|
74
|
-
|
87
|
+
|
75
88
|
// Sign extend if negative
|
76
89
|
if is_negative {
|
77
90
|
let shift = 32 - (bytes.len() * 8);
|
78
91
|
value = (value << shift) >> shift;
|
79
92
|
}
|
80
|
-
|
93
|
+
|
81
94
|
Ok(format_decimal_with_i32_scale(value, scale))
|
82
95
|
}
|
83
96
|
5..=8 => {
|
@@ -114,8 +127,79 @@ fn bytes_to_decimal(bytes: &[u8], scale: i32) -> Result<String, ParquetGemError>
|
|
114
127
|
|
115
128
|
Ok(format_decimal_with_i32_scale(value, scale))
|
116
129
|
}
|
130
|
+
17..=32 => {
|
131
|
+
// For 17-32 bytes, we need arbitrary precision handling
|
132
|
+
// Check if the number is negative (MSB of first byte)
|
133
|
+
let is_negative = bytes[0] & 0x80 != 0;
|
134
|
+
|
135
|
+
if is_negative {
|
136
|
+
// For negative numbers, we need to compute two's complement
|
137
|
+
// First, invert all bits
|
138
|
+
let mut inverted = Vec::with_capacity(bytes.len());
|
139
|
+
for &byte in bytes {
|
140
|
+
inverted.push(!byte);
|
141
|
+
}
|
142
|
+
|
143
|
+
// Then add 1
|
144
|
+
let mut carry = 1u8;
|
145
|
+
for i in (0..inverted.len()).rev() {
|
146
|
+
let (sum, new_carry) = inverted[i].overflowing_add(carry);
|
147
|
+
inverted[i] = sum;
|
148
|
+
carry = if new_carry { 1 } else { 0 };
|
149
|
+
}
|
150
|
+
|
151
|
+
// Convert to decimal string
|
152
|
+
let mut result = String::new();
|
153
|
+
let mut remainder = inverted;
|
154
|
+
|
155
|
+
// Repeatedly divide by 10 to get decimal digits
|
156
|
+
while !remainder.iter().all(|&b| b == 0) {
|
157
|
+
let mut carry = 0u16;
|
158
|
+
for i in 0..remainder.len() {
|
159
|
+
let temp = (carry << 8) | (remainder[i] as u16);
|
160
|
+
remainder[i] = (temp / 10) as u8;
|
161
|
+
carry = temp % 10;
|
162
|
+
}
|
163
|
+
result.push_str(&carry.to_string());
|
164
|
+
}
|
165
|
+
|
166
|
+
// The digits are in reverse order
|
167
|
+
if result.is_empty() {
|
168
|
+
result = "0".to_string();
|
169
|
+
} else {
|
170
|
+
result = result.chars().rev().collect();
|
171
|
+
}
|
172
|
+
|
173
|
+
// Add negative sign and format with scale
|
174
|
+
Ok(format_decimal_with_i32_scale(format!("-{}", result), scale))
|
175
|
+
} else {
|
176
|
+
// For positive numbers, direct conversion
|
177
|
+
let mut result = String::new();
|
178
|
+
let mut remainder = bytes.to_vec();
|
179
|
+
|
180
|
+
// Repeatedly divide by 10 to get decimal digits
|
181
|
+
while !remainder.iter().all(|&b| b == 0) {
|
182
|
+
let mut carry = 0u16;
|
183
|
+
for i in 0..remainder.len() {
|
184
|
+
let temp = (carry << 8) | (remainder[i] as u16);
|
185
|
+
remainder[i] = (temp / 10) as u8;
|
186
|
+
carry = temp % 10;
|
187
|
+
}
|
188
|
+
result.push_str(&carry.to_string());
|
189
|
+
}
|
190
|
+
|
191
|
+
// The digits are in reverse order
|
192
|
+
if result.is_empty() {
|
193
|
+
result = "0".to_string();
|
194
|
+
} else {
|
195
|
+
result = result.chars().rev().collect();
|
196
|
+
}
|
197
|
+
|
198
|
+
Ok(format_decimal_with_i32_scale(result, scale))
|
199
|
+
}
|
200
|
+
}
|
117
201
|
_ => Err(ParquetGemError::InvalidDecimal(format!(
|
118
|
-
"Unsupported decimal byte array size: {}",
|
202
|
+
"Unsupported decimal byte array size: {} (maximum 32 bytes)",
|
119
203
|
bytes.len()
|
120
204
|
))),
|
121
205
|
}
|
@@ -185,17 +185,18 @@ pub fn parse_schema_node(ruby: &Ruby, node_value: Value) -> Result<SchemaNode, M
|
|
185
185
|
// 2. When precision only - use scale 0
|
186
186
|
// 3. When scale only - use max precision (38)
|
187
187
|
let (precision, scale) = match (precision_val, scale_val) {
|
188
|
-
(None, None) => (38, 0),
|
188
|
+
(None, None) => (38, 0), // Maximum accuracy, scale 0
|
189
189
|
(Some(p), None) => {
|
190
190
|
// Precision provided, scale defaults to 0
|
191
191
|
let prec = u8::try_convert(p).map_err(|_| {
|
192
192
|
MagnusError::new(
|
193
193
|
ruby.exception_type_error(),
|
194
|
-
"Invalid precision value for decimal type, expected a positive integer"
|
194
|
+
"Invalid precision value for decimal type, expected a positive integer"
|
195
|
+
.to_string(),
|
195
196
|
)
|
196
197
|
})?;
|
197
198
|
(prec, 0)
|
198
|
-
}
|
199
|
+
}
|
199
200
|
(None, Some(s)) => {
|
200
201
|
// Scale provided, precision set to maximum (38)
|
201
202
|
let scl = i8::try_convert(s).map_err(|_| {
|
@@ -205,13 +206,14 @@ pub fn parse_schema_node(ruby: &Ruby, node_value: Value) -> Result<SchemaNode, M
|
|
205
206
|
)
|
206
207
|
})?;
|
207
208
|
(38, scl)
|
208
|
-
}
|
209
|
+
}
|
209
210
|
(Some(p), Some(s)) => {
|
210
211
|
// Both provided
|
211
212
|
let prec = u8::try_convert(p).map_err(|_| {
|
212
213
|
MagnusError::new(
|
213
214
|
ruby.exception_type_error(),
|
214
|
-
"Invalid precision value for decimal type, expected a positive integer"
|
215
|
+
"Invalid precision value for decimal type, expected a positive integer"
|
216
|
+
.to_string(),
|
215
217
|
)
|
216
218
|
})?;
|
217
219
|
let scl = i8::try_convert(s).map_err(|_| {
|
@@ -294,6 +296,7 @@ fn parse_primitive_type(s: &str) -> Option<PrimitiveType> {
|
|
294
296
|
"timestamp_millis" | "timestamp_ms" => Some(PrimitiveType::TimestampMillis),
|
295
297
|
"timestamp_micros" | "timestamp_us" => Some(PrimitiveType::TimestampMicros),
|
296
298
|
"decimal" => Some(PrimitiveType::Decimal128(38, 0)), // Maximum precision, scale 0
|
299
|
+
"decimal256" => Some(PrimitiveType::Decimal256(38, 0)), // Maximum precision, scale 0
|
297
300
|
_ => None,
|
298
301
|
}
|
299
302
|
}
|
@@ -321,6 +324,9 @@ pub fn schema_node_to_arrow_field(node: &SchemaNode) -> ArrowField {
|
|
321
324
|
PrimitiveType::Decimal128(precision, scale) => {
|
322
325
|
ArrowDataType::Decimal128(*precision, *scale)
|
323
326
|
}
|
327
|
+
PrimitiveType::Decimal256(precision, scale) => {
|
328
|
+
ArrowDataType::Decimal256(*precision, *scale)
|
329
|
+
}
|
324
330
|
PrimitiveType::Boolean => ArrowDataType::Boolean,
|
325
331
|
PrimitiveType::String => ArrowDataType::Utf8,
|
326
332
|
PrimitiveType::Binary => ArrowDataType::Binary,
|
@@ -243,6 +243,7 @@ pub fn parquet_schema_type_to_arrow_data_type(
|
|
243
243
|
PrimitiveType::Float32 => DataType::Float32,
|
244
244
|
PrimitiveType::Float64 => DataType::Float64,
|
245
245
|
PrimitiveType::Decimal128(precision, scale) => DataType::Decimal128(*precision, *scale),
|
246
|
+
PrimitiveType::Decimal256(precision, scale) => DataType::Decimal256(*precision, *scale),
|
246
247
|
PrimitiveType::String => DataType::Utf8,
|
247
248
|
PrimitiveType::Binary => DataType::Binary,
|
248
249
|
PrimitiveType::Boolean => DataType::Boolean,
|
@@ -381,6 +382,22 @@ fn create_arrow_builder_for_type(
|
|
381
382
|
|
382
383
|
Ok(Box::new(builder_with_precision))
|
383
384
|
}
|
385
|
+
ParquetSchemaType::Primitive(PrimitiveType::Decimal256(precision, scale)) => {
|
386
|
+
// Create a Decimal128Builder since we're truncating Decimal256 to Decimal128
|
387
|
+
let builder = Decimal256Builder::with_capacity(cap);
|
388
|
+
|
389
|
+
// Set precision and scale for the decimal and return the new builder
|
390
|
+
let builder_with_precision = builder
|
391
|
+
.with_precision_and_scale(*precision, *scale)
|
392
|
+
.map_err(|e| {
|
393
|
+
MagnusError::new(
|
394
|
+
magnus::exception::runtime_error(),
|
395
|
+
format!("Failed to set precision and scale: {}", e),
|
396
|
+
)
|
397
|
+
})?;
|
398
|
+
|
399
|
+
Ok(Box::new(builder_with_precision))
|
400
|
+
}
|
384
401
|
ParquetSchemaType::Primitive(PrimitiveType::String) => {
|
385
402
|
Ok(Box::new(StringBuilder::with_capacity(cap, cap * 32)))
|
386
403
|
}
|
@@ -891,6 +908,187 @@ fn fill_builder(
|
|
891
908
|
}
|
892
909
|
Ok(())
|
893
910
|
}
|
911
|
+
ParquetSchemaType::Primitive(PrimitiveType::Decimal256(_precision, scale)) => {
|
912
|
+
let typed_builder = builder
|
913
|
+
.as_any_mut()
|
914
|
+
.downcast_mut::<Decimal256Builder>()
|
915
|
+
.expect("Builder mismatch: expected Decimal256Builder for Decimal256");
|
916
|
+
|
917
|
+
for val in values {
|
918
|
+
match val {
|
919
|
+
ParquetValue::Decimal256(d, _scale) => typed_builder.append_value(*d),
|
920
|
+
ParquetValue::Decimal128(d, _scale) => {
|
921
|
+
// Convert i128 to i256
|
922
|
+
typed_builder.append_value(arrow_buffer::i256::from_i128(*d))
|
923
|
+
}
|
924
|
+
ParquetValue::Float64(f) => {
|
925
|
+
// Scale the float to the desired precision and scale
|
926
|
+
// For large values, use BigInt to avoid overflow
|
927
|
+
let scaled = *f * 10_f64.powi(*scale as i32);
|
928
|
+
if scaled >= i128::MIN as f64 && scaled <= i128::MAX as f64 {
|
929
|
+
let scaled_value = scaled as i128;
|
930
|
+
typed_builder.append_value(arrow_buffer::i256::from_i128(scaled_value))
|
931
|
+
} else {
|
932
|
+
// Use BigInt for values that don't fit in i128
|
933
|
+
use num::{BigInt, FromPrimitive};
|
934
|
+
let bigint = BigInt::from_f64(scaled).ok_or_else(|| {
|
935
|
+
MagnusError::new(
|
936
|
+
magnus::exception::type_error(),
|
937
|
+
format!("Failed to convert float {} to BigInt", f),
|
938
|
+
)
|
939
|
+
})?;
|
940
|
+
let bytes = bigint.to_signed_bytes_le();
|
941
|
+
if bytes.len() <= 32 {
|
942
|
+
let mut buf = if bigint.sign() == num::bigint::Sign::Minus {
|
943
|
+
[0xff; 32]
|
944
|
+
} else {
|
945
|
+
[0; 32]
|
946
|
+
};
|
947
|
+
buf[..bytes.len()].copy_from_slice(&bytes);
|
948
|
+
typed_builder.append_value(arrow_buffer::i256::from_le_bytes(buf))
|
949
|
+
} else {
|
950
|
+
return Err(MagnusError::new(
|
951
|
+
magnus::exception::type_error(),
|
952
|
+
format!(
|
953
|
+
"Float value {} scaled to {} is too large for Decimal256",
|
954
|
+
f, scaled
|
955
|
+
),
|
956
|
+
));
|
957
|
+
}
|
958
|
+
}
|
959
|
+
}
|
960
|
+
ParquetValue::Float32(flo) => {
|
961
|
+
// Scale the float to the desired precision and scale
|
962
|
+
let scaled = (*flo as f64) * 10_f64.powi(*scale as i32);
|
963
|
+
if scaled >= i128::MIN as f64 && scaled <= i128::MAX as f64 {
|
964
|
+
let scaled_value = scaled as i128;
|
965
|
+
typed_builder.append_value(arrow_buffer::i256::from_i128(scaled_value))
|
966
|
+
} else {
|
967
|
+
// Use BigInt for values that don't fit in i128
|
968
|
+
use num::{BigInt, FromPrimitive};
|
969
|
+
let bigint = BigInt::from_f64(scaled).ok_or_else(|| {
|
970
|
+
MagnusError::new(
|
971
|
+
magnus::exception::type_error(),
|
972
|
+
format!("Failed to convert float {} to BigInt", flo),
|
973
|
+
)
|
974
|
+
})?;
|
975
|
+
let bytes = bigint.to_signed_bytes_le();
|
976
|
+
if bytes.len() <= 32 {
|
977
|
+
let mut buf = if bigint.sign() == num::bigint::Sign::Minus {
|
978
|
+
[0xff; 32]
|
979
|
+
} else {
|
980
|
+
[0; 32]
|
981
|
+
};
|
982
|
+
buf[..bytes.len()].copy_from_slice(&bytes);
|
983
|
+
typed_builder.append_value(arrow_buffer::i256::from_le_bytes(buf))
|
984
|
+
} else {
|
985
|
+
return Err(MagnusError::new(
|
986
|
+
magnus::exception::type_error(),
|
987
|
+
format!(
|
988
|
+
"Float value {} scaled is too large for Decimal256",
|
989
|
+
flo
|
990
|
+
),
|
991
|
+
));
|
992
|
+
}
|
993
|
+
}
|
994
|
+
}
|
995
|
+
ParquetValue::Int64(i) => {
|
996
|
+
// Scale the integer to the desired scale
|
997
|
+
let base = arrow_buffer::i256::from_i128(*i as i128);
|
998
|
+
if *scale <= 38 {
|
999
|
+
// Can use i128 multiplication for scale <= 38
|
1000
|
+
let scale_factor =
|
1001
|
+
arrow_buffer::i256::from_i128(10_i128.pow(*scale as u32));
|
1002
|
+
match base.checked_mul(scale_factor) {
|
1003
|
+
Some(scaled) => typed_builder.append_value(scaled),
|
1004
|
+
None => {
|
1005
|
+
return Err(MagnusError::new(
|
1006
|
+
magnus::exception::type_error(),
|
1007
|
+
format!(
|
1008
|
+
"Integer {} scaled by {} overflows Decimal256",
|
1009
|
+
i, scale
|
1010
|
+
),
|
1011
|
+
));
|
1012
|
+
}
|
1013
|
+
}
|
1014
|
+
} else {
|
1015
|
+
// For very large scales, use BigInt
|
1016
|
+
use num::BigInt;
|
1017
|
+
let bigint = BigInt::from(*i) * BigInt::from(10).pow(*scale as u32);
|
1018
|
+
let bytes = bigint.to_signed_bytes_le();
|
1019
|
+
if bytes.len() <= 32 {
|
1020
|
+
let mut buf = if bigint.sign() == num::bigint::Sign::Minus {
|
1021
|
+
[0xff; 32]
|
1022
|
+
} else {
|
1023
|
+
[0; 32]
|
1024
|
+
};
|
1025
|
+
buf[..bytes.len()].copy_from_slice(&bytes);
|
1026
|
+
typed_builder.append_value(arrow_buffer::i256::from_le_bytes(buf))
|
1027
|
+
} else {
|
1028
|
+
return Err(MagnusError::new(
|
1029
|
+
magnus::exception::type_error(),
|
1030
|
+
format!(
|
1031
|
+
"Integer {} scaled by {} is too large for Decimal256",
|
1032
|
+
i, scale
|
1033
|
+
),
|
1034
|
+
));
|
1035
|
+
}
|
1036
|
+
}
|
1037
|
+
}
|
1038
|
+
ParquetValue::Int32(i) => {
|
1039
|
+
// Scale the integer to the desired scale
|
1040
|
+
let base = arrow_buffer::i256::from_i128(*i as i128);
|
1041
|
+
if *scale <= 38 {
|
1042
|
+
// Can use i128 multiplication for scale <= 38
|
1043
|
+
let scale_factor =
|
1044
|
+
arrow_buffer::i256::from_i128(10_i128.pow(*scale as u32));
|
1045
|
+
match base.checked_mul(scale_factor) {
|
1046
|
+
Some(scaled) => typed_builder.append_value(scaled),
|
1047
|
+
None => {
|
1048
|
+
return Err(MagnusError::new(
|
1049
|
+
magnus::exception::type_error(),
|
1050
|
+
format!(
|
1051
|
+
"Integer {} scaled by {} overflows Decimal256",
|
1052
|
+
i, scale
|
1053
|
+
),
|
1054
|
+
));
|
1055
|
+
}
|
1056
|
+
}
|
1057
|
+
} else {
|
1058
|
+
// For very large scales, use BigInt
|
1059
|
+
use num::BigInt;
|
1060
|
+
let bigint = BigInt::from(*i) * BigInt::from(10).pow(*scale as u32);
|
1061
|
+
let bytes = bigint.to_signed_bytes_le();
|
1062
|
+
if bytes.len() <= 32 {
|
1063
|
+
let mut buf = if bigint.sign() == num::bigint::Sign::Minus {
|
1064
|
+
[0xff; 32]
|
1065
|
+
} else {
|
1066
|
+
[0; 32]
|
1067
|
+
};
|
1068
|
+
buf[..bytes.len()].copy_from_slice(&bytes);
|
1069
|
+
typed_builder.append_value(arrow_buffer::i256::from_le_bytes(buf))
|
1070
|
+
} else {
|
1071
|
+
return Err(MagnusError::new(
|
1072
|
+
magnus::exception::type_error(),
|
1073
|
+
format!(
|
1074
|
+
"Integer {} scaled by {} is too large for Decimal256",
|
1075
|
+
i, scale
|
1076
|
+
),
|
1077
|
+
));
|
1078
|
+
}
|
1079
|
+
}
|
1080
|
+
}
|
1081
|
+
ParquetValue::Null => typed_builder.append_null(),
|
1082
|
+
other => {
|
1083
|
+
return Err(MagnusError::new(
|
1084
|
+
magnus::exception::type_error(),
|
1085
|
+
format!("Expected numeric value for Decimal256, got {:?}", other),
|
1086
|
+
))
|
1087
|
+
}
|
1088
|
+
}
|
1089
|
+
}
|
1090
|
+
Ok(())
|
1091
|
+
}
|
894
1092
|
ParquetSchemaType::Primitive(PrimitiveType::Boolean) => {
|
895
1093
|
let typed_builder = builder
|
896
1094
|
.as_any_mut()
|
@@ -1172,6 +1370,15 @@ fn fill_builder(
|
|
1172
1370
|
)
|
1173
1371
|
})?
|
1174
1372
|
.append_value(*x),
|
1373
|
+
ParquetValue::Decimal256(x, _scale) => typed_builder
|
1374
|
+
.field_builder::<Decimal256Builder>(i)
|
1375
|
+
.ok_or_else(|| {
|
1376
|
+
MagnusError::new(
|
1377
|
+
magnus::exception::type_error(),
|
1378
|
+
"Failed to coerce into Decimal256Builder",
|
1379
|
+
)
|
1380
|
+
})?
|
1381
|
+
.append_value(*x),
|
1175
1382
|
ParquetValue::Date32(x) => typed_builder
|
1176
1383
|
.field_builder::<Date32Builder>(i)
|
1177
1384
|
.ok_or_else(|| {
|
@@ -1377,6 +1584,15 @@ fn fill_builder(
|
|
1377
1584
|
)
|
1378
1585
|
})?
|
1379
1586
|
.append_null(),
|
1587
|
+
ParquetSchemaType::Primitive(PrimitiveType::Decimal256(_, _)) => typed_builder
|
1588
|
+
.field_builder::<Decimal256Builder>(i)
|
1589
|
+
.ok_or_else(|| {
|
1590
|
+
MagnusError::new(
|
1591
|
+
magnus::exception::type_error(),
|
1592
|
+
"Failed to coerce into Decimal256Builder for Decimal256",
|
1593
|
+
)
|
1594
|
+
})?
|
1595
|
+
.append_null(),
|
1380
1596
|
ParquetSchemaType::Primitive(PrimitiveType::String) => typed_builder
|
1381
1597
|
.field_builder::<StringBuilder>(i)
|
1382
1598
|
.ok_or_else(|| {
|
@@ -145,6 +145,53 @@ impl FromStr for ParquetSchemaType<'_> {
|
|
145
145
|
}
|
146
146
|
}
|
147
147
|
|
148
|
+
// Check if it's a decimal256 type with precision and scale
|
149
|
+
if let Some(decimal_params) = s.strip_prefix("decimal256(").and_then(|s| s.strip_suffix(")")) {
|
150
|
+
let parts: Vec<&str> = decimal_params.split(',').collect();
|
151
|
+
|
152
|
+
// Handle both single parameter (precision only) and two parameters (precision and scale)
|
153
|
+
if parts.len() == 1 {
|
154
|
+
// Only precision provided, scale defaults to 0
|
155
|
+
let precision = parts[0].trim().parse::<u8>().map_err(|_| {
|
156
|
+
MagnusError::new(
|
157
|
+
magnus::exception::runtime_error(),
|
158
|
+
format!("Invalid precision value in decimal256 type: {}", parts[0]),
|
159
|
+
)
|
160
|
+
})?;
|
161
|
+
|
162
|
+
return Ok(ParquetSchemaType::Primitive(PrimitiveType::Decimal256(
|
163
|
+
precision, 0,
|
164
|
+
)));
|
165
|
+
} else if parts.len() == 2 {
|
166
|
+
// Both precision and scale provided
|
167
|
+
let precision = parts[0].trim().parse::<u8>().map_err(|_| {
|
168
|
+
MagnusError::new(
|
169
|
+
magnus::exception::runtime_error(),
|
170
|
+
format!("Invalid precision value in decimal256 type: {}", parts[0]),
|
171
|
+
)
|
172
|
+
})?;
|
173
|
+
|
174
|
+
let scale = parts[1].trim().parse::<i8>().map_err(|_| {
|
175
|
+
MagnusError::new(
|
176
|
+
magnus::exception::runtime_error(),
|
177
|
+
format!("Invalid scale value in decimal256 type: {}", parts[1]),
|
178
|
+
)
|
179
|
+
})?;
|
180
|
+
|
181
|
+
return Ok(ParquetSchemaType::Primitive(PrimitiveType::Decimal256(
|
182
|
+
precision, scale,
|
183
|
+
)));
|
184
|
+
} else {
|
185
|
+
return Err(MagnusError::new(
|
186
|
+
magnus::exception::runtime_error(),
|
187
|
+
format!(
|
188
|
+
"Invalid decimal256 format. Expected 'decimal256(precision)' or 'decimal256(precision,scale)', got '{}'",
|
189
|
+
s
|
190
|
+
),
|
191
|
+
));
|
192
|
+
}
|
193
|
+
}
|
194
|
+
|
148
195
|
// Handle primitive types
|
149
196
|
match s {
|
150
197
|
"int8" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Int8)),
|
@@ -166,6 +213,9 @@ impl FromStr for ParquetSchemaType<'_> {
|
|
166
213
|
"decimal" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Decimal128(
|
167
214
|
38, 0,
|
168
215
|
))),
|
216
|
+
"decimal256" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Decimal256(
|
217
|
+
38, 0,
|
218
|
+
))),
|
169
219
|
"list" => Ok(ParquetSchemaType::List(Box::new(ListField {
|
170
220
|
item_type: ParquetSchemaType::Primitive(PrimitiveType::String),
|
171
221
|
format: None,
|
@@ -197,6 +197,9 @@ fn arrow_data_type_to_parquet_schema_type(dt: &DataType) -> Result<ParquetSchema
|
|
197
197
|
DataType::Decimal128(precision, scale) => Ok(PST::Primitive(PrimitiveType::Decimal128(
|
198
198
|
*precision, *scale,
|
199
199
|
))),
|
200
|
+
DataType::Decimal256(precision, scale) => Ok(PST::Primitive(PrimitiveType::Decimal256(
|
201
|
+
*precision, *scale,
|
202
|
+
))),
|
200
203
|
DataType::Date32 => Ok(PST::Primitive(PrimitiveType::Date32)),
|
201
204
|
DataType::Date64 => {
|
202
205
|
// Our code typically uses Date32 or Timestamp for 64. But Arrow has Date64
|
@@ -170,6 +170,9 @@ fn write_columns_impl(ruby: Rc<Ruby>, args: &[Value]) -> Result<(), ParquetGemEr
|
|
170
170
|
PrimitiveType::TimestampMicros => {
|
171
171
|
PST::Primitive(PrimitiveType::TimestampMicros)
|
172
172
|
}
|
173
|
+
PrimitiveType::Decimal256(precision, scale) => {
|
174
|
+
PST::Primitive(PrimitiveType::Decimal256(precision, scale))
|
175
|
+
}
|
173
176
|
},
|
174
177
|
SchemaNode::List { .. }
|
175
178
|
| SchemaNode::Map { .. }
|
@@ -258,6 +258,7 @@ pub fn estimate_value_size(
|
|
258
258
|
| PST::Primitive(PrimitiveType::Float64) => Ok(8),
|
259
259
|
PST::Primitive(PrimitiveType::Boolean) => Ok(1),
|
260
260
|
PST::Primitive(PrimitiveType::Decimal128(_, _)) => Ok(16),
|
261
|
+
PST::Primitive(PrimitiveType::Decimal256(_, _)) => Ok(32),
|
261
262
|
PST::Primitive(PrimitiveType::Date32)
|
262
263
|
| PST::Primitive(PrimitiveType::TimestampMillis)
|
263
264
|
| PST::Primitive(PrimitiveType::TimestampMicros) => Ok(8),
|
data/lib/parquet/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-06-
|
11
|
+
date: 2025-06-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|