parquet 0.5.2 → 0.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +9 -1
- data/README.md +228 -4
- data/ext/parquet/Cargo.toml +4 -0
- data/ext/parquet/build.rs +5 -0
- data/ext/parquet/src/lib.rs +1 -0
- data/ext/parquet/src/reader/common.rs +7 -6
- data/ext/parquet/src/reader/mod.rs +204 -0
- data/ext/parquet/src/reader/parquet_column_reader.rs +19 -20
- data/ext/parquet/src/reader/parquet_row_reader.rs +18 -22
- data/ext/parquet/src/ruby_reader.rs +3 -5
- data/ext/parquet/src/types/core_types.rs +1 -0
- data/ext/parquet/src/types/mod.rs +8 -5
- data/ext/parquet/src/types/parquet_value.rs +199 -7
- data/ext/parquet/src/types/record_types.rs +16 -5
- data/ext/parquet/src/types/schema_converter.rs +118 -11
- data/ext/parquet/src/types/schema_node.rs +83 -2
- data/ext/parquet/src/types/timestamp.rs +6 -10
- data/ext/parquet/src/types/type_conversion.rs +84 -11
- data/ext/parquet/src/types/writer_types.rs +59 -11
- data/ext/parquet/src/utils.rs +6 -6
- data/ext/parquet/src/writer/mod.rs +25 -17
- data/ext/parquet/src/writer/write_columns.rs +27 -24
- data/ext/parquet/src/writer/write_rows.rs +14 -15
- data/lib/parquet/schema.rb +89 -4
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +11 -0
- metadata +3 -2
@@ -68,7 +68,7 @@ fn parse_struct_node(
|
|
68
68
|
})?;
|
69
69
|
|
70
70
|
// Check for empty struct immediately
|
71
|
-
if fields_arr.
|
71
|
+
if fields_arr.is_empty() {
|
72
72
|
return Err(MagnusError::new(
|
73
73
|
ruby.exception_arg_error(),
|
74
74
|
format!("Cannot create a struct with zero fields. Struct name: '{}'. Parquet doesn't support empty structs", name)
|
@@ -175,6 +175,83 @@ pub fn parse_schema_node(ruby: &Ruby, node_value: Value) -> Result<SchemaNode, M
|
|
175
175
|
"struct" => parse_struct_node(ruby, &node_hash, name, nullable),
|
176
176
|
"list" => parse_list_node(ruby, &node_hash, name, nullable),
|
177
177
|
"map" => parse_map_node(ruby, &node_hash, name, nullable),
|
178
|
+
"decimal" => {
|
179
|
+
// Check for precision and scale
|
180
|
+
let precision_val = node_hash.get(Symbol::new("precision"));
|
181
|
+
let scale_val = node_hash.get(Symbol::new("scale"));
|
182
|
+
|
183
|
+
// Handle different precision/scale combinations:
|
184
|
+
// 1. When no precision or scale - use max precision (38)
|
185
|
+
// 2. When precision only - use scale 0
|
186
|
+
// 3. When scale only - use max precision (38)
|
187
|
+
let (precision, scale) = match (precision_val, scale_val) {
|
188
|
+
(None, None) => (38, 0), // Maximum accuracy, scale 0
|
189
|
+
(Some(p), None) => {
|
190
|
+
// Precision provided, scale defaults to 0
|
191
|
+
let prec = u8::try_convert(p).map_err(|_| {
|
192
|
+
MagnusError::new(
|
193
|
+
ruby.exception_type_error(),
|
194
|
+
"Invalid precision value for decimal type, expected a positive integer".to_string(),
|
195
|
+
)
|
196
|
+
})?;
|
197
|
+
(prec, 0)
|
198
|
+
},
|
199
|
+
(None, Some(s)) => {
|
200
|
+
// Scale provided, precision set to maximum (38)
|
201
|
+
let scl = i8::try_convert(s).map_err(|_| {
|
202
|
+
MagnusError::new(
|
203
|
+
ruby.exception_type_error(),
|
204
|
+
"Invalid scale value for decimal type, expected an integer".to_string(),
|
205
|
+
)
|
206
|
+
})?;
|
207
|
+
(38, scl)
|
208
|
+
},
|
209
|
+
(Some(p), Some(s)) => {
|
210
|
+
// Both provided
|
211
|
+
let prec = u8::try_convert(p).map_err(|_| {
|
212
|
+
MagnusError::new(
|
213
|
+
ruby.exception_type_error(),
|
214
|
+
"Invalid precision value for decimal type, expected a positive integer".to_string(),
|
215
|
+
)
|
216
|
+
})?;
|
217
|
+
let scl = i8::try_convert(s).map_err(|_| {
|
218
|
+
MagnusError::new(
|
219
|
+
ruby.exception_type_error(),
|
220
|
+
"Invalid scale value for decimal type, expected an integer".to_string(),
|
221
|
+
)
|
222
|
+
})?;
|
223
|
+
(prec, scl)
|
224
|
+
}
|
225
|
+
};
|
226
|
+
|
227
|
+
// Validate precision is in a valid range
|
228
|
+
if precision < 1 {
|
229
|
+
return Err(MagnusError::new(
|
230
|
+
ruby.exception_arg_error(),
|
231
|
+
format!(
|
232
|
+
"Precision for decimal type must be at least 1, got {}",
|
233
|
+
precision
|
234
|
+
),
|
235
|
+
));
|
236
|
+
}
|
237
|
+
|
238
|
+
if precision > 38 {
|
239
|
+
return Err(MagnusError::new(
|
240
|
+
ruby.exception_arg_error(),
|
241
|
+
format!(
|
242
|
+
"Precision for decimal type cannot exceed 38, got {}",
|
243
|
+
precision
|
244
|
+
),
|
245
|
+
));
|
246
|
+
}
|
247
|
+
|
248
|
+
Ok(SchemaNode::Primitive {
|
249
|
+
name,
|
250
|
+
parquet_type: PrimitiveType::Decimal128(precision, scale),
|
251
|
+
nullable,
|
252
|
+
format,
|
253
|
+
})
|
254
|
+
}
|
178
255
|
// For primitives, provide better error messages when type isn't recognized
|
179
256
|
other => {
|
180
257
|
if let Some(parquet_type) = parse_primitive_type(other) {
|
@@ -188,7 +265,7 @@ pub fn parse_schema_node(ruby: &Ruby, node_value: Value) -> Result<SchemaNode, M
|
|
188
265
|
Err(MagnusError::new(
|
189
266
|
magnus::exception::arg_error(),
|
190
267
|
format!(
|
191
|
-
"Unknown type: '{}'. Supported types are: struct, list, map, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float32, float64, boolean, string, binary, date32, timestamp_millis, timestamp_micros",
|
268
|
+
"Unknown type: '{}'. Supported types are: struct, list, map, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float32, float64, boolean, string, binary, date32, timestamp_millis, timestamp_micros, decimal",
|
192
269
|
other
|
193
270
|
)
|
194
271
|
))
|
@@ -216,6 +293,7 @@ fn parse_primitive_type(s: &str) -> Option<PrimitiveType> {
|
|
216
293
|
"date" | "date32" => Some(PrimitiveType::Date32),
|
217
294
|
"timestamp_millis" | "timestamp_ms" => Some(PrimitiveType::TimestampMillis),
|
218
295
|
"timestamp_micros" | "timestamp_us" => Some(PrimitiveType::TimestampMicros),
|
296
|
+
"decimal" => Some(PrimitiveType::Decimal128(38, 0)), // Maximum precision, scale 0
|
219
297
|
_ => None,
|
220
298
|
}
|
221
299
|
}
|
@@ -240,6 +318,9 @@ pub fn schema_node_to_arrow_field(node: &SchemaNode) -> ArrowField {
|
|
240
318
|
PrimitiveType::UInt64 => ArrowDataType::UInt64,
|
241
319
|
PrimitiveType::Float32 => ArrowDataType::Float32,
|
242
320
|
PrimitiveType::Float64 => ArrowDataType::Float64,
|
321
|
+
PrimitiveType::Decimal128(precision, scale) => {
|
322
|
+
ArrowDataType::Decimal128(*precision, *scale)
|
323
|
+
}
|
243
324
|
PrimitiveType::Boolean => ArrowDataType::Boolean,
|
244
325
|
PrimitiveType::String => ArrowDataType::Utf8,
|
245
326
|
PrimitiveType::Binary => ArrowDataType::Binary,
|
@@ -2,15 +2,11 @@ use super::*;
|
|
2
2
|
|
3
3
|
pub fn parse_zoned_timestamp(value: &ParquetValue) -> Result<jiff::Timestamp, ParquetGemError> {
|
4
4
|
let (ts, tz) = match value {
|
5
|
-
ParquetValue::TimestampSecond(ts, tz) => (jiff::Timestamp::from_second(*ts)
|
6
|
-
ParquetValue::TimestampMillis(ts, tz) =>
|
7
|
-
|
8
|
-
}
|
9
|
-
ParquetValue::TimestampMicros(ts, tz) => {
|
10
|
-
(jiff::Timestamp::from_microsecond(*ts).unwrap(), tz)
|
11
|
-
}
|
5
|
+
ParquetValue::TimestampSecond(ts, tz) => (jiff::Timestamp::from_second(*ts)?, tz),
|
6
|
+
ParquetValue::TimestampMillis(ts, tz) => (jiff::Timestamp::from_millisecond(*ts)?, tz),
|
7
|
+
ParquetValue::TimestampMicros(ts, tz) => (jiff::Timestamp::from_microsecond(*ts)?, tz),
|
12
8
|
ParquetValue::TimestampNanos(ts, tz) => {
|
13
|
-
(jiff::Timestamp::from_nanosecond(*ts as i128)
|
9
|
+
(jiff::Timestamp::from_nanosecond(*ts as i128)?, tz)
|
14
10
|
}
|
15
11
|
_ => {
|
16
12
|
return Err(MagnusError::new(
|
@@ -50,7 +46,7 @@ pub fn parse_zoned_timestamp(value: &ParquetValue) -> Result<jiff::Timestamp, Pa
|
|
50
46
|
Ok(ts.to_zoned(tz).timestamp())
|
51
47
|
} else {
|
52
48
|
// Try IANA timezone
|
53
|
-
match ts.in_tz(
|
49
|
+
match ts.in_tz(tz) {
|
54
50
|
Ok(zoned) => Ok(zoned.timestamp()),
|
55
51
|
Err(_) => Ok(ts), // Fall back to UTC if timezone is invalid
|
56
52
|
}
|
@@ -85,7 +81,7 @@ macro_rules! impl_timestamp_conversion {
|
|
85
81
|
#[macro_export]
|
86
82
|
macro_rules! impl_date_conversion {
|
87
83
|
($value:expr, $handle:expr) => {{
|
88
|
-
let ts = jiff::Timestamp::from_second(($value as i64) * 86400)
|
84
|
+
let ts = jiff::Timestamp::from_second(($value as i64) * 86400)?;
|
89
85
|
let formatted = ts.strftime("%Y-%m-%d").to_string();
|
90
86
|
Ok(formatted.into_value_with($handle))
|
91
87
|
}};
|
@@ -2,8 +2,8 @@ use std::str::FromStr;
|
|
2
2
|
use std::sync::Arc;
|
3
3
|
|
4
4
|
use super::*;
|
5
|
-
use arrow_array::builder::MapFieldNames;
|
6
5
|
use arrow_array::builder::*;
|
6
|
+
use arrow_array::builder::MapFieldNames;
|
7
7
|
use arrow_schema::{DataType, Field, Fields, TimeUnit};
|
8
8
|
use jiff::tz::{Offset, TimeZone};
|
9
9
|
use magnus::{RArray, RString, TryConvert};
|
@@ -41,9 +41,9 @@ pub fn convert_to_date32(
|
|
41
41
|
let s = String::try_convert(value)?;
|
42
42
|
// Parse string into Date using jiff
|
43
43
|
let date = if let Some(fmt) = format {
|
44
|
-
jiff::civil::Date::strptime(
|
44
|
+
jiff::civil::Date::strptime(fmt, &s).or_else(|e1| {
|
45
45
|
// Try parsing as DateTime and convert to Date with zero offset
|
46
|
-
jiff::civil::DateTime::strptime(
|
46
|
+
jiff::civil::DateTime::strptime(fmt, &s)
|
47
47
|
.and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
|
48
48
|
.map(|dt| dt.date())
|
49
49
|
.map_err(|e2| {
|
@@ -78,7 +78,7 @@ pub fn convert_to_date32(
|
|
78
78
|
.timestamp();
|
79
79
|
|
80
80
|
// Convert to epoch days
|
81
|
-
Ok((x.as_second()
|
81
|
+
Ok((x.as_second() / 86400) as i32)
|
82
82
|
} else if value.is_kind_of(ruby.class_time()) {
|
83
83
|
// Convert Time object to epoch days
|
84
84
|
let secs = i64::try_convert(value.funcall::<_, _, Value>("to_i", ())?)?;
|
@@ -100,10 +100,10 @@ pub fn convert_to_timestamp_millis(
|
|
100
100
|
let s = String::try_convert(value)?;
|
101
101
|
// Parse string into Timestamp using jiff
|
102
102
|
let timestamp = if let Some(fmt) = format {
|
103
|
-
jiff::Timestamp::strptime(
|
103
|
+
jiff::Timestamp::strptime(fmt, &s)
|
104
104
|
.or_else(|e1| {
|
105
105
|
// Try parsing as DateTime and convert to Timestamp with zero offset
|
106
|
-
jiff::civil::DateTime::strptime(
|
106
|
+
jiff::civil::DateTime::strptime(fmt, &s)
|
107
107
|
.and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
|
108
108
|
.map(|dt| dt.timestamp())
|
109
109
|
.map_err(|e2| {
|
@@ -150,9 +150,9 @@ pub fn convert_to_timestamp_micros(
|
|
150
150
|
let s = String::try_convert(value)?;
|
151
151
|
// Parse string into Timestamp using jiff
|
152
152
|
let timestamp = if let Some(fmt) = format {
|
153
|
-
jiff::Timestamp::strptime(
|
153
|
+
jiff::Timestamp::strptime(fmt, &s).or_else(|e1| {
|
154
154
|
// Try parsing as DateTime and convert to Timestamp with zero offset
|
155
|
-
jiff::civil::DateTime::strptime(
|
155
|
+
jiff::civil::DateTime::strptime(fmt, &s).and_then(|dt| {
|
156
156
|
dt.to_zoned(TimeZone::fixed(Offset::constant(0)))
|
157
157
|
})
|
158
158
|
.map(|dt| dt.timestamp())
|
@@ -242,6 +242,7 @@ pub fn parquet_schema_type_to_arrow_data_type(
|
|
242
242
|
PrimitiveType::UInt64 => DataType::UInt64,
|
243
243
|
PrimitiveType::Float32 => DataType::Float32,
|
244
244
|
PrimitiveType::Float64 => DataType::Float64,
|
245
|
+
PrimitiveType::Decimal128(precision, scale) => DataType::Decimal128(*precision, *scale),
|
245
246
|
PrimitiveType::String => DataType::Utf8,
|
246
247
|
PrimitiveType::Binary => DataType::Binary,
|
247
248
|
PrimitiveType::Boolean => DataType::Boolean,
|
@@ -364,6 +365,20 @@ fn create_arrow_builder_for_type(
|
|
364
365
|
ParquetSchemaType::Primitive(PrimitiveType::Float64) => {
|
365
366
|
Ok(Box::new(Float64Builder::with_capacity(cap)))
|
366
367
|
}
|
368
|
+
ParquetSchemaType::Primitive(PrimitiveType::Decimal128(precision, scale)) => {
|
369
|
+
// Create a Decimal128Builder with specific precision and scale
|
370
|
+
let builder = Decimal128Builder::with_capacity(cap);
|
371
|
+
|
372
|
+
// Set precision and scale for the decimal and return the new builder
|
373
|
+
let builder_with_precision = builder.with_precision_and_scale(*precision, *scale).map_err(|e| {
|
374
|
+
MagnusError::new(
|
375
|
+
magnus::exception::runtime_error(),
|
376
|
+
format!("Failed to set precision and scale: {}", e),
|
377
|
+
)
|
378
|
+
})?;
|
379
|
+
|
380
|
+
Ok(Box::new(builder_with_precision))
|
381
|
+
}
|
367
382
|
ParquetSchemaType::Primitive(PrimitiveType::String) => {
|
368
383
|
Ok(Box::new(StringBuilder::with_capacity(cap, cap * 32)))
|
369
384
|
}
|
@@ -415,7 +430,7 @@ fn create_arrow_builder_for_type(
|
|
415
430
|
ParquetSchemaType::Struct(struct_field) => {
|
416
431
|
// Check for empty struct immediately
|
417
432
|
if struct_field.fields.is_empty() {
|
418
|
-
|
433
|
+
Err(MagnusError::new(
|
419
434
|
magnus::exception::runtime_error(),
|
420
435
|
"Cannot build a struct with zero fields - Parquet doesn't support empty structs".to_string(),
|
421
436
|
))?;
|
@@ -445,7 +460,7 @@ fn create_arrow_builder_for_type(
|
|
445
460
|
|
446
461
|
// Make sure we have the right number of builders
|
447
462
|
if child_field_builders.len() != arrow_fields.len() {
|
448
|
-
|
463
|
+
Err(MagnusError::new(
|
449
464
|
magnus::exception::runtime_error(),
|
450
465
|
format!(
|
451
466
|
"Number of field builders ({}) doesn't match number of arrow fields ({})",
|
@@ -834,6 +849,46 @@ fn fill_builder(
|
|
834
849
|
}
|
835
850
|
Ok(())
|
836
851
|
}
|
852
|
+
ParquetSchemaType::Primitive(PrimitiveType::Decimal128(_precision, scale)) => {
|
853
|
+
let typed_builder = builder
|
854
|
+
.as_any_mut()
|
855
|
+
.downcast_mut::<Decimal128Builder>()
|
856
|
+
.expect("Builder mismatch: expected Float64Builder");
|
857
|
+
|
858
|
+
for val in values {
|
859
|
+
match val {
|
860
|
+
ParquetValue::Decimal128(d) => typed_builder.append_value(*d),
|
861
|
+
ParquetValue::Float64(f) => {
|
862
|
+
// Scale the float to the desired precision and scale
|
863
|
+
let scaled_value = (*f * 10_f64.powi(*scale as i32)) as i128;
|
864
|
+
typed_builder.append_value(scaled_value)
|
865
|
+
}
|
866
|
+
ParquetValue::Float32(flo) => {
|
867
|
+
// Scale the float to the desired precision and scale
|
868
|
+
let scaled_value = (*flo as f64 * 10_f64.powi(*scale as i32)) as i128;
|
869
|
+
typed_builder.append_value(scaled_value)
|
870
|
+
}
|
871
|
+
ParquetValue::Int64(i) => {
|
872
|
+
// Scale the integer to the desired scale
|
873
|
+
let scaled_value = (*i as i128) * 10_i128.pow(*scale as u32);
|
874
|
+
typed_builder.append_value(scaled_value)
|
875
|
+
}
|
876
|
+
ParquetValue::Int32(i) => {
|
877
|
+
// Scale the integer to the desired scale
|
878
|
+
let scaled_value = (*i as i128) * 10_i128.pow(*scale as u32);
|
879
|
+
typed_builder.append_value(scaled_value)
|
880
|
+
}
|
881
|
+
ParquetValue::Null => typed_builder.append_null(),
|
882
|
+
other => {
|
883
|
+
return Err(MagnusError::new(
|
884
|
+
magnus::exception::type_error(),
|
885
|
+
format!("Expected Float64, got {:?}", other),
|
886
|
+
))
|
887
|
+
}
|
888
|
+
}
|
889
|
+
}
|
890
|
+
Ok(())
|
891
|
+
}
|
837
892
|
ParquetSchemaType::Primitive(PrimitiveType::Boolean) => {
|
838
893
|
let typed_builder = builder
|
839
894
|
.as_any_mut()
|
@@ -954,7 +1009,7 @@ fn fill_builder(
|
|
954
1009
|
.expect("Builder mismatch: expected BinaryBuilder");
|
955
1010
|
for val in values {
|
956
1011
|
match val {
|
957
|
-
ParquetValue::Bytes(b) => typed_builder.append_value(
|
1012
|
+
ParquetValue::Bytes(b) => typed_builder.append_value(b),
|
958
1013
|
ParquetValue::Null => typed_builder.append_null(),
|
959
1014
|
other => {
|
960
1015
|
return Err(MagnusError::new(
|
@@ -1106,6 +1161,15 @@ fn fill_builder(
|
|
1106
1161
|
)
|
1107
1162
|
})?
|
1108
1163
|
.append_value(bytes),
|
1164
|
+
ParquetValue::Decimal128(x) => typed_builder
|
1165
|
+
.field_builder::<Decimal128Builder>(i)
|
1166
|
+
.ok_or_else(|| {
|
1167
|
+
MagnusError::new(
|
1168
|
+
magnus::exception::type_error(),
|
1169
|
+
"Failed to coerce into Decimal128Builder",
|
1170
|
+
)
|
1171
|
+
})?
|
1172
|
+
.append_value(*x),
|
1109
1173
|
ParquetValue::Date32(x) => typed_builder
|
1110
1174
|
.field_builder::<Date32Builder>(i)
|
1111
1175
|
.ok_or_else(|| {
|
@@ -1302,6 +1366,15 @@ fn fill_builder(
|
|
1302
1366
|
)
|
1303
1367
|
})?
|
1304
1368
|
.append_null(),
|
1369
|
+
ParquetSchemaType::Primitive(PrimitiveType::Decimal128(_, _)) => typed_builder
|
1370
|
+
.field_builder::<Decimal128Builder>(i)
|
1371
|
+
.ok_or_else(|| {
|
1372
|
+
MagnusError::new(
|
1373
|
+
magnus::exception::type_error(),
|
1374
|
+
"Failed to coerce into Decimal128Builder",
|
1375
|
+
)
|
1376
|
+
})?
|
1377
|
+
.append_null(),
|
1305
1378
|
ParquetSchemaType::Primitive(PrimitiveType::String) => typed_builder
|
1306
1379
|
.field_builder::<StringBuilder>(i)
|
1307
1380
|
.ok_or_else(|| {
|
@@ -59,7 +59,7 @@ impl Write for IoLikeValue {
|
|
59
59
|
}
|
60
60
|
}
|
61
61
|
|
62
|
-
impl
|
62
|
+
impl FromStr for ParquetSchemaType<'_> {
|
63
63
|
type Err = MagnusError;
|
64
64
|
|
65
65
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
@@ -98,6 +98,53 @@ impl<'a> FromStr for ParquetSchemaType<'a> {
|
|
98
98
|
})));
|
99
99
|
}
|
100
100
|
|
101
|
+
// Check if it's a decimal type with precision and scale
|
102
|
+
if let Some(decimal_params) = s.strip_prefix("decimal(").and_then(|s| s.strip_suffix(")")) {
|
103
|
+
let parts: Vec<&str> = decimal_params.split(',').collect();
|
104
|
+
|
105
|
+
// Handle both single parameter (precision only) and two parameters (precision and scale)
|
106
|
+
if parts.len() == 1 {
|
107
|
+
// Only precision provided, scale defaults to 0
|
108
|
+
let precision = parts[0].trim().parse::<u8>().map_err(|_| {
|
109
|
+
MagnusError::new(
|
110
|
+
magnus::exception::runtime_error(),
|
111
|
+
format!("Invalid precision value in decimal type: {}", parts[0]),
|
112
|
+
)
|
113
|
+
})?;
|
114
|
+
|
115
|
+
return Ok(ParquetSchemaType::Primitive(PrimitiveType::Decimal128(
|
116
|
+
precision, 0,
|
117
|
+
)));
|
118
|
+
} else if parts.len() == 2 {
|
119
|
+
// Both precision and scale provided
|
120
|
+
let precision = parts[0].trim().parse::<u8>().map_err(|_| {
|
121
|
+
MagnusError::new(
|
122
|
+
magnus::exception::runtime_error(),
|
123
|
+
format!("Invalid precision value in decimal type: {}", parts[0]),
|
124
|
+
)
|
125
|
+
})?;
|
126
|
+
|
127
|
+
let scale = parts[1].trim().parse::<i8>().map_err(|_| {
|
128
|
+
MagnusError::new(
|
129
|
+
magnus::exception::runtime_error(),
|
130
|
+
format!("Invalid scale value in decimal type: {}", parts[1]),
|
131
|
+
)
|
132
|
+
})?;
|
133
|
+
|
134
|
+
return Ok(ParquetSchemaType::Primitive(PrimitiveType::Decimal128(
|
135
|
+
precision, scale,
|
136
|
+
)));
|
137
|
+
} else {
|
138
|
+
return Err(MagnusError::new(
|
139
|
+
magnus::exception::runtime_error(),
|
140
|
+
format!(
|
141
|
+
"Invalid decimal format. Expected 'decimal(precision)' or 'decimal(precision,scale)', got '{}'",
|
142
|
+
s
|
143
|
+
),
|
144
|
+
));
|
145
|
+
}
|
146
|
+
}
|
147
|
+
|
101
148
|
// Handle primitive types
|
102
149
|
match s {
|
103
150
|
"int8" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Int8)),
|
@@ -116,6 +163,9 @@ impl<'a> FromStr for ParquetSchemaType<'a> {
|
|
116
163
|
"date32" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Date32)),
|
117
164
|
"timestamp_millis" => Ok(ParquetSchemaType::Primitive(PrimitiveType::TimestampMillis)),
|
118
165
|
"timestamp_micros" => Ok(ParquetSchemaType::Primitive(PrimitiveType::TimestampMicros)),
|
166
|
+
"decimal" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Decimal128(
|
167
|
+
38, 0,
|
168
|
+
))),
|
119
169
|
"list" => Ok(ParquetSchemaType::List(Box::new(ListField {
|
120
170
|
item_type: ParquetSchemaType::Primitive(PrimitiveType::String),
|
121
171
|
format: None,
|
@@ -129,7 +179,7 @@ impl<'a> FromStr for ParquetSchemaType<'a> {
|
|
129
179
|
}
|
130
180
|
}
|
131
181
|
|
132
|
-
impl
|
182
|
+
impl TryConvert for ParquetSchemaType<'_> {
|
133
183
|
fn try_convert(value: Value) -> Result<Self, MagnusError> {
|
134
184
|
let ruby = unsafe { Ruby::get_unchecked() };
|
135
185
|
let schema_type = parse_string_or_symbol(&ruby, value)?;
|
@@ -144,7 +194,7 @@ impl<'a> TryConvert for ParquetSchemaType<'a> {
|
|
144
194
|
|
145
195
|
// We know this type is safe to move between threads because it's just an enum
|
146
196
|
// with simple primitive types and strings
|
147
|
-
unsafe impl
|
197
|
+
unsafe impl Send for ParquetSchemaType<'_> {}
|
148
198
|
|
149
199
|
pub enum WriterOutput {
|
150
200
|
File(ArrowWriter<Box<dyn SendableWrite>>),
|
@@ -202,14 +252,12 @@ impl<'a> ColumnCollector<'a> {
|
|
202
252
|
pub fn push_value(&mut self, value: Value) -> Result<(), MagnusError> {
|
203
253
|
use crate::types::ParquetValue;
|
204
254
|
|
205
|
-
if value.is_nil() {
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
));
|
212
|
-
}
|
255
|
+
if value.is_nil() && !self.nullable {
|
256
|
+
// For non-nullable fields, raise an error
|
257
|
+
return Err(MagnusError::new(
|
258
|
+
magnus::exception::runtime_error(),
|
259
|
+
"Cannot write nil value for non-nullable field",
|
260
|
+
));
|
213
261
|
}
|
214
262
|
|
215
263
|
// For all other types, proceed as normal
|
data/ext/parquet/src/utils.rs
CHANGED
@@ -13,12 +13,12 @@ pub fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String
|
|
13
13
|
RString::from_value(value)
|
14
14
|
.ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid string value"))?
|
15
15
|
.to_string()
|
16
|
-
.map(
|
16
|
+
.map(Some)
|
17
17
|
} else if value.is_kind_of(ruby.class_symbol()) {
|
18
18
|
Symbol::from_value(value)
|
19
19
|
.ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid symbol value"))?
|
20
20
|
.funcall("to_s", ())
|
21
|
-
.map(
|
21
|
+
.map(Some)
|
22
22
|
} else {
|
23
23
|
Err(Error::new(
|
24
24
|
magnus::exception::type_error(),
|
@@ -161,11 +161,11 @@ pub fn parse_parquet_columns_args(
|
|
161
161
|
};
|
162
162
|
|
163
163
|
let batch_size = kwargs.optional.2.flatten();
|
164
|
-
if let Some(
|
165
|
-
if
|
164
|
+
if let Some(batch_size) = batch_size {
|
165
|
+
if batch_size == 0 {
|
166
166
|
return Err(Error::new(
|
167
|
-
|
168
|
-
|
167
|
+
magnus::exception::arg_error(),
|
168
|
+
"Batch size must be greater than 0",
|
169
169
|
));
|
170
170
|
}
|
171
171
|
}
|
@@ -111,13 +111,13 @@ pub fn parse_parquet_write_args(
|
|
111
111
|
if let Some(type_val) = type_val {
|
112
112
|
// If it has a type: :struct, it's the new DSL format
|
113
113
|
// Use parse_string_or_symbol to handle both String and Symbol values
|
114
|
-
let ttype = parse_string_or_symbol(
|
114
|
+
let ttype = parse_string_or_symbol(ruby, type_val)?;
|
115
115
|
if let Some(ref type_str) = ttype {
|
116
116
|
if type_str == "struct" {
|
117
117
|
// Parse using the new schema approach
|
118
|
-
let schema_node = crate::parse_schema_node(
|
118
|
+
let schema_node = crate::parse_schema_node(ruby, schema_value)?;
|
119
119
|
|
120
|
-
validate_schema_node(
|
120
|
+
validate_schema_node(ruby, &schema_node)?;
|
121
121
|
|
122
122
|
return Ok(ParquetWriteArgs {
|
123
123
|
read_from,
|
@@ -143,22 +143,21 @@ pub fn parse_parquet_write_args(
|
|
143
143
|
"Schema fields must be an array",
|
144
144
|
)
|
145
145
|
})?
|
146
|
-
.
|
147
|
-
== 0)
|
146
|
+
.is_empty())
|
148
147
|
{
|
149
148
|
// If schema is nil or an empty array, we need to peek at the first value to determine column count
|
150
149
|
let first_value = read_from.funcall::<_, _, Value>("peek", ())?;
|
151
150
|
// Default to nullable:true for auto-inferred fields
|
152
|
-
crate::infer_schema_from_first_row(
|
151
|
+
crate::infer_schema_from_first_row(ruby, first_value, true)?
|
153
152
|
} else {
|
154
153
|
// Legacy array format - use our centralized parser
|
155
|
-
crate::parse_legacy_schema(
|
154
|
+
crate::parse_legacy_schema(ruby, schema_value)?
|
156
155
|
};
|
157
156
|
|
158
157
|
// Convert the legacy schema fields to SchemaNode (DSL format)
|
159
|
-
let schema_node = crate::legacy_schema_to_dsl(
|
158
|
+
let schema_node = crate::legacy_schema_to_dsl(ruby, schema_fields)?;
|
160
159
|
|
161
|
-
validate_schema_node(
|
160
|
+
validate_schema_node(ruby, &schema_node)?;
|
162
161
|
|
163
162
|
Ok(ParquetWriteArgs {
|
164
163
|
read_from,
|
@@ -195,6 +194,9 @@ fn arrow_data_type_to_parquet_schema_type(dt: &DataType) -> Result<ParquetSchema
|
|
195
194
|
}
|
196
195
|
DataType::Float32 => Ok(PST::Primitive(PrimitiveType::Float32)),
|
197
196
|
DataType::Float64 => Ok(PST::Primitive(PrimitiveType::Float64)),
|
197
|
+
DataType::Decimal128(precision, scale) => Ok(PST::Primitive(PrimitiveType::Decimal128(
|
198
|
+
*precision, *scale,
|
199
|
+
))),
|
198
200
|
DataType::Date32 => Ok(PST::Primitive(PrimitiveType::Date32)),
|
199
201
|
DataType::Date64 => {
|
200
202
|
// Our code typically uses Date32 or Timestamp for 64. But Arrow has Date64
|
@@ -414,15 +416,21 @@ fn create_writer(
|
|
414
416
|
compression: Option<String>,
|
415
417
|
) -> Result<WriterOutput, ParquetGemError> {
|
416
418
|
// Create writer properties with compression based on the option
|
419
|
+
let compression_setting = match compression.map(|s| s.to_lowercase()).as_deref() {
|
420
|
+
Some("none") | Some("uncompressed") => Ok(Compression::UNCOMPRESSED),
|
421
|
+
Some("snappy") => Ok(Compression::SNAPPY),
|
422
|
+
Some("gzip") => Ok(Compression::GZIP(GzipLevel::default())),
|
423
|
+
Some("lz4") => Ok(Compression::LZ4),
|
424
|
+
Some("zstd") => Ok(Compression::ZSTD(ZstdLevel::default())),
|
425
|
+
None => Ok(Compression::UNCOMPRESSED),
|
426
|
+
other => Err(MagnusError::new(
|
427
|
+
magnus::exception::arg_error(),
|
428
|
+
format!("Invalid compression option: {:?}", other),
|
429
|
+
)),
|
430
|
+
}?;
|
431
|
+
|
417
432
|
let props = WriterProperties::builder()
|
418
|
-
.set_compression(
|
419
|
-
Some("none") | Some("uncompressed") => Compression::UNCOMPRESSED,
|
420
|
-
Some("snappy") => Compression::SNAPPY,
|
421
|
-
Some("gzip") => Compression::GZIP(GzipLevel::default()),
|
422
|
-
Some("lz4") => Compression::LZ4,
|
423
|
-
Some("zstd") => Compression::ZSTD(ZstdLevel::default()),
|
424
|
-
_ => Compression::UNCOMPRESSED,
|
425
|
-
})
|
433
|
+
.set_compression(compression_setting)
|
426
434
|
.build();
|
427
435
|
|
428
436
|
if write_to.is_kind_of(ruby.class_string()) {
|