parquet 0.5.1 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +9 -1
- data/ext/parquet/Cargo.toml +4 -0
- data/ext/parquet/build.rs +5 -0
- data/ext/parquet/src/lib.rs +1 -0
- data/ext/parquet/src/reader/common.rs +7 -6
- data/ext/parquet/src/reader/mod.rs +204 -0
- data/ext/parquet/src/reader/parquet_column_reader.rs +19 -20
- data/ext/parquet/src/reader/parquet_row_reader.rs +18 -22
- data/ext/parquet/src/ruby_reader.rs +11 -24
- data/ext/parquet/src/types/core_types.rs +1 -0
- data/ext/parquet/src/types/mod.rs +8 -5
- data/ext/parquet/src/types/parquet_value.rs +204 -7
- data/ext/parquet/src/types/record_types.rs +31 -8
- data/ext/parquet/src/types/schema_converter.rs +118 -11
- data/ext/parquet/src/types/schema_node.rs +83 -2
- data/ext/parquet/src/types/timestamp.rs +6 -10
- data/ext/parquet/src/types/type_conversion.rs +84 -11
- data/ext/parquet/src/types/writer_types.rs +40 -11
- data/ext/parquet/src/utils.rs +6 -6
- data/ext/parquet/src/writer/mod.rs +25 -18
- data/ext/parquet/src/writer/write_columns.rs +27 -24
- data/ext/parquet/src/writer/write_rows.rs +17 -16
- data/lib/parquet/schema.rb +77 -4
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +11 -0
- metadata +3 -2
@@ -68,7 +68,7 @@ fn parse_struct_node(
|
|
68
68
|
})?;
|
69
69
|
|
70
70
|
// Check for empty struct immediately
|
71
|
-
if fields_arr.
|
71
|
+
if fields_arr.is_empty() {
|
72
72
|
return Err(MagnusError::new(
|
73
73
|
ruby.exception_arg_error(),
|
74
74
|
format!("Cannot create a struct with zero fields. Struct name: '{}'. Parquet doesn't support empty structs", name)
|
@@ -175,6 +175,83 @@ pub fn parse_schema_node(ruby: &Ruby, node_value: Value) -> Result<SchemaNode, M
|
|
175
175
|
"struct" => parse_struct_node(ruby, &node_hash, name, nullable),
|
176
176
|
"list" => parse_list_node(ruby, &node_hash, name, nullable),
|
177
177
|
"map" => parse_map_node(ruby, &node_hash, name, nullable),
|
178
|
+
"decimal" => {
|
179
|
+
// Check for precision and scale
|
180
|
+
let precision_val = node_hash.get(Symbol::new("precision"));
|
181
|
+
let scale_val = node_hash.get(Symbol::new("scale"));
|
182
|
+
|
183
|
+
// Handle different precision/scale combinations:
|
184
|
+
// 1. When no precision or scale - use max precision (38)
|
185
|
+
// 2. When precision only - use scale 0
|
186
|
+
// 3. When scale only - use max precision (38)
|
187
|
+
let (precision, scale) = match (precision_val, scale_val) {
|
188
|
+
(None, None) => (38, 0), // Maximum accuracy, scale 0
|
189
|
+
(Some(p), None) => {
|
190
|
+
// Precision provided, scale defaults to 0
|
191
|
+
let prec = u8::try_convert(p).map_err(|_| {
|
192
|
+
MagnusError::new(
|
193
|
+
ruby.exception_type_error(),
|
194
|
+
"Invalid precision value for decimal type, expected a positive integer".to_string(),
|
195
|
+
)
|
196
|
+
})?;
|
197
|
+
(prec, 0)
|
198
|
+
},
|
199
|
+
(None, Some(s)) => {
|
200
|
+
// Scale provided, precision set to maximum (38)
|
201
|
+
let scl = i8::try_convert(s).map_err(|_| {
|
202
|
+
MagnusError::new(
|
203
|
+
ruby.exception_type_error(),
|
204
|
+
"Invalid scale value for decimal type, expected an integer".to_string(),
|
205
|
+
)
|
206
|
+
})?;
|
207
|
+
(38, scl)
|
208
|
+
},
|
209
|
+
(Some(p), Some(s)) => {
|
210
|
+
// Both provided
|
211
|
+
let prec = u8::try_convert(p).map_err(|_| {
|
212
|
+
MagnusError::new(
|
213
|
+
ruby.exception_type_error(),
|
214
|
+
"Invalid precision value for decimal type, expected a positive integer".to_string(),
|
215
|
+
)
|
216
|
+
})?;
|
217
|
+
let scl = i8::try_convert(s).map_err(|_| {
|
218
|
+
MagnusError::new(
|
219
|
+
ruby.exception_type_error(),
|
220
|
+
"Invalid scale value for decimal type, expected an integer".to_string(),
|
221
|
+
)
|
222
|
+
})?;
|
223
|
+
(prec, scl)
|
224
|
+
}
|
225
|
+
};
|
226
|
+
|
227
|
+
// Validate precision is in a valid range
|
228
|
+
if precision < 1 {
|
229
|
+
return Err(MagnusError::new(
|
230
|
+
ruby.exception_arg_error(),
|
231
|
+
format!(
|
232
|
+
"Precision for decimal type must be at least 1, got {}",
|
233
|
+
precision
|
234
|
+
),
|
235
|
+
));
|
236
|
+
}
|
237
|
+
|
238
|
+
if precision > 38 {
|
239
|
+
return Err(MagnusError::new(
|
240
|
+
ruby.exception_arg_error(),
|
241
|
+
format!(
|
242
|
+
"Precision for decimal type cannot exceed 38, got {}",
|
243
|
+
precision
|
244
|
+
),
|
245
|
+
));
|
246
|
+
}
|
247
|
+
|
248
|
+
Ok(SchemaNode::Primitive {
|
249
|
+
name,
|
250
|
+
parquet_type: PrimitiveType::Decimal128(precision, scale),
|
251
|
+
nullable,
|
252
|
+
format,
|
253
|
+
})
|
254
|
+
}
|
178
255
|
// For primitives, provide better error messages when type isn't recognized
|
179
256
|
other => {
|
180
257
|
if let Some(parquet_type) = parse_primitive_type(other) {
|
@@ -188,7 +265,7 @@ pub fn parse_schema_node(ruby: &Ruby, node_value: Value) -> Result<SchemaNode, M
|
|
188
265
|
Err(MagnusError::new(
|
189
266
|
magnus::exception::arg_error(),
|
190
267
|
format!(
|
191
|
-
"Unknown type: '{}'. Supported types are: struct, list, map, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float32, float64, boolean, string, binary, date32, timestamp_millis, timestamp_micros",
|
268
|
+
"Unknown type: '{}'. Supported types are: struct, list, map, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float32, float64, boolean, string, binary, date32, timestamp_millis, timestamp_micros, decimal",
|
192
269
|
other
|
193
270
|
)
|
194
271
|
))
|
@@ -216,6 +293,7 @@ fn parse_primitive_type(s: &str) -> Option<PrimitiveType> {
|
|
216
293
|
"date" | "date32" => Some(PrimitiveType::Date32),
|
217
294
|
"timestamp_millis" | "timestamp_ms" => Some(PrimitiveType::TimestampMillis),
|
218
295
|
"timestamp_micros" | "timestamp_us" => Some(PrimitiveType::TimestampMicros),
|
296
|
+
"decimal" => Some(PrimitiveType::Decimal128(38, 0)), // Maximum precision, scale 0
|
219
297
|
_ => None,
|
220
298
|
}
|
221
299
|
}
|
@@ -240,6 +318,9 @@ pub fn schema_node_to_arrow_field(node: &SchemaNode) -> ArrowField {
|
|
240
318
|
PrimitiveType::UInt64 => ArrowDataType::UInt64,
|
241
319
|
PrimitiveType::Float32 => ArrowDataType::Float32,
|
242
320
|
PrimitiveType::Float64 => ArrowDataType::Float64,
|
321
|
+
PrimitiveType::Decimal128(precision, scale) => {
|
322
|
+
ArrowDataType::Decimal128(*precision, *scale)
|
323
|
+
}
|
243
324
|
PrimitiveType::Boolean => ArrowDataType::Boolean,
|
244
325
|
PrimitiveType::String => ArrowDataType::Utf8,
|
245
326
|
PrimitiveType::Binary => ArrowDataType::Binary,
|
@@ -2,15 +2,11 @@ use super::*;
|
|
2
2
|
|
3
3
|
pub fn parse_zoned_timestamp(value: &ParquetValue) -> Result<jiff::Timestamp, ParquetGemError> {
|
4
4
|
let (ts, tz) = match value {
|
5
|
-
ParquetValue::TimestampSecond(ts, tz) => (jiff::Timestamp::from_second(*ts)
|
6
|
-
ParquetValue::TimestampMillis(ts, tz) =>
|
7
|
-
|
8
|
-
}
|
9
|
-
ParquetValue::TimestampMicros(ts, tz) => {
|
10
|
-
(jiff::Timestamp::from_microsecond(*ts).unwrap(), tz)
|
11
|
-
}
|
5
|
+
ParquetValue::TimestampSecond(ts, tz) => (jiff::Timestamp::from_second(*ts)?, tz),
|
6
|
+
ParquetValue::TimestampMillis(ts, tz) => (jiff::Timestamp::from_millisecond(*ts)?, tz),
|
7
|
+
ParquetValue::TimestampMicros(ts, tz) => (jiff::Timestamp::from_microsecond(*ts)?, tz),
|
12
8
|
ParquetValue::TimestampNanos(ts, tz) => {
|
13
|
-
(jiff::Timestamp::from_nanosecond(*ts as i128)
|
9
|
+
(jiff::Timestamp::from_nanosecond(*ts as i128)?, tz)
|
14
10
|
}
|
15
11
|
_ => {
|
16
12
|
return Err(MagnusError::new(
|
@@ -50,7 +46,7 @@ pub fn parse_zoned_timestamp(value: &ParquetValue) -> Result<jiff::Timestamp, Pa
|
|
50
46
|
Ok(ts.to_zoned(tz).timestamp())
|
51
47
|
} else {
|
52
48
|
// Try IANA timezone
|
53
|
-
match ts.in_tz(
|
49
|
+
match ts.in_tz(tz) {
|
54
50
|
Ok(zoned) => Ok(zoned.timestamp()),
|
55
51
|
Err(_) => Ok(ts), // Fall back to UTC if timezone is invalid
|
56
52
|
}
|
@@ -85,7 +81,7 @@ macro_rules! impl_timestamp_conversion {
|
|
85
81
|
#[macro_export]
|
86
82
|
macro_rules! impl_date_conversion {
|
87
83
|
($value:expr, $handle:expr) => {{
|
88
|
-
let ts = jiff::Timestamp::from_second(($value as i64) * 86400)
|
84
|
+
let ts = jiff::Timestamp::from_second(($value as i64) * 86400)?;
|
89
85
|
let formatted = ts.strftime("%Y-%m-%d").to_string();
|
90
86
|
Ok(formatted.into_value_with($handle))
|
91
87
|
}};
|
@@ -2,8 +2,8 @@ use std::str::FromStr;
|
|
2
2
|
use std::sync::Arc;
|
3
3
|
|
4
4
|
use super::*;
|
5
|
-
use arrow_array::builder::MapFieldNames;
|
6
5
|
use arrow_array::builder::*;
|
6
|
+
use arrow_array::builder::MapFieldNames;
|
7
7
|
use arrow_schema::{DataType, Field, Fields, TimeUnit};
|
8
8
|
use jiff::tz::{Offset, TimeZone};
|
9
9
|
use magnus::{RArray, RString, TryConvert};
|
@@ -41,9 +41,9 @@ pub fn convert_to_date32(
|
|
41
41
|
let s = String::try_convert(value)?;
|
42
42
|
// Parse string into Date using jiff
|
43
43
|
let date = if let Some(fmt) = format {
|
44
|
-
jiff::civil::Date::strptime(
|
44
|
+
jiff::civil::Date::strptime(fmt, &s).or_else(|e1| {
|
45
45
|
// Try parsing as DateTime and convert to Date with zero offset
|
46
|
-
jiff::civil::DateTime::strptime(
|
46
|
+
jiff::civil::DateTime::strptime(fmt, &s)
|
47
47
|
.and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
|
48
48
|
.map(|dt| dt.date())
|
49
49
|
.map_err(|e2| {
|
@@ -78,7 +78,7 @@ pub fn convert_to_date32(
|
|
78
78
|
.timestamp();
|
79
79
|
|
80
80
|
// Convert to epoch days
|
81
|
-
Ok((x.as_second()
|
81
|
+
Ok((x.as_second() / 86400) as i32)
|
82
82
|
} else if value.is_kind_of(ruby.class_time()) {
|
83
83
|
// Convert Time object to epoch days
|
84
84
|
let secs = i64::try_convert(value.funcall::<_, _, Value>("to_i", ())?)?;
|
@@ -100,10 +100,10 @@ pub fn convert_to_timestamp_millis(
|
|
100
100
|
let s = String::try_convert(value)?;
|
101
101
|
// Parse string into Timestamp using jiff
|
102
102
|
let timestamp = if let Some(fmt) = format {
|
103
|
-
jiff::Timestamp::strptime(
|
103
|
+
jiff::Timestamp::strptime(fmt, &s)
|
104
104
|
.or_else(|e1| {
|
105
105
|
// Try parsing as DateTime and convert to Timestamp with zero offset
|
106
|
-
jiff::civil::DateTime::strptime(
|
106
|
+
jiff::civil::DateTime::strptime(fmt, &s)
|
107
107
|
.and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
|
108
108
|
.map(|dt| dt.timestamp())
|
109
109
|
.map_err(|e2| {
|
@@ -150,9 +150,9 @@ pub fn convert_to_timestamp_micros(
|
|
150
150
|
let s = String::try_convert(value)?;
|
151
151
|
// Parse string into Timestamp using jiff
|
152
152
|
let timestamp = if let Some(fmt) = format {
|
153
|
-
jiff::Timestamp::strptime(
|
153
|
+
jiff::Timestamp::strptime(fmt, &s).or_else(|e1| {
|
154
154
|
// Try parsing as DateTime and convert to Timestamp with zero offset
|
155
|
-
jiff::civil::DateTime::strptime(
|
155
|
+
jiff::civil::DateTime::strptime(fmt, &s).and_then(|dt| {
|
156
156
|
dt.to_zoned(TimeZone::fixed(Offset::constant(0)))
|
157
157
|
})
|
158
158
|
.map(|dt| dt.timestamp())
|
@@ -242,6 +242,7 @@ pub fn parquet_schema_type_to_arrow_data_type(
|
|
242
242
|
PrimitiveType::UInt64 => DataType::UInt64,
|
243
243
|
PrimitiveType::Float32 => DataType::Float32,
|
244
244
|
PrimitiveType::Float64 => DataType::Float64,
|
245
|
+
PrimitiveType::Decimal128(precision, scale) => DataType::Decimal128(*precision, *scale),
|
245
246
|
PrimitiveType::String => DataType::Utf8,
|
246
247
|
PrimitiveType::Binary => DataType::Binary,
|
247
248
|
PrimitiveType::Boolean => DataType::Boolean,
|
@@ -364,6 +365,20 @@ fn create_arrow_builder_for_type(
|
|
364
365
|
ParquetSchemaType::Primitive(PrimitiveType::Float64) => {
|
365
366
|
Ok(Box::new(Float64Builder::with_capacity(cap)))
|
366
367
|
}
|
368
|
+
ParquetSchemaType::Primitive(PrimitiveType::Decimal128(precision, scale)) => {
|
369
|
+
// Create a Decimal128Builder with specific precision and scale
|
370
|
+
let builder = Decimal128Builder::with_capacity(cap);
|
371
|
+
|
372
|
+
// Set precision and scale for the decimal and return the new builder
|
373
|
+
let builder_with_precision = builder.with_precision_and_scale(*precision, *scale).map_err(|e| {
|
374
|
+
MagnusError::new(
|
375
|
+
magnus::exception::runtime_error(),
|
376
|
+
format!("Failed to set precision and scale: {}", e),
|
377
|
+
)
|
378
|
+
})?;
|
379
|
+
|
380
|
+
Ok(Box::new(builder_with_precision))
|
381
|
+
}
|
367
382
|
ParquetSchemaType::Primitive(PrimitiveType::String) => {
|
368
383
|
Ok(Box::new(StringBuilder::with_capacity(cap, cap * 32)))
|
369
384
|
}
|
@@ -415,7 +430,7 @@ fn create_arrow_builder_for_type(
|
|
415
430
|
ParquetSchemaType::Struct(struct_field) => {
|
416
431
|
// Check for empty struct immediately
|
417
432
|
if struct_field.fields.is_empty() {
|
418
|
-
|
433
|
+
Err(MagnusError::new(
|
419
434
|
magnus::exception::runtime_error(),
|
420
435
|
"Cannot build a struct with zero fields - Parquet doesn't support empty structs".to_string(),
|
421
436
|
))?;
|
@@ -445,7 +460,7 @@ fn create_arrow_builder_for_type(
|
|
445
460
|
|
446
461
|
// Make sure we have the right number of builders
|
447
462
|
if child_field_builders.len() != arrow_fields.len() {
|
448
|
-
|
463
|
+
Err(MagnusError::new(
|
449
464
|
magnus::exception::runtime_error(),
|
450
465
|
format!(
|
451
466
|
"Number of field builders ({}) doesn't match number of arrow fields ({})",
|
@@ -834,6 +849,46 @@ fn fill_builder(
|
|
834
849
|
}
|
835
850
|
Ok(())
|
836
851
|
}
|
852
|
+
ParquetSchemaType::Primitive(PrimitiveType::Decimal128(_precision, scale)) => {
|
853
|
+
let typed_builder = builder
|
854
|
+
.as_any_mut()
|
855
|
+
.downcast_mut::<Decimal128Builder>()
|
856
|
+
.expect("Builder mismatch: expected Float64Builder");
|
857
|
+
|
858
|
+
for val in values {
|
859
|
+
match val {
|
860
|
+
ParquetValue::Decimal128(d) => typed_builder.append_value(*d),
|
861
|
+
ParquetValue::Float64(f) => {
|
862
|
+
// Scale the float to the desired precision and scale
|
863
|
+
let scaled_value = (*f * 10_f64.powi(*scale as i32)) as i128;
|
864
|
+
typed_builder.append_value(scaled_value)
|
865
|
+
}
|
866
|
+
ParquetValue::Float32(flo) => {
|
867
|
+
// Scale the float to the desired precision and scale
|
868
|
+
let scaled_value = (*flo as f64 * 10_f64.powi(*scale as i32)) as i128;
|
869
|
+
typed_builder.append_value(scaled_value)
|
870
|
+
}
|
871
|
+
ParquetValue::Int64(i) => {
|
872
|
+
// Scale the integer to the desired scale
|
873
|
+
let scaled_value = (*i as i128) * 10_i128.pow(*scale as u32);
|
874
|
+
typed_builder.append_value(scaled_value)
|
875
|
+
}
|
876
|
+
ParquetValue::Int32(i) => {
|
877
|
+
// Scale the integer to the desired scale
|
878
|
+
let scaled_value = (*i as i128) * 10_i128.pow(*scale as u32);
|
879
|
+
typed_builder.append_value(scaled_value)
|
880
|
+
}
|
881
|
+
ParquetValue::Null => typed_builder.append_null(),
|
882
|
+
other => {
|
883
|
+
return Err(MagnusError::new(
|
884
|
+
magnus::exception::type_error(),
|
885
|
+
format!("Expected Float64, got {:?}", other),
|
886
|
+
))
|
887
|
+
}
|
888
|
+
}
|
889
|
+
}
|
890
|
+
Ok(())
|
891
|
+
}
|
837
892
|
ParquetSchemaType::Primitive(PrimitiveType::Boolean) => {
|
838
893
|
let typed_builder = builder
|
839
894
|
.as_any_mut()
|
@@ -954,7 +1009,7 @@ fn fill_builder(
|
|
954
1009
|
.expect("Builder mismatch: expected BinaryBuilder");
|
955
1010
|
for val in values {
|
956
1011
|
match val {
|
957
|
-
ParquetValue::Bytes(b) => typed_builder.append_value(
|
1012
|
+
ParquetValue::Bytes(b) => typed_builder.append_value(b),
|
958
1013
|
ParquetValue::Null => typed_builder.append_null(),
|
959
1014
|
other => {
|
960
1015
|
return Err(MagnusError::new(
|
@@ -1106,6 +1161,15 @@ fn fill_builder(
|
|
1106
1161
|
)
|
1107
1162
|
})?
|
1108
1163
|
.append_value(bytes),
|
1164
|
+
ParquetValue::Decimal128(x) => typed_builder
|
1165
|
+
.field_builder::<Decimal128Builder>(i)
|
1166
|
+
.ok_or_else(|| {
|
1167
|
+
MagnusError::new(
|
1168
|
+
magnus::exception::type_error(),
|
1169
|
+
"Failed to coerce into Decimal128Builder",
|
1170
|
+
)
|
1171
|
+
})?
|
1172
|
+
.append_value(*x),
|
1109
1173
|
ParquetValue::Date32(x) => typed_builder
|
1110
1174
|
.field_builder::<Date32Builder>(i)
|
1111
1175
|
.ok_or_else(|| {
|
@@ -1302,6 +1366,15 @@ fn fill_builder(
|
|
1302
1366
|
)
|
1303
1367
|
})?
|
1304
1368
|
.append_null(),
|
1369
|
+
ParquetSchemaType::Primitive(PrimitiveType::Decimal128(_, _)) => typed_builder
|
1370
|
+
.field_builder::<Decimal128Builder>(i)
|
1371
|
+
.ok_or_else(|| {
|
1372
|
+
MagnusError::new(
|
1373
|
+
magnus::exception::type_error(),
|
1374
|
+
"Failed to coerce into Decimal128Builder",
|
1375
|
+
)
|
1376
|
+
})?
|
1377
|
+
.append_null(),
|
1305
1378
|
ParquetSchemaType::Primitive(PrimitiveType::String) => typed_builder
|
1306
1379
|
.field_builder::<StringBuilder>(i)
|
1307
1380
|
.ok_or_else(|| {
|
@@ -59,7 +59,7 @@ impl Write for IoLikeValue {
|
|
59
59
|
}
|
60
60
|
}
|
61
61
|
|
62
|
-
impl
|
62
|
+
impl FromStr for ParquetSchemaType<'_> {
|
63
63
|
type Err = MagnusError;
|
64
64
|
|
65
65
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
@@ -98,6 +98,36 @@ impl<'a> FromStr for ParquetSchemaType<'a> {
|
|
98
98
|
})));
|
99
99
|
}
|
100
100
|
|
101
|
+
// Check if it's a decimal type with precision and scale
|
102
|
+
if let Some(decimal_params) = s.strip_prefix("decimal(").and_then(|s| s.strip_suffix(")")) {
|
103
|
+
let parts: Vec<&str> = decimal_params.split(',').collect();
|
104
|
+
if parts.len() != 2 {
|
105
|
+
return Err(MagnusError::new(
|
106
|
+
magnus::exception::runtime_error(),
|
107
|
+
format!(
|
108
|
+
"Invalid decimal format. Expected 'decimal(precision,scale)', got '{}'",
|
109
|
+
s
|
110
|
+
),
|
111
|
+
));
|
112
|
+
}
|
113
|
+
|
114
|
+
let precision = parts[0].trim().parse::<u8>().map_err(|_| {
|
115
|
+
MagnusError::new(
|
116
|
+
magnus::exception::runtime_error(),
|
117
|
+
format!("Invalid precision value in decimal type: {}", parts[0]),
|
118
|
+
)
|
119
|
+
})?;
|
120
|
+
|
121
|
+
let scale = parts[1].trim().parse::<i8>().map_err(|_| {
|
122
|
+
MagnusError::new(
|
123
|
+
magnus::exception::runtime_error(),
|
124
|
+
format!("Invalid scale value in decimal type: {}", parts[1]),
|
125
|
+
)
|
126
|
+
})?;
|
127
|
+
|
128
|
+
return Ok(ParquetSchemaType::Primitive(PrimitiveType::Decimal128(precision, scale)));
|
129
|
+
}
|
130
|
+
|
101
131
|
// Handle primitive types
|
102
132
|
match s {
|
103
133
|
"int8" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Int8)),
|
@@ -116,6 +146,7 @@ impl<'a> FromStr for ParquetSchemaType<'a> {
|
|
116
146
|
"date32" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Date32)),
|
117
147
|
"timestamp_millis" => Ok(ParquetSchemaType::Primitive(PrimitiveType::TimestampMillis)),
|
118
148
|
"timestamp_micros" => Ok(ParquetSchemaType::Primitive(PrimitiveType::TimestampMicros)),
|
149
|
+
"decimal" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Decimal128(18, 2))), // Default precision 18, scale 2
|
119
150
|
"list" => Ok(ParquetSchemaType::List(Box::new(ListField {
|
120
151
|
item_type: ParquetSchemaType::Primitive(PrimitiveType::String),
|
121
152
|
format: None,
|
@@ -129,7 +160,7 @@ impl<'a> FromStr for ParquetSchemaType<'a> {
|
|
129
160
|
}
|
130
161
|
}
|
131
162
|
|
132
|
-
impl
|
163
|
+
impl TryConvert for ParquetSchemaType<'_> {
|
133
164
|
fn try_convert(value: Value) -> Result<Self, MagnusError> {
|
134
165
|
let ruby = unsafe { Ruby::get_unchecked() };
|
135
166
|
let schema_type = parse_string_or_symbol(&ruby, value)?;
|
@@ -144,7 +175,7 @@ impl<'a> TryConvert for ParquetSchemaType<'a> {
|
|
144
175
|
|
145
176
|
// We know this type is safe to move between threads because it's just an enum
|
146
177
|
// with simple primitive types and strings
|
147
|
-
unsafe impl
|
178
|
+
unsafe impl Send for ParquetSchemaType<'_> {}
|
148
179
|
|
149
180
|
pub enum WriterOutput {
|
150
181
|
File(ArrowWriter<Box<dyn SendableWrite>>),
|
@@ -202,14 +233,12 @@ impl<'a> ColumnCollector<'a> {
|
|
202
233
|
pub fn push_value(&mut self, value: Value) -> Result<(), MagnusError> {
|
203
234
|
use crate::types::ParquetValue;
|
204
235
|
|
205
|
-
if value.is_nil() {
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
));
|
212
|
-
}
|
236
|
+
if value.is_nil() && !self.nullable {
|
237
|
+
// For non-nullable fields, raise an error
|
238
|
+
return Err(MagnusError::new(
|
239
|
+
magnus::exception::runtime_error(),
|
240
|
+
"Cannot write nil value for non-nullable field",
|
241
|
+
));
|
213
242
|
}
|
214
243
|
|
215
244
|
// For all other types, proceed as normal
|
data/ext/parquet/src/utils.rs
CHANGED
@@ -13,12 +13,12 @@ pub fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String
|
|
13
13
|
RString::from_value(value)
|
14
14
|
.ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid string value"))?
|
15
15
|
.to_string()
|
16
|
-
.map(
|
16
|
+
.map(Some)
|
17
17
|
} else if value.is_kind_of(ruby.class_symbol()) {
|
18
18
|
Symbol::from_value(value)
|
19
19
|
.ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid symbol value"))?
|
20
20
|
.funcall("to_s", ())
|
21
|
-
.map(
|
21
|
+
.map(Some)
|
22
22
|
} else {
|
23
23
|
Err(Error::new(
|
24
24
|
magnus::exception::type_error(),
|
@@ -161,11 +161,11 @@ pub fn parse_parquet_columns_args(
|
|
161
161
|
};
|
162
162
|
|
163
163
|
let batch_size = kwargs.optional.2.flatten();
|
164
|
-
if let Some(
|
165
|
-
if
|
164
|
+
if let Some(batch_size) = batch_size {
|
165
|
+
if batch_size == 0 {
|
166
166
|
return Err(Error::new(
|
167
|
-
|
168
|
-
|
167
|
+
magnus::exception::arg_error(),
|
168
|
+
"Batch size must be greater than 0",
|
169
169
|
));
|
170
170
|
}
|
171
171
|
}
|
@@ -29,7 +29,6 @@ use crate::{
|
|
29
29
|
IoLikeValue, ParquetSchemaType as PST, ParquetWriteArgs, SchemaField, SendableWrite,
|
30
30
|
};
|
31
31
|
|
32
|
-
const MIN_SAMPLES_FOR_ESTIMATE: usize = 10;
|
33
32
|
const SAMPLE_SIZE: usize = 100;
|
34
33
|
const MIN_BATCH_SIZE: usize = 10;
|
35
34
|
const INITIAL_BATCH_SIZE: usize = 100;
|
@@ -112,13 +111,13 @@ pub fn parse_parquet_write_args(
|
|
112
111
|
if let Some(type_val) = type_val {
|
113
112
|
// If it has a type: :struct, it's the new DSL format
|
114
113
|
// Use parse_string_or_symbol to handle both String and Symbol values
|
115
|
-
let ttype = parse_string_or_symbol(
|
114
|
+
let ttype = parse_string_or_symbol(ruby, type_val)?;
|
116
115
|
if let Some(ref type_str) = ttype {
|
117
116
|
if type_str == "struct" {
|
118
117
|
// Parse using the new schema approach
|
119
|
-
let schema_node = crate::parse_schema_node(
|
118
|
+
let schema_node = crate::parse_schema_node(ruby, schema_value)?;
|
120
119
|
|
121
|
-
validate_schema_node(
|
120
|
+
validate_schema_node(ruby, &schema_node)?;
|
122
121
|
|
123
122
|
return Ok(ParquetWriteArgs {
|
124
123
|
read_from,
|
@@ -144,22 +143,21 @@ pub fn parse_parquet_write_args(
|
|
144
143
|
"Schema fields must be an array",
|
145
144
|
)
|
146
145
|
})?
|
147
|
-
.
|
148
|
-
== 0)
|
146
|
+
.is_empty())
|
149
147
|
{
|
150
148
|
// If schema is nil or an empty array, we need to peek at the first value to determine column count
|
151
149
|
let first_value = read_from.funcall::<_, _, Value>("peek", ())?;
|
152
150
|
// Default to nullable:true for auto-inferred fields
|
153
|
-
crate::infer_schema_from_first_row(
|
151
|
+
crate::infer_schema_from_first_row(ruby, first_value, true)?
|
154
152
|
} else {
|
155
153
|
// Legacy array format - use our centralized parser
|
156
|
-
crate::parse_legacy_schema(
|
154
|
+
crate::parse_legacy_schema(ruby, schema_value)?
|
157
155
|
};
|
158
156
|
|
159
157
|
// Convert the legacy schema fields to SchemaNode (DSL format)
|
160
|
-
let schema_node = crate::legacy_schema_to_dsl(
|
158
|
+
let schema_node = crate::legacy_schema_to_dsl(ruby, schema_fields)?;
|
161
159
|
|
162
|
-
validate_schema_node(
|
160
|
+
validate_schema_node(ruby, &schema_node)?;
|
163
161
|
|
164
162
|
Ok(ParquetWriteArgs {
|
165
163
|
read_from,
|
@@ -196,6 +194,9 @@ fn arrow_data_type_to_parquet_schema_type(dt: &DataType) -> Result<ParquetSchema
|
|
196
194
|
}
|
197
195
|
DataType::Float32 => Ok(PST::Primitive(PrimitiveType::Float32)),
|
198
196
|
DataType::Float64 => Ok(PST::Primitive(PrimitiveType::Float64)),
|
197
|
+
DataType::Decimal128(precision, scale) => Ok(PST::Primitive(PrimitiveType::Decimal128(
|
198
|
+
*precision, *scale,
|
199
|
+
))),
|
199
200
|
DataType::Date32 => Ok(PST::Primitive(PrimitiveType::Date32)),
|
200
201
|
DataType::Date64 => {
|
201
202
|
// Our code typically uses Date32 or Timestamp for 64. But Arrow has Date64
|
@@ -415,15 +416,21 @@ fn create_writer(
|
|
415
416
|
compression: Option<String>,
|
416
417
|
) -> Result<WriterOutput, ParquetGemError> {
|
417
418
|
// Create writer properties with compression based on the option
|
419
|
+
let compression_setting = match compression.map(|s| s.to_lowercase()).as_deref() {
|
420
|
+
Some("none") | Some("uncompressed") => Ok(Compression::UNCOMPRESSED),
|
421
|
+
Some("snappy") => Ok(Compression::SNAPPY),
|
422
|
+
Some("gzip") => Ok(Compression::GZIP(GzipLevel::default())),
|
423
|
+
Some("lz4") => Ok(Compression::LZ4),
|
424
|
+
Some("zstd") => Ok(Compression::ZSTD(ZstdLevel::default())),
|
425
|
+
None => Ok(Compression::UNCOMPRESSED),
|
426
|
+
other => Err(MagnusError::new(
|
427
|
+
magnus::exception::arg_error(),
|
428
|
+
format!("Invalid compression option: {:?}", other),
|
429
|
+
)),
|
430
|
+
}?;
|
431
|
+
|
418
432
|
let props = WriterProperties::builder()
|
419
|
-
.set_compression(
|
420
|
-
Some("none") | Some("uncompressed") => Compression::UNCOMPRESSED,
|
421
|
-
Some("snappy") => Compression::SNAPPY,
|
422
|
-
Some("gzip") => Compression::GZIP(GzipLevel::default()),
|
423
|
-
Some("lz4") => Compression::LZ4,
|
424
|
-
Some("zstd") => Compression::ZSTD(ZstdLevel::default()),
|
425
|
-
_ => Compression::UNCOMPRESSED,
|
426
|
-
})
|
433
|
+
.set_compression(compression_setting)
|
427
434
|
.build();
|
428
435
|
|
429
436
|
if write_to.is_kind_of(ruby.class_string()) {
|