parquet 0.5.2 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +9 -1
- data/ext/parquet/Cargo.toml +4 -0
- data/ext/parquet/build.rs +5 -0
- data/ext/parquet/src/lib.rs +1 -0
- data/ext/parquet/src/reader/common.rs +7 -6
- data/ext/parquet/src/reader/mod.rs +204 -0
- data/ext/parquet/src/reader/parquet_column_reader.rs +19 -20
- data/ext/parquet/src/reader/parquet_row_reader.rs +18 -22
- data/ext/parquet/src/ruby_reader.rs +3 -5
- data/ext/parquet/src/types/core_types.rs +1 -0
- data/ext/parquet/src/types/mod.rs +8 -5
- data/ext/parquet/src/types/parquet_value.rs +199 -7
- data/ext/parquet/src/types/record_types.rs +16 -5
- data/ext/parquet/src/types/schema_converter.rs +118 -11
- data/ext/parquet/src/types/schema_node.rs +83 -2
- data/ext/parquet/src/types/timestamp.rs +6 -10
- data/ext/parquet/src/types/type_conversion.rs +84 -11
- data/ext/parquet/src/types/writer_types.rs +40 -11
- data/ext/parquet/src/utils.rs +6 -6
- data/ext/parquet/src/writer/mod.rs +25 -17
- data/ext/parquet/src/writer/write_columns.rs +27 -24
- data/ext/parquet/src/writer/write_rows.rs +14 -15
- data/lib/parquet/schema.rb +77 -4
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +11 -0
- metadata +3 -2
@@ -2,8 +2,8 @@ use std::str::FromStr;
|
|
2
2
|
use std::sync::Arc;
|
3
3
|
|
4
4
|
use super::*;
|
5
|
-
use arrow_array::builder::MapFieldNames;
|
6
5
|
use arrow_array::builder::*;
|
6
|
+
use arrow_array::builder::MapFieldNames;
|
7
7
|
use arrow_schema::{DataType, Field, Fields, TimeUnit};
|
8
8
|
use jiff::tz::{Offset, TimeZone};
|
9
9
|
use magnus::{RArray, RString, TryConvert};
|
@@ -41,9 +41,9 @@ pub fn convert_to_date32(
|
|
41
41
|
let s = String::try_convert(value)?;
|
42
42
|
// Parse string into Date using jiff
|
43
43
|
let date = if let Some(fmt) = format {
|
44
|
-
jiff::civil::Date::strptime(
|
44
|
+
jiff::civil::Date::strptime(fmt, &s).or_else(|e1| {
|
45
45
|
// Try parsing as DateTime and convert to Date with zero offset
|
46
|
-
jiff::civil::DateTime::strptime(
|
46
|
+
jiff::civil::DateTime::strptime(fmt, &s)
|
47
47
|
.and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
|
48
48
|
.map(|dt| dt.date())
|
49
49
|
.map_err(|e2| {
|
@@ -78,7 +78,7 @@ pub fn convert_to_date32(
|
|
78
78
|
.timestamp();
|
79
79
|
|
80
80
|
// Convert to epoch days
|
81
|
-
Ok((x.as_second()
|
81
|
+
Ok((x.as_second() / 86400) as i32)
|
82
82
|
} else if value.is_kind_of(ruby.class_time()) {
|
83
83
|
// Convert Time object to epoch days
|
84
84
|
let secs = i64::try_convert(value.funcall::<_, _, Value>("to_i", ())?)?;
|
@@ -100,10 +100,10 @@ pub fn convert_to_timestamp_millis(
|
|
100
100
|
let s = String::try_convert(value)?;
|
101
101
|
// Parse string into Timestamp using jiff
|
102
102
|
let timestamp = if let Some(fmt) = format {
|
103
|
-
jiff::Timestamp::strptime(
|
103
|
+
jiff::Timestamp::strptime(fmt, &s)
|
104
104
|
.or_else(|e1| {
|
105
105
|
// Try parsing as DateTime and convert to Timestamp with zero offset
|
106
|
-
jiff::civil::DateTime::strptime(
|
106
|
+
jiff::civil::DateTime::strptime(fmt, &s)
|
107
107
|
.and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
|
108
108
|
.map(|dt| dt.timestamp())
|
109
109
|
.map_err(|e2| {
|
@@ -150,9 +150,9 @@ pub fn convert_to_timestamp_micros(
|
|
150
150
|
let s = String::try_convert(value)?;
|
151
151
|
// Parse string into Timestamp using jiff
|
152
152
|
let timestamp = if let Some(fmt) = format {
|
153
|
-
jiff::Timestamp::strptime(
|
153
|
+
jiff::Timestamp::strptime(fmt, &s).or_else(|e1| {
|
154
154
|
// Try parsing as DateTime and convert to Timestamp with zero offset
|
155
|
-
jiff::civil::DateTime::strptime(
|
155
|
+
jiff::civil::DateTime::strptime(fmt, &s).and_then(|dt| {
|
156
156
|
dt.to_zoned(TimeZone::fixed(Offset::constant(0)))
|
157
157
|
})
|
158
158
|
.map(|dt| dt.timestamp())
|
@@ -242,6 +242,7 @@ pub fn parquet_schema_type_to_arrow_data_type(
|
|
242
242
|
PrimitiveType::UInt64 => DataType::UInt64,
|
243
243
|
PrimitiveType::Float32 => DataType::Float32,
|
244
244
|
PrimitiveType::Float64 => DataType::Float64,
|
245
|
+
PrimitiveType::Decimal128(precision, scale) => DataType::Decimal128(*precision, *scale),
|
245
246
|
PrimitiveType::String => DataType::Utf8,
|
246
247
|
PrimitiveType::Binary => DataType::Binary,
|
247
248
|
PrimitiveType::Boolean => DataType::Boolean,
|
@@ -364,6 +365,20 @@ fn create_arrow_builder_for_type(
|
|
364
365
|
ParquetSchemaType::Primitive(PrimitiveType::Float64) => {
|
365
366
|
Ok(Box::new(Float64Builder::with_capacity(cap)))
|
366
367
|
}
|
368
|
+
ParquetSchemaType::Primitive(PrimitiveType::Decimal128(precision, scale)) => {
|
369
|
+
// Create a Decimal128Builder with specific precision and scale
|
370
|
+
let builder = Decimal128Builder::with_capacity(cap);
|
371
|
+
|
372
|
+
// Set precision and scale for the decimal and return the new builder
|
373
|
+
let builder_with_precision = builder.with_precision_and_scale(*precision, *scale).map_err(|e| {
|
374
|
+
MagnusError::new(
|
375
|
+
magnus::exception::runtime_error(),
|
376
|
+
format!("Failed to set precision and scale: {}", e),
|
377
|
+
)
|
378
|
+
})?;
|
379
|
+
|
380
|
+
Ok(Box::new(builder_with_precision))
|
381
|
+
}
|
367
382
|
ParquetSchemaType::Primitive(PrimitiveType::String) => {
|
368
383
|
Ok(Box::new(StringBuilder::with_capacity(cap, cap * 32)))
|
369
384
|
}
|
@@ -415,7 +430,7 @@ fn create_arrow_builder_for_type(
|
|
415
430
|
ParquetSchemaType::Struct(struct_field) => {
|
416
431
|
// Check for empty struct immediately
|
417
432
|
if struct_field.fields.is_empty() {
|
418
|
-
|
433
|
+
Err(MagnusError::new(
|
419
434
|
magnus::exception::runtime_error(),
|
420
435
|
"Cannot build a struct with zero fields - Parquet doesn't support empty structs".to_string(),
|
421
436
|
))?;
|
@@ -445,7 +460,7 @@ fn create_arrow_builder_for_type(
|
|
445
460
|
|
446
461
|
// Make sure we have the right number of builders
|
447
462
|
if child_field_builders.len() != arrow_fields.len() {
|
448
|
-
|
463
|
+
Err(MagnusError::new(
|
449
464
|
magnus::exception::runtime_error(),
|
450
465
|
format!(
|
451
466
|
"Number of field builders ({}) doesn't match number of arrow fields ({})",
|
@@ -834,6 +849,46 @@ fn fill_builder(
|
|
834
849
|
}
|
835
850
|
Ok(())
|
836
851
|
}
|
852
|
+
ParquetSchemaType::Primitive(PrimitiveType::Decimal128(_precision, scale)) => {
|
853
|
+
let typed_builder = builder
|
854
|
+
.as_any_mut()
|
855
|
+
.downcast_mut::<Decimal128Builder>()
|
856
|
+
.expect("Builder mismatch: expected Float64Builder");
|
857
|
+
|
858
|
+
for val in values {
|
859
|
+
match val {
|
860
|
+
ParquetValue::Decimal128(d) => typed_builder.append_value(*d),
|
861
|
+
ParquetValue::Float64(f) => {
|
862
|
+
// Scale the float to the desired precision and scale
|
863
|
+
let scaled_value = (*f * 10_f64.powi(*scale as i32)) as i128;
|
864
|
+
typed_builder.append_value(scaled_value)
|
865
|
+
}
|
866
|
+
ParquetValue::Float32(flo) => {
|
867
|
+
// Scale the float to the desired precision and scale
|
868
|
+
let scaled_value = (*flo as f64 * 10_f64.powi(*scale as i32)) as i128;
|
869
|
+
typed_builder.append_value(scaled_value)
|
870
|
+
}
|
871
|
+
ParquetValue::Int64(i) => {
|
872
|
+
// Scale the integer to the desired scale
|
873
|
+
let scaled_value = (*i as i128) * 10_i128.pow(*scale as u32);
|
874
|
+
typed_builder.append_value(scaled_value)
|
875
|
+
}
|
876
|
+
ParquetValue::Int32(i) => {
|
877
|
+
// Scale the integer to the desired scale
|
878
|
+
let scaled_value = (*i as i128) * 10_i128.pow(*scale as u32);
|
879
|
+
typed_builder.append_value(scaled_value)
|
880
|
+
}
|
881
|
+
ParquetValue::Null => typed_builder.append_null(),
|
882
|
+
other => {
|
883
|
+
return Err(MagnusError::new(
|
884
|
+
magnus::exception::type_error(),
|
885
|
+
format!("Expected Float64, got {:?}", other),
|
886
|
+
))
|
887
|
+
}
|
888
|
+
}
|
889
|
+
}
|
890
|
+
Ok(())
|
891
|
+
}
|
837
892
|
ParquetSchemaType::Primitive(PrimitiveType::Boolean) => {
|
838
893
|
let typed_builder = builder
|
839
894
|
.as_any_mut()
|
@@ -954,7 +1009,7 @@ fn fill_builder(
|
|
954
1009
|
.expect("Builder mismatch: expected BinaryBuilder");
|
955
1010
|
for val in values {
|
956
1011
|
match val {
|
957
|
-
ParquetValue::Bytes(b) => typed_builder.append_value(
|
1012
|
+
ParquetValue::Bytes(b) => typed_builder.append_value(b),
|
958
1013
|
ParquetValue::Null => typed_builder.append_null(),
|
959
1014
|
other => {
|
960
1015
|
return Err(MagnusError::new(
|
@@ -1106,6 +1161,15 @@ fn fill_builder(
|
|
1106
1161
|
)
|
1107
1162
|
})?
|
1108
1163
|
.append_value(bytes),
|
1164
|
+
ParquetValue::Decimal128(x) => typed_builder
|
1165
|
+
.field_builder::<Decimal128Builder>(i)
|
1166
|
+
.ok_or_else(|| {
|
1167
|
+
MagnusError::new(
|
1168
|
+
magnus::exception::type_error(),
|
1169
|
+
"Failed to coerce into Decimal128Builder",
|
1170
|
+
)
|
1171
|
+
})?
|
1172
|
+
.append_value(*x),
|
1109
1173
|
ParquetValue::Date32(x) => typed_builder
|
1110
1174
|
.field_builder::<Date32Builder>(i)
|
1111
1175
|
.ok_or_else(|| {
|
@@ -1302,6 +1366,15 @@ fn fill_builder(
|
|
1302
1366
|
)
|
1303
1367
|
})?
|
1304
1368
|
.append_null(),
|
1369
|
+
ParquetSchemaType::Primitive(PrimitiveType::Decimal128(_, _)) => typed_builder
|
1370
|
+
.field_builder::<Decimal128Builder>(i)
|
1371
|
+
.ok_or_else(|| {
|
1372
|
+
MagnusError::new(
|
1373
|
+
magnus::exception::type_error(),
|
1374
|
+
"Failed to coerce into Decimal128Builder",
|
1375
|
+
)
|
1376
|
+
})?
|
1377
|
+
.append_null(),
|
1305
1378
|
ParquetSchemaType::Primitive(PrimitiveType::String) => typed_builder
|
1306
1379
|
.field_builder::<StringBuilder>(i)
|
1307
1380
|
.ok_or_else(|| {
|
@@ -59,7 +59,7 @@ impl Write for IoLikeValue {
|
|
59
59
|
}
|
60
60
|
}
|
61
61
|
|
62
|
-
impl
|
62
|
+
impl FromStr for ParquetSchemaType<'_> {
|
63
63
|
type Err = MagnusError;
|
64
64
|
|
65
65
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
@@ -98,6 +98,36 @@ impl<'a> FromStr for ParquetSchemaType<'a> {
|
|
98
98
|
})));
|
99
99
|
}
|
100
100
|
|
101
|
+
// Check if it's a decimal type with precision and scale
|
102
|
+
if let Some(decimal_params) = s.strip_prefix("decimal(").and_then(|s| s.strip_suffix(")")) {
|
103
|
+
let parts: Vec<&str> = decimal_params.split(',').collect();
|
104
|
+
if parts.len() != 2 {
|
105
|
+
return Err(MagnusError::new(
|
106
|
+
magnus::exception::runtime_error(),
|
107
|
+
format!(
|
108
|
+
"Invalid decimal format. Expected 'decimal(precision,scale)', got '{}'",
|
109
|
+
s
|
110
|
+
),
|
111
|
+
));
|
112
|
+
}
|
113
|
+
|
114
|
+
let precision = parts[0].trim().parse::<u8>().map_err(|_| {
|
115
|
+
MagnusError::new(
|
116
|
+
magnus::exception::runtime_error(),
|
117
|
+
format!("Invalid precision value in decimal type: {}", parts[0]),
|
118
|
+
)
|
119
|
+
})?;
|
120
|
+
|
121
|
+
let scale = parts[1].trim().parse::<i8>().map_err(|_| {
|
122
|
+
MagnusError::new(
|
123
|
+
magnus::exception::runtime_error(),
|
124
|
+
format!("Invalid scale value in decimal type: {}", parts[1]),
|
125
|
+
)
|
126
|
+
})?;
|
127
|
+
|
128
|
+
return Ok(ParquetSchemaType::Primitive(PrimitiveType::Decimal128(precision, scale)));
|
129
|
+
}
|
130
|
+
|
101
131
|
// Handle primitive types
|
102
132
|
match s {
|
103
133
|
"int8" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Int8)),
|
@@ -116,6 +146,7 @@ impl<'a> FromStr for ParquetSchemaType<'a> {
|
|
116
146
|
"date32" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Date32)),
|
117
147
|
"timestamp_millis" => Ok(ParquetSchemaType::Primitive(PrimitiveType::TimestampMillis)),
|
118
148
|
"timestamp_micros" => Ok(ParquetSchemaType::Primitive(PrimitiveType::TimestampMicros)),
|
149
|
+
"decimal" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Decimal128(18, 2))), // Default precision 18, scale 2
|
119
150
|
"list" => Ok(ParquetSchemaType::List(Box::new(ListField {
|
120
151
|
item_type: ParquetSchemaType::Primitive(PrimitiveType::String),
|
121
152
|
format: None,
|
@@ -129,7 +160,7 @@ impl<'a> FromStr for ParquetSchemaType<'a> {
|
|
129
160
|
}
|
130
161
|
}
|
131
162
|
|
132
|
-
impl
|
163
|
+
impl TryConvert for ParquetSchemaType<'_> {
|
133
164
|
fn try_convert(value: Value) -> Result<Self, MagnusError> {
|
134
165
|
let ruby = unsafe { Ruby::get_unchecked() };
|
135
166
|
let schema_type = parse_string_or_symbol(&ruby, value)?;
|
@@ -144,7 +175,7 @@ impl<'a> TryConvert for ParquetSchemaType<'a> {
|
|
144
175
|
|
145
176
|
// We know this type is safe to move between threads because it's just an enum
|
146
177
|
// with simple primitive types and strings
|
147
|
-
unsafe impl
|
178
|
+
unsafe impl Send for ParquetSchemaType<'_> {}
|
148
179
|
|
149
180
|
pub enum WriterOutput {
|
150
181
|
File(ArrowWriter<Box<dyn SendableWrite>>),
|
@@ -202,14 +233,12 @@ impl<'a> ColumnCollector<'a> {
|
|
202
233
|
pub fn push_value(&mut self, value: Value) -> Result<(), MagnusError> {
|
203
234
|
use crate::types::ParquetValue;
|
204
235
|
|
205
|
-
if value.is_nil() {
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
));
|
212
|
-
}
|
236
|
+
if value.is_nil() && !self.nullable {
|
237
|
+
// For non-nullable fields, raise an error
|
238
|
+
return Err(MagnusError::new(
|
239
|
+
magnus::exception::runtime_error(),
|
240
|
+
"Cannot write nil value for non-nullable field",
|
241
|
+
));
|
213
242
|
}
|
214
243
|
|
215
244
|
// For all other types, proceed as normal
|
data/ext/parquet/src/utils.rs
CHANGED
@@ -13,12 +13,12 @@ pub fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String
|
|
13
13
|
RString::from_value(value)
|
14
14
|
.ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid string value"))?
|
15
15
|
.to_string()
|
16
|
-
.map(
|
16
|
+
.map(Some)
|
17
17
|
} else if value.is_kind_of(ruby.class_symbol()) {
|
18
18
|
Symbol::from_value(value)
|
19
19
|
.ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid symbol value"))?
|
20
20
|
.funcall("to_s", ())
|
21
|
-
.map(
|
21
|
+
.map(Some)
|
22
22
|
} else {
|
23
23
|
Err(Error::new(
|
24
24
|
magnus::exception::type_error(),
|
@@ -161,11 +161,11 @@ pub fn parse_parquet_columns_args(
|
|
161
161
|
};
|
162
162
|
|
163
163
|
let batch_size = kwargs.optional.2.flatten();
|
164
|
-
if let Some(
|
165
|
-
if
|
164
|
+
if let Some(batch_size) = batch_size {
|
165
|
+
if batch_size == 0 {
|
166
166
|
return Err(Error::new(
|
167
|
-
|
168
|
-
|
167
|
+
magnus::exception::arg_error(),
|
168
|
+
"Batch size must be greater than 0",
|
169
169
|
));
|
170
170
|
}
|
171
171
|
}
|
@@ -111,13 +111,13 @@ pub fn parse_parquet_write_args(
|
|
111
111
|
if let Some(type_val) = type_val {
|
112
112
|
// If it has a type: :struct, it's the new DSL format
|
113
113
|
// Use parse_string_or_symbol to handle both String and Symbol values
|
114
|
-
let ttype = parse_string_or_symbol(
|
114
|
+
let ttype = parse_string_or_symbol(ruby, type_val)?;
|
115
115
|
if let Some(ref type_str) = ttype {
|
116
116
|
if type_str == "struct" {
|
117
117
|
// Parse using the new schema approach
|
118
|
-
let schema_node = crate::parse_schema_node(
|
118
|
+
let schema_node = crate::parse_schema_node(ruby, schema_value)?;
|
119
119
|
|
120
|
-
validate_schema_node(
|
120
|
+
validate_schema_node(ruby, &schema_node)?;
|
121
121
|
|
122
122
|
return Ok(ParquetWriteArgs {
|
123
123
|
read_from,
|
@@ -143,22 +143,21 @@ pub fn parse_parquet_write_args(
|
|
143
143
|
"Schema fields must be an array",
|
144
144
|
)
|
145
145
|
})?
|
146
|
-
.
|
147
|
-
== 0)
|
146
|
+
.is_empty())
|
148
147
|
{
|
149
148
|
// If schema is nil or an empty array, we need to peek at the first value to determine column count
|
150
149
|
let first_value = read_from.funcall::<_, _, Value>("peek", ())?;
|
151
150
|
// Default to nullable:true for auto-inferred fields
|
152
|
-
crate::infer_schema_from_first_row(
|
151
|
+
crate::infer_schema_from_first_row(ruby, first_value, true)?
|
153
152
|
} else {
|
154
153
|
// Legacy array format - use our centralized parser
|
155
|
-
crate::parse_legacy_schema(
|
154
|
+
crate::parse_legacy_schema(ruby, schema_value)?
|
156
155
|
};
|
157
156
|
|
158
157
|
// Convert the legacy schema fields to SchemaNode (DSL format)
|
159
|
-
let schema_node = crate::legacy_schema_to_dsl(
|
158
|
+
let schema_node = crate::legacy_schema_to_dsl(ruby, schema_fields)?;
|
160
159
|
|
161
|
-
validate_schema_node(
|
160
|
+
validate_schema_node(ruby, &schema_node)?;
|
162
161
|
|
163
162
|
Ok(ParquetWriteArgs {
|
164
163
|
read_from,
|
@@ -195,6 +194,9 @@ fn arrow_data_type_to_parquet_schema_type(dt: &DataType) -> Result<ParquetSchema
|
|
195
194
|
}
|
196
195
|
DataType::Float32 => Ok(PST::Primitive(PrimitiveType::Float32)),
|
197
196
|
DataType::Float64 => Ok(PST::Primitive(PrimitiveType::Float64)),
|
197
|
+
DataType::Decimal128(precision, scale) => Ok(PST::Primitive(PrimitiveType::Decimal128(
|
198
|
+
*precision, *scale,
|
199
|
+
))),
|
198
200
|
DataType::Date32 => Ok(PST::Primitive(PrimitiveType::Date32)),
|
199
201
|
DataType::Date64 => {
|
200
202
|
// Our code typically uses Date32 or Timestamp for 64. But Arrow has Date64
|
@@ -414,15 +416,21 @@ fn create_writer(
|
|
414
416
|
compression: Option<String>,
|
415
417
|
) -> Result<WriterOutput, ParquetGemError> {
|
416
418
|
// Create writer properties with compression based on the option
|
419
|
+
let compression_setting = match compression.map(|s| s.to_lowercase()).as_deref() {
|
420
|
+
Some("none") | Some("uncompressed") => Ok(Compression::UNCOMPRESSED),
|
421
|
+
Some("snappy") => Ok(Compression::SNAPPY),
|
422
|
+
Some("gzip") => Ok(Compression::GZIP(GzipLevel::default())),
|
423
|
+
Some("lz4") => Ok(Compression::LZ4),
|
424
|
+
Some("zstd") => Ok(Compression::ZSTD(ZstdLevel::default())),
|
425
|
+
None => Ok(Compression::UNCOMPRESSED),
|
426
|
+
other => Err(MagnusError::new(
|
427
|
+
magnus::exception::arg_error(),
|
428
|
+
format!("Invalid compression option: {:?}", other),
|
429
|
+
)),
|
430
|
+
}?;
|
431
|
+
|
417
432
|
let props = WriterProperties::builder()
|
418
|
-
.set_compression(
|
419
|
-
Some("none") | Some("uncompressed") => Compression::UNCOMPRESSED,
|
420
|
-
Some("snappy") => Compression::SNAPPY,
|
421
|
-
Some("gzip") => Compression::GZIP(GzipLevel::default()),
|
422
|
-
Some("lz4") => Compression::LZ4,
|
423
|
-
Some("zstd") => Compression::ZSTD(ZstdLevel::default()),
|
424
|
-
_ => Compression::UNCOMPRESSED,
|
425
|
-
})
|
433
|
+
.set_compression(compression_setting)
|
426
434
|
.build();
|
427
435
|
|
428
436
|
if write_to.is_kind_of(ruby.class_string()) {
|
@@ -11,12 +11,12 @@ use crate::{
|
|
11
11
|
use crate::{types::PrimitiveType, SchemaNode};
|
12
12
|
use arrow_array::{Array, RecordBatch};
|
13
13
|
use magnus::{value::ReprValue, Error as MagnusError, RArray, Ruby, Value};
|
14
|
-
use std::sync::Arc;
|
14
|
+
use std::{rc::Rc, sync::Arc};
|
15
15
|
|
16
16
|
#[inline]
|
17
17
|
pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
18
18
|
let ruby = unsafe { Ruby::get_unchecked() };
|
19
|
-
write_columns_impl(
|
19
|
+
write_columns_impl(Rc::new(ruby), args).map_err(|e| {
|
20
20
|
let z: MagnusError = e.into();
|
21
21
|
z
|
22
22
|
})?;
|
@@ -24,7 +24,7 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
|
24
24
|
}
|
25
25
|
|
26
26
|
#[inline]
|
27
|
-
fn write_columns_impl(ruby:
|
27
|
+
fn write_columns_impl(ruby: Rc<Ruby>, args: &[Value]) -> Result<(), ParquetGemError> {
|
28
28
|
let ParquetWriteArgs {
|
29
29
|
read_from,
|
30
30
|
write_to,
|
@@ -94,7 +94,7 @@ fn write_columns_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemE
|
|
94
94
|
};
|
95
95
|
|
96
96
|
if batch_array.len() != schema_len {
|
97
|
-
|
97
|
+
Err(MagnusError::new(
|
98
98
|
magnus::exception::type_error(),
|
99
99
|
format!(
|
100
100
|
"Batch column count ({}) does not match schema length ({}). Schema expects columns: {:?}",
|
@@ -118,7 +118,7 @@ fn write_columns_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemE
|
|
118
118
|
))?,
|
119
119
|
};
|
120
120
|
if top_fields.len() != fields.len() {
|
121
|
-
|
121
|
+
Err(MagnusError::new(
|
122
122
|
magnus::exception::runtime_error(),
|
123
123
|
"Mismatch top-level DSL fields vs Arrow fields",
|
124
124
|
))?;
|
@@ -140,31 +140,34 @@ fn write_columns_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemE
|
|
140
140
|
parquet_type,
|
141
141
|
// Format is handled internally now
|
142
142
|
..
|
143
|
-
} => match parquet_type {
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
143
|
+
} => match *parquet_type {
|
144
|
+
PrimitiveType::Int8 => PST::Primitive(PrimitiveType::Int8),
|
145
|
+
PrimitiveType::Int16 => PST::Primitive(PrimitiveType::Int16),
|
146
|
+
PrimitiveType::Int32 => PST::Primitive(PrimitiveType::Int32),
|
147
|
+
PrimitiveType::Int64 => PST::Primitive(PrimitiveType::Int64),
|
148
|
+
PrimitiveType::UInt8 => PST::Primitive(PrimitiveType::UInt8),
|
149
|
+
PrimitiveType::UInt16 => PST::Primitive(PrimitiveType::UInt16),
|
150
|
+
PrimitiveType::UInt32 => PST::Primitive(PrimitiveType::UInt32),
|
151
|
+
PrimitiveType::UInt64 => PST::Primitive(PrimitiveType::UInt64),
|
152
|
+
PrimitiveType::Float32 => {
|
153
153
|
PST::Primitive(PrimitiveType::Float32)
|
154
154
|
}
|
155
|
-
|
155
|
+
PrimitiveType::Float64 => {
|
156
156
|
PST::Primitive(PrimitiveType::Float64)
|
157
157
|
}
|
158
|
-
|
159
|
-
|
160
|
-
|
158
|
+
PrimitiveType::Decimal128(precision, scale) => {
|
159
|
+
PST::Primitive(PrimitiveType::Decimal128(precision, scale))
|
160
|
+
}
|
161
|
+
PrimitiveType::String => PST::Primitive(PrimitiveType::String),
|
162
|
+
PrimitiveType::Binary => PST::Primitive(PrimitiveType::Binary),
|
163
|
+
PrimitiveType::Boolean => {
|
161
164
|
PST::Primitive(PrimitiveType::Boolean)
|
162
165
|
}
|
163
|
-
|
164
|
-
|
166
|
+
PrimitiveType::Date32 => PST::Primitive(PrimitiveType::Date32),
|
167
|
+
PrimitiveType::TimestampMillis => {
|
165
168
|
PST::Primitive(PrimitiveType::TimestampMillis)
|
166
169
|
}
|
167
|
-
|
170
|
+
PrimitiveType::TimestampMicros => {
|
168
171
|
PST::Primitive(PrimitiveType::TimestampMicros)
|
169
172
|
}
|
170
173
|
},
|
@@ -205,12 +208,12 @@ fn write_columns_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemE
|
|
205
208
|
if e.is_kind_of(ruby.exception_stop_iteration()) {
|
206
209
|
break;
|
207
210
|
}
|
208
|
-
|
211
|
+
Err(e)?;
|
209
212
|
}
|
210
213
|
}
|
211
214
|
}
|
212
215
|
} else {
|
213
|
-
|
216
|
+
Err(MagnusError::new(
|
214
217
|
magnus::exception::type_error(),
|
215
218
|
"read_from must be an Enumerator".to_string(),
|
216
219
|
))?;
|
@@ -16,14 +16,14 @@ use magnus::{
|
|
16
16
|
value::ReprValue, Error as MagnusError, RArray, RHash, Ruby, Symbol, TryConvert, Value,
|
17
17
|
};
|
18
18
|
use rand::Rng;
|
19
|
-
use std::sync::Arc;
|
19
|
+
use std::{rc::Rc, sync::Arc};
|
20
20
|
|
21
21
|
const MIN_SAMPLES_FOR_ESTIMATE: usize = 10;
|
22
22
|
|
23
23
|
#[inline]
|
24
24
|
pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
25
25
|
let ruby = unsafe { Ruby::get_unchecked() };
|
26
|
-
write_rows_impl(
|
26
|
+
write_rows_impl(Rc::new(ruby), args).map_err(|e| {
|
27
27
|
let z: MagnusError = e.into();
|
28
28
|
z
|
29
29
|
})?;
|
@@ -31,7 +31,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
31
31
|
}
|
32
32
|
|
33
33
|
#[inline]
|
34
|
-
fn write_rows_impl(ruby:
|
34
|
+
fn write_rows_impl(ruby: Rc<Ruby>, args: &[Value]) -> Result<(), ParquetGemError> {
|
35
35
|
let ParquetWriteArgs {
|
36
36
|
read_from,
|
37
37
|
write_to,
|
@@ -83,8 +83,8 @@ fn write_rows_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemErro
|
|
83
83
|
})?;
|
84
84
|
let row_size = estimate_single_row_size(&row_array, &column_collectors)?;
|
85
85
|
size_samples.push(row_size);
|
86
|
-
} else if rng.random_range(0..=total_rows) < sample_size
|
87
|
-
let idx = rng.random_range(0..sample_size
|
86
|
+
} else if rng.random_range(0..=total_rows) < sample_size {
|
87
|
+
let idx = rng.random_range(0..sample_size);
|
88
88
|
let row_array = RArray::from_value(row).ok_or_else(|| {
|
89
89
|
MagnusError::new(ruby.exception_type_error(), "Row must be an array")
|
90
90
|
})?;
|
@@ -115,12 +115,12 @@ fn write_rows_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemErro
|
|
115
115
|
}
|
116
116
|
break;
|
117
117
|
}
|
118
|
-
|
118
|
+
Err(e)?;
|
119
119
|
}
|
120
120
|
}
|
121
121
|
}
|
122
122
|
} else {
|
123
|
-
|
123
|
+
Err(MagnusError::new(
|
124
124
|
magnus::exception::type_error(),
|
125
125
|
"read_from must be an Enumerator".to_string(),
|
126
126
|
))?;
|
@@ -257,6 +257,7 @@ pub fn estimate_value_size(
|
|
257
257
|
| PST::Primitive(PrimitiveType::UInt64)
|
258
258
|
| PST::Primitive(PrimitiveType::Float64) => Ok(8),
|
259
259
|
PST::Primitive(PrimitiveType::Boolean) => Ok(1),
|
260
|
+
PST::Primitive(PrimitiveType::Decimal128(_, _)) => Ok(16),
|
260
261
|
PST::Primitive(PrimitiveType::Date32)
|
261
262
|
| PST::Primitive(PrimitiveType::TimestampMillis)
|
262
263
|
| PST::Primitive(PrimitiveType::TimestampMicros) => Ok(8),
|
@@ -429,15 +430,13 @@ pub fn estimate_value_size(
|
|
429
430
|
if let Some(field_value) = hash.get(&*field.name) {
|
430
431
|
total_fields_size +=
|
431
432
|
estimate_value_size(field_value, &field.type_)?;
|
433
|
+
} else if field.nullable {
|
434
|
+
total_fields_size += 0;
|
432
435
|
} else {
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
magnus::exception::runtime_error(),
|
438
|
-
format!("Missing field: {} in hash {:?}", field.name, hash),
|
439
|
-
));
|
440
|
-
}
|
436
|
+
return Err(MagnusError::new(
|
437
|
+
magnus::exception::runtime_error(),
|
438
|
+
format!("Missing field: {} in hash {:?}", field.name, hash),
|
439
|
+
));
|
441
440
|
}
|
442
441
|
}
|
443
442
|
}
|