parquet 0.4.2 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +66 -59
- data/README.md +105 -1
- data/ext/parquet/Cargo.toml +4 -3
- data/ext/parquet/src/enumerator.rs +8 -0
- data/ext/parquet/src/header_cache.rs +7 -3
- data/ext/parquet/src/lib.rs +1 -0
- data/ext/parquet/src/logger.rs +171 -0
- data/ext/parquet/src/reader/common.rs +113 -0
- data/ext/parquet/src/reader/mod.rs +27 -13
- data/ext/parquet/src/reader/parquet_column_reader.rs +38 -78
- data/ext/parquet/src/reader/parquet_row_reader.rs +42 -19
- data/ext/parquet/src/types/core_types.rs +57 -1
- data/ext/parquet/src/types/mod.rs +8 -1
- data/ext/parquet/src/types/parquet_value.rs +211 -35
- data/ext/parquet/src/types/record_types.rs +18 -15
- data/ext/parquet/src/types/schema_converter.rs +349 -0
- data/ext/parquet/src/types/schema_node.rs +329 -0
- data/ext/parquet/src/types/timestamp.rs +18 -8
- data/ext/parquet/src/types/type_conversion.rs +1106 -511
- data/ext/parquet/src/types/writer_types.rs +78 -107
- data/ext/parquet/src/utils.rs +29 -9
- data/ext/parquet/src/writer/mod.rs +828 -280
- data/lib/parquet/schema.rb +154 -0
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rb +1 -0
- metadata +7 -2
@@ -1,16 +1,16 @@
|
|
1
1
|
use std::{
|
2
2
|
fs::File,
|
3
3
|
io::{self, BufReader, BufWriter},
|
4
|
-
mem,
|
5
4
|
sync::Arc,
|
6
5
|
};
|
7
6
|
|
8
7
|
use arrow_array::{Array, RecordBatch};
|
9
|
-
use arrow_schema::{DataType,
|
8
|
+
use arrow_schema::{DataType, Schema, TimeUnit};
|
9
|
+
use itertools::Itertools;
|
10
10
|
use magnus::{
|
11
11
|
scan_args::{get_kwargs, scan_args},
|
12
12
|
value::ReprValue,
|
13
|
-
Error as MagnusError, RArray, Ruby, TryConvert, Value,
|
13
|
+
Error as MagnusError, RArray, RHash, Ruby, Symbol, TryConvert, Value,
|
14
14
|
};
|
15
15
|
use parquet::{
|
16
16
|
arrow::ArrowWriter,
|
@@ -22,18 +22,210 @@ use tempfile::NamedTempFile;
|
|
22
22
|
|
23
23
|
use crate::{
|
24
24
|
convert_ruby_array_to_arrow,
|
25
|
-
|
26
|
-
|
25
|
+
logger::RubyLogger,
|
26
|
+
reader::ReaderError,
|
27
|
+
types::{
|
28
|
+
schema_node::build_arrow_schema, // ADDED - we need to reference the DSL's build_arrow_schema
|
29
|
+
ColumnCollector,
|
30
|
+
ParquetSchemaType,
|
31
|
+
WriterOutput,
|
32
|
+
},
|
33
|
+
utils::parse_string_or_symbol,
|
34
|
+
IoLikeValue, ParquetSchemaType as PST, ParquetWriteArgs, SchemaField, SendableWrite,
|
27
35
|
};
|
36
|
+
use crate::{types::PrimitiveType, SchemaNode}; // ADDED - ensure we import SchemaNode
|
28
37
|
|
29
|
-
const MIN_SAMPLES_FOR_ESTIMATE: usize = 10;
|
30
|
-
const SAMPLE_SIZE: usize = 100;
|
31
|
-
const MIN_BATCH_SIZE: usize = 10;
|
32
|
-
const INITIAL_BATCH_SIZE: usize = 100;
|
33
|
-
|
34
|
-
// Maximum memory usage per batch (64MB by default)
|
38
|
+
const MIN_SAMPLES_FOR_ESTIMATE: usize = 10;
|
39
|
+
const SAMPLE_SIZE: usize = 100;
|
40
|
+
const MIN_BATCH_SIZE: usize = 10;
|
41
|
+
const INITIAL_BATCH_SIZE: usize = 100;
|
35
42
|
const DEFAULT_MEMORY_THRESHOLD: usize = 64 * 1024 * 1024;
|
36
43
|
|
44
|
+
// -----------------------------------------------------------------------------
|
45
|
+
// HELPER to invert arrow DataType back to our ParquetSchemaType
|
46
|
+
// Converts Arrow DataType to our internal ParquetSchemaType representation.
|
47
|
+
// This is essential for mapping Arrow types back to our schema representation
|
48
|
+
// when working with column collections and schema validation.
|
49
|
+
// -----------------------------------------------------------------------------
|
50
|
+
fn arrow_data_type_to_parquet_schema_type(dt: &DataType) -> Result<ParquetSchemaType, MagnusError> {
|
51
|
+
match dt {
|
52
|
+
DataType::Boolean => Ok(PST::Boolean),
|
53
|
+
DataType::Int8 => Ok(PST::Int8),
|
54
|
+
DataType::Int16 => Ok(PST::Int16),
|
55
|
+
DataType::Int32 => Ok(PST::Int32),
|
56
|
+
DataType::Int64 => Ok(PST::Int64),
|
57
|
+
DataType::UInt8 => Ok(PST::UInt8),
|
58
|
+
DataType::UInt16 => Ok(PST::UInt16),
|
59
|
+
DataType::UInt32 => Ok(PST::UInt32),
|
60
|
+
DataType::UInt64 => Ok(PST::UInt64),
|
61
|
+
DataType::Float16 => {
|
62
|
+
// We do not have a direct ParquetSchemaType::Float16, we treat it as Float
|
63
|
+
Ok(PST::Float)
|
64
|
+
}
|
65
|
+
DataType::Float32 => Ok(PST::Float),
|
66
|
+
DataType::Float64 => Ok(PST::Double),
|
67
|
+
DataType::Date32 => Ok(PST::Date32),
|
68
|
+
DataType::Date64 => {
|
69
|
+
// Our code typically uses Date32 or Timestamp for 64. But Arrow has Date64
|
70
|
+
// We can store it as PST::Date64 if we want. If we don't have that, consider PST::Date32 or an error.
|
71
|
+
// If your existing code only handles Date32, you can error. But let's do PST::Date32 as fallback:
|
72
|
+
// Or define a new variant if you have one in your code. We'll show a fallback approach:
|
73
|
+
Err(MagnusError::new(
|
74
|
+
magnus::exception::runtime_error(),
|
75
|
+
"Arrow Date64 not directly supported in current ParquetSchemaType (use date32?).",
|
76
|
+
))
|
77
|
+
}
|
78
|
+
DataType::Timestamp(TimeUnit::Second, _tz) => {
|
79
|
+
// We'll treat this as PST::TimestampMillis, or define PST::TimestampSecond
|
80
|
+
// For simplicity, let's map "second" to PST::TimestampMillis with a note:
|
81
|
+
Ok(PST::TimestampMillis)
|
82
|
+
}
|
83
|
+
DataType::Timestamp(TimeUnit::Millisecond, _tz) => Ok(PST::TimestampMillis),
|
84
|
+
DataType::Timestamp(TimeUnit::Microsecond, _tz) => Ok(PST::TimestampMicros),
|
85
|
+
DataType::Timestamp(TimeUnit::Nanosecond, _tz) => {
|
86
|
+
// If you have a PST::TimestampNanos variant, use it. Otherwise, degrade to micros
|
87
|
+
// for demonstration:
|
88
|
+
Err(MagnusError::new(
|
89
|
+
magnus::exception::runtime_error(),
|
90
|
+
"TimestampNanos not supported, please adjust your schema or code.",
|
91
|
+
))
|
92
|
+
}
|
93
|
+
DataType::Utf8 => Ok(PST::String),
|
94
|
+
DataType::Binary => Ok(PST::Binary),
|
95
|
+
DataType::LargeUtf8 => {
|
96
|
+
// If not supported, degrade or error. We'll degrade to PST::String
|
97
|
+
Ok(PST::String)
|
98
|
+
}
|
99
|
+
DataType::LargeBinary => Ok(PST::Binary),
|
100
|
+
DataType::List(child_field) => {
|
101
|
+
// Recursively handle the item type
|
102
|
+
let child_type = arrow_data_type_to_parquet_schema_type(child_field.data_type())?;
|
103
|
+
Ok(PST::List(Box::new(crate::types::ListField {
|
104
|
+
item_type: child_type,
|
105
|
+
format: None,
|
106
|
+
nullable: true,
|
107
|
+
})))
|
108
|
+
}
|
109
|
+
DataType::Map(entry_field, _keys_sorted) => {
|
110
|
+
// Arrow's Map -> a struct<key, value> inside
|
111
|
+
let entry_type = entry_field.data_type();
|
112
|
+
if let DataType::Struct(fields) = entry_type {
|
113
|
+
if fields.len() == 2 {
|
114
|
+
let key_type = arrow_data_type_to_parquet_schema_type(fields[0].data_type())?;
|
115
|
+
let value_type = arrow_data_type_to_parquet_schema_type(fields[1].data_type())?;
|
116
|
+
Ok(PST::Map(Box::new(crate::types::MapField {
|
117
|
+
key_type,
|
118
|
+
value_type,
|
119
|
+
key_format: None,
|
120
|
+
value_format: None,
|
121
|
+
value_nullable: true,
|
122
|
+
})))
|
123
|
+
} else {
|
124
|
+
Err(MagnusError::new(
|
125
|
+
magnus::exception::type_error(),
|
126
|
+
"Map field must have exactly 2 child fields (key, value)",
|
127
|
+
))
|
128
|
+
}
|
129
|
+
} else {
|
130
|
+
Err(MagnusError::new(
|
131
|
+
magnus::exception::type_error(),
|
132
|
+
"Map field is not a struct? Unexpected Arrow schema layout",
|
133
|
+
))
|
134
|
+
}
|
135
|
+
}
|
136
|
+
DataType::Struct(arrow_fields) => {
|
137
|
+
// We treat this as PST::Struct. We'll recursively handle subfields
|
138
|
+
// but for top-level collecting we only store them as one column
|
139
|
+
// so the user data must pass a Ruby Hash or something for that field.
|
140
|
+
let mut schema_fields = vec![];
|
141
|
+
for f in arrow_fields {
|
142
|
+
let sub_type = arrow_data_type_to_parquet_schema_type(f.data_type())?;
|
143
|
+
schema_fields.push(SchemaField {
|
144
|
+
name: f.name().clone(),
|
145
|
+
type_: sub_type,
|
146
|
+
format: None, // We can't see the 'format' from Arrow
|
147
|
+
nullable: f.is_nullable(),
|
148
|
+
});
|
149
|
+
}
|
150
|
+
Ok(PST::Struct(Box::new(crate::types::StructField {
|
151
|
+
fields: schema_fields,
|
152
|
+
})))
|
153
|
+
}
|
154
|
+
_ => Err(MagnusError::new(
|
155
|
+
magnus::exception::runtime_error(),
|
156
|
+
format!("Unsupported or unhandled Arrow DataType: {:?}", dt),
|
157
|
+
)),
|
158
|
+
}
|
159
|
+
}
|
160
|
+
|
161
|
+
// -----------------------------------------------------------------------------
|
162
|
+
// HELPER to build ColumnCollectors for the DSL variant
|
163
|
+
// This function converts a SchemaNode (from our DSL) into a collection of ColumnCollectors
|
164
|
+
// that can accumulate values for each column in the schema.
|
165
|
+
// - arrow_schema: The Arrow schema corresponding to our DSL schema
|
166
|
+
// - root_node: The root SchemaNode (expected to be a Struct node) from which to build collectors
|
167
|
+
// -----------------------------------------------------------------------------
|
168
|
+
fn build_column_collectors_from_dsl<'a>(
|
169
|
+
ruby: &'a Ruby,
|
170
|
+
arrow_schema: &'a Arc<Schema>,
|
171
|
+
root_node: &'a SchemaNode,
|
172
|
+
) -> Result<Vec<ColumnCollector<'a>>, MagnusError> {
|
173
|
+
// We expect the top-level schema node to be a Struct so that arrow_schema
|
174
|
+
// lines up with root_node.fields. If the user gave a top-level primitive, it would be 1 field, but
|
175
|
+
// our code calls build_arrow_schema under the assumption "top-level must be Struct."
|
176
|
+
let fields = match root_node {
|
177
|
+
SchemaNode::Struct { fields, .. } => fields,
|
178
|
+
_ => {
|
179
|
+
return Err(MagnusError::new(
|
180
|
+
ruby.exception_runtime_error(),
|
181
|
+
"Top-level schema for DSL must be a struct",
|
182
|
+
))
|
183
|
+
}
|
184
|
+
};
|
185
|
+
|
186
|
+
if fields.len() != arrow_schema.fields().len() {
|
187
|
+
return Err(MagnusError::new(
|
188
|
+
ruby.exception_runtime_error(),
|
189
|
+
format!(
|
190
|
+
"Mismatch between DSL field count ({}) and Arrow fields ({})",
|
191
|
+
fields.len(),
|
192
|
+
arrow_schema.fields().len()
|
193
|
+
),
|
194
|
+
));
|
195
|
+
}
|
196
|
+
|
197
|
+
let mut collectors = Vec::with_capacity(fields.len());
|
198
|
+
for (arrow_field, schema_field_node) in arrow_schema.fields().iter().zip(fields) {
|
199
|
+
let name = arrow_field.name().clone();
|
200
|
+
let parquet_type = arrow_data_type_to_parquet_schema_type(arrow_field.data_type())?;
|
201
|
+
|
202
|
+
// Extract the optional format from the schema node
|
203
|
+
let format = extract_format_from_schema_node(schema_field_node);
|
204
|
+
|
205
|
+
// Build the ColumnCollector
|
206
|
+
collectors.push(ColumnCollector::new(
|
207
|
+
name,
|
208
|
+
parquet_type,
|
209
|
+
format,
|
210
|
+
arrow_field.is_nullable(),
|
211
|
+
));
|
212
|
+
}
|
213
|
+
Ok(collectors)
|
214
|
+
}
|
215
|
+
|
216
|
+
// Helper to extract the format from a SchemaNode if available
|
217
|
+
fn extract_format_from_schema_node(node: &SchemaNode) -> Option<String> {
|
218
|
+
match node {
|
219
|
+
SchemaNode::Primitive {
|
220
|
+
format: f,
|
221
|
+
parquet_type: _,
|
222
|
+
..
|
223
|
+
} => f.clone(),
|
224
|
+
// For struct, list, map, etc. there's no single "format." We ignore it.
|
225
|
+
_ => None,
|
226
|
+
}
|
227
|
+
}
|
228
|
+
|
37
229
|
/// Parse arguments for Parquet writing
|
38
230
|
pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, MagnusError> {
|
39
231
|
let ruby = unsafe { Ruby::get_unchecked() };
|
@@ -42,12 +234,13 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
42
234
|
|
43
235
|
let kwargs = get_kwargs::<
|
44
236
|
_,
|
45
|
-
(
|
237
|
+
(Value, Value),
|
46
238
|
(
|
47
239
|
Option<Option<usize>>,
|
48
240
|
Option<Option<usize>>,
|
49
241
|
Option<Option<String>>,
|
50
242
|
Option<Option<usize>>,
|
243
|
+
Option<Option<Value>>,
|
51
244
|
),
|
52
245
|
(),
|
53
246
|
>(
|
@@ -58,146 +251,231 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
58
251
|
"flush_threshold",
|
59
252
|
"compression",
|
60
253
|
"sample_size",
|
254
|
+
"logger",
|
61
255
|
],
|
62
256
|
)?;
|
63
257
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
magnus::exception::type_error(),
|
70
|
-
"First value must be an array when schema is not provided",
|
71
|
-
)
|
72
|
-
})?;
|
73
|
-
|
74
|
-
// Generate field names f0, f1, f2, etc.
|
75
|
-
(0..array.len())
|
76
|
-
.map(|i| SchemaField {
|
77
|
-
name: format!("f{}", i),
|
78
|
-
type_: ParquetSchemaType::String,
|
79
|
-
format: None,
|
80
|
-
})
|
81
|
-
.collect()
|
82
|
-
} else {
|
83
|
-
let schema_array = kwargs.required.0.unwrap();
|
258
|
+
// The schema value could be one of:
|
259
|
+
// 1. An array of hashes (legacy format)
|
260
|
+
// 2. A hash with type: :struct (new DSL format)
|
261
|
+
// 3. nil (infer from data)
|
262
|
+
let schema_value = kwargs.required.0;
|
84
263
|
|
85
|
-
|
264
|
+
// Check if it's the new DSL format (a hash with type: :struct)
|
265
|
+
// We need to handle both direct hash objects and objects created via Parquet::Schema.define
|
86
266
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
267
|
+
// First, try to convert it to a Hash if it's not already a Hash
|
268
|
+
// This handles the case where schema_value is a Schema object from Parquet::Schema.define
|
269
|
+
let schema_hash = if schema_value.is_kind_of(ruby.class_hash()) {
|
270
|
+
RHash::from_value(schema_value).ok_or_else(|| {
|
271
|
+
MagnusError::new(magnus::exception::type_error(), "Schema must be a hash")
|
272
|
+
})?
|
273
|
+
} else {
|
274
|
+
// Try to convert the object to a hash with to_h
|
275
|
+
match schema_value.respond_to("to_h", false) {
|
276
|
+
Ok(true) => {
|
277
|
+
match schema_value.funcall::<_, _, Value>("to_h", ()) {
|
278
|
+
Ok(hash_val) => match RHash::from_value(hash_val) {
|
279
|
+
Some(hash) => hash,
|
280
|
+
None => {
|
281
|
+
// Not a hash, continue to normal handling
|
282
|
+
RHash::new()
|
283
|
+
}
|
284
|
+
},
|
285
|
+
Err(_) => {
|
286
|
+
// couldn't call to_h, continue to normal handling
|
287
|
+
RHash::new()
|
288
|
+
}
|
289
|
+
}
|
93
290
|
}
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
return Err(MagnusError::new(
|
98
|
-
magnus::exception::type_error(),
|
99
|
-
format!("schema[{}] must contain exactly one key-value pair", idx),
|
100
|
-
));
|
291
|
+
_ => {
|
292
|
+
// Doesn't respond to to_h, continue to normal handling
|
293
|
+
RHash::new()
|
101
294
|
}
|
295
|
+
}
|
296
|
+
};
|
102
297
|
|
103
|
-
|
104
|
-
|
298
|
+
// Now check if it's a schema hash with a type: :struct field
|
299
|
+
let type_val = schema_hash.get(Symbol::new("type"));
|
105
300
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
301
|
+
if let Some(type_val) = type_val {
|
302
|
+
// If it has a type: :struct, it's the new DSL format
|
303
|
+
// Use parse_string_or_symbol to handle both String and Symbol values
|
304
|
+
let ttype = parse_string_or_symbol(&ruby, type_val)?;
|
305
|
+
if let Some(ref type_str) = ttype {
|
306
|
+
if type_str == "struct" {
|
307
|
+
// Parse using the new schema approach
|
308
|
+
let schema_node = crate::parse_schema_node(&ruby, schema_value)?;
|
110
309
|
|
111
|
-
|
112
|
-
let key = String::try_convert(key)?;
|
113
|
-
match key.as_str() {
|
114
|
-
"type" => type_str = Some(value),
|
115
|
-
"format" => format_str = Some(String::try_convert(value)?),
|
116
|
-
_ => {
|
117
|
-
return Err(MagnusError::new(
|
118
|
-
magnus::exception::type_error(),
|
119
|
-
format!("Unknown key '{}' in type definition", key),
|
120
|
-
))
|
121
|
-
}
|
122
|
-
}
|
123
|
-
}
|
310
|
+
validate_schema_node(&ruby, &schema_node)?;
|
124
311
|
|
125
|
-
|
312
|
+
return Ok(ParquetWriteArgs {
|
313
|
+
read_from,
|
314
|
+
write_to: kwargs.required.1,
|
315
|
+
schema: schema_node,
|
316
|
+
batch_size: kwargs.optional.0.flatten(),
|
317
|
+
flush_threshold: kwargs.optional.1.flatten(),
|
318
|
+
compression: kwargs.optional.2.flatten(),
|
319
|
+
sample_size: kwargs.optional.3.flatten(),
|
320
|
+
logger: kwargs.optional.4.flatten(),
|
321
|
+
});
|
322
|
+
}
|
323
|
+
}
|
324
|
+
}
|
325
|
+
|
326
|
+
// If it's not a hash with type: :struct, handle as legacy format
|
327
|
+
let schema_fields = if schema_value.is_nil()
|
328
|
+
|| (schema_value.is_kind_of(ruby.class_array())
|
329
|
+
&& RArray::from_value(schema_value)
|
330
|
+
.ok_or_else(|| {
|
126
331
|
MagnusError::new(
|
127
332
|
magnus::exception::type_error(),
|
128
|
-
"
|
333
|
+
"Schema fields must be an array",
|
129
334
|
)
|
130
|
-
})
|
335
|
+
})?
|
336
|
+
.len()
|
337
|
+
== 0)
|
338
|
+
{
|
339
|
+
// If schema is nil or an empty array, we need to peek at the first value to determine column count
|
340
|
+
let first_value = read_from.funcall::<_, _, Value>("peek", ())?;
|
341
|
+
// Default to nullable:true for auto-inferred fields
|
342
|
+
crate::infer_schema_from_first_row(&ruby, first_value, true)?
|
343
|
+
} else {
|
344
|
+
// Legacy array format - use our centralized parser
|
345
|
+
crate::parse_legacy_schema(&ruby, schema_value)?
|
346
|
+
};
|
131
347
|
|
132
|
-
|
133
|
-
|
134
|
-
(ParquetSchemaType::try_convert(type_value.clone())?, None)
|
135
|
-
};
|
136
|
-
|
137
|
-
schema.push(SchemaField {
|
138
|
-
name,
|
139
|
-
type_,
|
140
|
-
format,
|
141
|
-
});
|
142
|
-
}
|
348
|
+
// Convert the legacy schema fields to SchemaNode (DSL format)
|
349
|
+
let schema_node = crate::legacy_schema_to_dsl(&ruby, schema_fields)?;
|
143
350
|
|
144
|
-
|
145
|
-
};
|
351
|
+
validate_schema_node(&ruby, &schema_node)?;
|
146
352
|
|
147
353
|
Ok(ParquetWriteArgs {
|
148
354
|
read_from,
|
149
355
|
write_to: kwargs.required.1,
|
150
|
-
schema,
|
356
|
+
schema: schema_node,
|
151
357
|
batch_size: kwargs.optional.0.flatten(),
|
152
358
|
flush_threshold: kwargs.optional.1.flatten(),
|
153
359
|
compression: kwargs.optional.2.flatten(),
|
154
360
|
sample_size: kwargs.optional.3.flatten(),
|
361
|
+
logger: kwargs.optional.4.flatten(),
|
155
362
|
})
|
156
363
|
}
|
157
364
|
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
365
|
+
// Validates a SchemaNode to ensure it meets Parquet schema requirements
|
366
|
+
// Currently checks for duplicate field names at the root level, which would
|
367
|
+
// cause problems when writing Parquet files. Additional validation rules
|
368
|
+
// could be added here in the future.
|
369
|
+
//
|
370
|
+
// This validation is important because schema errors are difficult to debug
|
371
|
+
// once they reach the Parquet/Arrow layer, so we check proactively before
|
372
|
+
// any data processing begins.
|
373
|
+
fn validate_schema_node(ruby: &Ruby, schema_node: &SchemaNode) -> Result<(), MagnusError> {
|
374
|
+
if let SchemaNode::Struct { fields, .. } = &schema_node {
|
375
|
+
// if any root level schema fields have the same name, we raise an error
|
376
|
+
let field_names = fields
|
377
|
+
.iter()
|
378
|
+
.map(|f| match f {
|
379
|
+
SchemaNode::Struct { name, .. } => name.as_str(),
|
380
|
+
SchemaNode::List { name, .. } => name.as_str(),
|
381
|
+
SchemaNode::Map { name, .. } => name.as_str(),
|
382
|
+
SchemaNode::Primitive { name, .. } => name.as_str(),
|
383
|
+
})
|
384
|
+
.collect::<Vec<_>>();
|
385
|
+
let unique_field_names = field_names.iter().unique().collect::<Vec<_>>();
|
386
|
+
if field_names.len() != unique_field_names.len() {
|
387
|
+
return Err(MagnusError::new(
|
388
|
+
ruby.exception_arg_error(),
|
389
|
+
format!(
|
390
|
+
"Duplicate field names in root level schema: {:?}",
|
391
|
+
field_names
|
392
|
+
),
|
393
|
+
));
|
394
|
+
}
|
395
|
+
}
|
396
|
+
Ok(())
|
397
|
+
}
|
398
|
+
|
399
|
+
// Processes a single data row and adds values to the corresponding column collectors
|
400
|
+
// This function is called for each row of input data when writing in row-wise mode.
|
401
|
+
// It performs important validation to ensure the row structure matches the schema:
|
402
|
+
// - Verifies that the number of columns in the row matches the schema
|
403
|
+
// - Distributes each value to the appropriate ColumnCollector
|
404
|
+
//
|
405
|
+
// Each ColumnCollector handles type conversion and accumulation for its specific column,
|
406
|
+
// allowing this function to focus on row-level validation and distribution.
|
407
|
+
fn process_row(
|
408
|
+
ruby: &Ruby,
|
409
|
+
row: Value,
|
410
|
+
column_collectors: &mut [ColumnCollector],
|
411
|
+
) -> Result<(), MagnusError> {
|
412
|
+
let row_array = RArray::from_value(row)
|
413
|
+
.ok_or_else(|| MagnusError::new(ruby.exception_type_error(), "Row must be an array"))?;
|
414
|
+
|
415
|
+
// Validate row length matches schema
|
416
|
+
if row_array.len() != column_collectors.len() {
|
417
|
+
return Err(MagnusError::new(
|
418
|
+
magnus::exception::runtime_error(),
|
419
|
+
format!(
|
420
|
+
"Row length ({}) does not match schema length ({}). Schema expects columns: {:?}",
|
421
|
+
row_array.len(),
|
422
|
+
column_collectors.len(),
|
423
|
+
column_collectors
|
424
|
+
.iter()
|
425
|
+
.map(|c| c.name.as_str())
|
426
|
+
.collect::<Vec<_>>()
|
427
|
+
),
|
428
|
+
));
|
429
|
+
}
|
430
|
+
|
431
|
+
// Process each value in the row
|
432
|
+
for (collector, value) in column_collectors.iter_mut().zip(row_array) {
|
433
|
+
collector.push_value(value)?;
|
195
434
|
}
|
196
|
-
|
435
|
+
|
436
|
+
Ok(())
|
437
|
+
}
|
438
|
+
|
439
|
+
// Dynamically calculates an optimal batch size based on estimated row sizes
|
440
|
+
// and memory constraints. This function enables the writer to adapt to different
|
441
|
+
// data characteristics for optimal performance.
|
442
|
+
//
|
443
|
+
// The algorithm:
|
444
|
+
// 1. Requires a minimum number of samples to make a reliable estimate
|
445
|
+
// 2. Calculates the average row size from the samples
|
446
|
+
// 3. Determines a batch size that would consume approximately the target memory threshold
|
447
|
+
// 4. Ensures the batch size doesn't go below a minimum value for efficiency
|
448
|
+
//
|
449
|
+
// This approach balances memory usage with processing efficiency by targeting
|
450
|
+
// a specific memory footprint per batch.
|
451
|
+
fn update_batch_size(
|
452
|
+
size_samples: &[usize],
|
453
|
+
flush_threshold: usize,
|
454
|
+
min_batch_size: usize,
|
455
|
+
) -> usize {
|
456
|
+
if size_samples.len() < MIN_SAMPLES_FOR_ESTIMATE {
|
457
|
+
return min_batch_size;
|
458
|
+
}
|
459
|
+
|
460
|
+
let total_size = size_samples.iter().sum::<usize>();
|
461
|
+
// Safe because we know we have at least MIN_SAMPLES_FOR_ESTIMATE samples
|
462
|
+
let avg_row_size = total_size as f64 / size_samples.len() as f64;
|
463
|
+
let avg_row_size = avg_row_size.max(1.0); // Ensure we don't divide by zero
|
464
|
+
let suggested_batch_size = (flush_threshold as f64 / avg_row_size).floor() as usize;
|
465
|
+
suggested_batch_size.max(min_batch_size)
|
197
466
|
}
|
198
467
|
|
199
468
|
#[inline]
|
200
469
|
pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
470
|
+
write_rows_impl(args).map_err(|e| {
|
471
|
+
let z: MagnusError = e.into();
|
472
|
+
z
|
473
|
+
})?;
|
474
|
+
Ok(())
|
475
|
+
}
|
476
|
+
|
477
|
+
#[inline]
|
478
|
+
fn write_rows_impl(args: &[Value]) -> Result<(), ReaderError> {
|
201
479
|
let ruby = unsafe { Ruby::get_unchecked() };
|
202
480
|
|
203
481
|
let ParquetWriteArgs {
|
@@ -208,59 +486,27 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
208
486
|
compression,
|
209
487
|
flush_threshold,
|
210
488
|
sample_size: user_sample_size,
|
489
|
+
logger,
|
211
490
|
} = parse_parquet_write_args(args)?;
|
212
491
|
|
492
|
+
let logger = RubyLogger::new(&ruby, logger)?;
|
213
493
|
let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
|
214
494
|
|
215
|
-
//
|
216
|
-
let
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
ParquetSchemaType::Int8 => DataType::Int8,
|
223
|
-
ParquetSchemaType::Int16 => DataType::Int16,
|
224
|
-
ParquetSchemaType::Int32 => DataType::Int32,
|
225
|
-
ParquetSchemaType::Int64 => DataType::Int64,
|
226
|
-
ParquetSchemaType::UInt8 => DataType::UInt8,
|
227
|
-
ParquetSchemaType::UInt16 => DataType::UInt16,
|
228
|
-
ParquetSchemaType::UInt32 => DataType::UInt32,
|
229
|
-
ParquetSchemaType::UInt64 => DataType::UInt64,
|
230
|
-
ParquetSchemaType::Float => DataType::Float32,
|
231
|
-
ParquetSchemaType::Double => DataType::Float64,
|
232
|
-
ParquetSchemaType::String => DataType::Utf8,
|
233
|
-
ParquetSchemaType::Binary => DataType::Binary,
|
234
|
-
ParquetSchemaType::Boolean => DataType::Boolean,
|
235
|
-
ParquetSchemaType::Date32 => DataType::Date32,
|
236
|
-
ParquetSchemaType::TimestampMillis => {
|
237
|
-
DataType::Timestamp(TimeUnit::Millisecond, None)
|
238
|
-
}
|
239
|
-
ParquetSchemaType::TimestampMicros => {
|
240
|
-
DataType::Timestamp(TimeUnit::Microsecond, None)
|
241
|
-
}
|
242
|
-
ParquetSchemaType::List(_) => unimplemented!("List type not yet supported"),
|
243
|
-
ParquetSchemaType::Map(_) => unimplemented!("Map type not yet supported"),
|
244
|
-
},
|
245
|
-
true,
|
246
|
-
)
|
247
|
-
})
|
248
|
-
.collect();
|
249
|
-
let arrow_schema = Arc::new(Schema::new(arrow_fields));
|
495
|
+
// Get the Arrow schema from the SchemaNode (we only have DSL schema now, since legacy is converted)
|
496
|
+
let arrow_schema = build_arrow_schema(&schema, &logger).map_err(|e| {
|
497
|
+
MagnusError::new(
|
498
|
+
magnus::exception::runtime_error(),
|
499
|
+
format!("Failed to build Arrow schema from DSL schema: {}", e),
|
500
|
+
)
|
501
|
+
})?;
|
250
502
|
|
251
503
|
// Create the writer
|
252
504
|
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
|
253
505
|
|
254
506
|
if read_from.is_kind_of(ruby.class_enumerator()) {
|
255
|
-
//
|
256
|
-
let mut column_collectors
|
257
|
-
|
258
|
-
.map(|field| {
|
259
|
-
// Clone the type to avoid moving from a reference
|
260
|
-
let type_clone = field.type_.clone();
|
261
|
-
ColumnCollector::new(field.name.clone(), type_clone, field.format.clone())
|
262
|
-
})
|
263
|
-
.collect();
|
507
|
+
// Build column collectors - we only have DSL schema now
|
508
|
+
let mut column_collectors =
|
509
|
+
build_column_collectors_from_dsl(&ruby, &arrow_schema, &schema)?;
|
264
510
|
|
265
511
|
let mut rows_in_batch = 0;
|
266
512
|
let mut total_rows = 0;
|
@@ -272,48 +518,33 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
272
518
|
loop {
|
273
519
|
match read_from.funcall::<_, _, Value>("next", ()) {
|
274
520
|
Ok(row) => {
|
275
|
-
|
276
|
-
|
277
|
-
})?;
|
521
|
+
// Process the row
|
522
|
+
process_row(&ruby, row, &mut column_collectors)?;
|
278
523
|
|
279
|
-
//
|
280
|
-
if row_array.len() != column_collectors.len() {
|
281
|
-
return Err(MagnusError::new(
|
282
|
-
magnus::exception::type_error(),
|
283
|
-
format!(
|
284
|
-
"Row length ({}) does not match schema length ({}). Schema expects columns: {:?}",
|
285
|
-
row_array.len(),
|
286
|
-
column_collectors.len(),
|
287
|
-
column_collectors.iter().map(|c| c.name.as_str()).collect::<Vec<_>>()
|
288
|
-
),
|
289
|
-
));
|
290
|
-
}
|
291
|
-
|
292
|
-
// Sample row sizes using reservoir sampling
|
524
|
+
// Update row sampling for dynamic batch sizing
|
293
525
|
if size_samples.len() < sample_size {
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
526
|
+
// estimate row size
|
527
|
+
let row_array = RArray::from_value(row).ok_or_else(|| {
|
528
|
+
MagnusError::new(ruby.exception_type_error(), "Row must be an array")
|
529
|
+
})?;
|
530
|
+
let row_size = estimate_single_row_size(&row_array, &column_collectors)?;
|
531
|
+
size_samples.push(row_size);
|
532
|
+
} else if rng.random_range(0..=total_rows) < sample_size as usize {
|
533
|
+
let idx = rng.random_range(0..sample_size as usize);
|
534
|
+
let row_array = RArray::from_value(row).ok_or_else(|| {
|
535
|
+
MagnusError::new(ruby.exception_type_error(), "Row must be an array")
|
536
|
+
})?;
|
537
|
+
let row_size = estimate_single_row_size(&row_array, &column_collectors)?;
|
538
|
+
size_samples[idx] = row_size;
|
303
539
|
}
|
304
540
|
|
305
541
|
rows_in_batch += 1;
|
306
542
|
total_rows += 1;
|
307
543
|
|
308
544
|
// Calculate batch size progressively once we have minimum samples
|
309
|
-
if size_samples.len() >= MIN_SAMPLES_FOR_ESTIMATE
|
310
|
-
|
311
|
-
|
312
|
-
let avg_row_size = total_size as f64 / size_samples.len() as f64;
|
313
|
-
let avg_row_size = avg_row_size.max(1.0); // Ensure we don't divide by zero
|
314
|
-
let suggested_batch_size =
|
315
|
-
(flush_threshold as f64 / avg_row_size).floor() as usize;
|
316
|
-
current_batch_size = suggested_batch_size.max(MIN_BATCH_SIZE);
|
545
|
+
if user_batch_size.is_none() && size_samples.len() >= MIN_SAMPLES_FOR_ESTIMATE {
|
546
|
+
current_batch_size =
|
547
|
+
update_batch_size(&size_samples, flush_threshold, MIN_BATCH_SIZE);
|
317
548
|
}
|
318
549
|
|
319
550
|
// When we reach batch size, write the batch
|
@@ -330,19 +561,19 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
330
561
|
}
|
331
562
|
break;
|
332
563
|
}
|
333
|
-
return Err(e)
|
564
|
+
return Err(e)?;
|
334
565
|
}
|
335
566
|
}
|
336
567
|
}
|
337
568
|
} else {
|
338
569
|
return Err(MagnusError::new(
|
339
570
|
magnus::exception::type_error(),
|
340
|
-
"read_from must be an Enumerator",
|
341
|
-
))
|
571
|
+
"read_from must be an Enumerator".to_string(),
|
572
|
+
))?;
|
342
573
|
}
|
343
574
|
|
344
575
|
// Ensure everything is written and get the temp file if it exists
|
345
|
-
if let Some(temp_file) = writer.close()
|
576
|
+
if let Some(temp_file) = writer.close()? {
|
346
577
|
// If we got a temp file back, we need to copy its contents to the IO-like object
|
347
578
|
copy_temp_file_to_io_like(temp_file, IoLikeValue(write_to))?;
|
348
579
|
}
|
@@ -352,6 +583,15 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
352
583
|
|
353
584
|
#[inline]
|
354
585
|
pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
586
|
+
write_columns_impl(args).map_err(|e| {
|
587
|
+
let z: MagnusError = e.into();
|
588
|
+
z
|
589
|
+
})?;
|
590
|
+
Ok(())
|
591
|
+
}
|
592
|
+
|
593
|
+
#[inline]
|
594
|
+
fn write_columns_impl(args: &[Value]) -> Result<(), ReaderError> {
|
355
595
|
let ruby = unsafe { Ruby::get_unchecked() };
|
356
596
|
|
357
597
|
let ParquetWriteArgs {
|
@@ -362,45 +602,19 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
|
362
602
|
compression,
|
363
603
|
flush_threshold,
|
364
604
|
sample_size: _,
|
605
|
+
logger,
|
365
606
|
} = parse_parquet_write_args(args)?;
|
366
607
|
|
608
|
+
let logger = RubyLogger::new(&ruby, logger)?;
|
367
609
|
let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
|
368
610
|
|
369
|
-
//
|
370
|
-
let
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
ParquetSchemaType::Int8 => DataType::Int8,
|
377
|
-
ParquetSchemaType::Int16 => DataType::Int16,
|
378
|
-
ParquetSchemaType::Int32 => DataType::Int32,
|
379
|
-
ParquetSchemaType::Int64 => DataType::Int64,
|
380
|
-
ParquetSchemaType::UInt8 => DataType::UInt8,
|
381
|
-
ParquetSchemaType::UInt16 => DataType::UInt16,
|
382
|
-
ParquetSchemaType::UInt32 => DataType::UInt32,
|
383
|
-
ParquetSchemaType::UInt64 => DataType::UInt64,
|
384
|
-
ParquetSchemaType::Float => DataType::Float32,
|
385
|
-
ParquetSchemaType::Double => DataType::Float64,
|
386
|
-
ParquetSchemaType::String => DataType::Utf8,
|
387
|
-
ParquetSchemaType::Binary => DataType::Binary,
|
388
|
-
ParquetSchemaType::Boolean => DataType::Boolean,
|
389
|
-
ParquetSchemaType::Date32 => DataType::Date32,
|
390
|
-
ParquetSchemaType::TimestampMillis => {
|
391
|
-
DataType::Timestamp(TimeUnit::Millisecond, None)
|
392
|
-
}
|
393
|
-
ParquetSchemaType::TimestampMicros => {
|
394
|
-
DataType::Timestamp(TimeUnit::Microsecond, None)
|
395
|
-
}
|
396
|
-
ParquetSchemaType::List(_) => unimplemented!("List type not yet supported"),
|
397
|
-
ParquetSchemaType::Map(_) => unimplemented!("Map type not yet supported"),
|
398
|
-
},
|
399
|
-
true,
|
400
|
-
)
|
401
|
-
})
|
402
|
-
.collect();
|
403
|
-
let arrow_schema = Arc::new(Schema::new(arrow_fields));
|
611
|
+
// Get the Arrow schema from the SchemaNode (we only have DSL schema now, since legacy is converted)
|
612
|
+
let arrow_schema = build_arrow_schema(&schema, &logger).map_err(|e| {
|
613
|
+
MagnusError::new(
|
614
|
+
magnus::exception::runtime_error(),
|
615
|
+
format!("Failed to build Arrow schema from DSL schema: {}", e),
|
616
|
+
)
|
617
|
+
})?;
|
404
618
|
|
405
619
|
// Create the writer
|
406
620
|
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
|
@@ -422,36 +636,111 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
|
422
636
|
})?;
|
423
637
|
|
424
638
|
// Validate batch length matches schema
|
425
|
-
|
639
|
+
// Get schema length and field names - we only have DSL schema now
|
640
|
+
let (schema_len, field_names): (usize, Vec<&str>) = {
|
641
|
+
let fields = match &schema {
|
642
|
+
SchemaNode::Struct { fields, .. } => fields,
|
643
|
+
_ => {
|
644
|
+
return Err(MagnusError::new(
|
645
|
+
magnus::exception::type_error(),
|
646
|
+
"Root schema node must be a struct type",
|
647
|
+
))?
|
648
|
+
}
|
649
|
+
};
|
650
|
+
(
|
651
|
+
fields.len(),
|
652
|
+
fields
|
653
|
+
.iter()
|
654
|
+
.map(|f| match f {
|
655
|
+
SchemaNode::Primitive { name, .. } => name.as_str(),
|
656
|
+
SchemaNode::List { name, .. } => name.as_str(),
|
657
|
+
SchemaNode::Map { name, .. } => name.as_str(),
|
658
|
+
SchemaNode::Struct { name, .. } => name.as_str(),
|
659
|
+
})
|
660
|
+
.to_owned()
|
661
|
+
.collect(),
|
662
|
+
)
|
663
|
+
};
|
664
|
+
|
665
|
+
if batch_array.len() != schema_len {
|
426
666
|
return Err(MagnusError::new(
|
427
667
|
magnus::exception::type_error(),
|
428
668
|
format!(
|
429
669
|
"Batch column count ({}) does not match schema length ({}). Schema expects columns: {:?}",
|
430
670
|
batch_array.len(),
|
431
|
-
|
432
|
-
|
671
|
+
schema_len,
|
672
|
+
field_names
|
433
673
|
),
|
434
|
-
))
|
674
|
+
))?;
|
435
675
|
}
|
436
676
|
|
437
677
|
// Convert each column in the batch to Arrow arrays
|
438
|
-
let arrow_arrays: Vec<(String, Arc<dyn Array>)> =
|
439
|
-
|
440
|
-
.
|
441
|
-
|
442
|
-
|
678
|
+
let arrow_arrays: Vec<(String, Arc<dyn Array>)> = {
|
679
|
+
// Process each field in the DSL schema
|
680
|
+
let fields = arrow_schema.fields();
|
681
|
+
let top_fields =
|
682
|
+
match &schema {
|
683
|
+
SchemaNode::Struct { fields, .. } => fields,
|
684
|
+
_ => return Err(MagnusError::new(
|
685
|
+
magnus::exception::runtime_error(),
|
686
|
+
"Top-level DSL schema must be a struct for columns approach",
|
687
|
+
))?,
|
688
|
+
};
|
689
|
+
if top_fields.len() != fields.len() {
|
690
|
+
return Err(MagnusError::new(
|
691
|
+
magnus::exception::runtime_error(),
|
692
|
+
"Mismatch top-level DSL fields vs Arrow fields",
|
693
|
+
))?;
|
694
|
+
}
|
695
|
+
|
696
|
+
let mut out = vec![];
|
697
|
+
for ((arrow_f, dsl_f), col_val) in
|
698
|
+
fields.iter().zip(top_fields.iter()).zip(batch_array)
|
699
|
+
{
|
700
|
+
let col_arr = RArray::from_value(col_val).ok_or_else(|| {
|
443
701
|
MagnusError::new(
|
444
702
|
magnus::exception::type_error(),
|
445
|
-
format!("Column '{}' must be an array",
|
703
|
+
format!("Column '{}' must be an array", arrow_f.name()),
|
446
704
|
)
|
447
705
|
})?;
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
706
|
+
// Get appropriate parquet_type
|
707
|
+
let ptype = match dsl_f {
|
708
|
+
SchemaNode::Primitive {
|
709
|
+
parquet_type,
|
710
|
+
// Format is handled internally now
|
711
|
+
..
|
712
|
+
} => match parquet_type {
|
713
|
+
&PrimitiveType::Int8 => PST::Int8,
|
714
|
+
&PrimitiveType::Int16 => PST::Int16,
|
715
|
+
&PrimitiveType::Int32 => PST::Int32,
|
716
|
+
&PrimitiveType::Int64 => PST::Int64,
|
717
|
+
&PrimitiveType::UInt8 => PST::UInt8,
|
718
|
+
&PrimitiveType::UInt16 => PST::UInt16,
|
719
|
+
&PrimitiveType::UInt32 => PST::UInt32,
|
720
|
+
&PrimitiveType::UInt64 => PST::UInt64,
|
721
|
+
&PrimitiveType::Float32 => PST::Float,
|
722
|
+
&PrimitiveType::Float64 => PST::Double,
|
723
|
+
&PrimitiveType::String => PST::String,
|
724
|
+
&PrimitiveType::Binary => PST::Binary,
|
725
|
+
&PrimitiveType::Boolean => PST::Boolean,
|
726
|
+
&PrimitiveType::Date32 => PST::Date32,
|
727
|
+
&PrimitiveType::TimestampMillis => PST::TimestampMillis,
|
728
|
+
&PrimitiveType::TimestampMicros => PST::TimestampMicros,
|
729
|
+
},
|
730
|
+
SchemaNode::List { .. }
|
731
|
+
| SchemaNode::Map { .. }
|
732
|
+
| SchemaNode::Struct { .. } => {
|
733
|
+
// For nested, we just do a single "column" as well
|
734
|
+
arrow_data_type_to_parquet_schema_type(arrow_f.data_type())?
|
735
|
+
}
|
736
|
+
};
|
737
|
+
out.push((
|
738
|
+
arrow_f.name().clone(),
|
739
|
+
convert_ruby_array_to_arrow(col_arr, &ptype)?,
|
740
|
+
));
|
741
|
+
}
|
742
|
+
out
|
743
|
+
};
|
455
744
|
|
456
745
|
// Create and write record batch
|
457
746
|
let record_batch = RecordBatch::try_from_iter(arrow_arrays).map_err(|e| {
|
@@ -461,14 +750,12 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
|
461
750
|
)
|
462
751
|
})?;
|
463
752
|
|
464
|
-
writer
|
465
|
-
.write(&record_batch)
|
466
|
-
.map_err(|e| ParquetErrorWrapper(e))?;
|
753
|
+
writer.write(&record_batch)?;
|
467
754
|
|
468
755
|
match &mut writer {
|
469
756
|
WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
|
470
757
|
if w.in_progress_size() >= flush_threshold {
|
471
|
-
w.flush()
|
758
|
+
w.flush()?;
|
472
759
|
}
|
473
760
|
}
|
474
761
|
}
|
@@ -477,19 +764,19 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
|
477
764
|
if e.is_kind_of(ruby.exception_stop_iteration()) {
|
478
765
|
break;
|
479
766
|
}
|
480
|
-
return Err(e)
|
767
|
+
return Err(e)?;
|
481
768
|
}
|
482
769
|
}
|
483
770
|
}
|
484
771
|
} else {
|
485
772
|
return Err(MagnusError::new(
|
486
773
|
magnus::exception::type_error(),
|
487
|
-
"read_from must be an Enumerator",
|
488
|
-
))
|
774
|
+
"read_from must be an Enumerator".to_string(),
|
775
|
+
))?;
|
489
776
|
}
|
490
777
|
|
491
778
|
// Ensure everything is written and get the temp file if it exists
|
492
|
-
if let Some(temp_file) = writer.close()
|
779
|
+
if let Some(temp_file) = writer.close()? {
|
493
780
|
// If we got a temp file back, we need to copy its contents to the IO-like object
|
494
781
|
copy_temp_file_to_io_like(temp_file, IoLikeValue(write_to))?;
|
495
782
|
}
|
@@ -497,12 +784,23 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
|
497
784
|
Ok(())
|
498
785
|
}
|
499
786
|
|
787
|
+
// Creates an appropriate Parquet writer based on the output target and compression settings
|
788
|
+
// This function handles two main output scenarios:
|
789
|
+
// 1. Writing directly to a file path (string)
|
790
|
+
// 2. Writing to a Ruby IO-like object (using a temporary file as an intermediate buffer)
|
791
|
+
//
|
792
|
+
// For IO-like objects, the function creates a temporary file that is later copied to the
|
793
|
+
// IO object when writing is complete. This approach is necessary because Parquet requires
|
794
|
+
// random file access to write its footer after the data.
|
795
|
+
//
|
796
|
+
// The function also configures compression based on the user's preferences, with
|
797
|
+
// several options available (none, snappy, gzip, lz4, zstd).
|
500
798
|
fn create_writer(
|
501
799
|
ruby: &Ruby,
|
502
800
|
write_to: &Value,
|
503
801
|
schema: Arc<Schema>,
|
504
802
|
compression: Option<String>,
|
505
|
-
) -> Result<WriterOutput,
|
803
|
+
) -> Result<WriterOutput, ReaderError> {
|
506
804
|
// Create writer properties with compression based on the option
|
507
805
|
let props = WriterProperties::builder()
|
508
806
|
.set_compression(match compression.as_deref() {
|
@@ -517,9 +815,8 @@ fn create_writer(
|
|
517
815
|
|
518
816
|
if write_to.is_kind_of(ruby.class_string()) {
|
519
817
|
let path = write_to.to_r_string()?.to_string()?;
|
520
|
-
let file: Box<dyn SendableWrite> = Box::new(File::create(path)
|
521
|
-
let writer =
|
522
|
-
ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
|
818
|
+
let file: Box<dyn SendableWrite> = Box::new(File::create(path)?);
|
819
|
+
let writer = ArrowWriter::try_new(file, schema, Some(props))?;
|
523
820
|
Ok(WriterOutput::File(writer))
|
524
821
|
} else {
|
525
822
|
// Create a temporary file to write to instead of directly to the IoLikeValue
|
@@ -535,13 +832,22 @@ fn create_writer(
|
|
535
832
|
format!("Failed to reopen temporary file: {}", e),
|
536
833
|
)
|
537
834
|
})?);
|
538
|
-
let writer =
|
539
|
-
ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
|
835
|
+
let writer = ArrowWriter::try_new(file, schema, Some(props))?;
|
540
836
|
Ok(WriterOutput::TempFile(writer, temp_file))
|
541
837
|
}
|
542
838
|
}
|
543
839
|
|
544
|
-
//
|
840
|
+
// Copies the contents of a temporary file to a Ruby IO-like object
|
841
|
+
// This function is necessary because Parquet writing requires random file access
|
842
|
+
// (especially for writing the footer after all data), but Ruby IO objects may not
|
843
|
+
// support seeking. The solution is to:
|
844
|
+
//
|
845
|
+
// 1. Write the entire Parquet file to a temporary file first
|
846
|
+
// 2. Once writing is complete, copy the entire contents to the Ruby IO object
|
847
|
+
//
|
848
|
+
// This approach enables support for a wide range of Ruby IO objects like StringIO,
|
849
|
+
// network streams, etc., but does require enough disk space for the temporary file
|
850
|
+
// and involves a second full-file read/write operation at the end.
|
545
851
|
fn copy_temp_file_to_io_like(
|
546
852
|
temp_file: NamedTempFile,
|
547
853
|
io_like: IoLikeValue,
|
@@ -565,36 +871,278 @@ fn copy_temp_file_to_io_like(
|
|
565
871
|
Ok(())
|
566
872
|
}
|
567
873
|
|
874
|
+
// Estimates the memory size of a single row by examining each value
|
875
|
+
// This is used for dynamic batch sizing to optimize memory usage during writes
|
876
|
+
// by adapting batch sizes based on the actual data being processed.
|
877
|
+
pub fn estimate_single_row_size(
|
878
|
+
row_array: &RArray,
|
879
|
+
collectors: &[ColumnCollector],
|
880
|
+
) -> Result<usize, MagnusError> {
|
881
|
+
let mut size = 0;
|
882
|
+
for (idx, val) in row_array.into_iter().enumerate() {
|
883
|
+
let col_type = &collectors[idx].type_;
|
884
|
+
// Calculate size based on the type-specific estimation
|
885
|
+
size += estimate_value_size(val, col_type)?;
|
886
|
+
}
|
887
|
+
Ok(size)
|
888
|
+
}
|
889
|
+
|
890
|
+
// Estimates the memory footprint of a single value based on its schema type
|
891
|
+
// This provides type-specific size estimates that help with dynamic batch sizing
|
892
|
+
// For complex types like lists, maps, and structs, we use reasonable approximations
|
893
|
+
pub fn estimate_value_size(
|
894
|
+
value: Value,
|
895
|
+
schema_type: &ParquetSchemaType,
|
896
|
+
) -> Result<usize, MagnusError> {
|
897
|
+
use ParquetSchemaType as PST;
|
898
|
+
if value.is_nil() {
|
899
|
+
return Ok(0); // nil => minimal
|
900
|
+
}
|
901
|
+
match schema_type {
|
902
|
+
PST::Int8 | PST::UInt8 => Ok(1),
|
903
|
+
PST::Int16 | PST::UInt16 => Ok(2),
|
904
|
+
PST::Int32 | PST::UInt32 | PST::Float => Ok(4),
|
905
|
+
PST::Int64 | PST::UInt64 | PST::Double => Ok(8),
|
906
|
+
PST::Boolean => Ok(1),
|
907
|
+
PST::Date32 | PST::TimestampMillis | PST::TimestampMicros => Ok(8),
|
908
|
+
PST::String | PST::Binary => {
|
909
|
+
if let Ok(s) = String::try_convert(value) {
|
910
|
+
// Account for string length plus Rust String's capacity+pointer overhead
|
911
|
+
Ok(s.len() + std::mem::size_of::<usize>() * 3)
|
912
|
+
} else {
|
913
|
+
// Try to convert the value to a string using to_s for non-string types
|
914
|
+
// This handles numeric values that will be converted to strings later
|
915
|
+
let _ruby = unsafe { Ruby::get_unchecked() };
|
916
|
+
match value.funcall::<_, _, Value>("to_s", ()) {
|
917
|
+
Ok(str_val) => {
|
918
|
+
if let Ok(s) = String::try_convert(str_val) {
|
919
|
+
Ok(s.len() + std::mem::size_of::<usize>() * 3)
|
920
|
+
} else {
|
921
|
+
// If to_s conversion fails, just use a reasonable default
|
922
|
+
Ok(8) // Reasonable size estimate for small values
|
923
|
+
}
|
924
|
+
}
|
925
|
+
Err(_) => {
|
926
|
+
// If to_s method fails, use a default size
|
927
|
+
Ok(8) // Reasonable size estimate for small values
|
928
|
+
}
|
929
|
+
}
|
930
|
+
}
|
931
|
+
}
|
932
|
+
PST::List(item_type) => {
|
933
|
+
if let Ok(arr) = RArray::try_convert(value) {
|
934
|
+
let len = arr.len();
|
935
|
+
|
936
|
+
// Base overhead for the array structure (pointer, length, capacity)
|
937
|
+
let base_size = std::mem::size_of::<usize>() * 3;
|
938
|
+
|
939
|
+
// If empty, just return the base size
|
940
|
+
if len == 0 {
|
941
|
+
return Ok(base_size);
|
942
|
+
}
|
943
|
+
|
944
|
+
// Sample up to 5 elements to get average element size
|
945
|
+
let sample_count = std::cmp::min(len, 5);
|
946
|
+
let mut total_sample_size = 0;
|
947
|
+
|
948
|
+
for i in 0..sample_count {
|
949
|
+
let element = arr.entry(i as isize)?;
|
950
|
+
let element_size = estimate_value_size(element, &item_type.item_type)?;
|
951
|
+
total_sample_size += element_size;
|
952
|
+
}
|
953
|
+
|
954
|
+
// If we couldn't sample any elements properly, that's an error
|
955
|
+
if sample_count > 0 && total_sample_size == 0 {
|
956
|
+
return Err(MagnusError::new(
|
957
|
+
magnus::exception::runtime_error(),
|
958
|
+
"Failed to estimate size of list elements",
|
959
|
+
));
|
960
|
+
}
|
961
|
+
|
962
|
+
// Calculate average element size from samples
|
963
|
+
let avg_element_size = if sample_count > 0 {
|
964
|
+
total_sample_size as f64 / sample_count as f64
|
965
|
+
} else {
|
966
|
+
return Err(MagnusError::new(
|
967
|
+
magnus::exception::runtime_error(),
|
968
|
+
"Failed to sample list elements for size estimation",
|
969
|
+
));
|
970
|
+
};
|
971
|
+
|
972
|
+
// Estimate total size based on average element size * length + base overhead
|
973
|
+
Ok(base_size + (avg_element_size as usize * len))
|
974
|
+
} else {
|
975
|
+
// Instead of assuming it's a small list, return an error
|
976
|
+
Err(MagnusError::new(
|
977
|
+
magnus::exception::runtime_error(),
|
978
|
+
format!("Expected array for List type but got: {:?}", value),
|
979
|
+
))
|
980
|
+
}
|
981
|
+
}
|
982
|
+
PST::Map(map_field) => {
|
983
|
+
if let Ok(hash) = RHash::try_convert(value) {
|
984
|
+
let size_estimate = hash.funcall::<_, _, usize>("size", ())?;
|
985
|
+
|
986
|
+
// Base overhead for the hash structure
|
987
|
+
let base_size = std::mem::size_of::<usize>() * 4;
|
988
|
+
|
989
|
+
// If empty, just return the base size
|
990
|
+
if size_estimate == 0 {
|
991
|
+
return Ok(base_size);
|
992
|
+
}
|
993
|
+
|
994
|
+
// Sample up to 5 key-value pairs to estimate average sizes
|
995
|
+
let mut key_sample_size = 0;
|
996
|
+
let mut value_sample_size = 0;
|
997
|
+
let mut sample_count = 0;
|
998
|
+
|
999
|
+
// Get an enumerator for the hash
|
1000
|
+
let enumerator = hash.funcall::<_, _, Value>("to_enum", ())?;
|
1001
|
+
|
1002
|
+
// Sample up to 5 entries
|
1003
|
+
for _ in 0..std::cmp::min(size_estimate, 5) {
|
1004
|
+
match enumerator.funcall::<_, _, Value>("next", ()) {
|
1005
|
+
Ok(pair) => {
|
1006
|
+
if let Ok(pair_array) = RArray::try_convert(pair) {
|
1007
|
+
if pair_array.len() == 2 {
|
1008
|
+
let key = pair_array.entry(0)?;
|
1009
|
+
let val = pair_array.entry(1)?;
|
1010
|
+
|
1011
|
+
key_sample_size +=
|
1012
|
+
estimate_value_size(key, &map_field.key_type)?;
|
1013
|
+
value_sample_size +=
|
1014
|
+
estimate_value_size(val, &map_field.value_type)?;
|
1015
|
+
sample_count += 1;
|
1016
|
+
}
|
1017
|
+
}
|
1018
|
+
}
|
1019
|
+
Err(_) => break, // Stop if we reach the end
|
1020
|
+
}
|
1021
|
+
}
|
1022
|
+
|
1023
|
+
// If we couldn't sample any pairs, return an error
|
1024
|
+
if size_estimate > 0 && sample_count == 0 {
|
1025
|
+
return Err(MagnusError::new(
|
1026
|
+
magnus::exception::runtime_error(),
|
1027
|
+
"Failed to sample map entries for size estimation",
|
1028
|
+
));
|
1029
|
+
}
|
1030
|
+
|
1031
|
+
// Calculate average key and value sizes
|
1032
|
+
let (avg_key_size, avg_value_size) = if sample_count > 0 {
|
1033
|
+
(
|
1034
|
+
key_sample_size as f64 / sample_count as f64,
|
1035
|
+
value_sample_size as f64 / sample_count as f64,
|
1036
|
+
)
|
1037
|
+
} else {
|
1038
|
+
return Err(MagnusError::new(
|
1039
|
+
magnus::exception::runtime_error(),
|
1040
|
+
"Failed to sample hash key-value pairs for size estimation",
|
1041
|
+
));
|
1042
|
+
};
|
1043
|
+
|
1044
|
+
// Each entry has overhead (node pointers, etc.) in a hash map
|
1045
|
+
let entry_overhead = std::mem::size_of::<usize>() * 2;
|
1046
|
+
|
1047
|
+
// Estimate total size:
|
1048
|
+
// base size + (key_size + value_size + entry_overhead) * count
|
1049
|
+
Ok(base_size
|
1050
|
+
+ ((avg_key_size + avg_value_size + entry_overhead as f64) as usize
|
1051
|
+
* size_estimate))
|
1052
|
+
} else {
|
1053
|
+
// Instead of assuming a small map, return an error
|
1054
|
+
Err(MagnusError::new(
|
1055
|
+
magnus::exception::runtime_error(),
|
1056
|
+
format!("Expected hash for Map type but got: {:?}", value),
|
1057
|
+
))
|
1058
|
+
}
|
1059
|
+
}
|
1060
|
+
PST::Struct(struct_field) => {
|
1061
|
+
if let Ok(hash) = RHash::try_convert(value) {
|
1062
|
+
// Base overhead for the struct
|
1063
|
+
let base_size = std::mem::size_of::<usize>() * 3;
|
1064
|
+
|
1065
|
+
// Estimate size for each field
|
1066
|
+
let mut total_fields_size = 0;
|
1067
|
+
|
1068
|
+
for field in &struct_field.fields {
|
1069
|
+
// Try to get the field value from the hash
|
1070
|
+
match hash.get(Symbol::new(&field.name)) {
|
1071
|
+
Some(field_value) => {
|
1072
|
+
total_fields_size += estimate_value_size(field_value, &field.type_)?;
|
1073
|
+
}
|
1074
|
+
None => {
|
1075
|
+
if let Some(field_value) = hash.get(&*field.name) {
|
1076
|
+
total_fields_size +=
|
1077
|
+
estimate_value_size(field_value, &field.type_)?;
|
1078
|
+
} else {
|
1079
|
+
if field.nullable {
|
1080
|
+
total_fields_size += 0;
|
1081
|
+
} else {
|
1082
|
+
return Err(MagnusError::new(
|
1083
|
+
magnus::exception::runtime_error(),
|
1084
|
+
format!("Missing field: {} in hash {:?}", field.name, hash),
|
1085
|
+
));
|
1086
|
+
}
|
1087
|
+
}
|
1088
|
+
}
|
1089
|
+
}
|
1090
|
+
}
|
1091
|
+
|
1092
|
+
// We no longer error on missing fields during size estimation
|
1093
|
+
Ok(base_size + total_fields_size)
|
1094
|
+
} else {
|
1095
|
+
// Instead of trying instance_variables or assuming a default, return an error
|
1096
|
+
Err(MagnusError::new(
|
1097
|
+
magnus::exception::runtime_error(),
|
1098
|
+
format!("Expected hash for Struct type but got: {:?}", value),
|
1099
|
+
))
|
1100
|
+
}
|
1101
|
+
}
|
1102
|
+
}
|
1103
|
+
}
|
1104
|
+
|
1105
|
+
// Converts all accumulated data from ColumnCollectors into an Arrow RecordBatch
|
1106
|
+
// and writes it to the Parquet file/output. This is a crucial function that bridges
|
1107
|
+
// between our Ruby-oriented data collectors and the Arrow/Parquet ecosystem.
|
1108
|
+
//
|
1109
|
+
// The function:
|
1110
|
+
// 1. Takes all collected values from each ColumnCollector and converts them to Arrow arrays
|
1111
|
+
// 2. Creates a RecordBatch from these arrays (column-oriented data format)
|
1112
|
+
// 3. Writes the batch to the ParquetWriter
|
1113
|
+
// 4. Flushes the writer if the accumulated memory exceeds the threshold
|
1114
|
+
//
|
1115
|
+
// This approach enables efficient batch-wise writing while controlling memory usage.
|
568
1116
|
fn write_batch(
|
569
1117
|
writer: &mut WriterOutput,
|
570
1118
|
collectors: &mut [ColumnCollector],
|
571
1119
|
flush_threshold: usize,
|
572
|
-
) -> Result<(),
|
1120
|
+
) -> Result<(), ReaderError> {
|
573
1121
|
// Convert columns to Arrow arrays
|
574
1122
|
let arrow_arrays: Vec<(String, Arc<dyn Array>)> = collectors
|
575
1123
|
.iter_mut()
|
576
|
-
.map(|
|
577
|
-
|
1124
|
+
.map(|c| {
|
1125
|
+
let arr = c.take_array()?;
|
1126
|
+
Ok((c.name.clone(), arr))
|
1127
|
+
})
|
1128
|
+
.collect::<Result<_, ReaderError>>()?;
|
578
1129
|
|
579
|
-
|
580
|
-
let record_batch = RecordBatch::try_from_iter(arrow_arrays).map_err(|e| {
|
1130
|
+
let record_batch = RecordBatch::try_from_iter(arrow_arrays.clone()).map_err(|e| {
|
581
1131
|
MagnusError::new(
|
582
1132
|
magnus::exception::runtime_error(),
|
583
|
-
format!("Failed to create
|
1133
|
+
format!("Failed to create RecordBatch: {}", e),
|
584
1134
|
)
|
585
1135
|
})?;
|
586
1136
|
|
587
|
-
writer
|
588
|
-
.write(&record_batch)
|
589
|
-
.map_err(|e| ParquetErrorWrapper(e))?;
|
1137
|
+
writer.write(&record_batch)?;
|
590
1138
|
|
1139
|
+
// Check if we need to flush based on memory usage thresholds
|
591
1140
|
match writer {
|
592
1141
|
WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
|
593
1142
|
if w.in_progress_size() >= flush_threshold || w.memory_size() >= flush_threshold {
|
594
|
-
w.flush()
|
1143
|
+
w.flush()?;
|
595
1144
|
}
|
596
1145
|
}
|
597
1146
|
}
|
598
|
-
|
599
1147
|
Ok(())
|
600
1148
|
}
|