parquet 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,16 +1,16 @@
1
1
  use std::{
2
2
  fs::File,
3
3
  io::{self, BufReader, BufWriter},
4
- mem,
5
4
  sync::Arc,
6
5
  };
7
6
 
8
7
  use arrow_array::{Array, RecordBatch};
9
- use arrow_schema::{DataType, Field, Schema, TimeUnit};
8
+ use arrow_schema::{DataType, Schema, TimeUnit};
9
+ use itertools::Itertools;
10
10
  use magnus::{
11
11
  scan_args::{get_kwargs, scan_args},
12
12
  value::ReprValue,
13
- Error as MagnusError, RArray, Ruby, TryConvert, Value,
13
+ Error as MagnusError, RArray, RHash, Ruby, Symbol, TryConvert, Value,
14
14
  };
15
15
  use parquet::{
16
16
  arrow::ArrowWriter,
@@ -22,18 +22,210 @@ use tempfile::NamedTempFile;
22
22
 
23
23
  use crate::{
24
24
  convert_ruby_array_to_arrow,
25
- types::{ColumnCollector, ParquetErrorWrapper, WriterOutput},
26
- IoLikeValue, ParquetSchemaType, ParquetWriteArgs, SchemaField, SendableWrite,
25
+ logger::RubyLogger,
26
+ reader::ReaderError,
27
+ types::{
28
+ schema_node::build_arrow_schema, // ADDED - we need to reference the DSL's build_arrow_schema
29
+ ColumnCollector,
30
+ ParquetSchemaType,
31
+ WriterOutput,
32
+ },
33
+ utils::parse_string_or_symbol,
34
+ IoLikeValue, ParquetSchemaType as PST, ParquetWriteArgs, SchemaField, SendableWrite,
27
35
  };
36
+ use crate::{types::PrimitiveType, SchemaNode}; // ADDED - ensure we import SchemaNode
28
37
 
29
- const MIN_SAMPLES_FOR_ESTIMATE: usize = 10; // Minimum samples needed for estimation
30
- const SAMPLE_SIZE: usize = 100; // Number of rows to sample for size estimation
31
- const MIN_BATCH_SIZE: usize = 10; // Minimum batch size to maintain efficiency
32
- const INITIAL_BATCH_SIZE: usize = 100; // Initial batch size while sampling
33
-
34
- // Maximum memory usage per batch (64MB by default)
38
+ const MIN_SAMPLES_FOR_ESTIMATE: usize = 10;
39
+ const SAMPLE_SIZE: usize = 100;
40
+ const MIN_BATCH_SIZE: usize = 10;
41
+ const INITIAL_BATCH_SIZE: usize = 100;
35
42
  const DEFAULT_MEMORY_THRESHOLD: usize = 64 * 1024 * 1024;
36
43
 
44
+ // -----------------------------------------------------------------------------
45
+ // HELPER to invert arrow DataType back to our ParquetSchemaType
46
+ // Converts Arrow DataType to our internal ParquetSchemaType representation.
47
+ // This is essential for mapping Arrow types back to our schema representation
48
+ // when working with column collections and schema validation.
49
+ // -----------------------------------------------------------------------------
50
+ fn arrow_data_type_to_parquet_schema_type(dt: &DataType) -> Result<ParquetSchemaType, MagnusError> {
51
+ match dt {
52
+ DataType::Boolean => Ok(PST::Boolean),
53
+ DataType::Int8 => Ok(PST::Int8),
54
+ DataType::Int16 => Ok(PST::Int16),
55
+ DataType::Int32 => Ok(PST::Int32),
56
+ DataType::Int64 => Ok(PST::Int64),
57
+ DataType::UInt8 => Ok(PST::UInt8),
58
+ DataType::UInt16 => Ok(PST::UInt16),
59
+ DataType::UInt32 => Ok(PST::UInt32),
60
+ DataType::UInt64 => Ok(PST::UInt64),
61
+ DataType::Float16 => {
62
+ // We do not have a direct ParquetSchemaType::Float16, we treat it as Float
63
+ Ok(PST::Float)
64
+ }
65
+ DataType::Float32 => Ok(PST::Float),
66
+ DataType::Float64 => Ok(PST::Double),
67
+ DataType::Date32 => Ok(PST::Date32),
68
+ DataType::Date64 => {
69
+ // Our code typically uses Date32 or Timestamp for 64. But Arrow has Date64
70
+ // We can store it as PST::Date64 if we want. If we don't have that, consider PST::Date32 or an error.
71
+ // If your existing code only handles Date32, you can error. But let's do PST::Date32 as fallback:
72
+ // Or define a new variant if you have one in your code. We'll show a fallback approach:
73
+ Err(MagnusError::new(
74
+ magnus::exception::runtime_error(),
75
+ "Arrow Date64 not directly supported in current ParquetSchemaType (use date32?).",
76
+ ))
77
+ }
78
+ DataType::Timestamp(TimeUnit::Second, _tz) => {
79
+ // We'll treat this as PST::TimestampMillis, or define PST::TimestampSecond
80
+ // For simplicity, let's map "second" to PST::TimestampMillis with a note:
81
+ Ok(PST::TimestampMillis)
82
+ }
83
+ DataType::Timestamp(TimeUnit::Millisecond, _tz) => Ok(PST::TimestampMillis),
84
+ DataType::Timestamp(TimeUnit::Microsecond, _tz) => Ok(PST::TimestampMicros),
85
+ DataType::Timestamp(TimeUnit::Nanosecond, _tz) => {
86
+ // If you have a PST::TimestampNanos variant, use it. Otherwise, degrade to micros
87
+ // for demonstration:
88
+ Err(MagnusError::new(
89
+ magnus::exception::runtime_error(),
90
+ "TimestampNanos not supported, please adjust your schema or code.",
91
+ ))
92
+ }
93
+ DataType::Utf8 => Ok(PST::String),
94
+ DataType::Binary => Ok(PST::Binary),
95
+ DataType::LargeUtf8 => {
96
+ // If not supported, degrade or error. We'll degrade to PST::String
97
+ Ok(PST::String)
98
+ }
99
+ DataType::LargeBinary => Ok(PST::Binary),
100
+ DataType::List(child_field) => {
101
+ // Recursively handle the item type
102
+ let child_type = arrow_data_type_to_parquet_schema_type(child_field.data_type())?;
103
+ Ok(PST::List(Box::new(crate::types::ListField {
104
+ item_type: child_type,
105
+ format: None,
106
+ nullable: true,
107
+ })))
108
+ }
109
+ DataType::Map(entry_field, _keys_sorted) => {
110
+ // Arrow's Map -> a struct<key, value> inside
111
+ let entry_type = entry_field.data_type();
112
+ if let DataType::Struct(fields) = entry_type {
113
+ if fields.len() == 2 {
114
+ let key_type = arrow_data_type_to_parquet_schema_type(fields[0].data_type())?;
115
+ let value_type = arrow_data_type_to_parquet_schema_type(fields[1].data_type())?;
116
+ Ok(PST::Map(Box::new(crate::types::MapField {
117
+ key_type,
118
+ value_type,
119
+ key_format: None,
120
+ value_format: None,
121
+ value_nullable: true,
122
+ })))
123
+ } else {
124
+ Err(MagnusError::new(
125
+ magnus::exception::type_error(),
126
+ "Map field must have exactly 2 child fields (key, value)",
127
+ ))
128
+ }
129
+ } else {
130
+ Err(MagnusError::new(
131
+ magnus::exception::type_error(),
132
+ "Map field is not a struct? Unexpected Arrow schema layout",
133
+ ))
134
+ }
135
+ }
136
+ DataType::Struct(arrow_fields) => {
137
+ // We treat this as PST::Struct. We'll recursively handle subfields
138
+ // but for top-level collecting we only store them as one column
139
+ // so the user data must pass a Ruby Hash or something for that field.
140
+ let mut schema_fields = vec![];
141
+ for f in arrow_fields {
142
+ let sub_type = arrow_data_type_to_parquet_schema_type(f.data_type())?;
143
+ schema_fields.push(SchemaField {
144
+ name: f.name().clone(),
145
+ type_: sub_type,
146
+ format: None, // We can't see the 'format' from Arrow
147
+ nullable: f.is_nullable(),
148
+ });
149
+ }
150
+ Ok(PST::Struct(Box::new(crate::types::StructField {
151
+ fields: schema_fields,
152
+ })))
153
+ }
154
+ _ => Err(MagnusError::new(
155
+ magnus::exception::runtime_error(),
156
+ format!("Unsupported or unhandled Arrow DataType: {:?}", dt),
157
+ )),
158
+ }
159
+ }
160
+
161
+ // -----------------------------------------------------------------------------
162
+ // HELPER to build ColumnCollectors for the DSL variant
163
+ // This function converts a SchemaNode (from our DSL) into a collection of ColumnCollectors
164
+ // that can accumulate values for each column in the schema.
165
+ // - arrow_schema: The Arrow schema corresponding to our DSL schema
166
+ // - root_node: The root SchemaNode (expected to be a Struct node) from which to build collectors
167
+ // -----------------------------------------------------------------------------
168
+ fn build_column_collectors_from_dsl<'a>(
169
+ ruby: &'a Ruby,
170
+ arrow_schema: &'a Arc<Schema>,
171
+ root_node: &'a SchemaNode,
172
+ ) -> Result<Vec<ColumnCollector<'a>>, MagnusError> {
173
+ // We expect the top-level schema node to be a Struct so that arrow_schema
174
+ // lines up with root_node.fields. If the user gave a top-level primitive, it would be 1 field, but
175
+ // our code calls build_arrow_schema under the assumption "top-level must be Struct."
176
+ let fields = match root_node {
177
+ SchemaNode::Struct { fields, .. } => fields,
178
+ _ => {
179
+ return Err(MagnusError::new(
180
+ ruby.exception_runtime_error(),
181
+ "Top-level schema for DSL must be a struct",
182
+ ))
183
+ }
184
+ };
185
+
186
+ if fields.len() != arrow_schema.fields().len() {
187
+ return Err(MagnusError::new(
188
+ ruby.exception_runtime_error(),
189
+ format!(
190
+ "Mismatch between DSL field count ({}) and Arrow fields ({})",
191
+ fields.len(),
192
+ arrow_schema.fields().len()
193
+ ),
194
+ ));
195
+ }
196
+
197
+ let mut collectors = Vec::with_capacity(fields.len());
198
+ for (arrow_field, schema_field_node) in arrow_schema.fields().iter().zip(fields) {
199
+ let name = arrow_field.name().clone();
200
+ let parquet_type = arrow_data_type_to_parquet_schema_type(arrow_field.data_type())?;
201
+
202
+ // Extract the optional format from the schema node
203
+ let format = extract_format_from_schema_node(schema_field_node);
204
+
205
+ // Build the ColumnCollector
206
+ collectors.push(ColumnCollector::new(
207
+ name,
208
+ parquet_type,
209
+ format,
210
+ arrow_field.is_nullable(),
211
+ ));
212
+ }
213
+ Ok(collectors)
214
+ }
215
+
216
+ // Helper to extract the format from a SchemaNode if available
217
+ fn extract_format_from_schema_node(node: &SchemaNode) -> Option<String> {
218
+ match node {
219
+ SchemaNode::Primitive {
220
+ format: f,
221
+ parquet_type: _,
222
+ ..
223
+ } => f.clone(),
224
+ // For struct, list, map, etc. there's no single "format." We ignore it.
225
+ _ => None,
226
+ }
227
+ }
228
+
37
229
  /// Parse arguments for Parquet writing
38
230
  pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, MagnusError> {
39
231
  let ruby = unsafe { Ruby::get_unchecked() };
@@ -48,6 +240,7 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
48
240
  Option<Option<usize>>,
49
241
  Option<Option<String>>,
50
242
  Option<Option<usize>>,
243
+ Option<Option<Value>>,
51
244
  ),
52
245
  (),
53
246
  >(
@@ -58,129 +251,231 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
58
251
  "flush_threshold",
59
252
  "compression",
60
253
  "sample_size",
254
+ "logger",
61
255
  ],
62
256
  )?;
63
257
 
64
- let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
65
- MagnusError::new(
66
- magnus::exception::type_error(),
67
- "schema must be an array of hashes",
68
- )
69
- })?;
258
+ // The schema value could be one of:
259
+ // 1. An array of hashes (legacy format)
260
+ // 2. A hash with type: :struct (new DSL format)
261
+ // 3. nil (infer from data)
262
+ let schema_value = kwargs.required.0;
70
263
 
71
- let mut schema = Vec::with_capacity(schema_array.len());
264
+ // Check if it's the new DSL format (a hash with type: :struct)
265
+ // We need to handle both direct hash objects and objects created via Parquet::Schema.define
72
266
 
73
- for (idx, field_hash) in schema_array.into_iter().enumerate() {
74
- if !field_hash.is_kind_of(ruby.class_hash()) {
75
- return Err(MagnusError::new(
76
- magnus::exception::type_error(),
77
- format!("schema[{}] must be a hash", idx),
78
- ));
267
+ // First, try to convert it to a Hash if it's not already a Hash
268
+ // This handles the case where schema_value is a Schema object from Parquet::Schema.define
269
+ let schema_hash = if schema_value.is_kind_of(ruby.class_hash()) {
270
+ RHash::from_value(schema_value).ok_or_else(|| {
271
+ MagnusError::new(magnus::exception::type_error(), "Schema must be a hash")
272
+ })?
273
+ } else {
274
+ // Try to convert the object to a hash with to_h
275
+ match schema_value.respond_to("to_h", false) {
276
+ Ok(true) => {
277
+ match schema_value.funcall::<_, _, Value>("to_h", ()) {
278
+ Ok(hash_val) => match RHash::from_value(hash_val) {
279
+ Some(hash) => hash,
280
+ None => {
281
+ // Not a hash, continue to normal handling
282
+ RHash::new()
283
+ }
284
+ },
285
+ Err(_) => {
286
+ // couldn't call to_h, continue to normal handling
287
+ RHash::new()
288
+ }
289
+ }
290
+ }
291
+ _ => {
292
+ // Doesn't respond to to_h, continue to normal handling
293
+ RHash::new()
294
+ }
79
295
  }
296
+ };
80
297
 
81
- let entries: Vec<(Value, Value)> = field_hash.funcall("to_a", ())?;
82
- if entries.len() != 1 {
83
- return Err(MagnusError::new(
84
- magnus::exception::type_error(),
85
- format!("schema[{}] must contain exactly one key-value pair", idx),
86
- ));
87
- }
298
+ // Now check if it's a schema hash with a type: :struct field
299
+ let type_val = schema_hash.get(Symbol::new("type"));
88
300
 
89
- let (name, type_value) = &entries[0];
90
- let name = String::try_convert(name.clone())?;
301
+ if let Some(type_val) = type_val {
302
+ // If it has a type: :struct, it's the new DSL format
303
+ // Use parse_string_or_symbol to handle both String and Symbol values
304
+ let ttype = parse_string_or_symbol(&ruby, type_val)?;
305
+ if let Some(ref type_str) = ttype {
306
+ if type_str == "struct" {
307
+ // Parse using the new schema approach
308
+ let schema_node = crate::parse_schema_node(&ruby, schema_value)?;
91
309
 
92
- let (type_, format) = if type_value.is_kind_of(ruby.class_hash()) {
93
- let type_hash: Vec<(Value, Value)> = type_value.funcall("to_a", ())?;
94
- let mut type_str = None;
95
- let mut format_str = None;
310
+ validate_schema_node(&ruby, &schema_node)?;
96
311
 
97
- for (key, value) in type_hash {
98
- let key = String::try_convert(key)?;
99
- match key.as_str() {
100
- "type" => type_str = Some(value),
101
- "format" => format_str = Some(String::try_convert(value)?),
102
- _ => {
103
- return Err(MagnusError::new(
104
- magnus::exception::type_error(),
105
- format!("Unknown key '{}' in type definition", key),
106
- ))
107
- }
108
- }
312
+ return Ok(ParquetWriteArgs {
313
+ read_from,
314
+ write_to: kwargs.required.1,
315
+ schema: schema_node,
316
+ batch_size: kwargs.optional.0.flatten(),
317
+ flush_threshold: kwargs.optional.1.flatten(),
318
+ compression: kwargs.optional.2.flatten(),
319
+ sample_size: kwargs.optional.3.flatten(),
320
+ logger: kwargs.optional.4.flatten(),
321
+ });
109
322
  }
323
+ }
324
+ }
110
325
 
111
- let type_str = type_str.ok_or_else(|| {
112
- MagnusError::new(
113
- magnus::exception::type_error(),
114
- "Missing 'type' in type definition",
115
- )
116
- })?;
326
+ // If it's not a hash with type: :struct, handle as legacy format
327
+ let schema_fields = if schema_value.is_nil()
328
+ || (schema_value.is_kind_of(ruby.class_array())
329
+ && RArray::from_value(schema_value)
330
+ .ok_or_else(|| {
331
+ MagnusError::new(
332
+ magnus::exception::type_error(),
333
+ "Schema fields must be an array",
334
+ )
335
+ })?
336
+ .len()
337
+ == 0)
338
+ {
339
+ // If schema is nil or an empty array, we need to peek at the first value to determine column count
340
+ let first_value = read_from.funcall::<_, _, Value>("peek", ())?;
341
+ // Default to nullable:true for auto-inferred fields
342
+ crate::infer_schema_from_first_row(&ruby, first_value, true)?
343
+ } else {
344
+ // Legacy array format - use our centralized parser
345
+ crate::parse_legacy_schema(&ruby, schema_value)?
346
+ };
117
347
 
118
- (ParquetSchemaType::try_convert(type_str)?, format_str)
119
- } else {
120
- (ParquetSchemaType::try_convert(type_value.clone())?, None)
121
- };
348
+ // Convert the legacy schema fields to SchemaNode (DSL format)
349
+ let schema_node = crate::legacy_schema_to_dsl(&ruby, schema_fields)?;
122
350
 
123
- schema.push(SchemaField {
124
- name,
125
- type_,
126
- format,
127
- });
128
- }
351
+ validate_schema_node(&ruby, &schema_node)?;
129
352
 
130
353
  Ok(ParquetWriteArgs {
131
354
  read_from,
132
355
  write_to: kwargs.required.1,
133
- schema,
356
+ schema: schema_node,
134
357
  batch_size: kwargs.optional.0.flatten(),
135
358
  flush_threshold: kwargs.optional.1.flatten(),
136
359
  compression: kwargs.optional.2.flatten(),
137
360
  sample_size: kwargs.optional.3.flatten(),
361
+ logger: kwargs.optional.4.flatten(),
138
362
  })
139
363
  }
140
364
 
141
- /// Estimate the size of a row
142
- fn estimate_single_row_size(row: &RArray, schema: &[SchemaField]) -> Result<usize, MagnusError> {
143
- let mut row_size = 0;
144
- for (field, value) in schema.iter().zip(row.into_iter()) {
145
- // Estimate size based on type and value
146
- row_size += match &field.type_ {
147
- // Use reference to avoid moving
148
- ParquetSchemaType::Int8 | ParquetSchemaType::UInt8 => 1,
149
- ParquetSchemaType::Int16 | ParquetSchemaType::UInt16 => 2,
150
- ParquetSchemaType::Int32
151
- | ParquetSchemaType::UInt32
152
- | ParquetSchemaType::Float
153
- | ParquetSchemaType::Date32 => 4,
154
- ParquetSchemaType::Int64
155
- | ParquetSchemaType::UInt64
156
- | ParquetSchemaType::Double
157
- | ParquetSchemaType::TimestampMillis
158
- | ParquetSchemaType::TimestampMicros => 8,
159
- ParquetSchemaType::String => {
160
- if let Ok(s) = String::try_convert(value) {
161
- s.len() + mem::size_of::<usize>() // account for length prefix
162
- } else {
163
- 16 // default estimate for string
164
- }
165
- }
166
- ParquetSchemaType::Binary => {
167
- if let Ok(bytes) = Vec::<u8>::try_convert(value) {
168
- bytes.len() + mem::size_of::<usize>() // account for length prefix
169
- } else {
170
- 16 // default estimate for binary
171
- }
172
- }
173
- ParquetSchemaType::Boolean => 1,
174
- ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
175
- 32 // rough estimate for complex types
176
- }
177
- };
365
+ // Validates a SchemaNode to ensure it meets Parquet schema requirements
366
+ // Currently checks for duplicate field names at the root level, which would
367
+ // cause problems when writing Parquet files. Additional validation rules
368
+ // could be added here in the future.
369
+ //
370
+ // This validation is important because schema errors are difficult to debug
371
+ // once they reach the Parquet/Arrow layer, so we check proactively before
372
+ // any data processing begins.
373
+ fn validate_schema_node(ruby: &Ruby, schema_node: &SchemaNode) -> Result<(), MagnusError> {
374
+ if let SchemaNode::Struct { fields, .. } = &schema_node {
375
+ // if any root level schema fields have the same name, we raise an error
376
+ let field_names = fields
377
+ .iter()
378
+ .map(|f| match f {
379
+ SchemaNode::Struct { name, .. } => name.as_str(),
380
+ SchemaNode::List { name, .. } => name.as_str(),
381
+ SchemaNode::Map { name, .. } => name.as_str(),
382
+ SchemaNode::Primitive { name, .. } => name.as_str(),
383
+ })
384
+ .collect::<Vec<_>>();
385
+ let unique_field_names = field_names.iter().unique().collect::<Vec<_>>();
386
+ if field_names.len() != unique_field_names.len() {
387
+ return Err(MagnusError::new(
388
+ ruby.exception_arg_error(),
389
+ format!(
390
+ "Duplicate field names in root level schema: {:?}",
391
+ field_names
392
+ ),
393
+ ));
394
+ }
178
395
  }
179
- Ok(row_size)
396
+ Ok(())
397
+ }
398
+
399
+ // Processes a single data row and adds values to the corresponding column collectors
400
+ // This function is called for each row of input data when writing in row-wise mode.
401
+ // It performs important validation to ensure the row structure matches the schema:
402
+ // - Verifies that the number of columns in the row matches the schema
403
+ // - Distributes each value to the appropriate ColumnCollector
404
+ //
405
+ // Each ColumnCollector handles type conversion and accumulation for its specific column,
406
+ // allowing this function to focus on row-level validation and distribution.
407
+ fn process_row(
408
+ ruby: &Ruby,
409
+ row: Value,
410
+ column_collectors: &mut [ColumnCollector],
411
+ ) -> Result<(), MagnusError> {
412
+ let row_array = RArray::from_value(row)
413
+ .ok_or_else(|| MagnusError::new(ruby.exception_type_error(), "Row must be an array"))?;
414
+
415
+ // Validate row length matches schema
416
+ if row_array.len() != column_collectors.len() {
417
+ return Err(MagnusError::new(
418
+ magnus::exception::runtime_error(),
419
+ format!(
420
+ "Row length ({}) does not match schema length ({}). Schema expects columns: {:?}",
421
+ row_array.len(),
422
+ column_collectors.len(),
423
+ column_collectors
424
+ .iter()
425
+ .map(|c| c.name.as_str())
426
+ .collect::<Vec<_>>()
427
+ ),
428
+ ));
429
+ }
430
+
431
+ // Process each value in the row
432
+ for (collector, value) in column_collectors.iter_mut().zip(row_array) {
433
+ collector.push_value(value)?;
434
+ }
435
+
436
+ Ok(())
437
+ }
438
+
439
+ // Dynamically calculates an optimal batch size based on estimated row sizes
440
+ // and memory constraints. This function enables the writer to adapt to different
441
+ // data characteristics for optimal performance.
442
+ //
443
+ // The algorithm:
444
+ // 1. Requires a minimum number of samples to make a reliable estimate
445
+ // 2. Calculates the average row size from the samples
446
+ // 3. Determines a batch size that would consume approximately the target memory threshold
447
+ // 4. Ensures the batch size doesn't go below a minimum value for efficiency
448
+ //
449
+ // This approach balances memory usage with processing efficiency by targeting
450
+ // a specific memory footprint per batch.
451
+ fn update_batch_size(
452
+ size_samples: &[usize],
453
+ flush_threshold: usize,
454
+ min_batch_size: usize,
455
+ ) -> usize {
456
+ if size_samples.len() < MIN_SAMPLES_FOR_ESTIMATE {
457
+ return min_batch_size;
458
+ }
459
+
460
+ let total_size = size_samples.iter().sum::<usize>();
461
+ // Safe because we know we have at least MIN_SAMPLES_FOR_ESTIMATE samples
462
+ let avg_row_size = total_size as f64 / size_samples.len() as f64;
463
+ let avg_row_size = avg_row_size.max(1.0); // Ensure we don't divide by zero
464
+ let suggested_batch_size = (flush_threshold as f64 / avg_row_size).floor() as usize;
465
+ suggested_batch_size.max(min_batch_size)
180
466
  }
181
467
 
182
468
  #[inline]
183
469
  pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
470
+ write_rows_impl(args).map_err(|e| {
471
+ let z: MagnusError = e.into();
472
+ z
473
+ })?;
474
+ Ok(())
475
+ }
476
+
477
+ #[inline]
478
+ fn write_rows_impl(args: &[Value]) -> Result<(), ReaderError> {
184
479
  let ruby = unsafe { Ruby::get_unchecked() };
185
480
 
186
481
  let ParquetWriteArgs {
@@ -191,59 +486,27 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
191
486
  compression,
192
487
  flush_threshold,
193
488
  sample_size: user_sample_size,
489
+ logger,
194
490
  } = parse_parquet_write_args(args)?;
195
491
 
492
+ let logger = RubyLogger::new(&ruby, logger)?;
196
493
  let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
197
494
 
198
- // Convert schema to Arrow schema
199
- let arrow_fields: Vec<Field> = schema
200
- .iter()
201
- .map(|field| {
202
- Field::new(
203
- &field.name,
204
- match field.type_ {
205
- ParquetSchemaType::Int8 => DataType::Int8,
206
- ParquetSchemaType::Int16 => DataType::Int16,
207
- ParquetSchemaType::Int32 => DataType::Int32,
208
- ParquetSchemaType::Int64 => DataType::Int64,
209
- ParquetSchemaType::UInt8 => DataType::UInt8,
210
- ParquetSchemaType::UInt16 => DataType::UInt16,
211
- ParquetSchemaType::UInt32 => DataType::UInt32,
212
- ParquetSchemaType::UInt64 => DataType::UInt64,
213
- ParquetSchemaType::Float => DataType::Float32,
214
- ParquetSchemaType::Double => DataType::Float64,
215
- ParquetSchemaType::String => DataType::Utf8,
216
- ParquetSchemaType::Binary => DataType::Binary,
217
- ParquetSchemaType::Boolean => DataType::Boolean,
218
- ParquetSchemaType::Date32 => DataType::Date32,
219
- ParquetSchemaType::TimestampMillis => {
220
- DataType::Timestamp(TimeUnit::Millisecond, None)
221
- }
222
- ParquetSchemaType::TimestampMicros => {
223
- DataType::Timestamp(TimeUnit::Microsecond, None)
224
- }
225
- ParquetSchemaType::List(_) => unimplemented!("List type not yet supported"),
226
- ParquetSchemaType::Map(_) => unimplemented!("Map type not yet supported"),
227
- },
228
- true,
229
- )
230
- })
231
- .collect();
232
- let arrow_schema = Arc::new(Schema::new(arrow_fields));
495
+ // Get the Arrow schema from the SchemaNode (we only have DSL schema now, since legacy is converted)
496
+ let arrow_schema = build_arrow_schema(&schema, &logger).map_err(|e| {
497
+ MagnusError::new(
498
+ magnus::exception::runtime_error(),
499
+ format!("Failed to build Arrow schema from DSL schema: {}", e),
500
+ )
501
+ })?;
233
502
 
234
503
  // Create the writer
235
504
  let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
236
505
 
237
506
  if read_from.is_kind_of(ruby.class_enumerator()) {
238
- // Create collectors for each column
239
- let mut column_collectors: Vec<ColumnCollector> = schema
240
- .iter()
241
- .map(|field| {
242
- // Clone the type to avoid moving from a reference
243
- let type_clone = field.type_.clone();
244
- ColumnCollector::new(field.name.clone(), type_clone, field.format.clone())
245
- })
246
- .collect();
507
+ // Build column collectors - we only have DSL schema now
508
+ let mut column_collectors =
509
+ build_column_collectors_from_dsl(&ruby, &arrow_schema, &schema)?;
247
510
 
248
511
  let mut rows_in_batch = 0;
249
512
  let mut total_rows = 0;
@@ -255,48 +518,33 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
255
518
  loop {
256
519
  match read_from.funcall::<_, _, Value>("next", ()) {
257
520
  Ok(row) => {
258
- let row_array = RArray::from_value(row).ok_or_else(|| {
259
- MagnusError::new(ruby.exception_type_error(), "Row must be an array")
260
- })?;
261
-
262
- // Validate row length matches schema
263
- if row_array.len() != column_collectors.len() {
264
- return Err(MagnusError::new(
265
- magnus::exception::type_error(),
266
- format!(
267
- "Row length ({}) does not match schema length ({}). Schema expects columns: {:?}",
268
- row_array.len(),
269
- column_collectors.len(),
270
- column_collectors.iter().map(|c| c.name.as_str()).collect::<Vec<_>>()
271
- ),
272
- ));
273
- }
521
+ // Process the row
522
+ process_row(&ruby, row, &mut column_collectors)?;
274
523
 
275
- // Sample row sizes using reservoir sampling
524
+ // Update row sampling for dynamic batch sizing
276
525
  if size_samples.len() < sample_size {
277
- size_samples.push(estimate_single_row_size(&row_array, &schema)?);
278
- } else if rng.random_range(0..=total_rows) < sample_size {
279
- let idx = rng.random_range(0..sample_size);
280
- size_samples[idx] = estimate_single_row_size(&row_array, &schema)?;
281
- }
282
-
283
- // Process each value in the row
284
- for (collector, value) in column_collectors.iter_mut().zip(row_array) {
285
- collector.push_value(value)?;
526
+ // estimate row size
527
+ let row_array = RArray::from_value(row).ok_or_else(|| {
528
+ MagnusError::new(ruby.exception_type_error(), "Row must be an array")
529
+ })?;
530
+ let row_size = estimate_single_row_size(&row_array, &column_collectors)?;
531
+ size_samples.push(row_size);
532
+ } else if rng.random_range(0..=total_rows) < sample_size as usize {
533
+ let idx = rng.random_range(0..sample_size as usize);
534
+ let row_array = RArray::from_value(row).ok_or_else(|| {
535
+ MagnusError::new(ruby.exception_type_error(), "Row must be an array")
536
+ })?;
537
+ let row_size = estimate_single_row_size(&row_array, &column_collectors)?;
538
+ size_samples[idx] = row_size;
286
539
  }
287
540
 
288
541
  rows_in_batch += 1;
289
542
  total_rows += 1;
290
543
 
291
544
  // Calculate batch size progressively once we have minimum samples
292
- if size_samples.len() >= MIN_SAMPLES_FOR_ESTIMATE && user_batch_size.is_none() {
293
- let total_size = size_samples.iter().sum::<usize>();
294
- // Safe because we know we have at least MIN_SAMPLES_FOR_ESTIMATE samples
295
- let avg_row_size = total_size as f64 / size_samples.len() as f64;
296
- let avg_row_size = avg_row_size.max(1.0); // Ensure we don't divide by zero
297
- let suggested_batch_size =
298
- (flush_threshold as f64 / avg_row_size).floor() as usize;
299
- current_batch_size = suggested_batch_size.max(MIN_BATCH_SIZE);
545
+ if user_batch_size.is_none() && size_samples.len() >= MIN_SAMPLES_FOR_ESTIMATE {
546
+ current_batch_size =
547
+ update_batch_size(&size_samples, flush_threshold, MIN_BATCH_SIZE);
300
548
  }
301
549
 
302
550
  // When we reach batch size, write the batch
@@ -313,19 +561,19 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
313
561
  }
314
562
  break;
315
563
  }
316
- return Err(e);
564
+ return Err(e)?;
317
565
  }
318
566
  }
319
567
  }
320
568
  } else {
321
569
  return Err(MagnusError::new(
322
570
  magnus::exception::type_error(),
323
- "read_from must be an Enumerator",
324
- ));
571
+ "read_from must be an Enumerator".to_string(),
572
+ ))?;
325
573
  }
326
574
 
327
575
  // Ensure everything is written and get the temp file if it exists
328
- if let Some(temp_file) = writer.close().map_err(|e| ParquetErrorWrapper(e))? {
576
+ if let Some(temp_file) = writer.close()? {
329
577
  // If we got a temp file back, we need to copy its contents to the IO-like object
330
578
  copy_temp_file_to_io_like(temp_file, IoLikeValue(write_to))?;
331
579
  }
@@ -335,6 +583,15 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
335
583
 
336
584
  #[inline]
337
585
  pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
586
+ write_columns_impl(args).map_err(|e| {
587
+ let z: MagnusError = e.into();
588
+ z
589
+ })?;
590
+ Ok(())
591
+ }
592
+
593
+ #[inline]
594
+ fn write_columns_impl(args: &[Value]) -> Result<(), ReaderError> {
338
595
  let ruby = unsafe { Ruby::get_unchecked() };
339
596
 
340
597
  let ParquetWriteArgs {
@@ -345,45 +602,19 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
345
602
  compression,
346
603
  flush_threshold,
347
604
  sample_size: _,
605
+ logger,
348
606
  } = parse_parquet_write_args(args)?;
349
607
 
608
+ let logger = RubyLogger::new(&ruby, logger)?;
350
609
  let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
351
610
 
352
- // Convert schema to Arrow schema
353
- let arrow_fields: Vec<Field> = schema
354
- .iter()
355
- .map(|field| {
356
- Field::new(
357
- &field.name,
358
- match field.type_ {
359
- ParquetSchemaType::Int8 => DataType::Int8,
360
- ParquetSchemaType::Int16 => DataType::Int16,
361
- ParquetSchemaType::Int32 => DataType::Int32,
362
- ParquetSchemaType::Int64 => DataType::Int64,
363
- ParquetSchemaType::UInt8 => DataType::UInt8,
364
- ParquetSchemaType::UInt16 => DataType::UInt16,
365
- ParquetSchemaType::UInt32 => DataType::UInt32,
366
- ParquetSchemaType::UInt64 => DataType::UInt64,
367
- ParquetSchemaType::Float => DataType::Float32,
368
- ParquetSchemaType::Double => DataType::Float64,
369
- ParquetSchemaType::String => DataType::Utf8,
370
- ParquetSchemaType::Binary => DataType::Binary,
371
- ParquetSchemaType::Boolean => DataType::Boolean,
372
- ParquetSchemaType::Date32 => DataType::Date32,
373
- ParquetSchemaType::TimestampMillis => {
374
- DataType::Timestamp(TimeUnit::Millisecond, None)
375
- }
376
- ParquetSchemaType::TimestampMicros => {
377
- DataType::Timestamp(TimeUnit::Microsecond, None)
378
- }
379
- ParquetSchemaType::List(_) => unimplemented!("List type not yet supported"),
380
- ParquetSchemaType::Map(_) => unimplemented!("Map type not yet supported"),
381
- },
382
- true,
383
- )
384
- })
385
- .collect();
386
- let arrow_schema = Arc::new(Schema::new(arrow_fields));
611
+ // Get the Arrow schema from the SchemaNode (we only have DSL schema now, since legacy is converted)
612
+ let arrow_schema = build_arrow_schema(&schema, &logger).map_err(|e| {
613
+ MagnusError::new(
614
+ magnus::exception::runtime_error(),
615
+ format!("Failed to build Arrow schema from DSL schema: {}", e),
616
+ )
617
+ })?;
387
618
 
388
619
  // Create the writer
389
620
  let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
@@ -396,37 +627,120 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
396
627
  MagnusError::new(ruby.exception_type_error(), "Batch must be an array")
397
628
  })?;
398
629
 
630
+ // Batch array must be an array of arrays. Check that the first value in `batch_array` is an array.
631
+ batch_array.entry::<RArray>(0).map_err(|_| {
632
+ MagnusError::new(
633
+ ruby.exception_type_error(),
634
+ "When writing columns, data must be formatted as batches of columns: [[batch1_col1, batch1_col2], [batch2_col1, batch2_col2]].",
635
+ )
636
+ })?;
637
+
399
638
  // Validate batch length matches schema
400
- if batch_array.len() != schema.len() {
639
+ // Get schema length and field names - we only have DSL schema now
640
+ let (schema_len, field_names): (usize, Vec<&str>) = {
641
+ let fields = match &schema {
642
+ SchemaNode::Struct { fields, .. } => fields,
643
+ _ => {
644
+ return Err(MagnusError::new(
645
+ magnus::exception::type_error(),
646
+ "Root schema node must be a struct type",
647
+ ))?
648
+ }
649
+ };
650
+ (
651
+ fields.len(),
652
+ fields
653
+ .iter()
654
+ .map(|f| match f {
655
+ SchemaNode::Primitive { name, .. } => name.as_str(),
656
+ SchemaNode::List { name, .. } => name.as_str(),
657
+ SchemaNode::Map { name, .. } => name.as_str(),
658
+ SchemaNode::Struct { name, .. } => name.as_str(),
659
+ })
660
+ .to_owned()
661
+ .collect(),
662
+ )
663
+ };
664
+
665
+ if batch_array.len() != schema_len {
401
666
  return Err(MagnusError::new(
402
667
  magnus::exception::type_error(),
403
668
  format!(
404
669
  "Batch column count ({}) does not match schema length ({}). Schema expects columns: {:?}",
405
670
  batch_array.len(),
406
- schema.len(),
407
- schema.iter().map(|f| f.name.as_str()).collect::<Vec<_>>()
671
+ schema_len,
672
+ field_names
408
673
  ),
409
- ));
674
+ ))?;
410
675
  }
411
676
 
412
677
  // Convert each column in the batch to Arrow arrays
413
- let arrow_arrays: Vec<(String, Arc<dyn Array>)> = schema
414
- .iter()
415
- .zip(batch_array)
416
- .map(|(field, column)| {
417
- let column_array = RArray::from_value(column).ok_or_else(|| {
678
+ let arrow_arrays: Vec<(String, Arc<dyn Array>)> = {
679
+ // Process each field in the DSL schema
680
+ let fields = arrow_schema.fields();
681
+ let top_fields =
682
+ match &schema {
683
+ SchemaNode::Struct { fields, .. } => fields,
684
+ _ => return Err(MagnusError::new(
685
+ magnus::exception::runtime_error(),
686
+ "Top-level DSL schema must be a struct for columns approach",
687
+ ))?,
688
+ };
689
+ if top_fields.len() != fields.len() {
690
+ return Err(MagnusError::new(
691
+ magnus::exception::runtime_error(),
692
+ "Mismatch top-level DSL fields vs Arrow fields",
693
+ ))?;
694
+ }
695
+
696
+ let mut out = vec![];
697
+ for ((arrow_f, dsl_f), col_val) in
698
+ fields.iter().zip(top_fields.iter()).zip(batch_array)
699
+ {
700
+ let col_arr = RArray::from_value(col_val).ok_or_else(|| {
418
701
  MagnusError::new(
419
702
  magnus::exception::type_error(),
420
- format!("Column '{}' must be an array", field.name),
703
+ format!("Column '{}' must be an array", arrow_f.name()),
421
704
  )
422
705
  })?;
423
-
424
- Ok((
425
- field.name.clone(),
426
- convert_ruby_array_to_arrow(column_array, &field.type_)?,
427
- ))
428
- })
429
- .collect::<Result<_, MagnusError>>()?;
706
+ // Get appropriate parquet_type
707
+ let ptype = match dsl_f {
708
+ SchemaNode::Primitive {
709
+ parquet_type,
710
+ // Format is handled internally now
711
+ ..
712
+ } => match parquet_type {
713
+ &PrimitiveType::Int8 => PST::Int8,
714
+ &PrimitiveType::Int16 => PST::Int16,
715
+ &PrimitiveType::Int32 => PST::Int32,
716
+ &PrimitiveType::Int64 => PST::Int64,
717
+ &PrimitiveType::UInt8 => PST::UInt8,
718
+ &PrimitiveType::UInt16 => PST::UInt16,
719
+ &PrimitiveType::UInt32 => PST::UInt32,
720
+ &PrimitiveType::UInt64 => PST::UInt64,
721
+ &PrimitiveType::Float32 => PST::Float,
722
+ &PrimitiveType::Float64 => PST::Double,
723
+ &PrimitiveType::String => PST::String,
724
+ &PrimitiveType::Binary => PST::Binary,
725
+ &PrimitiveType::Boolean => PST::Boolean,
726
+ &PrimitiveType::Date32 => PST::Date32,
727
+ &PrimitiveType::TimestampMillis => PST::TimestampMillis,
728
+ &PrimitiveType::TimestampMicros => PST::TimestampMicros,
729
+ },
730
+ SchemaNode::List { .. }
731
+ | SchemaNode::Map { .. }
732
+ | SchemaNode::Struct { .. } => {
733
+ // For nested, we just do a single "column" as well
734
+ arrow_data_type_to_parquet_schema_type(arrow_f.data_type())?
735
+ }
736
+ };
737
+ out.push((
738
+ arrow_f.name().clone(),
739
+ convert_ruby_array_to_arrow(col_arr, &ptype)?,
740
+ ));
741
+ }
742
+ out
743
+ };
430
744
 
431
745
  // Create and write record batch
432
746
  let record_batch = RecordBatch::try_from_iter(arrow_arrays).map_err(|e| {
@@ -436,14 +750,12 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
436
750
  )
437
751
  })?;
438
752
 
439
- writer
440
- .write(&record_batch)
441
- .map_err(|e| ParquetErrorWrapper(e))?;
753
+ writer.write(&record_batch)?;
442
754
 
443
755
  match &mut writer {
444
756
  WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
445
757
  if w.in_progress_size() >= flush_threshold {
446
- w.flush().map_err(|e| ParquetErrorWrapper(e))?;
758
+ w.flush()?;
447
759
  }
448
760
  }
449
761
  }
@@ -452,19 +764,19 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
452
764
  if e.is_kind_of(ruby.exception_stop_iteration()) {
453
765
  break;
454
766
  }
455
- return Err(e);
767
+ return Err(e)?;
456
768
  }
457
769
  }
458
770
  }
459
771
  } else {
460
772
  return Err(MagnusError::new(
461
773
  magnus::exception::type_error(),
462
- "read_from must be an Enumerator",
463
- ));
774
+ "read_from must be an Enumerator".to_string(),
775
+ ))?;
464
776
  }
465
777
 
466
778
  // Ensure everything is written and get the temp file if it exists
467
- if let Some(temp_file) = writer.close().map_err(|e| ParquetErrorWrapper(e))? {
779
+ if let Some(temp_file) = writer.close()? {
468
780
  // If we got a temp file back, we need to copy its contents to the IO-like object
469
781
  copy_temp_file_to_io_like(temp_file, IoLikeValue(write_to))?;
470
782
  }
@@ -472,12 +784,23 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
472
784
  Ok(())
473
785
  }
474
786
 
787
+ // Creates an appropriate Parquet writer based on the output target and compression settings
788
+ // This function handles two main output scenarios:
789
+ // 1. Writing directly to a file path (string)
790
+ // 2. Writing to a Ruby IO-like object (using a temporary file as an intermediate buffer)
791
+ //
792
+ // For IO-like objects, the function creates a temporary file that is later copied to the
793
+ // IO object when writing is complete. This approach is necessary because Parquet requires
794
+ // random file access to write its footer after the data.
795
+ //
796
+ // The function also configures compression based on the user's preferences, with
797
+ // several options available (none, snappy, gzip, lz4, zstd).
475
798
  fn create_writer(
476
799
  ruby: &Ruby,
477
800
  write_to: &Value,
478
801
  schema: Arc<Schema>,
479
802
  compression: Option<String>,
480
- ) -> Result<WriterOutput, MagnusError> {
803
+ ) -> Result<WriterOutput, ReaderError> {
481
804
  // Create writer properties with compression based on the option
482
805
  let props = WriterProperties::builder()
483
806
  .set_compression(match compression.as_deref() {
@@ -492,9 +815,8 @@ fn create_writer(
492
815
 
493
816
  if write_to.is_kind_of(ruby.class_string()) {
494
817
  let path = write_to.to_r_string()?.to_string()?;
495
- let file: Box<dyn SendableWrite> = Box::new(File::create(path).unwrap());
496
- let writer =
497
- ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
818
+ let file: Box<dyn SendableWrite> = Box::new(File::create(path)?);
819
+ let writer = ArrowWriter::try_new(file, schema, Some(props))?;
498
820
  Ok(WriterOutput::File(writer))
499
821
  } else {
500
822
  // Create a temporary file to write to instead of directly to the IoLikeValue
@@ -510,13 +832,22 @@ fn create_writer(
510
832
  format!("Failed to reopen temporary file: {}", e),
511
833
  )
512
834
  })?);
513
- let writer =
514
- ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
835
+ let writer = ArrowWriter::try_new(file, schema, Some(props))?;
515
836
  Ok(WriterOutput::TempFile(writer, temp_file))
516
837
  }
517
838
  }
518
839
 
519
- // Helper function to copy temp file contents to IoLikeValue
840
+ // Copies the contents of a temporary file to a Ruby IO-like object
841
+ // This function is necessary because Parquet writing requires random file access
842
+ // (especially for writing the footer after all data), but Ruby IO objects may not
843
+ // support seeking. The solution is to:
844
+ //
845
+ // 1. Write the entire Parquet file to a temporary file first
846
+ // 2. Once writing is complete, copy the entire contents to the Ruby IO object
847
+ //
848
+ // This approach enables support for a wide range of Ruby IO objects like StringIO,
849
+ // network streams, etc., but does require enough disk space for the temporary file
850
+ // and involves a second full-file read/write operation at the end.
520
851
  fn copy_temp_file_to_io_like(
521
852
  temp_file: NamedTempFile,
522
853
  io_like: IoLikeValue,
@@ -540,36 +871,278 @@ fn copy_temp_file_to_io_like(
540
871
  Ok(())
541
872
  }
542
873
 
874
+ // Estimates the memory size of a single row by examining each value
875
+ // This is used for dynamic batch sizing to optimize memory usage during writes
876
+ // by adapting batch sizes based on the actual data being processed.
877
+ pub fn estimate_single_row_size(
878
+ row_array: &RArray,
879
+ collectors: &[ColumnCollector],
880
+ ) -> Result<usize, MagnusError> {
881
+ let mut size = 0;
882
+ for (idx, val) in row_array.into_iter().enumerate() {
883
+ let col_type = &collectors[idx].type_;
884
+ // Calculate size based on the type-specific estimation
885
+ size += estimate_value_size(val, col_type)?;
886
+ }
887
+ Ok(size)
888
+ }
889
+
890
+ // Estimates the memory footprint of a single value based on its schema type
891
+ // This provides type-specific size estimates that help with dynamic batch sizing
892
+ // For complex types like lists, maps, and structs, we use reasonable approximations
893
+ pub fn estimate_value_size(
894
+ value: Value,
895
+ schema_type: &ParquetSchemaType,
896
+ ) -> Result<usize, MagnusError> {
897
+ use ParquetSchemaType as PST;
898
+ if value.is_nil() {
899
+ return Ok(0); // nil => minimal
900
+ }
901
+ match schema_type {
902
+ PST::Int8 | PST::UInt8 => Ok(1),
903
+ PST::Int16 | PST::UInt16 => Ok(2),
904
+ PST::Int32 | PST::UInt32 | PST::Float => Ok(4),
905
+ PST::Int64 | PST::UInt64 | PST::Double => Ok(8),
906
+ PST::Boolean => Ok(1),
907
+ PST::Date32 | PST::TimestampMillis | PST::TimestampMicros => Ok(8),
908
+ PST::String | PST::Binary => {
909
+ if let Ok(s) = String::try_convert(value) {
910
+ // Account for string length plus Rust String's capacity+pointer overhead
911
+ Ok(s.len() + std::mem::size_of::<usize>() * 3)
912
+ } else {
913
+ // Try to convert the value to a string using to_s for non-string types
914
+ // This handles numeric values that will be converted to strings later
915
+ let _ruby = unsafe { Ruby::get_unchecked() };
916
+ match value.funcall::<_, _, Value>("to_s", ()) {
917
+ Ok(str_val) => {
918
+ if let Ok(s) = String::try_convert(str_val) {
919
+ Ok(s.len() + std::mem::size_of::<usize>() * 3)
920
+ } else {
921
+ // If to_s conversion fails, just use a reasonable default
922
+ Ok(8) // Reasonable size estimate for small values
923
+ }
924
+ }
925
+ Err(_) => {
926
+ // If to_s method fails, use a default size
927
+ Ok(8) // Reasonable size estimate for small values
928
+ }
929
+ }
930
+ }
931
+ }
932
+ PST::List(item_type) => {
933
+ if let Ok(arr) = RArray::try_convert(value) {
934
+ let len = arr.len();
935
+
936
+ // Base overhead for the array structure (pointer, length, capacity)
937
+ let base_size = std::mem::size_of::<usize>() * 3;
938
+
939
+ // If empty, just return the base size
940
+ if len == 0 {
941
+ return Ok(base_size);
942
+ }
943
+
944
+ // Sample up to 5 elements to get average element size
945
+ let sample_count = std::cmp::min(len, 5);
946
+ let mut total_sample_size = 0;
947
+
948
+ for i in 0..sample_count {
949
+ let element = arr.entry(i as isize)?;
950
+ let element_size = estimate_value_size(element, &item_type.item_type)?;
951
+ total_sample_size += element_size;
952
+ }
953
+
954
+ // If we couldn't sample any elements properly, that's an error
955
+ if sample_count > 0 && total_sample_size == 0 {
956
+ return Err(MagnusError::new(
957
+ magnus::exception::runtime_error(),
958
+ "Failed to estimate size of list elements",
959
+ ));
960
+ }
961
+
962
+ // Calculate average element size from samples
963
+ let avg_element_size = if sample_count > 0 {
964
+ total_sample_size as f64 / sample_count as f64
965
+ } else {
966
+ return Err(MagnusError::new(
967
+ magnus::exception::runtime_error(),
968
+ "Failed to sample list elements for size estimation",
969
+ ));
970
+ };
971
+
972
+ // Estimate total size based on average element size * length + base overhead
973
+ Ok(base_size + (avg_element_size as usize * len))
974
+ } else {
975
+ // Instead of assuming it's a small list, return an error
976
+ Err(MagnusError::new(
977
+ magnus::exception::runtime_error(),
978
+ format!("Expected array for List type but got: {:?}", value),
979
+ ))
980
+ }
981
+ }
982
+ PST::Map(map_field) => {
983
+ if let Ok(hash) = RHash::try_convert(value) {
984
+ let size_estimate = hash.funcall::<_, _, usize>("size", ())?;
985
+
986
+ // Base overhead for the hash structure
987
+ let base_size = std::mem::size_of::<usize>() * 4;
988
+
989
+ // If empty, just return the base size
990
+ if size_estimate == 0 {
991
+ return Ok(base_size);
992
+ }
993
+
994
+ // Sample up to 5 key-value pairs to estimate average sizes
995
+ let mut key_sample_size = 0;
996
+ let mut value_sample_size = 0;
997
+ let mut sample_count = 0;
998
+
999
+ // Get an enumerator for the hash
1000
+ let enumerator = hash.funcall::<_, _, Value>("to_enum", ())?;
1001
+
1002
+ // Sample up to 5 entries
1003
+ for _ in 0..std::cmp::min(size_estimate, 5) {
1004
+ match enumerator.funcall::<_, _, Value>("next", ()) {
1005
+ Ok(pair) => {
1006
+ if let Ok(pair_array) = RArray::try_convert(pair) {
1007
+ if pair_array.len() == 2 {
1008
+ let key = pair_array.entry(0)?;
1009
+ let val = pair_array.entry(1)?;
1010
+
1011
+ key_sample_size +=
1012
+ estimate_value_size(key, &map_field.key_type)?;
1013
+ value_sample_size +=
1014
+ estimate_value_size(val, &map_field.value_type)?;
1015
+ sample_count += 1;
1016
+ }
1017
+ }
1018
+ }
1019
+ Err(_) => break, // Stop if we reach the end
1020
+ }
1021
+ }
1022
+
1023
+ // If we couldn't sample any pairs, return an error
1024
+ if size_estimate > 0 && sample_count == 0 {
1025
+ return Err(MagnusError::new(
1026
+ magnus::exception::runtime_error(),
1027
+ "Failed to sample map entries for size estimation",
1028
+ ));
1029
+ }
1030
+
1031
+ // Calculate average key and value sizes
1032
+ let (avg_key_size, avg_value_size) = if sample_count > 0 {
1033
+ (
1034
+ key_sample_size as f64 / sample_count as f64,
1035
+ value_sample_size as f64 / sample_count as f64,
1036
+ )
1037
+ } else {
1038
+ return Err(MagnusError::new(
1039
+ magnus::exception::runtime_error(),
1040
+ "Failed to sample hash key-value pairs for size estimation",
1041
+ ));
1042
+ };
1043
+
1044
+ // Each entry has overhead (node pointers, etc.) in a hash map
1045
+ let entry_overhead = std::mem::size_of::<usize>() * 2;
1046
+
1047
+ // Estimate total size:
1048
+ // base size + (key_size + value_size + entry_overhead) * count
1049
+ Ok(base_size
1050
+ + ((avg_key_size + avg_value_size + entry_overhead as f64) as usize
1051
+ * size_estimate))
1052
+ } else {
1053
+ // Instead of assuming a small map, return an error
1054
+ Err(MagnusError::new(
1055
+ magnus::exception::runtime_error(),
1056
+ format!("Expected hash for Map type but got: {:?}", value),
1057
+ ))
1058
+ }
1059
+ }
1060
+ PST::Struct(struct_field) => {
1061
+ if let Ok(hash) = RHash::try_convert(value) {
1062
+ // Base overhead for the struct
1063
+ let base_size = std::mem::size_of::<usize>() * 3;
1064
+
1065
+ // Estimate size for each field
1066
+ let mut total_fields_size = 0;
1067
+
1068
+ for field in &struct_field.fields {
1069
+ // Try to get the field value from the hash
1070
+ match hash.get(Symbol::new(&field.name)) {
1071
+ Some(field_value) => {
1072
+ total_fields_size += estimate_value_size(field_value, &field.type_)?;
1073
+ }
1074
+ None => {
1075
+ if let Some(field_value) = hash.get(&*field.name) {
1076
+ total_fields_size +=
1077
+ estimate_value_size(field_value, &field.type_)?;
1078
+ } else {
1079
+ if field.nullable {
1080
+ total_fields_size += 0;
1081
+ } else {
1082
+ return Err(MagnusError::new(
1083
+ magnus::exception::runtime_error(),
1084
+ format!("Missing field: {} in hash {:?}", field.name, hash),
1085
+ ));
1086
+ }
1087
+ }
1088
+ }
1089
+ }
1090
+ }
1091
+
1092
+ // We no longer error on missing fields during size estimation
1093
+ Ok(base_size + total_fields_size)
1094
+ } else {
1095
+ // Instead of trying instance_variables or assuming a default, return an error
1096
+ Err(MagnusError::new(
1097
+ magnus::exception::runtime_error(),
1098
+ format!("Expected hash for Struct type but got: {:?}", value),
1099
+ ))
1100
+ }
1101
+ }
1102
+ }
1103
+ }
1104
+
1105
+ // Converts all accumulated data from ColumnCollectors into an Arrow RecordBatch
1106
+ // and writes it to the Parquet file/output. This is a crucial function that bridges
1107
+ // between our Ruby-oriented data collectors and the Arrow/Parquet ecosystem.
1108
+ //
1109
+ // The function:
1110
+ // 1. Takes all collected values from each ColumnCollector and converts them to Arrow arrays
1111
+ // 2. Creates a RecordBatch from these arrays (column-oriented data format)
1112
+ // 3. Writes the batch to the ParquetWriter
1113
+ // 4. Flushes the writer if the accumulated memory exceeds the threshold
1114
+ //
1115
+ // This approach enables efficient batch-wise writing while controlling memory usage.
543
1116
  fn write_batch(
544
1117
  writer: &mut WriterOutput,
545
1118
  collectors: &mut [ColumnCollector],
546
1119
  flush_threshold: usize,
547
- ) -> Result<(), MagnusError> {
1120
+ ) -> Result<(), ReaderError> {
548
1121
  // Convert columns to Arrow arrays
549
1122
  let arrow_arrays: Vec<(String, Arc<dyn Array>)> = collectors
550
1123
  .iter_mut()
551
- .map(|collector| Ok((collector.name.clone(), collector.take_array()?)))
552
- .collect::<Result<_, MagnusError>>()?;
1124
+ .map(|c| {
1125
+ let arr = c.take_array()?;
1126
+ Ok((c.name.clone(), arr))
1127
+ })
1128
+ .collect::<Result<_, ReaderError>>()?;
553
1129
 
554
- // Create and write record batch
555
- let record_batch = RecordBatch::try_from_iter(arrow_arrays).map_err(|e| {
1130
+ let record_batch = RecordBatch::try_from_iter(arrow_arrays.clone()).map_err(|e| {
556
1131
  MagnusError::new(
557
1132
  magnus::exception::runtime_error(),
558
- format!("Failed to create record batch: {}", e),
1133
+ format!("Failed to create RecordBatch: {}", e),
559
1134
  )
560
1135
  })?;
561
1136
 
562
- writer
563
- .write(&record_batch)
564
- .map_err(|e| ParquetErrorWrapper(e))?;
1137
+ writer.write(&record_batch)?;
565
1138
 
1139
+ // Check if we need to flush based on memory usage thresholds
566
1140
  match writer {
567
1141
  WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
568
1142
  if w.in_progress_size() >= flush_threshold || w.memory_size() >= flush_threshold {
569
- w.flush().map_err(|e| ParquetErrorWrapper(e))?;
1143
+ w.flush()?;
570
1144
  }
571
1145
  }
572
1146
  }
573
-
574
1147
  Ok(())
575
1148
  }