parquet 0.4.2 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,16 +1,16 @@
1
1
  use std::{
2
2
  fs::File,
3
3
  io::{self, BufReader, BufWriter},
4
- mem,
5
4
  sync::Arc,
6
5
  };
7
6
 
8
7
  use arrow_array::{Array, RecordBatch};
9
- use arrow_schema::{DataType, Field, Schema, TimeUnit};
8
+ use arrow_schema::{DataType, Schema, TimeUnit};
9
+ use itertools::Itertools;
10
10
  use magnus::{
11
11
  scan_args::{get_kwargs, scan_args},
12
12
  value::ReprValue,
13
- Error as MagnusError, RArray, Ruby, TryConvert, Value,
13
+ Error as MagnusError, RArray, RHash, Ruby, Symbol, TryConvert, Value,
14
14
  };
15
15
  use parquet::{
16
16
  arrow::ArrowWriter,
@@ -22,18 +22,210 @@ use tempfile::NamedTempFile;
22
22
 
23
23
  use crate::{
24
24
  convert_ruby_array_to_arrow,
25
- types::{ColumnCollector, ParquetErrorWrapper, WriterOutput},
26
- IoLikeValue, ParquetSchemaType, ParquetWriteArgs, SchemaField, SendableWrite,
25
+ logger::RubyLogger,
26
+ reader::ReaderError,
27
+ types::{
28
+ schema_node::build_arrow_schema, // ADDED - we need to reference the DSL's build_arrow_schema
29
+ ColumnCollector,
30
+ ParquetSchemaType,
31
+ WriterOutput,
32
+ },
33
+ utils::parse_string_or_symbol,
34
+ IoLikeValue, ParquetSchemaType as PST, ParquetWriteArgs, SchemaField, SendableWrite,
27
35
  };
36
+ use crate::{types::PrimitiveType, SchemaNode}; // ADDED - ensure we import SchemaNode
28
37
 
29
- const MIN_SAMPLES_FOR_ESTIMATE: usize = 10; // Minimum samples needed for estimation
30
- const SAMPLE_SIZE: usize = 100; // Number of rows to sample for size estimation
31
- const MIN_BATCH_SIZE: usize = 10; // Minimum batch size to maintain efficiency
32
- const INITIAL_BATCH_SIZE: usize = 100; // Initial batch size while sampling
33
-
34
- // Maximum memory usage per batch (64MB by default)
38
+ const MIN_SAMPLES_FOR_ESTIMATE: usize = 10;
39
+ const SAMPLE_SIZE: usize = 100;
40
+ const MIN_BATCH_SIZE: usize = 10;
41
+ const INITIAL_BATCH_SIZE: usize = 100;
35
42
  const DEFAULT_MEMORY_THRESHOLD: usize = 64 * 1024 * 1024;
36
43
 
44
+ // -----------------------------------------------------------------------------
45
+ // HELPER to invert arrow DataType back to our ParquetSchemaType
46
+ // Converts Arrow DataType to our internal ParquetSchemaType representation.
47
+ // This is essential for mapping Arrow types back to our schema representation
48
+ // when working with column collections and schema validation.
49
+ // -----------------------------------------------------------------------------
50
+ fn arrow_data_type_to_parquet_schema_type(dt: &DataType) -> Result<ParquetSchemaType, MagnusError> {
51
+ match dt {
52
+ DataType::Boolean => Ok(PST::Boolean),
53
+ DataType::Int8 => Ok(PST::Int8),
54
+ DataType::Int16 => Ok(PST::Int16),
55
+ DataType::Int32 => Ok(PST::Int32),
56
+ DataType::Int64 => Ok(PST::Int64),
57
+ DataType::UInt8 => Ok(PST::UInt8),
58
+ DataType::UInt16 => Ok(PST::UInt16),
59
+ DataType::UInt32 => Ok(PST::UInt32),
60
+ DataType::UInt64 => Ok(PST::UInt64),
61
+ DataType::Float16 => {
62
+ // We do not have a direct ParquetSchemaType::Float16, we treat it as Float
63
+ Ok(PST::Float)
64
+ }
65
+ DataType::Float32 => Ok(PST::Float),
66
+ DataType::Float64 => Ok(PST::Double),
67
+ DataType::Date32 => Ok(PST::Date32),
68
+ DataType::Date64 => {
69
+ // Our code typically uses Date32 or Timestamp for 64. But Arrow has Date64
70
+ // We can store it as PST::Date64 if we want. If we don't have that, consider PST::Date32 or an error.
71
+ // If your existing code only handles Date32, you can error. But let's do PST::Date32 as fallback:
72
+ // Or define a new variant if you have one in your code. We'll show a fallback approach:
73
+ Err(MagnusError::new(
74
+ magnus::exception::runtime_error(),
75
+ "Arrow Date64 not directly supported in current ParquetSchemaType (use date32?).",
76
+ ))
77
+ }
78
+ DataType::Timestamp(TimeUnit::Second, _tz) => {
79
+ // We'll treat this as PST::TimestampMillis, or define PST::TimestampSecond
80
+ // For simplicity, let's map "second" to PST::TimestampMillis with a note:
81
+ Ok(PST::TimestampMillis)
82
+ }
83
+ DataType::Timestamp(TimeUnit::Millisecond, _tz) => Ok(PST::TimestampMillis),
84
+ DataType::Timestamp(TimeUnit::Microsecond, _tz) => Ok(PST::TimestampMicros),
85
+ DataType::Timestamp(TimeUnit::Nanosecond, _tz) => {
86
+ // If you have a PST::TimestampNanos variant, use it. Otherwise, degrade to micros
87
+ // for demonstration:
88
+ Err(MagnusError::new(
89
+ magnus::exception::runtime_error(),
90
+ "TimestampNanos not supported, please adjust your schema or code.",
91
+ ))
92
+ }
93
+ DataType::Utf8 => Ok(PST::String),
94
+ DataType::Binary => Ok(PST::Binary),
95
+ DataType::LargeUtf8 => {
96
+ // If not supported, degrade or error. We'll degrade to PST::String
97
+ Ok(PST::String)
98
+ }
99
+ DataType::LargeBinary => Ok(PST::Binary),
100
+ DataType::List(child_field) => {
101
+ // Recursively handle the item type
102
+ let child_type = arrow_data_type_to_parquet_schema_type(child_field.data_type())?;
103
+ Ok(PST::List(Box::new(crate::types::ListField {
104
+ item_type: child_type,
105
+ format: None,
106
+ nullable: true,
107
+ })))
108
+ }
109
+ DataType::Map(entry_field, _keys_sorted) => {
110
+ // Arrow's Map -> a struct<key, value> inside
111
+ let entry_type = entry_field.data_type();
112
+ if let DataType::Struct(fields) = entry_type {
113
+ if fields.len() == 2 {
114
+ let key_type = arrow_data_type_to_parquet_schema_type(fields[0].data_type())?;
115
+ let value_type = arrow_data_type_to_parquet_schema_type(fields[1].data_type())?;
116
+ Ok(PST::Map(Box::new(crate::types::MapField {
117
+ key_type,
118
+ value_type,
119
+ key_format: None,
120
+ value_format: None,
121
+ value_nullable: true,
122
+ })))
123
+ } else {
124
+ Err(MagnusError::new(
125
+ magnus::exception::type_error(),
126
+ "Map field must have exactly 2 child fields (key, value)",
127
+ ))
128
+ }
129
+ } else {
130
+ Err(MagnusError::new(
131
+ magnus::exception::type_error(),
132
+ "Map field is not a struct? Unexpected Arrow schema layout",
133
+ ))
134
+ }
135
+ }
136
+ DataType::Struct(arrow_fields) => {
137
+ // We treat this as PST::Struct. We'll recursively handle subfields
138
+ // but for top-level collecting we only store them as one column
139
+ // so the user data must pass a Ruby Hash or something for that field.
140
+ let mut schema_fields = vec![];
141
+ for f in arrow_fields {
142
+ let sub_type = arrow_data_type_to_parquet_schema_type(f.data_type())?;
143
+ schema_fields.push(SchemaField {
144
+ name: f.name().clone(),
145
+ type_: sub_type,
146
+ format: None, // We can't see the 'format' from Arrow
147
+ nullable: f.is_nullable(),
148
+ });
149
+ }
150
+ Ok(PST::Struct(Box::new(crate::types::StructField {
151
+ fields: schema_fields,
152
+ })))
153
+ }
154
+ _ => Err(MagnusError::new(
155
+ magnus::exception::runtime_error(),
156
+ format!("Unsupported or unhandled Arrow DataType: {:?}", dt),
157
+ )),
158
+ }
159
+ }
160
+
161
+ // -----------------------------------------------------------------------------
162
+ // HELPER to build ColumnCollectors for the DSL variant
163
+ // This function converts a SchemaNode (from our DSL) into a collection of ColumnCollectors
164
+ // that can accumulate values for each column in the schema.
165
+ // - arrow_schema: The Arrow schema corresponding to our DSL schema
166
+ // - root_node: The root SchemaNode (expected to be a Struct node) from which to build collectors
167
+ // -----------------------------------------------------------------------------
168
+ fn build_column_collectors_from_dsl<'a>(
169
+ ruby: &'a Ruby,
170
+ arrow_schema: &'a Arc<Schema>,
171
+ root_node: &'a SchemaNode,
172
+ ) -> Result<Vec<ColumnCollector<'a>>, MagnusError> {
173
+ // We expect the top-level schema node to be a Struct so that arrow_schema
174
+ // lines up with root_node.fields. If the user gave a top-level primitive, it would be 1 field, but
175
+ // our code calls build_arrow_schema under the assumption "top-level must be Struct."
176
+ let fields = match root_node {
177
+ SchemaNode::Struct { fields, .. } => fields,
178
+ _ => {
179
+ return Err(MagnusError::new(
180
+ ruby.exception_runtime_error(),
181
+ "Top-level schema for DSL must be a struct",
182
+ ))
183
+ }
184
+ };
185
+
186
+ if fields.len() != arrow_schema.fields().len() {
187
+ return Err(MagnusError::new(
188
+ ruby.exception_runtime_error(),
189
+ format!(
190
+ "Mismatch between DSL field count ({}) and Arrow fields ({})",
191
+ fields.len(),
192
+ arrow_schema.fields().len()
193
+ ),
194
+ ));
195
+ }
196
+
197
+ let mut collectors = Vec::with_capacity(fields.len());
198
+ for (arrow_field, schema_field_node) in arrow_schema.fields().iter().zip(fields) {
199
+ let name = arrow_field.name().clone();
200
+ let parquet_type = arrow_data_type_to_parquet_schema_type(arrow_field.data_type())?;
201
+
202
+ // Extract the optional format from the schema node
203
+ let format = extract_format_from_schema_node(schema_field_node);
204
+
205
+ // Build the ColumnCollector
206
+ collectors.push(ColumnCollector::new(
207
+ name,
208
+ parquet_type,
209
+ format,
210
+ arrow_field.is_nullable(),
211
+ ));
212
+ }
213
+ Ok(collectors)
214
+ }
215
+
216
+ // Helper to extract the format from a SchemaNode if available
217
+ fn extract_format_from_schema_node(node: &SchemaNode) -> Option<String> {
218
+ match node {
219
+ SchemaNode::Primitive {
220
+ format: f,
221
+ parquet_type: _,
222
+ ..
223
+ } => f.clone(),
224
+ // For struct, list, map, etc. there's no single "format." We ignore it.
225
+ _ => None,
226
+ }
227
+ }
228
+
37
229
  /// Parse arguments for Parquet writing
38
230
  pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, MagnusError> {
39
231
  let ruby = unsafe { Ruby::get_unchecked() };
@@ -42,12 +234,13 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
42
234
 
43
235
  let kwargs = get_kwargs::<
44
236
  _,
45
- (Option<RArray>, Value),
237
+ (Value, Value),
46
238
  (
47
239
  Option<Option<usize>>,
48
240
  Option<Option<usize>>,
49
241
  Option<Option<String>>,
50
242
  Option<Option<usize>>,
243
+ Option<Option<Value>>,
51
244
  ),
52
245
  (),
53
246
  >(
@@ -58,146 +251,231 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
58
251
  "flush_threshold",
59
252
  "compression",
60
253
  "sample_size",
254
+ "logger",
61
255
  ],
62
256
  )?;
63
257
 
64
- let schema = if kwargs.required.0.is_none() || kwargs.required.0.unwrap().is_empty() {
65
- // If schema is nil, we need to peek at the first value to determine column count
66
- let first_value = read_from.funcall::<_, _, Value>("peek", ())?;
67
- let array = RArray::from_value(first_value).ok_or_else(|| {
68
- MagnusError::new(
69
- magnus::exception::type_error(),
70
- "First value must be an array when schema is not provided",
71
- )
72
- })?;
73
-
74
- // Generate field names f0, f1, f2, etc.
75
- (0..array.len())
76
- .map(|i| SchemaField {
77
- name: format!("f{}", i),
78
- type_: ParquetSchemaType::String,
79
- format: None,
80
- })
81
- .collect()
82
- } else {
83
- let schema_array = kwargs.required.0.unwrap();
258
+ // The schema value could be one of:
259
+ // 1. An array of hashes (legacy format)
260
+ // 2. A hash with type: :struct (new DSL format)
261
+ // 3. nil (infer from data)
262
+ let schema_value = kwargs.required.0;
84
263
 
85
- let mut schema = Vec::with_capacity(schema_array.len());
264
+ // Check if it's the new DSL format (a hash with type: :struct)
265
+ // We need to handle both direct hash objects and objects created via Parquet::Schema.define
86
266
 
87
- for (idx, field_hash) in schema_array.into_iter().enumerate() {
88
- if !field_hash.is_kind_of(ruby.class_hash()) {
89
- return Err(MagnusError::new(
90
- magnus::exception::type_error(),
91
- format!("schema[{}] must be a hash", idx),
92
- ));
267
+ // First, try to convert it to a Hash if it's not already a Hash
268
+ // This handles the case where schema_value is a Schema object from Parquet::Schema.define
269
+ let schema_hash = if schema_value.is_kind_of(ruby.class_hash()) {
270
+ RHash::from_value(schema_value).ok_or_else(|| {
271
+ MagnusError::new(magnus::exception::type_error(), "Schema must be a hash")
272
+ })?
273
+ } else {
274
+ // Try to convert the object to a hash with to_h
275
+ match schema_value.respond_to("to_h", false) {
276
+ Ok(true) => {
277
+ match schema_value.funcall::<_, _, Value>("to_h", ()) {
278
+ Ok(hash_val) => match RHash::from_value(hash_val) {
279
+ Some(hash) => hash,
280
+ None => {
281
+ // Not a hash, continue to normal handling
282
+ RHash::new()
283
+ }
284
+ },
285
+ Err(_) => {
286
+ // couldn't call to_h, continue to normal handling
287
+ RHash::new()
288
+ }
289
+ }
93
290
  }
94
-
95
- let entries: Vec<(Value, Value)> = field_hash.funcall("to_a", ())?;
96
- if entries.len() != 1 {
97
- return Err(MagnusError::new(
98
- magnus::exception::type_error(),
99
- format!("schema[{}] must contain exactly one key-value pair", idx),
100
- ));
291
+ _ => {
292
+ // Doesn't respond to to_h, continue to normal handling
293
+ RHash::new()
101
294
  }
295
+ }
296
+ };
102
297
 
103
- let (name, type_value) = &entries[0];
104
- let name = String::try_convert(name.clone())?;
298
+ // Now check if it's a schema hash with a type: :struct field
299
+ let type_val = schema_hash.get(Symbol::new("type"));
105
300
 
106
- let (type_, format) = if type_value.is_kind_of(ruby.class_hash()) {
107
- let type_hash: Vec<(Value, Value)> = type_value.funcall("to_a", ())?;
108
- let mut type_str = None;
109
- let mut format_str = None;
301
+ if let Some(type_val) = type_val {
302
+ // If it has a type: :struct, it's the new DSL format
303
+ // Use parse_string_or_symbol to handle both String and Symbol values
304
+ let ttype = parse_string_or_symbol(&ruby, type_val)?;
305
+ if let Some(ref type_str) = ttype {
306
+ if type_str == "struct" {
307
+ // Parse using the new schema approach
308
+ let schema_node = crate::parse_schema_node(&ruby, schema_value)?;
110
309
 
111
- for (key, value) in type_hash {
112
- let key = String::try_convert(key)?;
113
- match key.as_str() {
114
- "type" => type_str = Some(value),
115
- "format" => format_str = Some(String::try_convert(value)?),
116
- _ => {
117
- return Err(MagnusError::new(
118
- magnus::exception::type_error(),
119
- format!("Unknown key '{}' in type definition", key),
120
- ))
121
- }
122
- }
123
- }
310
+ validate_schema_node(&ruby, &schema_node)?;
124
311
 
125
- let type_str = type_str.ok_or_else(|| {
312
+ return Ok(ParquetWriteArgs {
313
+ read_from,
314
+ write_to: kwargs.required.1,
315
+ schema: schema_node,
316
+ batch_size: kwargs.optional.0.flatten(),
317
+ flush_threshold: kwargs.optional.1.flatten(),
318
+ compression: kwargs.optional.2.flatten(),
319
+ sample_size: kwargs.optional.3.flatten(),
320
+ logger: kwargs.optional.4.flatten(),
321
+ });
322
+ }
323
+ }
324
+ }
325
+
326
+ // If it's not a hash with type: :struct, handle as legacy format
327
+ let schema_fields = if schema_value.is_nil()
328
+ || (schema_value.is_kind_of(ruby.class_array())
329
+ && RArray::from_value(schema_value)
330
+ .ok_or_else(|| {
126
331
  MagnusError::new(
127
332
  magnus::exception::type_error(),
128
- "Missing 'type' in type definition",
333
+ "Schema fields must be an array",
129
334
  )
130
- })?;
335
+ })?
336
+ .len()
337
+ == 0)
338
+ {
339
+ // If schema is nil or an empty array, we need to peek at the first value to determine column count
340
+ let first_value = read_from.funcall::<_, _, Value>("peek", ())?;
341
+ // Default to nullable:true for auto-inferred fields
342
+ crate::infer_schema_from_first_row(&ruby, first_value, true)?
343
+ } else {
344
+ // Legacy array format - use our centralized parser
345
+ crate::parse_legacy_schema(&ruby, schema_value)?
346
+ };
131
347
 
132
- (ParquetSchemaType::try_convert(type_str)?, format_str)
133
- } else {
134
- (ParquetSchemaType::try_convert(type_value.clone())?, None)
135
- };
136
-
137
- schema.push(SchemaField {
138
- name,
139
- type_,
140
- format,
141
- });
142
- }
348
+ // Convert the legacy schema fields to SchemaNode (DSL format)
349
+ let schema_node = crate::legacy_schema_to_dsl(&ruby, schema_fields)?;
143
350
 
144
- schema
145
- };
351
+ validate_schema_node(&ruby, &schema_node)?;
146
352
 
147
353
  Ok(ParquetWriteArgs {
148
354
  read_from,
149
355
  write_to: kwargs.required.1,
150
- schema,
356
+ schema: schema_node,
151
357
  batch_size: kwargs.optional.0.flatten(),
152
358
  flush_threshold: kwargs.optional.1.flatten(),
153
359
  compression: kwargs.optional.2.flatten(),
154
360
  sample_size: kwargs.optional.3.flatten(),
361
+ logger: kwargs.optional.4.flatten(),
155
362
  })
156
363
  }
157
364
 
158
- /// Estimate the size of a row
159
- fn estimate_single_row_size(row: &RArray, schema: &[SchemaField]) -> Result<usize, MagnusError> {
160
- let mut row_size = 0;
161
- for (field, value) in schema.iter().zip(row.into_iter()) {
162
- // Estimate size based on type and value
163
- row_size += match &field.type_ {
164
- // Use reference to avoid moving
165
- ParquetSchemaType::Int8 | ParquetSchemaType::UInt8 => 1,
166
- ParquetSchemaType::Int16 | ParquetSchemaType::UInt16 => 2,
167
- ParquetSchemaType::Int32
168
- | ParquetSchemaType::UInt32
169
- | ParquetSchemaType::Float
170
- | ParquetSchemaType::Date32 => 4,
171
- ParquetSchemaType::Int64
172
- | ParquetSchemaType::UInt64
173
- | ParquetSchemaType::Double
174
- | ParquetSchemaType::TimestampMillis
175
- | ParquetSchemaType::TimestampMicros => 8,
176
- ParquetSchemaType::String => {
177
- if let Ok(s) = String::try_convert(value) {
178
- s.len() + mem::size_of::<usize>() // account for length prefix
179
- } else {
180
- 16 // default estimate for string
181
- }
182
- }
183
- ParquetSchemaType::Binary => {
184
- if let Ok(bytes) = Vec::<u8>::try_convert(value) {
185
- bytes.len() + mem::size_of::<usize>() // account for length prefix
186
- } else {
187
- 16 // default estimate for binary
188
- }
189
- }
190
- ParquetSchemaType::Boolean => 1,
191
- ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
192
- 32 // rough estimate for complex types
193
- }
194
- };
365
+ // Validates a SchemaNode to ensure it meets Parquet schema requirements
366
+ // Currently checks for duplicate field names at the root level, which would
367
+ // cause problems when writing Parquet files. Additional validation rules
368
+ // could be added here in the future.
369
+ //
370
+ // This validation is important because schema errors are difficult to debug
371
+ // once they reach the Parquet/Arrow layer, so we check proactively before
372
+ // any data processing begins.
373
+ fn validate_schema_node(ruby: &Ruby, schema_node: &SchemaNode) -> Result<(), MagnusError> {
374
+ if let SchemaNode::Struct { fields, .. } = &schema_node {
375
+ // if any root level schema fields have the same name, we raise an error
376
+ let field_names = fields
377
+ .iter()
378
+ .map(|f| match f {
379
+ SchemaNode::Struct { name, .. } => name.as_str(),
380
+ SchemaNode::List { name, .. } => name.as_str(),
381
+ SchemaNode::Map { name, .. } => name.as_str(),
382
+ SchemaNode::Primitive { name, .. } => name.as_str(),
383
+ })
384
+ .collect::<Vec<_>>();
385
+ let unique_field_names = field_names.iter().unique().collect::<Vec<_>>();
386
+ if field_names.len() != unique_field_names.len() {
387
+ return Err(MagnusError::new(
388
+ ruby.exception_arg_error(),
389
+ format!(
390
+ "Duplicate field names in root level schema: {:?}",
391
+ field_names
392
+ ),
393
+ ));
394
+ }
395
+ }
396
+ Ok(())
397
+ }
398
+
399
+ // Processes a single data row and adds values to the corresponding column collectors
400
+ // This function is called for each row of input data when writing in row-wise mode.
401
+ // It performs important validation to ensure the row structure matches the schema:
402
+ // - Verifies that the number of columns in the row matches the schema
403
+ // - Distributes each value to the appropriate ColumnCollector
404
+ //
405
+ // Each ColumnCollector handles type conversion and accumulation for its specific column,
406
+ // allowing this function to focus on row-level validation and distribution.
407
+ fn process_row(
408
+ ruby: &Ruby,
409
+ row: Value,
410
+ column_collectors: &mut [ColumnCollector],
411
+ ) -> Result<(), MagnusError> {
412
+ let row_array = RArray::from_value(row)
413
+ .ok_or_else(|| MagnusError::new(ruby.exception_type_error(), "Row must be an array"))?;
414
+
415
+ // Validate row length matches schema
416
+ if row_array.len() != column_collectors.len() {
417
+ return Err(MagnusError::new(
418
+ magnus::exception::runtime_error(),
419
+ format!(
420
+ "Row length ({}) does not match schema length ({}). Schema expects columns: {:?}",
421
+ row_array.len(),
422
+ column_collectors.len(),
423
+ column_collectors
424
+ .iter()
425
+ .map(|c| c.name.as_str())
426
+ .collect::<Vec<_>>()
427
+ ),
428
+ ));
429
+ }
430
+
431
+ // Process each value in the row
432
+ for (collector, value) in column_collectors.iter_mut().zip(row_array) {
433
+ collector.push_value(value)?;
195
434
  }
196
- Ok(row_size)
435
+
436
+ Ok(())
437
+ }
438
+
439
+ // Dynamically calculates an optimal batch size based on estimated row sizes
440
+ // and memory constraints. This function enables the writer to adapt to different
441
+ // data characteristics for optimal performance.
442
+ //
443
+ // The algorithm:
444
+ // 1. Requires a minimum number of samples to make a reliable estimate
445
+ // 2. Calculates the average row size from the samples
446
+ // 3. Determines a batch size that would consume approximately the target memory threshold
447
+ // 4. Ensures the batch size doesn't go below a minimum value for efficiency
448
+ //
449
+ // This approach balances memory usage with processing efficiency by targeting
450
+ // a specific memory footprint per batch.
451
+ fn update_batch_size(
452
+ size_samples: &[usize],
453
+ flush_threshold: usize,
454
+ min_batch_size: usize,
455
+ ) -> usize {
456
+ if size_samples.len() < MIN_SAMPLES_FOR_ESTIMATE {
457
+ return min_batch_size;
458
+ }
459
+
460
+ let total_size = size_samples.iter().sum::<usize>();
461
+ // Safe because we know we have at least MIN_SAMPLES_FOR_ESTIMATE samples
462
+ let avg_row_size = total_size as f64 / size_samples.len() as f64;
463
+ let avg_row_size = avg_row_size.max(1.0); // Ensure we don't divide by zero
464
+ let suggested_batch_size = (flush_threshold as f64 / avg_row_size).floor() as usize;
465
+ suggested_batch_size.max(min_batch_size)
197
466
  }
198
467
 
199
468
  #[inline]
200
469
  pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
470
+ write_rows_impl(args).map_err(|e| {
471
+ let z: MagnusError = e.into();
472
+ z
473
+ })?;
474
+ Ok(())
475
+ }
476
+
477
+ #[inline]
478
+ fn write_rows_impl(args: &[Value]) -> Result<(), ReaderError> {
201
479
  let ruby = unsafe { Ruby::get_unchecked() };
202
480
 
203
481
  let ParquetWriteArgs {
@@ -208,59 +486,27 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
208
486
  compression,
209
487
  flush_threshold,
210
488
  sample_size: user_sample_size,
489
+ logger,
211
490
  } = parse_parquet_write_args(args)?;
212
491
 
492
+ let logger = RubyLogger::new(&ruby, logger)?;
213
493
  let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
214
494
 
215
- // Convert schema to Arrow schema
216
- let arrow_fields: Vec<Field> = schema
217
- .iter()
218
- .map(|field| {
219
- Field::new(
220
- &field.name,
221
- match field.type_ {
222
- ParquetSchemaType::Int8 => DataType::Int8,
223
- ParquetSchemaType::Int16 => DataType::Int16,
224
- ParquetSchemaType::Int32 => DataType::Int32,
225
- ParquetSchemaType::Int64 => DataType::Int64,
226
- ParquetSchemaType::UInt8 => DataType::UInt8,
227
- ParquetSchemaType::UInt16 => DataType::UInt16,
228
- ParquetSchemaType::UInt32 => DataType::UInt32,
229
- ParquetSchemaType::UInt64 => DataType::UInt64,
230
- ParquetSchemaType::Float => DataType::Float32,
231
- ParquetSchemaType::Double => DataType::Float64,
232
- ParquetSchemaType::String => DataType::Utf8,
233
- ParquetSchemaType::Binary => DataType::Binary,
234
- ParquetSchemaType::Boolean => DataType::Boolean,
235
- ParquetSchemaType::Date32 => DataType::Date32,
236
- ParquetSchemaType::TimestampMillis => {
237
- DataType::Timestamp(TimeUnit::Millisecond, None)
238
- }
239
- ParquetSchemaType::TimestampMicros => {
240
- DataType::Timestamp(TimeUnit::Microsecond, None)
241
- }
242
- ParquetSchemaType::List(_) => unimplemented!("List type not yet supported"),
243
- ParquetSchemaType::Map(_) => unimplemented!("Map type not yet supported"),
244
- },
245
- true,
246
- )
247
- })
248
- .collect();
249
- let arrow_schema = Arc::new(Schema::new(arrow_fields));
495
+ // Get the Arrow schema from the SchemaNode (we only have DSL schema now, since legacy is converted)
496
+ let arrow_schema = build_arrow_schema(&schema, &logger).map_err(|e| {
497
+ MagnusError::new(
498
+ magnus::exception::runtime_error(),
499
+ format!("Failed to build Arrow schema from DSL schema: {}", e),
500
+ )
501
+ })?;
250
502
 
251
503
  // Create the writer
252
504
  let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
253
505
 
254
506
  if read_from.is_kind_of(ruby.class_enumerator()) {
255
- // Create collectors for each column
256
- let mut column_collectors: Vec<ColumnCollector> = schema
257
- .iter()
258
- .map(|field| {
259
- // Clone the type to avoid moving from a reference
260
- let type_clone = field.type_.clone();
261
- ColumnCollector::new(field.name.clone(), type_clone, field.format.clone())
262
- })
263
- .collect();
507
+ // Build column collectors - we only have DSL schema now
508
+ let mut column_collectors =
509
+ build_column_collectors_from_dsl(&ruby, &arrow_schema, &schema)?;
264
510
 
265
511
  let mut rows_in_batch = 0;
266
512
  let mut total_rows = 0;
@@ -272,48 +518,33 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
272
518
  loop {
273
519
  match read_from.funcall::<_, _, Value>("next", ()) {
274
520
  Ok(row) => {
275
- let row_array = RArray::from_value(row).ok_or_else(|| {
276
- MagnusError::new(ruby.exception_type_error(), "Row must be an array")
277
- })?;
521
+ // Process the row
522
+ process_row(&ruby, row, &mut column_collectors)?;
278
523
 
279
- // Validate row length matches schema
280
- if row_array.len() != column_collectors.len() {
281
- return Err(MagnusError::new(
282
- magnus::exception::type_error(),
283
- format!(
284
- "Row length ({}) does not match schema length ({}). Schema expects columns: {:?}",
285
- row_array.len(),
286
- column_collectors.len(),
287
- column_collectors.iter().map(|c| c.name.as_str()).collect::<Vec<_>>()
288
- ),
289
- ));
290
- }
291
-
292
- // Sample row sizes using reservoir sampling
524
+ // Update row sampling for dynamic batch sizing
293
525
  if size_samples.len() < sample_size {
294
- size_samples.push(estimate_single_row_size(&row_array, &schema)?);
295
- } else if rng.random_range(0..=total_rows) < sample_size {
296
- let idx = rng.random_range(0..sample_size);
297
- size_samples[idx] = estimate_single_row_size(&row_array, &schema)?;
298
- }
299
-
300
- // Process each value in the row
301
- for (collector, value) in column_collectors.iter_mut().zip(row_array) {
302
- collector.push_value(value)?;
526
+ // estimate row size
527
+ let row_array = RArray::from_value(row).ok_or_else(|| {
528
+ MagnusError::new(ruby.exception_type_error(), "Row must be an array")
529
+ })?;
530
+ let row_size = estimate_single_row_size(&row_array, &column_collectors)?;
531
+ size_samples.push(row_size);
532
+ } else if rng.random_range(0..=total_rows) < sample_size as usize {
533
+ let idx = rng.random_range(0..sample_size as usize);
534
+ let row_array = RArray::from_value(row).ok_or_else(|| {
535
+ MagnusError::new(ruby.exception_type_error(), "Row must be an array")
536
+ })?;
537
+ let row_size = estimate_single_row_size(&row_array, &column_collectors)?;
538
+ size_samples[idx] = row_size;
303
539
  }
304
540
 
305
541
  rows_in_batch += 1;
306
542
  total_rows += 1;
307
543
 
308
544
  // Calculate batch size progressively once we have minimum samples
309
- if size_samples.len() >= MIN_SAMPLES_FOR_ESTIMATE && user_batch_size.is_none() {
310
- let total_size = size_samples.iter().sum::<usize>();
311
- // Safe because we know we have at least MIN_SAMPLES_FOR_ESTIMATE samples
312
- let avg_row_size = total_size as f64 / size_samples.len() as f64;
313
- let avg_row_size = avg_row_size.max(1.0); // Ensure we don't divide by zero
314
- let suggested_batch_size =
315
- (flush_threshold as f64 / avg_row_size).floor() as usize;
316
- current_batch_size = suggested_batch_size.max(MIN_BATCH_SIZE);
545
+ if user_batch_size.is_none() && size_samples.len() >= MIN_SAMPLES_FOR_ESTIMATE {
546
+ current_batch_size =
547
+ update_batch_size(&size_samples, flush_threshold, MIN_BATCH_SIZE);
317
548
  }
318
549
 
319
550
  // When we reach batch size, write the batch
@@ -330,19 +561,19 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
330
561
  }
331
562
  break;
332
563
  }
333
- return Err(e);
564
+ return Err(e)?;
334
565
  }
335
566
  }
336
567
  }
337
568
  } else {
338
569
  return Err(MagnusError::new(
339
570
  magnus::exception::type_error(),
340
- "read_from must be an Enumerator",
341
- ));
571
+ "read_from must be an Enumerator".to_string(),
572
+ ))?;
342
573
  }
343
574
 
344
575
  // Ensure everything is written and get the temp file if it exists
345
- if let Some(temp_file) = writer.close().map_err(|e| ParquetErrorWrapper(e))? {
576
+ if let Some(temp_file) = writer.close()? {
346
577
  // If we got a temp file back, we need to copy its contents to the IO-like object
347
578
  copy_temp_file_to_io_like(temp_file, IoLikeValue(write_to))?;
348
579
  }
@@ -352,6 +583,15 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
352
583
 
353
584
  #[inline]
354
585
  pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
586
+ write_columns_impl(args).map_err(|e| {
587
+ let z: MagnusError = e.into();
588
+ z
589
+ })?;
590
+ Ok(())
591
+ }
592
+
593
+ #[inline]
594
+ fn write_columns_impl(args: &[Value]) -> Result<(), ReaderError> {
355
595
  let ruby = unsafe { Ruby::get_unchecked() };
356
596
 
357
597
  let ParquetWriteArgs {
@@ -362,45 +602,19 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
362
602
  compression,
363
603
  flush_threshold,
364
604
  sample_size: _,
605
+ logger,
365
606
  } = parse_parquet_write_args(args)?;
366
607
 
608
+ let logger = RubyLogger::new(&ruby, logger)?;
367
609
  let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
368
610
 
369
- // Convert schema to Arrow schema
370
- let arrow_fields: Vec<Field> = schema
371
- .iter()
372
- .map(|field| {
373
- Field::new(
374
- &field.name,
375
- match field.type_ {
376
- ParquetSchemaType::Int8 => DataType::Int8,
377
- ParquetSchemaType::Int16 => DataType::Int16,
378
- ParquetSchemaType::Int32 => DataType::Int32,
379
- ParquetSchemaType::Int64 => DataType::Int64,
380
- ParquetSchemaType::UInt8 => DataType::UInt8,
381
- ParquetSchemaType::UInt16 => DataType::UInt16,
382
- ParquetSchemaType::UInt32 => DataType::UInt32,
383
- ParquetSchemaType::UInt64 => DataType::UInt64,
384
- ParquetSchemaType::Float => DataType::Float32,
385
- ParquetSchemaType::Double => DataType::Float64,
386
- ParquetSchemaType::String => DataType::Utf8,
387
- ParquetSchemaType::Binary => DataType::Binary,
388
- ParquetSchemaType::Boolean => DataType::Boolean,
389
- ParquetSchemaType::Date32 => DataType::Date32,
390
- ParquetSchemaType::TimestampMillis => {
391
- DataType::Timestamp(TimeUnit::Millisecond, None)
392
- }
393
- ParquetSchemaType::TimestampMicros => {
394
- DataType::Timestamp(TimeUnit::Microsecond, None)
395
- }
396
- ParquetSchemaType::List(_) => unimplemented!("List type not yet supported"),
397
- ParquetSchemaType::Map(_) => unimplemented!("Map type not yet supported"),
398
- },
399
- true,
400
- )
401
- })
402
- .collect();
403
- let arrow_schema = Arc::new(Schema::new(arrow_fields));
611
+ // Get the Arrow schema from the SchemaNode (we only have DSL schema now, since legacy is converted)
612
+ let arrow_schema = build_arrow_schema(&schema, &logger).map_err(|e| {
613
+ MagnusError::new(
614
+ magnus::exception::runtime_error(),
615
+ format!("Failed to build Arrow schema from DSL schema: {}", e),
616
+ )
617
+ })?;
404
618
 
405
619
  // Create the writer
406
620
  let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
@@ -422,36 +636,111 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
422
636
  })?;
423
637
 
424
638
  // Validate batch length matches schema
425
- if batch_array.len() != schema.len() {
639
+ // Get schema length and field names - we only have DSL schema now
640
+ let (schema_len, field_names): (usize, Vec<&str>) = {
641
+ let fields = match &schema {
642
+ SchemaNode::Struct { fields, .. } => fields,
643
+ _ => {
644
+ return Err(MagnusError::new(
645
+ magnus::exception::type_error(),
646
+ "Root schema node must be a struct type",
647
+ ))?
648
+ }
649
+ };
650
+ (
651
+ fields.len(),
652
+ fields
653
+ .iter()
654
+ .map(|f| match f {
655
+ SchemaNode::Primitive { name, .. } => name.as_str(),
656
+ SchemaNode::List { name, .. } => name.as_str(),
657
+ SchemaNode::Map { name, .. } => name.as_str(),
658
+ SchemaNode::Struct { name, .. } => name.as_str(),
659
+ })
660
+ .to_owned()
661
+ .collect(),
662
+ )
663
+ };
664
+
665
+ if batch_array.len() != schema_len {
426
666
  return Err(MagnusError::new(
427
667
  magnus::exception::type_error(),
428
668
  format!(
429
669
  "Batch column count ({}) does not match schema length ({}). Schema expects columns: {:?}",
430
670
  batch_array.len(),
431
- schema.len(),
432
- schema.iter().map(|f| f.name.as_str()).collect::<Vec<_>>()
671
+ schema_len,
672
+ field_names
433
673
  ),
434
- ));
674
+ ))?;
435
675
  }
436
676
 
437
677
  // Convert each column in the batch to Arrow arrays
438
- let arrow_arrays: Vec<(String, Arc<dyn Array>)> = schema
439
- .iter()
440
- .zip(batch_array)
441
- .map(|(field, column)| {
442
- let column_array = RArray::from_value(column).ok_or_else(|| {
678
+ let arrow_arrays: Vec<(String, Arc<dyn Array>)> = {
679
+ // Process each field in the DSL schema
680
+ let fields = arrow_schema.fields();
681
+ let top_fields =
682
+ match &schema {
683
+ SchemaNode::Struct { fields, .. } => fields,
684
+ _ => return Err(MagnusError::new(
685
+ magnus::exception::runtime_error(),
686
+ "Top-level DSL schema must be a struct for columns approach",
687
+ ))?,
688
+ };
689
+ if top_fields.len() != fields.len() {
690
+ return Err(MagnusError::new(
691
+ magnus::exception::runtime_error(),
692
+ "Mismatch top-level DSL fields vs Arrow fields",
693
+ ))?;
694
+ }
695
+
696
+ let mut out = vec![];
697
+ for ((arrow_f, dsl_f), col_val) in
698
+ fields.iter().zip(top_fields.iter()).zip(batch_array)
699
+ {
700
+ let col_arr = RArray::from_value(col_val).ok_or_else(|| {
443
701
  MagnusError::new(
444
702
  magnus::exception::type_error(),
445
- format!("Column '{}' must be an array", field.name),
703
+ format!("Column '{}' must be an array", arrow_f.name()),
446
704
  )
447
705
  })?;
448
-
449
- Ok((
450
- field.name.clone(),
451
- convert_ruby_array_to_arrow(column_array, &field.type_)?,
452
- ))
453
- })
454
- .collect::<Result<_, MagnusError>>()?;
706
+ // Get appropriate parquet_type
707
+ let ptype = match dsl_f {
708
+ SchemaNode::Primitive {
709
+ parquet_type,
710
+ // Format is handled internally now
711
+ ..
712
+ } => match parquet_type {
713
+ &PrimitiveType::Int8 => PST::Int8,
714
+ &PrimitiveType::Int16 => PST::Int16,
715
+ &PrimitiveType::Int32 => PST::Int32,
716
+ &PrimitiveType::Int64 => PST::Int64,
717
+ &PrimitiveType::UInt8 => PST::UInt8,
718
+ &PrimitiveType::UInt16 => PST::UInt16,
719
+ &PrimitiveType::UInt32 => PST::UInt32,
720
+ &PrimitiveType::UInt64 => PST::UInt64,
721
+ &PrimitiveType::Float32 => PST::Float,
722
+ &PrimitiveType::Float64 => PST::Double,
723
+ &PrimitiveType::String => PST::String,
724
+ &PrimitiveType::Binary => PST::Binary,
725
+ &PrimitiveType::Boolean => PST::Boolean,
726
+ &PrimitiveType::Date32 => PST::Date32,
727
+ &PrimitiveType::TimestampMillis => PST::TimestampMillis,
728
+ &PrimitiveType::TimestampMicros => PST::TimestampMicros,
729
+ },
730
+ SchemaNode::List { .. }
731
+ | SchemaNode::Map { .. }
732
+ | SchemaNode::Struct { .. } => {
733
+ // For nested, we just do a single "column" as well
734
+ arrow_data_type_to_parquet_schema_type(arrow_f.data_type())?
735
+ }
736
+ };
737
+ out.push((
738
+ arrow_f.name().clone(),
739
+ convert_ruby_array_to_arrow(col_arr, &ptype)?,
740
+ ));
741
+ }
742
+ out
743
+ };
455
744
 
456
745
  // Create and write record batch
457
746
  let record_batch = RecordBatch::try_from_iter(arrow_arrays).map_err(|e| {
@@ -461,14 +750,12 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
461
750
  )
462
751
  })?;
463
752
 
464
- writer
465
- .write(&record_batch)
466
- .map_err(|e| ParquetErrorWrapper(e))?;
753
+ writer.write(&record_batch)?;
467
754
 
468
755
  match &mut writer {
469
756
  WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
470
757
  if w.in_progress_size() >= flush_threshold {
471
- w.flush().map_err(|e| ParquetErrorWrapper(e))?;
758
+ w.flush()?;
472
759
  }
473
760
  }
474
761
  }
@@ -477,19 +764,19 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
477
764
  if e.is_kind_of(ruby.exception_stop_iteration()) {
478
765
  break;
479
766
  }
480
- return Err(e);
767
+ return Err(e)?;
481
768
  }
482
769
  }
483
770
  }
484
771
  } else {
485
772
  return Err(MagnusError::new(
486
773
  magnus::exception::type_error(),
487
- "read_from must be an Enumerator",
488
- ));
774
+ "read_from must be an Enumerator".to_string(),
775
+ ))?;
489
776
  }
490
777
 
491
778
  // Ensure everything is written and get the temp file if it exists
492
- if let Some(temp_file) = writer.close().map_err(|e| ParquetErrorWrapper(e))? {
779
+ if let Some(temp_file) = writer.close()? {
493
780
  // If we got a temp file back, we need to copy its contents to the IO-like object
494
781
  copy_temp_file_to_io_like(temp_file, IoLikeValue(write_to))?;
495
782
  }
@@ -497,12 +784,23 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
497
784
  Ok(())
498
785
  }
499
786
 
787
+ // Creates an appropriate Parquet writer based on the output target and compression settings
788
+ // This function handles two main output scenarios:
789
+ // 1. Writing directly to a file path (string)
790
+ // 2. Writing to a Ruby IO-like object (using a temporary file as an intermediate buffer)
791
+ //
792
+ // For IO-like objects, the function creates a temporary file that is later copied to the
793
+ // IO object when writing is complete. This approach is necessary because Parquet requires
794
+ // random file access to write its footer after the data.
795
+ //
796
+ // The function also configures compression based on the user's preferences, with
797
+ // several options available (none, snappy, gzip, lz4, zstd).
500
798
  fn create_writer(
501
799
  ruby: &Ruby,
502
800
  write_to: &Value,
503
801
  schema: Arc<Schema>,
504
802
  compression: Option<String>,
505
- ) -> Result<WriterOutput, MagnusError> {
803
+ ) -> Result<WriterOutput, ReaderError> {
506
804
  // Create writer properties with compression based on the option
507
805
  let props = WriterProperties::builder()
508
806
  .set_compression(match compression.as_deref() {
@@ -517,9 +815,8 @@ fn create_writer(
517
815
 
518
816
  if write_to.is_kind_of(ruby.class_string()) {
519
817
  let path = write_to.to_r_string()?.to_string()?;
520
- let file: Box<dyn SendableWrite> = Box::new(File::create(path).unwrap());
521
- let writer =
522
- ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
818
+ let file: Box<dyn SendableWrite> = Box::new(File::create(path)?);
819
+ let writer = ArrowWriter::try_new(file, schema, Some(props))?;
523
820
  Ok(WriterOutput::File(writer))
524
821
  } else {
525
822
  // Create a temporary file to write to instead of directly to the IoLikeValue
@@ -535,13 +832,22 @@ fn create_writer(
535
832
  format!("Failed to reopen temporary file: {}", e),
536
833
  )
537
834
  })?);
538
- let writer =
539
- ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
835
+ let writer = ArrowWriter::try_new(file, schema, Some(props))?;
540
836
  Ok(WriterOutput::TempFile(writer, temp_file))
541
837
  }
542
838
  }
543
839
 
544
- // Helper function to copy temp file contents to IoLikeValue
840
+ // Copies the contents of a temporary file to a Ruby IO-like object
841
+ // This function is necessary because Parquet writing requires random file access
842
+ // (especially for writing the footer after all data), but Ruby IO objects may not
843
+ // support seeking. The solution is to:
844
+ //
845
+ // 1. Write the entire Parquet file to a temporary file first
846
+ // 2. Once writing is complete, copy the entire contents to the Ruby IO object
847
+ //
848
+ // This approach enables support for a wide range of Ruby IO objects like StringIO,
849
+ // network streams, etc., but does require enough disk space for the temporary file
850
+ // and involves a second full-file read/write operation at the end.
545
851
  fn copy_temp_file_to_io_like(
546
852
  temp_file: NamedTempFile,
547
853
  io_like: IoLikeValue,
@@ -565,36 +871,278 @@ fn copy_temp_file_to_io_like(
565
871
  Ok(())
566
872
  }
567
873
 
874
+ // Estimates the memory size of a single row by examining each value
875
+ // This is used for dynamic batch sizing to optimize memory usage during writes
876
+ // by adapting batch sizes based on the actual data being processed.
877
+ pub fn estimate_single_row_size(
878
+ row_array: &RArray,
879
+ collectors: &[ColumnCollector],
880
+ ) -> Result<usize, MagnusError> {
881
+ let mut size = 0;
882
+ for (idx, val) in row_array.into_iter().enumerate() {
883
+ let col_type = &collectors[idx].type_;
884
+ // Calculate size based on the type-specific estimation
885
+ size += estimate_value_size(val, col_type)?;
886
+ }
887
+ Ok(size)
888
+ }
889
+
890
+ // Estimates the memory footprint of a single value based on its schema type
891
+ // This provides type-specific size estimates that help with dynamic batch sizing
892
+ // For complex types like lists, maps, and structs, we use reasonable approximations
893
+ pub fn estimate_value_size(
894
+ value: Value,
895
+ schema_type: &ParquetSchemaType,
896
+ ) -> Result<usize, MagnusError> {
897
+ use ParquetSchemaType as PST;
898
+ if value.is_nil() {
899
+ return Ok(0); // nil => minimal
900
+ }
901
+ match schema_type {
902
+ PST::Int8 | PST::UInt8 => Ok(1),
903
+ PST::Int16 | PST::UInt16 => Ok(2),
904
+ PST::Int32 | PST::UInt32 | PST::Float => Ok(4),
905
+ PST::Int64 | PST::UInt64 | PST::Double => Ok(8),
906
+ PST::Boolean => Ok(1),
907
+ PST::Date32 | PST::TimestampMillis | PST::TimestampMicros => Ok(8),
908
+ PST::String | PST::Binary => {
909
+ if let Ok(s) = String::try_convert(value) {
910
+ // Account for string length plus Rust String's capacity+pointer overhead
911
+ Ok(s.len() + std::mem::size_of::<usize>() * 3)
912
+ } else {
913
+ // Try to convert the value to a string using to_s for non-string types
914
+ // This handles numeric values that will be converted to strings later
915
+ let _ruby = unsafe { Ruby::get_unchecked() };
916
+ match value.funcall::<_, _, Value>("to_s", ()) {
917
+ Ok(str_val) => {
918
+ if let Ok(s) = String::try_convert(str_val) {
919
+ Ok(s.len() + std::mem::size_of::<usize>() * 3)
920
+ } else {
921
+ // If to_s conversion fails, just use a reasonable default
922
+ Ok(8) // Reasonable size estimate for small values
923
+ }
924
+ }
925
+ Err(_) => {
926
+ // If to_s method fails, use a default size
927
+ Ok(8) // Reasonable size estimate for small values
928
+ }
929
+ }
930
+ }
931
+ }
932
+ PST::List(item_type) => {
933
+ if let Ok(arr) = RArray::try_convert(value) {
934
+ let len = arr.len();
935
+
936
+ // Base overhead for the array structure (pointer, length, capacity)
937
+ let base_size = std::mem::size_of::<usize>() * 3;
938
+
939
+ // If empty, just return the base size
940
+ if len == 0 {
941
+ return Ok(base_size);
942
+ }
943
+
944
+ // Sample up to 5 elements to get average element size
945
+ let sample_count = std::cmp::min(len, 5);
946
+ let mut total_sample_size = 0;
947
+
948
+ for i in 0..sample_count {
949
+ let element = arr.entry(i as isize)?;
950
+ let element_size = estimate_value_size(element, &item_type.item_type)?;
951
+ total_sample_size += element_size;
952
+ }
953
+
954
+ // If we couldn't sample any elements properly, that's an error
955
+ if sample_count > 0 && total_sample_size == 0 {
956
+ return Err(MagnusError::new(
957
+ magnus::exception::runtime_error(),
958
+ "Failed to estimate size of list elements",
959
+ ));
960
+ }
961
+
962
+ // Calculate average element size from samples
963
+ let avg_element_size = if sample_count > 0 {
964
+ total_sample_size as f64 / sample_count as f64
965
+ } else {
966
+ return Err(MagnusError::new(
967
+ magnus::exception::runtime_error(),
968
+ "Failed to sample list elements for size estimation",
969
+ ));
970
+ };
971
+
972
+ // Estimate total size based on average element size * length + base overhead
973
+ Ok(base_size + (avg_element_size as usize * len))
974
+ } else {
975
+ // Instead of assuming it's a small list, return an error
976
+ Err(MagnusError::new(
977
+ magnus::exception::runtime_error(),
978
+ format!("Expected array for List type but got: {:?}", value),
979
+ ))
980
+ }
981
+ }
982
+ PST::Map(map_field) => {
983
+ if let Ok(hash) = RHash::try_convert(value) {
984
+ let size_estimate = hash.funcall::<_, _, usize>("size", ())?;
985
+
986
+ // Base overhead for the hash structure
987
+ let base_size = std::mem::size_of::<usize>() * 4;
988
+
989
+ // If empty, just return the base size
990
+ if size_estimate == 0 {
991
+ return Ok(base_size);
992
+ }
993
+
994
+ // Sample up to 5 key-value pairs to estimate average sizes
995
+ let mut key_sample_size = 0;
996
+ let mut value_sample_size = 0;
997
+ let mut sample_count = 0;
998
+
999
+ // Get an enumerator for the hash
1000
+ let enumerator = hash.funcall::<_, _, Value>("to_enum", ())?;
1001
+
1002
+ // Sample up to 5 entries
1003
+ for _ in 0..std::cmp::min(size_estimate, 5) {
1004
+ match enumerator.funcall::<_, _, Value>("next", ()) {
1005
+ Ok(pair) => {
1006
+ if let Ok(pair_array) = RArray::try_convert(pair) {
1007
+ if pair_array.len() == 2 {
1008
+ let key = pair_array.entry(0)?;
1009
+ let val = pair_array.entry(1)?;
1010
+
1011
+ key_sample_size +=
1012
+ estimate_value_size(key, &map_field.key_type)?;
1013
+ value_sample_size +=
1014
+ estimate_value_size(val, &map_field.value_type)?;
1015
+ sample_count += 1;
1016
+ }
1017
+ }
1018
+ }
1019
+ Err(_) => break, // Stop if we reach the end
1020
+ }
1021
+ }
1022
+
1023
+ // If we couldn't sample any pairs, return an error
1024
+ if size_estimate > 0 && sample_count == 0 {
1025
+ return Err(MagnusError::new(
1026
+ magnus::exception::runtime_error(),
1027
+ "Failed to sample map entries for size estimation",
1028
+ ));
1029
+ }
1030
+
1031
+ // Calculate average key and value sizes
1032
+ let (avg_key_size, avg_value_size) = if sample_count > 0 {
1033
+ (
1034
+ key_sample_size as f64 / sample_count as f64,
1035
+ value_sample_size as f64 / sample_count as f64,
1036
+ )
1037
+ } else {
1038
+ return Err(MagnusError::new(
1039
+ magnus::exception::runtime_error(),
1040
+ "Failed to sample hash key-value pairs for size estimation",
1041
+ ));
1042
+ };
1043
+
1044
+ // Each entry has overhead (node pointers, etc.) in a hash map
1045
+ let entry_overhead = std::mem::size_of::<usize>() * 2;
1046
+
1047
+ // Estimate total size:
1048
+ // base size + (key_size + value_size + entry_overhead) * count
1049
+ Ok(base_size
1050
+ + ((avg_key_size + avg_value_size + entry_overhead as f64) as usize
1051
+ * size_estimate))
1052
+ } else {
1053
+ // Instead of assuming a small map, return an error
1054
+ Err(MagnusError::new(
1055
+ magnus::exception::runtime_error(),
1056
+ format!("Expected hash for Map type but got: {:?}", value),
1057
+ ))
1058
+ }
1059
+ }
1060
+ PST::Struct(struct_field) => {
1061
+ if let Ok(hash) = RHash::try_convert(value) {
1062
+ // Base overhead for the struct
1063
+ let base_size = std::mem::size_of::<usize>() * 3;
1064
+
1065
+ // Estimate size for each field
1066
+ let mut total_fields_size = 0;
1067
+
1068
+ for field in &struct_field.fields {
1069
+ // Try to get the field value from the hash
1070
+ match hash.get(Symbol::new(&field.name)) {
1071
+ Some(field_value) => {
1072
+ total_fields_size += estimate_value_size(field_value, &field.type_)?;
1073
+ }
1074
+ None => {
1075
+ if let Some(field_value) = hash.get(&*field.name) {
1076
+ total_fields_size +=
1077
+ estimate_value_size(field_value, &field.type_)?;
1078
+ } else {
1079
+ if field.nullable {
1080
+ total_fields_size += 0;
1081
+ } else {
1082
+ return Err(MagnusError::new(
1083
+ magnus::exception::runtime_error(),
1084
+ format!("Missing field: {} in hash {:?}", field.name, hash),
1085
+ ));
1086
+ }
1087
+ }
1088
+ }
1089
+ }
1090
+ }
1091
+
1092
+ // We no longer error on missing fields during size estimation
1093
+ Ok(base_size + total_fields_size)
1094
+ } else {
1095
+ // Instead of trying instance_variables or assuming a default, return an error
1096
+ Err(MagnusError::new(
1097
+ magnus::exception::runtime_error(),
1098
+ format!("Expected hash for Struct type but got: {:?}", value),
1099
+ ))
1100
+ }
1101
+ }
1102
+ }
1103
+ }
1104
+
1105
+ // Converts all accumulated data from ColumnCollectors into an Arrow RecordBatch
1106
+ // and writes it to the Parquet file/output. This is a crucial function that bridges
1107
+ // between our Ruby-oriented data collectors and the Arrow/Parquet ecosystem.
1108
+ //
1109
+ // The function:
1110
+ // 1. Takes all collected values from each ColumnCollector and converts them to Arrow arrays
1111
+ // 2. Creates a RecordBatch from these arrays (column-oriented data format)
1112
+ // 3. Writes the batch to the ParquetWriter
1113
+ // 4. Flushes the writer if the accumulated memory exceeds the threshold
1114
+ //
1115
+ // This approach enables efficient batch-wise writing while controlling memory usage.
568
1116
  fn write_batch(
569
1117
  writer: &mut WriterOutput,
570
1118
  collectors: &mut [ColumnCollector],
571
1119
  flush_threshold: usize,
572
- ) -> Result<(), MagnusError> {
1120
+ ) -> Result<(), ReaderError> {
573
1121
  // Convert columns to Arrow arrays
574
1122
  let arrow_arrays: Vec<(String, Arc<dyn Array>)> = collectors
575
1123
  .iter_mut()
576
- .map(|collector| Ok((collector.name.clone(), collector.take_array()?)))
577
- .collect::<Result<_, MagnusError>>()?;
1124
+ .map(|c| {
1125
+ let arr = c.take_array()?;
1126
+ Ok((c.name.clone(), arr))
1127
+ })
1128
+ .collect::<Result<_, ReaderError>>()?;
578
1129
 
579
- // Create and write record batch
580
- let record_batch = RecordBatch::try_from_iter(arrow_arrays).map_err(|e| {
1130
+ let record_batch = RecordBatch::try_from_iter(arrow_arrays.clone()).map_err(|e| {
581
1131
  MagnusError::new(
582
1132
  magnus::exception::runtime_error(),
583
- format!("Failed to create record batch: {}", e),
1133
+ format!("Failed to create RecordBatch: {}", e),
584
1134
  )
585
1135
  })?;
586
1136
 
587
- writer
588
- .write(&record_batch)
589
- .map_err(|e| ParquetErrorWrapper(e))?;
1137
+ writer.write(&record_batch)?;
590
1138
 
1139
+ // Check if we need to flush based on memory usage thresholds
591
1140
  match writer {
592
1141
  WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
593
1142
  if w.in_progress_size() >= flush_threshold || w.memory_size() >= flush_threshold {
594
- w.flush().map_err(|e| ParquetErrorWrapper(e))?;
1143
+ w.flush()?;
595
1144
  }
596
1145
  }
597
1146
  }
598
-
599
1147
  Ok(())
600
1148
  }