parquet 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,39 +1,33 @@
1
- use std::{
2
- fs::File,
3
- io::{self, BufReader, BufWriter},
4
- sync::Arc,
5
- };
1
+ mod write_columns;
2
+ mod write_rows;
6
3
 
7
- use arrow_array::{Array, RecordBatch};
8
4
  use arrow_schema::{DataType, Schema, TimeUnit};
9
5
  use itertools::Itertools;
10
6
  use magnus::{
11
7
  scan_args::{get_kwargs, scan_args},
12
8
  value::ReprValue,
13
- Error as MagnusError, RArray, RHash, Ruby, Symbol, TryConvert, Value,
9
+ Error as MagnusError, RArray, RHash, Ruby, Symbol, Value,
14
10
  };
15
11
  use parquet::{
16
12
  arrow::ArrowWriter,
17
13
  basic::{Compression, GzipLevel, ZstdLevel},
18
14
  file::properties::WriterProperties,
19
15
  };
20
- use rand::Rng;
16
+ use std::{
17
+ fs::File,
18
+ io::{self, BufReader, BufWriter},
19
+ sync::Arc,
20
+ };
21
21
  use tempfile::NamedTempFile;
22
+ pub use write_columns::write_columns;
23
+ pub use write_rows::write_rows;
22
24
 
25
+ use crate::{types::PrimitiveType, SchemaNode};
23
26
  use crate::{
24
- convert_ruby_array_to_arrow,
25
- logger::RubyLogger,
26
- reader::ReaderError,
27
- types::{
28
- schema_node::build_arrow_schema, // ADDED - we need to reference the DSL's build_arrow_schema
29
- ColumnCollector,
30
- ParquetSchemaType,
31
- WriterOutput,
32
- },
27
+ types::{ColumnCollector, ParquetGemError, ParquetSchemaType, WriterOutput},
33
28
  utils::parse_string_or_symbol,
34
29
  IoLikeValue, ParquetSchemaType as PST, ParquetWriteArgs, SchemaField, SendableWrite,
35
30
  };
36
- use crate::{types::PrimitiveType, SchemaNode}; // ADDED - ensure we import SchemaNode
37
31
 
38
32
  const MIN_SAMPLES_FOR_ESTIMATE: usize = 10;
39
33
  const SAMPLE_SIZE: usize = 100;
@@ -41,6 +35,144 @@ const MIN_BATCH_SIZE: usize = 10;
41
35
  const INITIAL_BATCH_SIZE: usize = 100;
42
36
  const DEFAULT_MEMORY_THRESHOLD: usize = 64 * 1024 * 1024;
43
37
 
38
+ /// Parse arguments for Parquet writing
39
+ pub fn parse_parquet_write_args(
40
+ ruby: &Ruby,
41
+ args: &[Value],
42
+ ) -> Result<ParquetWriteArgs, MagnusError> {
43
+ let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
44
+ let (read_from,) = parsed_args.required;
45
+
46
+ let kwargs = get_kwargs::<
47
+ _,
48
+ (Value, Value),
49
+ (
50
+ Option<Option<usize>>,
51
+ Option<Option<usize>>,
52
+ Option<Option<String>>,
53
+ Option<Option<usize>>,
54
+ Option<Option<Value>>,
55
+ ),
56
+ (),
57
+ >(
58
+ parsed_args.keywords,
59
+ &["schema", "write_to"],
60
+ &[
61
+ "batch_size",
62
+ "flush_threshold",
63
+ "compression",
64
+ "sample_size",
65
+ "logger",
66
+ ],
67
+ )?;
68
+
69
+ // The schema value could be one of:
70
+ // 1. An array of hashes (legacy format)
71
+ // 2. A hash with type: :struct (new DSL format)
72
+ // 3. nil (infer from data)
73
+ let schema_value = kwargs.required.0;
74
+
75
+ // Check if it's the new DSL format (a hash with type: :struct)
76
+ // We need to handle both direct hash objects and objects created via Parquet::Schema.define
77
+
78
+ // First, try to convert it to a Hash if it's not already a Hash
79
+ // This handles the case where schema_value is a Schema object from Parquet::Schema.define
80
+ let schema_hash = if schema_value.is_kind_of(ruby.class_hash()) {
81
+ RHash::from_value(schema_value).ok_or_else(|| {
82
+ MagnusError::new(magnus::exception::type_error(), "Schema must be a hash")
83
+ })?
84
+ } else {
85
+ // Try to convert the object to a hash with to_h
86
+ match schema_value.respond_to("to_h", false) {
87
+ Ok(true) => {
88
+ match schema_value.funcall::<_, _, Value>("to_h", ()) {
89
+ Ok(hash_val) => match RHash::from_value(hash_val) {
90
+ Some(hash) => hash,
91
+ None => {
92
+ // Not a hash, continue to normal handling
93
+ RHash::new()
94
+ }
95
+ },
96
+ Err(_) => {
97
+ // couldn't call to_h, continue to normal handling
98
+ RHash::new()
99
+ }
100
+ }
101
+ }
102
+ _ => {
103
+ // Doesn't respond to to_h, continue to normal handling
104
+ RHash::new()
105
+ }
106
+ }
107
+ };
108
+
109
+ // Now check if it's a schema hash with a type: :struct field
110
+ let type_val = schema_hash.get(Symbol::new("type"));
111
+
112
+ if let Some(type_val) = type_val {
113
+ // If it has a type: :struct, it's the new DSL format
114
+ // Use parse_string_or_symbol to handle both String and Symbol values
115
+ let ttype = parse_string_or_symbol(&ruby, type_val)?;
116
+ if let Some(ref type_str) = ttype {
117
+ if type_str == "struct" {
118
+ // Parse using the new schema approach
119
+ let schema_node = crate::parse_schema_node(&ruby, schema_value)?;
120
+
121
+ validate_schema_node(&ruby, &schema_node)?;
122
+
123
+ return Ok(ParquetWriteArgs {
124
+ read_from,
125
+ write_to: kwargs.required.1,
126
+ schema: schema_node,
127
+ batch_size: kwargs.optional.0.flatten(),
128
+ flush_threshold: kwargs.optional.1.flatten(),
129
+ compression: kwargs.optional.2.flatten(),
130
+ sample_size: kwargs.optional.3.flatten(),
131
+ logger: kwargs.optional.4.flatten(),
132
+ });
133
+ }
134
+ }
135
+ }
136
+
137
+ // If it's not a hash with type: :struct, handle as legacy format
138
+ let schema_fields = if schema_value.is_nil()
139
+ || (schema_value.is_kind_of(ruby.class_array())
140
+ && RArray::from_value(schema_value)
141
+ .ok_or_else(|| {
142
+ MagnusError::new(
143
+ magnus::exception::type_error(),
144
+ "Schema fields must be an array",
145
+ )
146
+ })?
147
+ .len()
148
+ == 0)
149
+ {
150
+ // If schema is nil or an empty array, we need to peek at the first value to determine column count
151
+ let first_value = read_from.funcall::<_, _, Value>("peek", ())?;
152
+ // Default to nullable:true for auto-inferred fields
153
+ crate::infer_schema_from_first_row(&ruby, first_value, true)?
154
+ } else {
155
+ // Legacy array format - use our centralized parser
156
+ crate::parse_legacy_schema(&ruby, schema_value)?
157
+ };
158
+
159
+ // Convert the legacy schema fields to SchemaNode (DSL format)
160
+ let schema_node = crate::legacy_schema_to_dsl(&ruby, schema_fields)?;
161
+
162
+ validate_schema_node(&ruby, &schema_node)?;
163
+
164
+ Ok(ParquetWriteArgs {
165
+ read_from,
166
+ write_to: kwargs.required.1,
167
+ schema: schema_node,
168
+ batch_size: kwargs.optional.0.flatten(),
169
+ flush_threshold: kwargs.optional.1.flatten(),
170
+ compression: kwargs.optional.2.flatten(),
171
+ sample_size: kwargs.optional.3.flatten(),
172
+ logger: kwargs.optional.4.flatten(),
173
+ })
174
+ }
175
+
44
176
  // -----------------------------------------------------------------------------
45
177
  // HELPER to invert arrow DataType back to our ParquetSchemaType
46
178
  // Converts Arrow DataType to our internal ParquetSchemaType representation.
@@ -49,22 +181,22 @@ const DEFAULT_MEMORY_THRESHOLD: usize = 64 * 1024 * 1024;
49
181
  // -----------------------------------------------------------------------------
50
182
  fn arrow_data_type_to_parquet_schema_type(dt: &DataType) -> Result<ParquetSchemaType, MagnusError> {
51
183
  match dt {
52
- DataType::Boolean => Ok(PST::Boolean),
53
- DataType::Int8 => Ok(PST::Int8),
54
- DataType::Int16 => Ok(PST::Int16),
55
- DataType::Int32 => Ok(PST::Int32),
56
- DataType::Int64 => Ok(PST::Int64),
57
- DataType::UInt8 => Ok(PST::UInt8),
58
- DataType::UInt16 => Ok(PST::UInt16),
59
- DataType::UInt32 => Ok(PST::UInt32),
60
- DataType::UInt64 => Ok(PST::UInt64),
184
+ DataType::Boolean => Ok(PST::Primitive(PrimitiveType::Boolean)),
185
+ DataType::Int8 => Ok(PST::Primitive(PrimitiveType::Int8)),
186
+ DataType::Int16 => Ok(PST::Primitive(PrimitiveType::Int16)),
187
+ DataType::Int32 => Ok(PST::Primitive(PrimitiveType::Int32)),
188
+ DataType::Int64 => Ok(PST::Primitive(PrimitiveType::Int64)),
189
+ DataType::UInt8 => Ok(PST::Primitive(PrimitiveType::UInt8)),
190
+ DataType::UInt16 => Ok(PST::Primitive(PrimitiveType::UInt16)),
191
+ DataType::UInt32 => Ok(PST::Primitive(PrimitiveType::UInt32)),
192
+ DataType::UInt64 => Ok(PST::Primitive(PrimitiveType::UInt64)),
61
193
  DataType::Float16 => {
62
194
  // We do not have a direct ParquetSchemaType::Float16, we treat it as Float
63
- Ok(PST::Float)
195
+ Ok(PST::Primitive(PrimitiveType::Float32))
64
196
  }
65
- DataType::Float32 => Ok(PST::Float),
66
- DataType::Float64 => Ok(PST::Double),
67
- DataType::Date32 => Ok(PST::Date32),
197
+ DataType::Float32 => Ok(PST::Primitive(PrimitiveType::Float32)),
198
+ DataType::Float64 => Ok(PST::Primitive(PrimitiveType::Float64)),
199
+ DataType::Date32 => Ok(PST::Primitive(PrimitiveType::Date32)),
68
200
  DataType::Date64 => {
69
201
  // Our code typically uses Date32 or Timestamp for 64. But Arrow has Date64
70
202
  // We can store it as PST::Date64 if we want. If we don't have that, consider PST::Date32 or an error.
@@ -78,10 +210,14 @@ fn arrow_data_type_to_parquet_schema_type(dt: &DataType) -> Result<ParquetSchema
78
210
  DataType::Timestamp(TimeUnit::Second, _tz) => {
79
211
  // We'll treat this as PST::TimestampMillis, or define PST::TimestampSecond
80
212
  // For simplicity, let's map "second" to PST::TimestampMillis with a note:
81
- Ok(PST::TimestampMillis)
213
+ Ok(PST::Primitive(PrimitiveType::TimestampMillis))
214
+ }
215
+ DataType::Timestamp(TimeUnit::Millisecond, _tz) => {
216
+ Ok(PST::Primitive(PrimitiveType::TimestampMillis))
217
+ }
218
+ DataType::Timestamp(TimeUnit::Microsecond, _tz) => {
219
+ Ok(PST::Primitive(PrimitiveType::TimestampMicros))
82
220
  }
83
- DataType::Timestamp(TimeUnit::Millisecond, _tz) => Ok(PST::TimestampMillis),
84
- DataType::Timestamp(TimeUnit::Microsecond, _tz) => Ok(PST::TimestampMicros),
85
221
  DataType::Timestamp(TimeUnit::Nanosecond, _tz) => {
86
222
  // If you have a PST::TimestampNanos variant, use it. Otherwise, degrade to micros
87
223
  // for demonstration:
@@ -90,13 +226,13 @@ fn arrow_data_type_to_parquet_schema_type(dt: &DataType) -> Result<ParquetSchema
90
226
  "TimestampNanos not supported, please adjust your schema or code.",
91
227
  ))
92
228
  }
93
- DataType::Utf8 => Ok(PST::String),
94
- DataType::Binary => Ok(PST::Binary),
229
+ DataType::Utf8 => Ok(PST::Primitive(PrimitiveType::String)),
230
+ DataType::Binary => Ok(PST::Primitive(PrimitiveType::Binary)),
95
231
  DataType::LargeUtf8 => {
96
232
  // If not supported, degrade or error. We'll degrade to PST::String
97
- Ok(PST::String)
233
+ Ok(PST::Primitive(PrimitiveType::String))
98
234
  }
99
- DataType::LargeBinary => Ok(PST::Binary),
235
+ DataType::LargeBinary => Ok(PST::Primitive(PrimitiveType::Binary)),
100
236
  DataType::List(child_field) => {
101
237
  // Recursively handle the item type
102
238
  let child_type = arrow_data_type_to_parquet_schema_type(child_field.data_type())?;
@@ -204,6 +340,7 @@ fn build_column_collectors_from_dsl<'a>(
204
340
 
205
341
  // Build the ColumnCollector
206
342
  collectors.push(ColumnCollector::new(
343
+ ruby,
207
344
  name,
208
345
  parquet_type,
209
346
  format,
@@ -226,142 +363,6 @@ fn extract_format_from_schema_node(node: &SchemaNode) -> Option<String> {
226
363
  }
227
364
  }
228
365
 
229
- /// Parse arguments for Parquet writing
230
- pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, MagnusError> {
231
- let ruby = unsafe { Ruby::get_unchecked() };
232
- let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
233
- let (read_from,) = parsed_args.required;
234
-
235
- let kwargs = get_kwargs::<
236
- _,
237
- (Value, Value),
238
- (
239
- Option<Option<usize>>,
240
- Option<Option<usize>>,
241
- Option<Option<String>>,
242
- Option<Option<usize>>,
243
- Option<Option<Value>>,
244
- ),
245
- (),
246
- >(
247
- parsed_args.keywords,
248
- &["schema", "write_to"],
249
- &[
250
- "batch_size",
251
- "flush_threshold",
252
- "compression",
253
- "sample_size",
254
- "logger",
255
- ],
256
- )?;
257
-
258
- // The schema value could be one of:
259
- // 1. An array of hashes (legacy format)
260
- // 2. A hash with type: :struct (new DSL format)
261
- // 3. nil (infer from data)
262
- let schema_value = kwargs.required.0;
263
-
264
- // Check if it's the new DSL format (a hash with type: :struct)
265
- // We need to handle both direct hash objects and objects created via Parquet::Schema.define
266
-
267
- // First, try to convert it to a Hash if it's not already a Hash
268
- // This handles the case where schema_value is a Schema object from Parquet::Schema.define
269
- let schema_hash = if schema_value.is_kind_of(ruby.class_hash()) {
270
- RHash::from_value(schema_value).ok_or_else(|| {
271
- MagnusError::new(magnus::exception::type_error(), "Schema must be a hash")
272
- })?
273
- } else {
274
- // Try to convert the object to a hash with to_h
275
- match schema_value.respond_to("to_h", false) {
276
- Ok(true) => {
277
- match schema_value.funcall::<_, _, Value>("to_h", ()) {
278
- Ok(hash_val) => match RHash::from_value(hash_val) {
279
- Some(hash) => hash,
280
- None => {
281
- // Not a hash, continue to normal handling
282
- RHash::new()
283
- }
284
- },
285
- Err(_) => {
286
- // couldn't call to_h, continue to normal handling
287
- RHash::new()
288
- }
289
- }
290
- }
291
- _ => {
292
- // Doesn't respond to to_h, continue to normal handling
293
- RHash::new()
294
- }
295
- }
296
- };
297
-
298
- // Now check if it's a schema hash with a type: :struct field
299
- let type_val = schema_hash.get(Symbol::new("type"));
300
-
301
- if let Some(type_val) = type_val {
302
- // If it has a type: :struct, it's the new DSL format
303
- // Use parse_string_or_symbol to handle both String and Symbol values
304
- let ttype = parse_string_or_symbol(&ruby, type_val)?;
305
- if let Some(ref type_str) = ttype {
306
- if type_str == "struct" {
307
- // Parse using the new schema approach
308
- let schema_node = crate::parse_schema_node(&ruby, schema_value)?;
309
-
310
- validate_schema_node(&ruby, &schema_node)?;
311
-
312
- return Ok(ParquetWriteArgs {
313
- read_from,
314
- write_to: kwargs.required.1,
315
- schema: schema_node,
316
- batch_size: kwargs.optional.0.flatten(),
317
- flush_threshold: kwargs.optional.1.flatten(),
318
- compression: kwargs.optional.2.flatten(),
319
- sample_size: kwargs.optional.3.flatten(),
320
- logger: kwargs.optional.4.flatten(),
321
- });
322
- }
323
- }
324
- }
325
-
326
- // If it's not a hash with type: :struct, handle as legacy format
327
- let schema_fields = if schema_value.is_nil()
328
- || (schema_value.is_kind_of(ruby.class_array())
329
- && RArray::from_value(schema_value)
330
- .ok_or_else(|| {
331
- MagnusError::new(
332
- magnus::exception::type_error(),
333
- "Schema fields must be an array",
334
- )
335
- })?
336
- .len()
337
- == 0)
338
- {
339
- // If schema is nil or an empty array, we need to peek at the first value to determine column count
340
- let first_value = read_from.funcall::<_, _, Value>("peek", ())?;
341
- // Default to nullable:true for auto-inferred fields
342
- crate::infer_schema_from_first_row(&ruby, first_value, true)?
343
- } else {
344
- // Legacy array format - use our centralized parser
345
- crate::parse_legacy_schema(&ruby, schema_value)?
346
- };
347
-
348
- // Convert the legacy schema fields to SchemaNode (DSL format)
349
- let schema_node = crate::legacy_schema_to_dsl(&ruby, schema_fields)?;
350
-
351
- validate_schema_node(&ruby, &schema_node)?;
352
-
353
- Ok(ParquetWriteArgs {
354
- read_from,
355
- write_to: kwargs.required.1,
356
- schema: schema_node,
357
- batch_size: kwargs.optional.0.flatten(),
358
- flush_threshold: kwargs.optional.1.flatten(),
359
- compression: kwargs.optional.2.flatten(),
360
- sample_size: kwargs.optional.3.flatten(),
361
- logger: kwargs.optional.4.flatten(),
362
- })
363
- }
364
-
365
366
  // Validates a SchemaNode to ensure it meets Parquet schema requirements
366
367
  // Currently checks for duplicate field names at the root level, which would
367
368
  // cause problems when writing Parquet files. Additional validation rules
@@ -396,394 +397,6 @@ fn validate_schema_node(ruby: &Ruby, schema_node: &SchemaNode) -> Result<(), Mag
396
397
  Ok(())
397
398
  }
398
399
 
399
- // Processes a single data row and adds values to the corresponding column collectors
400
- // This function is called for each row of input data when writing in row-wise mode.
401
- // It performs important validation to ensure the row structure matches the schema:
402
- // - Verifies that the number of columns in the row matches the schema
403
- // - Distributes each value to the appropriate ColumnCollector
404
- //
405
- // Each ColumnCollector handles type conversion and accumulation for its specific column,
406
- // allowing this function to focus on row-level validation and distribution.
407
- fn process_row(
408
- ruby: &Ruby,
409
- row: Value,
410
- column_collectors: &mut [ColumnCollector],
411
- ) -> Result<(), MagnusError> {
412
- let row_array = RArray::from_value(row)
413
- .ok_or_else(|| MagnusError::new(ruby.exception_type_error(), "Row must be an array"))?;
414
-
415
- // Validate row length matches schema
416
- if row_array.len() != column_collectors.len() {
417
- return Err(MagnusError::new(
418
- magnus::exception::runtime_error(),
419
- format!(
420
- "Row length ({}) does not match schema length ({}). Schema expects columns: {:?}",
421
- row_array.len(),
422
- column_collectors.len(),
423
- column_collectors
424
- .iter()
425
- .map(|c| c.name.as_str())
426
- .collect::<Vec<_>>()
427
- ),
428
- ));
429
- }
430
-
431
- // Process each value in the row
432
- for (collector, value) in column_collectors.iter_mut().zip(row_array) {
433
- collector.push_value(value)?;
434
- }
435
-
436
- Ok(())
437
- }
438
-
439
- // Dynamically calculates an optimal batch size based on estimated row sizes
440
- // and memory constraints. This function enables the writer to adapt to different
441
- // data characteristics for optimal performance.
442
- //
443
- // The algorithm:
444
- // 1. Requires a minimum number of samples to make a reliable estimate
445
- // 2. Calculates the average row size from the samples
446
- // 3. Determines a batch size that would consume approximately the target memory threshold
447
- // 4. Ensures the batch size doesn't go below a minimum value for efficiency
448
- //
449
- // This approach balances memory usage with processing efficiency by targeting
450
- // a specific memory footprint per batch.
451
- fn update_batch_size(
452
- size_samples: &[usize],
453
- flush_threshold: usize,
454
- min_batch_size: usize,
455
- ) -> usize {
456
- if size_samples.len() < MIN_SAMPLES_FOR_ESTIMATE {
457
- return min_batch_size;
458
- }
459
-
460
- let total_size = size_samples.iter().sum::<usize>();
461
- // Safe because we know we have at least MIN_SAMPLES_FOR_ESTIMATE samples
462
- let avg_row_size = total_size as f64 / size_samples.len() as f64;
463
- let avg_row_size = avg_row_size.max(1.0); // Ensure we don't divide by zero
464
- let suggested_batch_size = (flush_threshold as f64 / avg_row_size).floor() as usize;
465
- suggested_batch_size.max(min_batch_size)
466
- }
467
-
468
- #[inline]
469
- pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
470
- write_rows_impl(args).map_err(|e| {
471
- let z: MagnusError = e.into();
472
- z
473
- })?;
474
- Ok(())
475
- }
476
-
477
- #[inline]
478
- fn write_rows_impl(args: &[Value]) -> Result<(), ReaderError> {
479
- let ruby = unsafe { Ruby::get_unchecked() };
480
-
481
- let ParquetWriteArgs {
482
- read_from,
483
- write_to,
484
- schema,
485
- batch_size: user_batch_size,
486
- compression,
487
- flush_threshold,
488
- sample_size: user_sample_size,
489
- logger,
490
- } = parse_parquet_write_args(args)?;
491
-
492
- let logger = RubyLogger::new(&ruby, logger)?;
493
- let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
494
-
495
- // Get the Arrow schema from the SchemaNode (we only have DSL schema now, since legacy is converted)
496
- let arrow_schema = build_arrow_schema(&schema, &logger).map_err(|e| {
497
- MagnusError::new(
498
- magnus::exception::runtime_error(),
499
- format!("Failed to build Arrow schema from DSL schema: {}", e),
500
- )
501
- })?;
502
-
503
- // Create the writer
504
- let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
505
-
506
- if read_from.is_kind_of(ruby.class_enumerator()) {
507
- // Build column collectors - we only have DSL schema now
508
- let mut column_collectors =
509
- build_column_collectors_from_dsl(&ruby, &arrow_schema, &schema)?;
510
-
511
- let mut rows_in_batch = 0;
512
- let mut total_rows = 0;
513
- let mut rng = rand::rng();
514
- let sample_size = user_sample_size.unwrap_or(SAMPLE_SIZE);
515
- let mut size_samples = Vec::with_capacity(sample_size);
516
- let mut current_batch_size = user_batch_size.unwrap_or(INITIAL_BATCH_SIZE);
517
-
518
- loop {
519
- match read_from.funcall::<_, _, Value>("next", ()) {
520
- Ok(row) => {
521
- // Process the row
522
- process_row(&ruby, row, &mut column_collectors)?;
523
-
524
- // Update row sampling for dynamic batch sizing
525
- if size_samples.len() < sample_size {
526
- // estimate row size
527
- let row_array = RArray::from_value(row).ok_or_else(|| {
528
- MagnusError::new(ruby.exception_type_error(), "Row must be an array")
529
- })?;
530
- let row_size = estimate_single_row_size(&row_array, &column_collectors)?;
531
- size_samples.push(row_size);
532
- } else if rng.random_range(0..=total_rows) < sample_size as usize {
533
- let idx = rng.random_range(0..sample_size as usize);
534
- let row_array = RArray::from_value(row).ok_or_else(|| {
535
- MagnusError::new(ruby.exception_type_error(), "Row must be an array")
536
- })?;
537
- let row_size = estimate_single_row_size(&row_array, &column_collectors)?;
538
- size_samples[idx] = row_size;
539
- }
540
-
541
- rows_in_batch += 1;
542
- total_rows += 1;
543
-
544
- // Calculate batch size progressively once we have minimum samples
545
- if user_batch_size.is_none() && size_samples.len() >= MIN_SAMPLES_FOR_ESTIMATE {
546
- current_batch_size =
547
- update_batch_size(&size_samples, flush_threshold, MIN_BATCH_SIZE);
548
- }
549
-
550
- // When we reach batch size, write the batch
551
- if rows_in_batch >= current_batch_size {
552
- write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
553
- rows_in_batch = 0;
554
- }
555
- }
556
- Err(e) => {
557
- if e.is_kind_of(ruby.exception_stop_iteration()) {
558
- // Write any remaining rows
559
- if rows_in_batch > 0 {
560
- write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
561
- }
562
- break;
563
- }
564
- return Err(e)?;
565
- }
566
- }
567
- }
568
- } else {
569
- return Err(MagnusError::new(
570
- magnus::exception::type_error(),
571
- "read_from must be an Enumerator".to_string(),
572
- ))?;
573
- }
574
-
575
- // Ensure everything is written and get the temp file if it exists
576
- if let Some(temp_file) = writer.close()? {
577
- // If we got a temp file back, we need to copy its contents to the IO-like object
578
- copy_temp_file_to_io_like(temp_file, IoLikeValue(write_to))?;
579
- }
580
-
581
- Ok(())
582
- }
583
-
584
- #[inline]
585
- pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
586
- write_columns_impl(args).map_err(|e| {
587
- let z: MagnusError = e.into();
588
- z
589
- })?;
590
- Ok(())
591
- }
592
-
593
- #[inline]
594
- fn write_columns_impl(args: &[Value]) -> Result<(), ReaderError> {
595
- let ruby = unsafe { Ruby::get_unchecked() };
596
-
597
- let ParquetWriteArgs {
598
- read_from,
599
- write_to,
600
- schema,
601
- batch_size: _,
602
- compression,
603
- flush_threshold,
604
- sample_size: _,
605
- logger,
606
- } = parse_parquet_write_args(args)?;
607
-
608
- let logger = RubyLogger::new(&ruby, logger)?;
609
- let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
610
-
611
- // Get the Arrow schema from the SchemaNode (we only have DSL schema now, since legacy is converted)
612
- let arrow_schema = build_arrow_schema(&schema, &logger).map_err(|e| {
613
- MagnusError::new(
614
- magnus::exception::runtime_error(),
615
- format!("Failed to build Arrow schema from DSL schema: {}", e),
616
- )
617
- })?;
618
-
619
- // Create the writer
620
- let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
621
-
622
- if read_from.is_kind_of(ruby.class_enumerator()) {
623
- loop {
624
- match read_from.funcall::<_, _, Value>("next", ()) {
625
- Ok(batch) => {
626
- let batch_array = RArray::from_value(batch).ok_or_else(|| {
627
- MagnusError::new(ruby.exception_type_error(), "Batch must be an array")
628
- })?;
629
-
630
- // Batch array must be an array of arrays. Check that the first value in `batch_array` is an array.
631
- batch_array.entry::<RArray>(0).map_err(|_| {
632
- MagnusError::new(
633
- ruby.exception_type_error(),
634
- "When writing columns, data must be formatted as batches of columns: [[batch1_col1, batch1_col2], [batch2_col1, batch2_col2]].",
635
- )
636
- })?;
637
-
638
- // Validate batch length matches schema
639
- // Get schema length and field names - we only have DSL schema now
640
- let (schema_len, field_names): (usize, Vec<&str>) = {
641
- let fields = match &schema {
642
- SchemaNode::Struct { fields, .. } => fields,
643
- _ => {
644
- return Err(MagnusError::new(
645
- magnus::exception::type_error(),
646
- "Root schema node must be a struct type",
647
- ))?
648
- }
649
- };
650
- (
651
- fields.len(),
652
- fields
653
- .iter()
654
- .map(|f| match f {
655
- SchemaNode::Primitive { name, .. } => name.as_str(),
656
- SchemaNode::List { name, .. } => name.as_str(),
657
- SchemaNode::Map { name, .. } => name.as_str(),
658
- SchemaNode::Struct { name, .. } => name.as_str(),
659
- })
660
- .to_owned()
661
- .collect(),
662
- )
663
- };
664
-
665
- if batch_array.len() != schema_len {
666
- return Err(MagnusError::new(
667
- magnus::exception::type_error(),
668
- format!(
669
- "Batch column count ({}) does not match schema length ({}). Schema expects columns: {:?}",
670
- batch_array.len(),
671
- schema_len,
672
- field_names
673
- ),
674
- ))?;
675
- }
676
-
677
- // Convert each column in the batch to Arrow arrays
678
- let arrow_arrays: Vec<(String, Arc<dyn Array>)> = {
679
- // Process each field in the DSL schema
680
- let fields = arrow_schema.fields();
681
- let top_fields =
682
- match &schema {
683
- SchemaNode::Struct { fields, .. } => fields,
684
- _ => return Err(MagnusError::new(
685
- magnus::exception::runtime_error(),
686
- "Top-level DSL schema must be a struct for columns approach",
687
- ))?,
688
- };
689
- if top_fields.len() != fields.len() {
690
- return Err(MagnusError::new(
691
- magnus::exception::runtime_error(),
692
- "Mismatch top-level DSL fields vs Arrow fields",
693
- ))?;
694
- }
695
-
696
- let mut out = vec![];
697
- for ((arrow_f, dsl_f), col_val) in
698
- fields.iter().zip(top_fields.iter()).zip(batch_array)
699
- {
700
- let col_arr = RArray::from_value(col_val).ok_or_else(|| {
701
- MagnusError::new(
702
- magnus::exception::type_error(),
703
- format!("Column '{}' must be an array", arrow_f.name()),
704
- )
705
- })?;
706
- // Get appropriate parquet_type
707
- let ptype = match dsl_f {
708
- SchemaNode::Primitive {
709
- parquet_type,
710
- // Format is handled internally now
711
- ..
712
- } => match parquet_type {
713
- &PrimitiveType::Int8 => PST::Int8,
714
- &PrimitiveType::Int16 => PST::Int16,
715
- &PrimitiveType::Int32 => PST::Int32,
716
- &PrimitiveType::Int64 => PST::Int64,
717
- &PrimitiveType::UInt8 => PST::UInt8,
718
- &PrimitiveType::UInt16 => PST::UInt16,
719
- &PrimitiveType::UInt32 => PST::UInt32,
720
- &PrimitiveType::UInt64 => PST::UInt64,
721
- &PrimitiveType::Float32 => PST::Float,
722
- &PrimitiveType::Float64 => PST::Double,
723
- &PrimitiveType::String => PST::String,
724
- &PrimitiveType::Binary => PST::Binary,
725
- &PrimitiveType::Boolean => PST::Boolean,
726
- &PrimitiveType::Date32 => PST::Date32,
727
- &PrimitiveType::TimestampMillis => PST::TimestampMillis,
728
- &PrimitiveType::TimestampMicros => PST::TimestampMicros,
729
- },
730
- SchemaNode::List { .. }
731
- | SchemaNode::Map { .. }
732
- | SchemaNode::Struct { .. } => {
733
- // For nested, we just do a single "column" as well
734
- arrow_data_type_to_parquet_schema_type(arrow_f.data_type())?
735
- }
736
- };
737
- out.push((
738
- arrow_f.name().clone(),
739
- convert_ruby_array_to_arrow(col_arr, &ptype)?,
740
- ));
741
- }
742
- out
743
- };
744
-
745
- // Create and write record batch
746
- let record_batch = RecordBatch::try_from_iter(arrow_arrays).map_err(|e| {
747
- MagnusError::new(
748
- magnus::exception::runtime_error(),
749
- format!("Failed to create record batch: {}", e),
750
- )
751
- })?;
752
-
753
- writer.write(&record_batch)?;
754
-
755
- match &mut writer {
756
- WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
757
- if w.in_progress_size() >= flush_threshold {
758
- w.flush()?;
759
- }
760
- }
761
- }
762
- }
763
- Err(e) => {
764
- if e.is_kind_of(ruby.exception_stop_iteration()) {
765
- break;
766
- }
767
- return Err(e)?;
768
- }
769
- }
770
- }
771
- } else {
772
- return Err(MagnusError::new(
773
- magnus::exception::type_error(),
774
- "read_from must be an Enumerator".to_string(),
775
- ))?;
776
- }
777
-
778
- // Ensure everything is written and get the temp file if it exists
779
- if let Some(temp_file) = writer.close()? {
780
- // If we got a temp file back, we need to copy its contents to the IO-like object
781
- copy_temp_file_to_io_like(temp_file, IoLikeValue(write_to))?;
782
- }
783
-
784
- Ok(())
785
- }
786
-
787
400
  // Creates an appropriate Parquet writer based on the output target and compression settings
788
401
  // This function handles two main output scenarios:
789
402
  // 1. Writing directly to a file path (string)
@@ -800,7 +413,7 @@ fn create_writer(
800
413
  write_to: &Value,
801
414
  schema: Arc<Schema>,
802
415
  compression: Option<String>,
803
- ) -> Result<WriterOutput, ReaderError> {
416
+ ) -> Result<WriterOutput, ParquetGemError> {
804
417
  // Create writer properties with compression based on the option
805
418
  let props = WriterProperties::builder()
806
419
  .set_compression(match compression.as_deref() {
@@ -870,279 +483,3 @@ fn copy_temp_file_to_io_like(
870
483
 
871
484
  Ok(())
872
485
  }
873
-
874
- // Estimates the memory size of a single row by examining each value
875
- // This is used for dynamic batch sizing to optimize memory usage during writes
876
- // by adapting batch sizes based on the actual data being processed.
877
- pub fn estimate_single_row_size(
878
- row_array: &RArray,
879
- collectors: &[ColumnCollector],
880
- ) -> Result<usize, MagnusError> {
881
- let mut size = 0;
882
- for (idx, val) in row_array.into_iter().enumerate() {
883
- let col_type = &collectors[idx].type_;
884
- // Calculate size based on the type-specific estimation
885
- size += estimate_value_size(val, col_type)?;
886
- }
887
- Ok(size)
888
- }
889
-
890
- // Estimates the memory footprint of a single value based on its schema type
891
- // This provides type-specific size estimates that help with dynamic batch sizing
892
- // For complex types like lists, maps, and structs, we use reasonable approximations
893
- pub fn estimate_value_size(
894
- value: Value,
895
- schema_type: &ParquetSchemaType,
896
- ) -> Result<usize, MagnusError> {
897
- use ParquetSchemaType as PST;
898
- if value.is_nil() {
899
- return Ok(0); // nil => minimal
900
- }
901
- match schema_type {
902
- PST::Int8 | PST::UInt8 => Ok(1),
903
- PST::Int16 | PST::UInt16 => Ok(2),
904
- PST::Int32 | PST::UInt32 | PST::Float => Ok(4),
905
- PST::Int64 | PST::UInt64 | PST::Double => Ok(8),
906
- PST::Boolean => Ok(1),
907
- PST::Date32 | PST::TimestampMillis | PST::TimestampMicros => Ok(8),
908
- PST::String | PST::Binary => {
909
- if let Ok(s) = String::try_convert(value) {
910
- // Account for string length plus Rust String's capacity+pointer overhead
911
- Ok(s.len() + std::mem::size_of::<usize>() * 3)
912
- } else {
913
- // Try to convert the value to a string using to_s for non-string types
914
- // This handles numeric values that will be converted to strings later
915
- let _ruby = unsafe { Ruby::get_unchecked() };
916
- match value.funcall::<_, _, Value>("to_s", ()) {
917
- Ok(str_val) => {
918
- if let Ok(s) = String::try_convert(str_val) {
919
- Ok(s.len() + std::mem::size_of::<usize>() * 3)
920
- } else {
921
- // If to_s conversion fails, just use a reasonable default
922
- Ok(8) // Reasonable size estimate for small values
923
- }
924
- }
925
- Err(_) => {
926
- // If to_s method fails, use a default size
927
- Ok(8) // Reasonable size estimate for small values
928
- }
929
- }
930
- }
931
- }
932
- PST::List(item_type) => {
933
- if let Ok(arr) = RArray::try_convert(value) {
934
- let len = arr.len();
935
-
936
- // Base overhead for the array structure (pointer, length, capacity)
937
- let base_size = std::mem::size_of::<usize>() * 3;
938
-
939
- // If empty, just return the base size
940
- if len == 0 {
941
- return Ok(base_size);
942
- }
943
-
944
- // Sample up to 5 elements to get average element size
945
- let sample_count = std::cmp::min(len, 5);
946
- let mut total_sample_size = 0;
947
-
948
- for i in 0..sample_count {
949
- let element = arr.entry(i as isize)?;
950
- let element_size = estimate_value_size(element, &item_type.item_type)?;
951
- total_sample_size += element_size;
952
- }
953
-
954
- // If we couldn't sample any elements properly, that's an error
955
- if sample_count > 0 && total_sample_size == 0 {
956
- return Err(MagnusError::new(
957
- magnus::exception::runtime_error(),
958
- "Failed to estimate size of list elements",
959
- ));
960
- }
961
-
962
- // Calculate average element size from samples
963
- let avg_element_size = if sample_count > 0 {
964
- total_sample_size as f64 / sample_count as f64
965
- } else {
966
- return Err(MagnusError::new(
967
- magnus::exception::runtime_error(),
968
- "Failed to sample list elements for size estimation",
969
- ));
970
- };
971
-
972
- // Estimate total size based on average element size * length + base overhead
973
- Ok(base_size + (avg_element_size as usize * len))
974
- } else {
975
- // Instead of assuming it's a small list, return an error
976
- Err(MagnusError::new(
977
- magnus::exception::runtime_error(),
978
- format!("Expected array for List type but got: {:?}", value),
979
- ))
980
- }
981
- }
982
- PST::Map(map_field) => {
983
- if let Ok(hash) = RHash::try_convert(value) {
984
- let size_estimate = hash.funcall::<_, _, usize>("size", ())?;
985
-
986
- // Base overhead for the hash structure
987
- let base_size = std::mem::size_of::<usize>() * 4;
988
-
989
- // If empty, just return the base size
990
- if size_estimate == 0 {
991
- return Ok(base_size);
992
- }
993
-
994
- // Sample up to 5 key-value pairs to estimate average sizes
995
- let mut key_sample_size = 0;
996
- let mut value_sample_size = 0;
997
- let mut sample_count = 0;
998
-
999
- // Get an enumerator for the hash
1000
- let enumerator = hash.funcall::<_, _, Value>("to_enum", ())?;
1001
-
1002
- // Sample up to 5 entries
1003
- for _ in 0..std::cmp::min(size_estimate, 5) {
1004
- match enumerator.funcall::<_, _, Value>("next", ()) {
1005
- Ok(pair) => {
1006
- if let Ok(pair_array) = RArray::try_convert(pair) {
1007
- if pair_array.len() == 2 {
1008
- let key = pair_array.entry(0)?;
1009
- let val = pair_array.entry(1)?;
1010
-
1011
- key_sample_size +=
1012
- estimate_value_size(key, &map_field.key_type)?;
1013
- value_sample_size +=
1014
- estimate_value_size(val, &map_field.value_type)?;
1015
- sample_count += 1;
1016
- }
1017
- }
1018
- }
1019
- Err(_) => break, // Stop if we reach the end
1020
- }
1021
- }
1022
-
1023
- // If we couldn't sample any pairs, return an error
1024
- if size_estimate > 0 && sample_count == 0 {
1025
- return Err(MagnusError::new(
1026
- magnus::exception::runtime_error(),
1027
- "Failed to sample map entries for size estimation",
1028
- ));
1029
- }
1030
-
1031
- // Calculate average key and value sizes
1032
- let (avg_key_size, avg_value_size) = if sample_count > 0 {
1033
- (
1034
- key_sample_size as f64 / sample_count as f64,
1035
- value_sample_size as f64 / sample_count as f64,
1036
- )
1037
- } else {
1038
- return Err(MagnusError::new(
1039
- magnus::exception::runtime_error(),
1040
- "Failed to sample hash key-value pairs for size estimation",
1041
- ));
1042
- };
1043
-
1044
- // Each entry has overhead (node pointers, etc.) in a hash map
1045
- let entry_overhead = std::mem::size_of::<usize>() * 2;
1046
-
1047
- // Estimate total size:
1048
- // base size + (key_size + value_size + entry_overhead) * count
1049
- Ok(base_size
1050
- + ((avg_key_size + avg_value_size + entry_overhead as f64) as usize
1051
- * size_estimate))
1052
- } else {
1053
- // Instead of assuming a small map, return an error
1054
- Err(MagnusError::new(
1055
- magnus::exception::runtime_error(),
1056
- format!("Expected hash for Map type but got: {:?}", value),
1057
- ))
1058
- }
1059
- }
1060
- PST::Struct(struct_field) => {
1061
- if let Ok(hash) = RHash::try_convert(value) {
1062
- // Base overhead for the struct
1063
- let base_size = std::mem::size_of::<usize>() * 3;
1064
-
1065
- // Estimate size for each field
1066
- let mut total_fields_size = 0;
1067
-
1068
- for field in &struct_field.fields {
1069
- // Try to get the field value from the hash
1070
- match hash.get(Symbol::new(&field.name)) {
1071
- Some(field_value) => {
1072
- total_fields_size += estimate_value_size(field_value, &field.type_)?;
1073
- }
1074
- None => {
1075
- if let Some(field_value) = hash.get(&*field.name) {
1076
- total_fields_size +=
1077
- estimate_value_size(field_value, &field.type_)?;
1078
- } else {
1079
- if field.nullable {
1080
- total_fields_size += 0;
1081
- } else {
1082
- return Err(MagnusError::new(
1083
- magnus::exception::runtime_error(),
1084
- format!("Missing field: {} in hash {:?}", field.name, hash),
1085
- ));
1086
- }
1087
- }
1088
- }
1089
- }
1090
- }
1091
-
1092
- // We no longer error on missing fields during size estimation
1093
- Ok(base_size + total_fields_size)
1094
- } else {
1095
- // Instead of trying instance_variables or assuming a default, return an error
1096
- Err(MagnusError::new(
1097
- magnus::exception::runtime_error(),
1098
- format!("Expected hash for Struct type but got: {:?}", value),
1099
- ))
1100
- }
1101
- }
1102
- }
1103
- }
1104
-
1105
- // Converts all accumulated data from ColumnCollectors into an Arrow RecordBatch
1106
- // and writes it to the Parquet file/output. This is a crucial function that bridges
1107
- // between our Ruby-oriented data collectors and the Arrow/Parquet ecosystem.
1108
- //
1109
- // The function:
1110
- // 1. Takes all collected values from each ColumnCollector and converts them to Arrow arrays
1111
- // 2. Creates a RecordBatch from these arrays (column-oriented data format)
1112
- // 3. Writes the batch to the ParquetWriter
1113
- // 4. Flushes the writer if the accumulated memory exceeds the threshold
1114
- //
1115
- // This approach enables efficient batch-wise writing while controlling memory usage.
1116
- fn write_batch(
1117
- writer: &mut WriterOutput,
1118
- collectors: &mut [ColumnCollector],
1119
- flush_threshold: usize,
1120
- ) -> Result<(), ReaderError> {
1121
- // Convert columns to Arrow arrays
1122
- let arrow_arrays: Vec<(String, Arc<dyn Array>)> = collectors
1123
- .iter_mut()
1124
- .map(|c| {
1125
- let arr = c.take_array()?;
1126
- Ok((c.name.clone(), arr))
1127
- })
1128
- .collect::<Result<_, ReaderError>>()?;
1129
-
1130
- let record_batch = RecordBatch::try_from_iter(arrow_arrays.clone()).map_err(|e| {
1131
- MagnusError::new(
1132
- magnus::exception::runtime_error(),
1133
- format!("Failed to create RecordBatch: {}", e),
1134
- )
1135
- })?;
1136
-
1137
- writer.write(&record_batch)?;
1138
-
1139
- // Check if we need to flush based on memory usage thresholds
1140
- match writer {
1141
- WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
1142
- if w.in_progress_size() >= flush_threshold || w.memory_size() >= flush_threshold {
1143
- w.flush()?;
1144
- }
1145
- }
1146
- }
1147
- Ok(())
1148
- }