parquet-tyfoom 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/Cargo.lock +1854 -0
  3. data/Cargo.toml +3 -0
  4. data/Gemfile +21 -0
  5. data/LICENSE +21 -0
  6. data/README.md +428 -0
  7. data/Rakefile +43 -0
  8. data/ext/parquet/Cargo.toml +39 -0
  9. data/ext/parquet/build.rs +5 -0
  10. data/ext/parquet/extconf.rb +4 -0
  11. data/ext/parquet/src/adapter_ffi.rs +297 -0
  12. data/ext/parquet/src/allocator.rs +13 -0
  13. data/ext/parquet/src/lib.rs +24 -0
  14. data/ext/parquet-core/Cargo.toml +24 -0
  15. data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
  16. data/ext/parquet-core/src/error.rs +189 -0
  17. data/ext/parquet-core/src/lib.rs +60 -0
  18. data/ext/parquet-core/src/reader.rs +368 -0
  19. data/ext/parquet-core/src/schema.rs +452 -0
  20. data/ext/parquet-core/src/test_utils.rs +308 -0
  21. data/ext/parquet-core/src/traits/mod.rs +5 -0
  22. data/ext/parquet-core/src/traits/schema.rs +190 -0
  23. data/ext/parquet-core/src/value.rs +220 -0
  24. data/ext/parquet-core/src/writer.rs +1241 -0
  25. data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
  26. data/ext/parquet-core/tests/binary_data.rs +437 -0
  27. data/ext/parquet-core/tests/column_projection.rs +557 -0
  28. data/ext/parquet-core/tests/complex_types.rs +821 -0
  29. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  30. data/ext/parquet-core/tests/concurrent_access.rs +431 -0
  31. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  32. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  33. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
  34. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  35. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  36. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  37. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  38. data/ext/parquet-core/tests/review_regressions.rs +787 -0
  39. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  40. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
  41. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  42. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  43. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  44. data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
  45. data/ext/parquet-ruby-adapter/build.rs +5 -0
  46. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  47. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  48. data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
  49. data/ext/parquet-ruby-adapter/src/error.rs +141 -0
  50. data/ext/parquet-ruby-adapter/src/io.rs +432 -0
  51. data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
  52. data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
  53. data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
  54. data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
  55. data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
  56. data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
  57. data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
  58. data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
  59. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  60. data/ext/parquet-ruby-adapter/src/types.rs +98 -0
  61. data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
  62. data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
  63. data/lib/parquet/schema.rb +262 -0
  64. data/lib/parquet/version.rb +3 -0
  65. data/lib/parquet.rb +11 -0
  66. data/lib/parquet.rbi +181 -0
  67. metadata +165 -0
@@ -0,0 +1,625 @@
1
+ use magnus::value::ReprValue;
2
+ use magnus::{Enumerator, Error as MagnusError, RArray, Ruby, TryConvert, Value};
3
+ use parquet_core::writer::WriterBuilder;
4
+ use parquet_core::Schema;
5
+ use std::io::{BufReader, BufWriter, Write};
6
+ use tempfile::NamedTempFile;
7
+
8
+ use crate::io::RubyIOWriter;
9
+ use crate::types::WriterOutput;
10
+ use crate::utils::parse_compression;
11
+
12
+ /// Rows pulled per slice when streaming rows from an Enumerable without a
13
+ /// user-provided batch_size. Mirrors the core writer's default batch size so
14
+ /// the Ruby-side slice and the core row buffer stay in the same range.
15
+ const DEFAULT_ROW_SLICE_SIZE: usize = 1000;
16
+
17
+ /// How the writer batches rows before flushing. All batch sizing is owned by the
18
+ /// core `Writer`; the adapter only forwards the user's options.
19
+ #[derive(Debug, Default, Clone, Copy)]
20
+ pub struct BatchSizingOptions {
21
+ pub batch_size: Option<usize>,
22
+ pub flush_threshold: Option<usize>,
23
+ pub sample_size: Option<usize>,
24
+ }
25
+
26
+ /// Create a writer based on the output type (file path or IO object), forwarding
27
+ /// the batch-sizing options to the core writer (the single source of truth).
28
+ pub fn create_writer(
29
+ ruby: &Ruby,
30
+ write_to: Value,
31
+ schema: Schema,
32
+ compression: Option<String>,
33
+ options: BatchSizingOptions,
34
+ ) -> Result<WriterOutput, MagnusError> {
35
+ let mut builder = WriterBuilder::new().with_compression(parse_compression(ruby, compression)?);
36
+ if let Some(size) = options.batch_size {
37
+ builder = builder.with_batch_size(size);
38
+ }
39
+ if let Some(threshold) = options.flush_threshold {
40
+ builder = builder.with_memory_threshold(threshold);
41
+ }
42
+ if let Some(size) = options.sample_size {
43
+ builder = builder.with_sample_size(size);
44
+ }
45
+
46
+ if write_to.is_kind_of(ruby.class_string()) {
47
+ // Direct file path
48
+ let path_str: String = TryConvert::try_convert(write_to)?;
49
+ let file = std::fs::File::create(&path_str)
50
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
51
+ let writer = builder
52
+ .build(file, schema)
53
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
54
+ Ok(WriterOutput::File(writer))
55
+ } else {
56
+ // IO-like object - create temporary file
57
+ let temp_file = NamedTempFile::new().map_err(|e| {
58
+ MagnusError::new(
59
+ ruby.exception_runtime_error(),
60
+ format!("Failed to create temporary file: {}", e),
61
+ )
62
+ })?;
63
+
64
+ // Clone the file handle for the writer
65
+ let file = temp_file.reopen().map_err(|e| {
66
+ MagnusError::new(
67
+ ruby.exception_runtime_error(),
68
+ format!("Failed to reopen temporary file: {}", e),
69
+ )
70
+ })?;
71
+
72
+ let writer = builder
73
+ .build(file, schema)
74
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
75
+
76
+ Ok(WriterOutput::TempFile(writer, temp_file, write_to))
77
+ }
78
+ }
79
+
80
+ /// Finalize the writer and copy temp file to IO if needed
81
+ pub fn finalize_writer(ruby: &Ruby, writer_output: WriterOutput) -> Result<(), MagnusError> {
82
+ match writer_output {
83
+ WriterOutput::File(writer) => writer
84
+ .close()
85
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string())),
86
+ WriterOutput::TempFile(writer, temp_file, io_object) => {
87
+ // Close the writer first
88
+ writer
89
+ .close()
90
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
91
+
92
+ // Copy temp file to IO object
93
+ copy_temp_file_to_io(ruby, temp_file, io_object)
94
+ }
95
+ }
96
+ }
97
+
98
+ /// Copy temporary file contents to Ruby IO object
99
+ fn copy_temp_file_to_io(
100
+ ruby: &Ruby,
101
+ temp_file: NamedTempFile,
102
+ io_object: Value,
103
+ ) -> Result<(), MagnusError> {
104
+ let file = temp_file.reopen().map_err(|e| {
105
+ MagnusError::new(
106
+ ruby.exception_runtime_error(),
107
+ format!("Failed to reopen temporary file: {}", e),
108
+ )
109
+ })?;
110
+
111
+ let mut buf_reader = BufReader::new(file);
112
+ let ruby_io_writer = RubyIOWriter::new(io_object);
113
+ let mut buf_writer = BufWriter::new(ruby_io_writer);
114
+
115
+ std::io::copy(&mut buf_reader, &mut buf_writer).map_err(|e| {
116
+ MagnusError::new(
117
+ ruby.exception_runtime_error(),
118
+ format!("Failed to copy temp file to IO object: {}", e),
119
+ )
120
+ })?;
121
+
122
+ buf_writer.flush().map_err(|e| {
123
+ MagnusError::new(
124
+ ruby.exception_runtime_error(),
125
+ format!("Failed to flush IO object: {}", e),
126
+ )
127
+ })?;
128
+
129
+ // The temporary file will be automatically deleted when temp_file is dropped
130
+ Ok(())
131
+ }
132
+
133
+ /// The rows to write, either already materialized or pulled lazily from an
134
+ /// Enumerable in bounded slices so the whole input is never resident at once.
135
+ enum RowSource {
136
+ Materialized(RArray),
137
+ Streamed {
138
+ first_slice: Option<RArray>,
139
+ remaining: Enumerator,
140
+ },
141
+ }
142
+
143
+ /// Classify `read_from` without draining it. Arrays are used as-is; anything
144
+ /// else that supports `each_slice` (Enumerator, any Enumerable) is streamed in
145
+ /// bounded slices; objects that only support `to_a` keep the legacy
146
+ /// materialize-first behavior.
147
+ fn row_source(
148
+ ruby: &Ruby,
149
+ read_from: Value,
150
+ batch_size: Option<usize>,
151
+ ) -> Result<RowSource, MagnusError> {
152
+ if read_from.is_kind_of(ruby.class_array()) {
153
+ return Ok(RowSource::Materialized(TryConvert::try_convert(read_from)?));
154
+ }
155
+
156
+ if read_from.respond_to("each_slice", false)? {
157
+ // One fiber switch per slice keeps Enumerator overhead negligible
158
+ // while bounding how many Ruby rows are in flight at once.
159
+ let slice_size = batch_size.unwrap_or(DEFAULT_ROW_SLICE_SIZE);
160
+ let mut remaining = read_from.enumeratorize("each_slice", (slice_size,));
161
+ let first_slice = match remaining.next() {
162
+ Some(slice) => Some(TryConvert::try_convert(slice?)?),
163
+ None => None,
164
+ };
165
+ return Ok(RowSource::Streamed {
166
+ first_slice,
167
+ remaining,
168
+ });
169
+ }
170
+
171
+ if read_from.respond_to("to_a", false)? {
172
+ let array_value: Value = read_from.funcall("to_a", ())?;
173
+ return Ok(RowSource::Materialized(TryConvert::try_convert(
174
+ array_value,
175
+ )?));
176
+ }
177
+
178
+ Err(MagnusError::new(
179
+ ruby.exception_type_error(),
180
+ "data must be an array or respond to 'to_a'",
181
+ ))
182
+ }
183
+
184
+ /// Map a value-conversion failure to the Ruby exception the write API raises:
185
+ /// encoding problems surface as EncodingError, everything else RuntimeError.
186
+ fn conversion_error(ruby: &Ruby, error_msg: String) -> MagnusError {
187
+ if error_msg.contains("EncodingError") || error_msg.contains("invalid utf-8") {
188
+ // Extract the actual encoding error message
189
+ if let Some(pos) = error_msg.find("EncodingError: ") {
190
+ let encoding_msg = error_msg[pos + 15..].to_string();
191
+ MagnusError::new(ruby.exception_encoding_error(), encoding_msg)
192
+ } else {
193
+ MagnusError::new(ruby.exception_encoding_error(), error_msg)
194
+ }
195
+ } else {
196
+ MagnusError::new(ruby.exception_runtime_error(), error_msg)
197
+ }
198
+ }
199
+
200
+ /// Convert one slice of Ruby rows and write them through the core writer.
201
+ /// Returns the number of rows written.
202
+ ///
203
+ /// `release_consumed` should be true only when `rows` is an array this module
204
+ /// created itself (an `each_slice` slice): each element is dropped from the
205
+ /// slice once converted, so large rows become collectable while the rest of
206
+ /// the slice is still being written. User-provided arrays must not be mutated.
207
+ fn write_row_slice(
208
+ ruby: &Ruby,
209
+ writer_output: &mut WriterOutput,
210
+ converter: &mut crate::converter::RubyValueConverter,
211
+ field_schemas: &[parquet_core::SchemaNode],
212
+ rows: RArray,
213
+ release_consumed: bool,
214
+ ) -> Result<u64, MagnusError> {
215
+ let mut rows_written = 0u64;
216
+
217
+ for (row_idx, row_value) in rows.into_iter().enumerate() {
218
+ // Convert Ruby row to ParquetValue vector
219
+ let row = if row_value.is_kind_of(ruby.class_array()) {
220
+ let array: RArray = TryConvert::try_convert(row_value)?;
221
+ let mut values = Vec::with_capacity(array.len());
222
+
223
+ for (idx, item) in array.into_iter().enumerate() {
224
+ let schema_hint = field_schemas.get(idx);
225
+ let pq_value = converter
226
+ .to_parquet_with_schema_hint(item, schema_hint)
227
+ .map_err(|e| conversion_error(ruby, e.to_string()))?;
228
+ values.push(pq_value);
229
+ }
230
+ values
231
+ } else {
232
+ return Err(MagnusError::new(
233
+ ruby.exception_type_error(),
234
+ "each row must be an array",
235
+ ));
236
+ };
237
+
238
+ if release_consumed {
239
+ rows.store(row_idx as isize, ruby.qnil())?;
240
+ }
241
+
242
+ match writer_output {
243
+ WriterOutput::File(writer) | WriterOutput::TempFile(writer, _, _) => {
244
+ writer
245
+ .write_row(row)
246
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
247
+ }
248
+ }
249
+ rows_written += 1;
250
+ }
251
+
252
+ Ok(rows_written)
253
+ }
254
+
255
+ /// Write data in row format to a parquet file
256
+ pub fn write_rows(
257
+ ruby: &Ruby,
258
+ write_args: crate::types::ParquetWriteArgs,
259
+ ) -> Result<Value, MagnusError> {
260
+ use crate::converter::RubyValueConverter;
261
+ use crate::logger::RubyLogger;
262
+ use crate::schema::{extract_field_schemas, process_schema_value, ruby_schema_to_parquet};
263
+ use crate::string_cache::StringCache;
264
+
265
+ // Read rows lazily where possible: draining an Enumerator up front would
266
+ // hold the entire dataset in memory for the duration of the write.
267
+ let source = row_source(ruby, write_args.read_from, write_args.batch_size)?;
268
+
269
+ // Schema inference (used when `schema` is nil/empty) only inspects the
270
+ // first row, so the first slice is enough.
271
+ let empty_rows;
272
+ let inference_rows = match &source {
273
+ RowSource::Materialized(rows) => rows,
274
+ RowSource::Streamed {
275
+ first_slice: Some(rows),
276
+ ..
277
+ } => rows,
278
+ RowSource::Streamed {
279
+ first_slice: None, ..
280
+ } => {
281
+ empty_rows = ruby.ary_new();
282
+ &empty_rows
283
+ }
284
+ };
285
+
286
+ // Process schema value
287
+ let schema_hash = process_schema_value(ruby, write_args.schema_value, Some(inference_rows))
288
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
289
+
290
+ // Create schema
291
+ let schema = ruby_schema_to_parquet(schema_hash)
292
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
293
+
294
+ // Extract field schemas for conversion hints
295
+ let field_schemas = extract_field_schemas(&schema);
296
+
297
+ // Create writer. All batch sizing and flushing is owned by the core writer;
298
+ // the user's batch_size/flush_threshold/sample_size are forwarded to it.
299
+ let mut writer_output = create_writer(
300
+ ruby,
301
+ write_args.write_to,
302
+ schema.clone(),
303
+ write_args.compression,
304
+ BatchSizingOptions {
305
+ batch_size: write_args.batch_size,
306
+ flush_threshold: write_args.flush_threshold,
307
+ sample_size: write_args.sample_size,
308
+ },
309
+ )?;
310
+
311
+ // Create logger
312
+ let logger = RubyLogger::new(write_args.logger)?;
313
+ let _ = logger.info(|| "Starting to write parquet file".to_string());
314
+
315
+ // Create converter with string cache if enabled. `string_cache` is the
316
+ // requested capacity (None = disabled).
317
+ let mut converter = if let Some(capacity) = write_args.string_cache {
318
+ let _ = logger.debug(|| format!("String cache enabled (capacity {})", capacity));
319
+ RubyValueConverter::with_string_cache(StringCache::new(capacity))
320
+ } else {
321
+ RubyValueConverter::new()
322
+ };
323
+
324
+ // Stream each row to the core writer, which buffers and flushes internally
325
+ // according to its (now sole) batch-sizing policy. Completed row groups
326
+ // reach the destination while later rows are still being produced.
327
+ let mut total_rows = 0u64;
328
+
329
+ match source {
330
+ RowSource::Materialized(rows) => {
331
+ total_rows += write_row_slice(
332
+ ruby,
333
+ &mut writer_output,
334
+ &mut converter,
335
+ &field_schemas,
336
+ rows,
337
+ false,
338
+ )?;
339
+ }
340
+ RowSource::Streamed {
341
+ first_slice,
342
+ remaining,
343
+ } => {
344
+ if let Some(rows) = first_slice {
345
+ total_rows += write_row_slice(
346
+ ruby,
347
+ &mut writer_output,
348
+ &mut converter,
349
+ &field_schemas,
350
+ rows,
351
+ true,
352
+ )?;
353
+ }
354
+ for slice in remaining {
355
+ let rows: RArray = TryConvert::try_convert(slice?)?;
356
+ total_rows += write_row_slice(
357
+ ruby,
358
+ &mut writer_output,
359
+ &mut converter,
360
+ &field_schemas,
361
+ rows,
362
+ true,
363
+ )?;
364
+ }
365
+ }
366
+ }
367
+
368
+ // The core writer flushes any remaining buffered rows when closed by
369
+ // finalize_writer below.
370
+ let _ = logger.info(|| format!("Finished writing {} rows to parquet file", total_rows));
371
+
372
+ // Log string cache statistics if enabled. `misses` is exact even after the
373
+ // bounded cache fills; exact distinct cardinality would require an unbounded
374
+ // side table, so the log labels it as misses rather than unique strings.
375
+ if let Some(stats) = converter.string_cache_stats() {
376
+ let _ = logger.info(|| {
377
+ format!(
378
+ "String cache stats: {} cache misses, {} hits ({:.1}% hit rate)",
379
+ stats.misses,
380
+ stats.hits,
381
+ stats.hit_rate * 100.0
382
+ )
383
+ });
384
+ }
385
+
386
+ // Finalize the writer
387
+ finalize_writer(ruby, writer_output)?;
388
+
389
+ Ok(ruby.qnil().as_value())
390
+ }
391
+
392
+ /// The column batches to write, either already materialized or pulled lazily
393
+ /// from an Enumerable one batch at a time.
394
+ enum BatchSource {
395
+ Materialized(RArray),
396
+ Streamed {
397
+ first_batch: Option<Value>,
398
+ remaining: Enumerator,
399
+ },
400
+ }
401
+
402
+ /// Classify `read_from` without draining it, mirroring `row_source`. Batches
403
+ /// are chunky (whole columns), so streamed sources are pulled one batch per
404
+ /// fiber switch via `each`.
405
+ fn batch_source(ruby: &Ruby, read_from: Value) -> Result<BatchSource, MagnusError> {
406
+ if read_from.is_kind_of(ruby.class_array()) {
407
+ return Ok(BatchSource::Materialized(TryConvert::try_convert(
408
+ read_from,
409
+ )?));
410
+ }
411
+
412
+ if read_from.respond_to("each", false)? {
413
+ let mut remaining = read_from.enumeratorize("each", ());
414
+ let first_batch = remaining.next().transpose()?;
415
+ return Ok(BatchSource::Streamed {
416
+ first_batch,
417
+ remaining,
418
+ });
419
+ }
420
+
421
+ if read_from.respond_to("to_a", false)? {
422
+ let array_value: Value = read_from.funcall("to_a", ())?;
423
+ return Ok(BatchSource::Materialized(TryConvert::try_convert(
424
+ array_value,
425
+ )?));
426
+ }
427
+
428
+ Err(MagnusError::new(
429
+ ruby.exception_type_error(),
430
+ "data must be an array or respond to 'to_a'",
431
+ ))
432
+ }
433
+
434
+ /// Convert one Ruby batch (an array of per-column value arrays) and write it
435
+ /// through the core writer as a record batch. Returns the batch's row count.
436
+ fn write_column_batch(
437
+ ruby: &Ruby,
438
+ writer_output: &mut WriterOutput,
439
+ field_schemas: &[parquet_core::SchemaNode],
440
+ column_names: &[String],
441
+ batch: Value,
442
+ ) -> Result<usize, MagnusError> {
443
+ use crate::converter::RubyValueConverter;
444
+
445
+ if !batch.is_kind_of(ruby.class_array()) {
446
+ return Err(MagnusError::new(
447
+ ruby.exception_type_error(),
448
+ "each batch must be an array of column values",
449
+ ));
450
+ }
451
+
452
+ let batch_array: RArray = TryConvert::try_convert(batch)?;
453
+
454
+ // Verify batch has the right number of columns
455
+ if batch_array.len() != column_names.len() {
456
+ return Err(MagnusError::new(
457
+ ruby.exception_runtime_error(),
458
+ format!(
459
+ "Batch has {} columns but schema has {}",
460
+ batch_array.len(),
461
+ column_names.len()
462
+ ),
463
+ ));
464
+ }
465
+
466
+ let mut batch_columns: Vec<(String, Vec<parquet_core::ParquetValue>)> =
467
+ Vec::with_capacity(column_names.len());
468
+
469
+ // Process each column in the batch
470
+ for (col_idx, column_values) in batch_array.into_iter().enumerate() {
471
+ if !column_values.is_kind_of(ruby.class_array()) {
472
+ return Err(MagnusError::new(
473
+ ruby.exception_type_error(),
474
+ format!("Column {} values must be an array", col_idx),
475
+ ));
476
+ }
477
+
478
+ let values_array: RArray = TryConvert::try_convert(column_values)?;
479
+
480
+ // Convert and append values
481
+ let mut converter = RubyValueConverter::new();
482
+ let schema_hint = field_schemas.get(col_idx);
483
+
484
+ let mut values = Vec::with_capacity(values_array.len());
485
+ for value in values_array.into_iter() {
486
+ let pq_value = converter
487
+ .to_parquet_with_schema_hint(value, schema_hint)
488
+ .map_err(|e| conversion_error(ruby, e.to_string()))?;
489
+ values.push(pq_value);
490
+ }
491
+ batch_columns.push((column_names[col_idx].clone(), values));
492
+ }
493
+
494
+ let batch_rows = batch_columns
495
+ .first()
496
+ .map(|(_name, values)| values.len())
497
+ .unwrap_or(0);
498
+
499
+ // Write this batch immediately; the core writer flushes completed row
500
+ // groups to the destination once its in-progress buffer exceeds the
501
+ // flush threshold.
502
+ match writer_output {
503
+ WriterOutput::File(writer) | WriterOutput::TempFile(writer, _, _) => {
504
+ writer
505
+ .write_columns(batch_columns)
506
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
507
+ }
508
+ }
509
+
510
+ Ok(batch_rows)
511
+ }
512
+
513
+ /// Write data in column format to a parquet file
514
+ pub fn write_columns(
515
+ ruby: &Ruby,
516
+ write_args: crate::types::ParquetWriteArgs,
517
+ ) -> Result<Value, MagnusError> {
518
+ use crate::logger::RubyLogger;
519
+ use crate::schema::{extract_field_schemas, process_schema_value, ruby_schema_to_parquet};
520
+
521
+ let logger = RubyLogger::new(write_args.logger)?;
522
+
523
+ // Read batches lazily where possible: draining an Enumerator up front
524
+ // would hold every batch in memory for the duration of the write.
525
+ let source = batch_source(ruby, write_args.read_from)?;
526
+
527
+ // Schema inference (used when `schema` is nil/empty) only inspects the
528
+ // first batch, so one batch is enough.
529
+ let first_batch_holder;
530
+ let inference_batches = match &source {
531
+ BatchSource::Materialized(batches) => batches,
532
+ BatchSource::Streamed { first_batch, .. } => {
533
+ first_batch_holder = ruby.ary_new();
534
+ if let Some(batch) = first_batch {
535
+ first_batch_holder.push(*batch)?;
536
+ }
537
+ &first_batch_holder
538
+ }
539
+ };
540
+
541
+ // Process schema value
542
+ let schema_hash = process_schema_value(ruby, write_args.schema_value, Some(inference_batches))
543
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
544
+
545
+ // Create schema
546
+ let schema = ruby_schema_to_parquet(schema_hash)
547
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
548
+
549
+ // Extract field schemas for conversion hints
550
+ let field_schemas = extract_field_schemas(&schema);
551
+
552
+ // Create writer. The columnar path writes one record batch per write_columns
553
+ // call, so row batch-sizing options are rejected before this point.
554
+ let mut writer_output = create_writer(
555
+ ruby,
556
+ write_args.write_to,
557
+ schema.clone(),
558
+ write_args.compression,
559
+ BatchSizingOptions {
560
+ batch_size: None,
561
+ flush_threshold: write_args.flush_threshold,
562
+ sample_size: None,
563
+ },
564
+ )?;
565
+ let _ = logger.info(|| "Starting to write parquet file columns".to_string());
566
+
567
+ // Get column names from schema
568
+ let column_names: Vec<String> =
569
+ if let parquet_core::SchemaNode::Struct { fields, .. } = &schema.root {
570
+ fields.iter().map(|f| f.name().to_string()).collect()
571
+ } else {
572
+ return Err(MagnusError::new(
573
+ ruby.exception_runtime_error(),
574
+ "Schema root must be a struct",
575
+ ));
576
+ };
577
+
578
+ // Convert and write each batch as it arrives; completed row groups are
579
+ // flushed to the destination instead of accumulating every batch first.
580
+ let mut total_rows: usize = 0;
581
+
582
+ match source {
583
+ BatchSource::Materialized(batches) => {
584
+ for batch in batches.into_iter() {
585
+ total_rows += write_column_batch(
586
+ ruby,
587
+ &mut writer_output,
588
+ &field_schemas,
589
+ &column_names,
590
+ batch,
591
+ )?;
592
+ }
593
+ }
594
+ BatchSource::Streamed {
595
+ first_batch,
596
+ remaining,
597
+ } => {
598
+ if let Some(batch) = first_batch {
599
+ total_rows += write_column_batch(
600
+ ruby,
601
+ &mut writer_output,
602
+ &field_schemas,
603
+ &column_names,
604
+ batch,
605
+ )?;
606
+ }
607
+ for batch in remaining {
608
+ total_rows += write_column_batch(
609
+ ruby,
610
+ &mut writer_output,
611
+ &field_schemas,
612
+ &column_names,
613
+ batch?,
614
+ )?;
615
+ }
616
+ }
617
+ }
618
+
619
+ let _ = logger.info(|| format!("Finished writing {total_rows} rows to parquet file columns"));
620
+
621
+ // Finalize the writer
622
+ finalize_writer(ruby, writer_output)?;
623
+
624
+ Ok(ruby.qnil().as_value())
625
+ }