parquet-tyfoom 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/Cargo.lock +1854 -0
  3. data/Cargo.toml +3 -0
  4. data/Gemfile +21 -0
  5. data/LICENSE +21 -0
  6. data/README.md +428 -0
  7. data/Rakefile +43 -0
  8. data/ext/parquet/Cargo.toml +39 -0
  9. data/ext/parquet/build.rs +5 -0
  10. data/ext/parquet/extconf.rb +4 -0
  11. data/ext/parquet/src/adapter_ffi.rs +297 -0
  12. data/ext/parquet/src/allocator.rs +13 -0
  13. data/ext/parquet/src/lib.rs +24 -0
  14. data/ext/parquet-core/Cargo.toml +24 -0
  15. data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
  16. data/ext/parquet-core/src/error.rs +189 -0
  17. data/ext/parquet-core/src/lib.rs +60 -0
  18. data/ext/parquet-core/src/reader.rs +368 -0
  19. data/ext/parquet-core/src/schema.rs +452 -0
  20. data/ext/parquet-core/src/test_utils.rs +308 -0
  21. data/ext/parquet-core/src/traits/mod.rs +5 -0
  22. data/ext/parquet-core/src/traits/schema.rs +190 -0
  23. data/ext/parquet-core/src/value.rs +220 -0
  24. data/ext/parquet-core/src/writer.rs +1241 -0
  25. data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
  26. data/ext/parquet-core/tests/binary_data.rs +437 -0
  27. data/ext/parquet-core/tests/column_projection.rs +557 -0
  28. data/ext/parquet-core/tests/complex_types.rs +821 -0
  29. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  30. data/ext/parquet-core/tests/concurrent_access.rs +431 -0
  31. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  32. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  33. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
  34. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  35. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  36. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  37. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  38. data/ext/parquet-core/tests/review_regressions.rs +787 -0
  39. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  40. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
  41. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  42. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  43. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  44. data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
  45. data/ext/parquet-ruby-adapter/build.rs +5 -0
  46. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  47. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  48. data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
  49. data/ext/parquet-ruby-adapter/src/error.rs +141 -0
  50. data/ext/parquet-ruby-adapter/src/io.rs +432 -0
  51. data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
  52. data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
  53. data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
  54. data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
  55. data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
  56. data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
  57. data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
  58. data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
  59. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  60. data/ext/parquet-ruby-adapter/src/types.rs +98 -0
  61. data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
  62. data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
  63. data/lib/parquet/schema.rb +262 -0
  64. data/lib/parquet/version.rb +3 -0
  65. data/lib/parquet.rb +11 -0
  66. data/lib/parquet.rbi +181 -0
  67. metadata +165 -0
@@ -0,0 +1,1241 @@
1
+ //! Core Parquet writing functionality
2
+
3
+ use crate::{
4
+ arrow_conversion::parquet_values_to_arrow_array, ParquetError, ParquetValue, Result, Schema,
5
+ SchemaNode,
6
+ };
7
+ use arrow::record_batch::RecordBatch;
8
+ use arrow_schema::{DataType, Field};
9
+ use parquet::arrow::ArrowWriter;
10
+ use parquet::basic::Compression;
11
+ use parquet::file::properties::WriterProperties;
12
+ use rand::Rng;
13
+ use std::collections::{hash_map::Entry, HashMap};
14
+ use std::sync::Arc as StdArc;
15
+
16
+ // Default configuration constants
17
+ const DEFAULT_BATCH_SIZE: usize = 1000;
18
+ const DEFAULT_MEMORY_THRESHOLD: usize = 100 * 1024 * 1024; // 100MB
19
+ const DEFAULT_SAMPLE_SIZE: usize = 100;
20
+ const MIN_BATCH_SIZE: usize = 10;
21
+ // Ceiling for a fixed or dynamically-estimated batch size on a single-column
22
+ // schema. The effective cap is also limited by schema width below.
23
+ pub const MAX_BATCH_SIZE: usize = 1_000_000;
24
+ // `sample_size` also backs an eager Vec reservation during writer creation.
25
+ // Keep user-provided estimates from becoming an unbounded upfront allocation.
26
+ pub const MAX_SAMPLE_SIZE: usize = 10_000;
27
+ // Total slots eagerly reserved across all per-column buffers. This keeps wide
28
+ // schemas from multiplying a row-count cap into an unbounded allocation.
29
+ const MAX_BUFFERED_VALUE_SLOTS: usize = 1_000_000;
30
+ const MIN_SAMPLES_FOR_ESTIMATE: usize = 10;
31
+
32
+ /// Builder for creating a configured Writer
33
+ pub struct WriterBuilder {
34
+ compression: Compression,
35
+ batch_size: Option<usize>,
36
+ memory_threshold: usize,
37
+ sample_size: usize,
38
+ }
39
+
40
+ impl Default for WriterBuilder {
41
+ fn default() -> Self {
42
+ Self {
43
+ compression: Compression::SNAPPY,
44
+ batch_size: None,
45
+ memory_threshold: DEFAULT_MEMORY_THRESHOLD,
46
+ sample_size: DEFAULT_SAMPLE_SIZE,
47
+ }
48
+ }
49
+ }
50
+
51
+ impl WriterBuilder {
52
+ /// Create a new WriterBuilder with default settings
53
+ pub fn new() -> Self {
54
+ Self::default()
55
+ }
56
+
57
+ /// Set the compression algorithm
58
+ pub fn with_compression(mut self, compression: Compression) -> Self {
59
+ self.compression = compression;
60
+ self
61
+ }
62
+
63
+ /// Set a fixed batch size (disables dynamic sizing)
64
+ pub fn with_batch_size(mut self, size: usize) -> Self {
65
+ self.batch_size = Some(size);
66
+ self
67
+ }
68
+
69
+ /// Set the memory threshold for flushing
70
+ pub fn with_memory_threshold(mut self, threshold: usize) -> Self {
71
+ self.memory_threshold = threshold;
72
+ self
73
+ }
74
+
75
+ /// Set the sample size for row size estimation
76
+ pub fn with_sample_size(mut self, size: usize) -> Self {
77
+ self.sample_size = size;
78
+ self
79
+ }
80
+
81
+ /// Build a Writer with the configured settings
82
+ pub fn build<W: std::io::Write + Send>(self, writer: W, schema: Schema) -> Result<Writer<W>> {
83
+ let arrow_schema = schema_to_arrow(&schema)?;
84
+
85
+ let props = WriterProperties::builder()
86
+ .set_compression(self.compression)
87
+ .build();
88
+
89
+ let arrow_writer = ArrowWriter::try_new(writer, arrow_schema.clone(), Some(props))?;
90
+
91
+ validate_column_count(arrow_schema.fields().len())?;
92
+ let current_batch_size = match self.batch_size {
93
+ Some(size) => validate_fixed_batch_size(size, arrow_schema.fields().len())?,
94
+ None => default_batch_size_for_column_count(arrow_schema.fields().len()),
95
+ };
96
+ let sample_size = validate_sample_size(self.sample_size)?;
97
+ let buffered_columns = new_buffered_columns(&arrow_schema, current_batch_size);
98
+
99
+ Ok(Writer {
100
+ arrow_writer: Some(arrow_writer),
101
+ arrow_schema,
102
+ buffered_columns,
103
+ buffered_row_count: 0,
104
+ current_batch_size,
105
+ memory_threshold: self.memory_threshold,
106
+ sample_size,
107
+ size_samples: Vec::with_capacity(sample_size),
108
+ total_rows_written: 0,
109
+ fixed_batch_size: self.batch_size,
110
+ raw_bytes_since_flush: 0,
111
+ })
112
+ }
113
+ }
114
+
115
+ /// Core Parquet writer that works with any type implementing Write
116
+ pub struct Writer<W: std::io::Write> {
117
+ arrow_writer: Option<ArrowWriter<W>>,
118
+ arrow_schema: StdArc<arrow_schema::Schema>,
119
+ buffered_columns: Vec<Vec<ParquetValue>>,
120
+ buffered_row_count: usize,
121
+ current_batch_size: usize,
122
+ memory_threshold: usize,
123
+ sample_size: usize,
124
+ size_samples: Vec<usize>,
125
+ total_rows_written: usize,
126
+ fixed_batch_size: Option<usize>,
127
+ /// Estimated raw bytes accepted since the last row-group flush. Tracked
128
+ /// separately from the arrow writer's encoded buffer sizes so
129
+ /// `memory_threshold` still bounds in-flight data (and streams row groups
130
+ /// to the destination) when encoding/compression shrinks it dramatically.
131
+ raw_bytes_since_flush: usize,
132
+ }
133
+
134
+ impl<W> Writer<W>
135
+ where
136
+ W: std::io::Write + Send,
137
+ {
138
+ /// Create a new writer with default settings
139
+ pub fn new(writer: W, schema: Schema) -> Result<Self> {
140
+ WriterBuilder::new().build(writer, schema)
141
+ }
142
+
143
+ /// Create a new writer with custom properties
144
+ pub fn new_with_properties(writer: W, schema: Schema, props: WriterProperties) -> Result<Self> {
145
+ let arrow_schema = schema_to_arrow(&schema)?;
146
+
147
+ let arrow_writer = ArrowWriter::try_new(writer, arrow_schema.clone(), Some(props))?;
148
+
149
+ validate_column_count(arrow_schema.fields().len())?;
150
+ let current_batch_size = default_batch_size_for_column_count(arrow_schema.fields().len());
151
+ let buffered_columns = new_buffered_columns(&arrow_schema, current_batch_size);
152
+
153
+ Ok(Self {
154
+ arrow_writer: Some(arrow_writer),
155
+ arrow_schema,
156
+ buffered_columns,
157
+ buffered_row_count: 0,
158
+ current_batch_size,
159
+ memory_threshold: DEFAULT_MEMORY_THRESHOLD,
160
+ sample_size: DEFAULT_SAMPLE_SIZE,
161
+ size_samples: Vec::with_capacity(DEFAULT_SAMPLE_SIZE),
162
+ total_rows_written: 0,
163
+ fixed_batch_size: None,
164
+ raw_bytes_since_flush: 0,
165
+ })
166
+ }
167
+
168
+ /// Write a batch of rows to the Parquet file
169
+ ///
170
+ /// Each row is a vector of values corresponding to the schema fields
171
+ pub fn write_rows(&mut self, rows: Vec<Vec<ParquetValue>>) -> Result<()> {
172
+ for row in rows {
173
+ self.write_row(row)?;
174
+ }
175
+ Ok(())
176
+ }
177
+
178
+ /// Write a single row to the Parquet file
179
+ ///
180
+ /// Rows are buffered internally and written in batches to optimize memory usage
181
+ pub fn write_row(&mut self, row: Vec<ParquetValue>) -> Result<()> {
182
+ // Validate row length
183
+ let num_cols = self.arrow_schema.fields().len();
184
+ if row.len() != num_cols {
185
+ return Err(ParquetError::Schema(format!(
186
+ "Row has {} values but schema has {} fields",
187
+ row.len(),
188
+ num_cols
189
+ )));
190
+ }
191
+
192
+ // Validate each value matches its schema
193
+ for (idx, (value, field)) in row.iter().zip(self.arrow_schema.fields()).enumerate() {
194
+ validate_value_against_field(value, field, &format!("row[{}]", idx))?;
195
+ }
196
+
197
+ let row_size = self.estimate_row_size(&row)?;
198
+
199
+ // Sample row size for dynamic batch sizing
200
+ if self.fixed_batch_size.is_none() {
201
+ self.sample_row_size(row_size);
202
+ }
203
+
204
+ // Count raw staged bytes toward the flush threshold.
205
+ self.raw_bytes_since_flush = self.raw_bytes_since_flush.saturating_add(row_size);
206
+
207
+ for (col_idx, value) in row.into_iter().enumerate() {
208
+ self.buffered_columns[col_idx].push(value);
209
+ }
210
+ self.buffered_row_count += 1;
211
+
212
+ // Check if we need to flush: batch full, or raw staged bytes already
213
+ // past the threshold (bounds in-flight memory when rows are large
214
+ // relative to the configured batch size).
215
+ if self.buffered_row_count >= self.current_batch_size
216
+ || self.raw_bytes_since_flush >= self.memory_threshold
217
+ {
218
+ self.flush_buffered_rows()?;
219
+ }
220
+
221
+ Ok(())
222
+ }
223
+
224
+ /// Sample row size for dynamic batch sizing using reservoir sampling
225
+ fn sample_row_size(&mut self, row_size: usize) {
226
+ if self.size_samples.len() < self.sample_size {
227
+ self.size_samples.push(row_size);
228
+ } else {
229
+ // Reservoir sampling
230
+ let mut rng = rand::rng();
231
+ let idx = rng.random_range(0..=self.total_rows_written);
232
+ if idx < self.sample_size {
233
+ self.size_samples[idx] = row_size;
234
+ }
235
+ }
236
+
237
+ // Update batch size once the requested sample has been collected. Small
238
+ // explicit sample sizes are valid because they bound how long large rows
239
+ // may keep using the default batch size.
240
+ let samples_required = self.sample_size.min(MIN_SAMPLES_FOR_ESTIMATE);
241
+ if self.size_samples.len() >= samples_required {
242
+ self.update_batch_size();
243
+ }
244
+ }
245
+
246
+ /// Estimate the memory size of a single row
247
+ fn estimate_row_size(&self, row: &[ParquetValue]) -> Result<usize> {
248
+ let mut size = 0;
249
+ for (idx, value) in row.iter().enumerate() {
250
+ let field = &self.arrow_schema.fields()[idx];
251
+ size += self.estimate_value_size(value, field.data_type())?;
252
+ }
253
+ Ok(size)
254
+ }
255
+
256
+ /// Estimate the memory footprint of a single value
257
+ #[allow(clippy::only_used_in_recursion)]
258
+ fn estimate_value_size(&self, value: &ParquetValue, data_type: &DataType) -> Result<usize> {
259
+ use ParquetValue::*;
260
+
261
+ Ok(match (value, data_type) {
262
+ (Null, _) => 0,
263
+
264
+ // Fixed size types
265
+ (Boolean(_), DataType::Boolean) => 1,
266
+ (Int8(_), DataType::Int8) => 1,
267
+ (UInt8(_), DataType::UInt8) => 1,
268
+ (Int16(_), DataType::Int16) => 2,
269
+ (UInt16(_), DataType::UInt16) => 2,
270
+ (Int32(_), DataType::Int32) => 4,
271
+ (UInt32(_), DataType::UInt32) => 4,
272
+ (Float32(_), DataType::Float32) => 4,
273
+ (Int64(_), DataType::Int64) => 8,
274
+ (UInt64(_), DataType::UInt64) => 8,
275
+ (Float64(_), DataType::Float64) => 8,
276
+ (Date32(_), DataType::Date32) => 4,
277
+ (Date64(_), DataType::Date64) => 8,
278
+ (TimeMillis(_), DataType::Time32(_)) => 4,
279
+ (TimeMicros(_), DataType::Time64(_)) => 8,
280
+ (TimeNanos(_), DataType::Time64(_)) => 8,
281
+ (TimestampSecond(_, _), DataType::Timestamp(_, _)) => 8,
282
+ (TimestampMillis(_, _), DataType::Timestamp(_, _)) => 8,
283
+ (TimestampMicros(_, _), DataType::Timestamp(_, _)) => 8,
284
+ (TimestampNanos(_, _), DataType::Timestamp(_, _)) => 8,
285
+ (Decimal128(_, _), DataType::Decimal128(_, _)) => 16,
286
+
287
+ // Variable size types
288
+ (String(s), DataType::Utf8) => s.len() + std::mem::size_of::<usize>() * 3,
289
+ (Bytes(b), DataType::Binary) => b.len() + std::mem::size_of::<usize>() * 3,
290
+ (Bytes(_), DataType::FixedSizeBinary(len)) => *len as usize,
291
+
292
+ (Decimal256(v, _), DataType::Decimal256(_, _)) => {
293
+ let bytes = v.to_signed_bytes_le();
294
+ 32 + bytes.len()
295
+ }
296
+
297
+ // Complex types
298
+ (List(items), DataType::List(field)) => {
299
+ let base_size = std::mem::size_of::<usize>() * 3;
300
+ if items.is_empty() {
301
+ base_size
302
+ } else {
303
+ // Sample up to 5 elements
304
+ let sample_count = items.len().min(5);
305
+ let sample_size: usize = items
306
+ .iter()
307
+ .take(sample_count)
308
+ .map(|item| {
309
+ self.estimate_value_size(item, field.data_type())
310
+ .unwrap_or(0)
311
+ })
312
+ .sum();
313
+ let avg_size = sample_size / sample_count;
314
+ base_size + (avg_size * items.len())
315
+ }
316
+ }
317
+
318
+ (Map(entries), DataType::Map(entries_field, _)) => {
319
+ if let DataType::Struct(fields) = entries_field.data_type() {
320
+ let base_size = std::mem::size_of::<usize>() * 4;
321
+ if entries.is_empty() || fields.len() < 2 {
322
+ base_size
323
+ } else {
324
+ // Sample up to 5 entries
325
+ let sample_count = entries.len().min(5);
326
+ let mut total_size = base_size;
327
+
328
+ for (key, val) in entries.iter().take(sample_count) {
329
+ total_size += self
330
+ .estimate_value_size(key, fields[0].data_type())
331
+ .unwrap_or(0);
332
+ total_size += self
333
+ .estimate_value_size(val, fields[1].data_type())
334
+ .unwrap_or(0);
335
+ }
336
+
337
+ let avg_entry_size = (total_size - base_size) / sample_count;
338
+ base_size + (avg_entry_size * entries.len())
339
+ }
340
+ } else {
341
+ 100 // Default estimate
342
+ }
343
+ }
344
+
345
+ (Record(fields), DataType::Struct(schema_fields)) => {
346
+ let base_size = std::mem::size_of::<usize>() * 3;
347
+ let field_sizes: usize = fields
348
+ .iter()
349
+ .zip(schema_fields.iter())
350
+ .map(|((_, val), field)| {
351
+ self.estimate_value_size(val, field.data_type())
352
+ .unwrap_or(0)
353
+ })
354
+ .sum();
355
+ base_size + field_sizes
356
+ }
357
+
358
+ _ => 100, // Default estimate for mismatched types
359
+ })
360
+ }
361
+
362
+ /// Update dynamic batch size based on current samples
363
+ fn update_batch_size(&mut self) {
364
+ if self.size_samples.is_empty() {
365
+ return;
366
+ }
367
+
368
+ let total_size: usize = self.size_samples.iter().sum();
369
+ let avg_row_size = (total_size as f64 / self.size_samples.len() as f64).max(1.0);
370
+ let suggested_batch_size = (self.memory_threshold as f64 / avg_row_size).floor() as usize;
371
+ self.current_batch_size = dynamic_batch_size_for_column_count(
372
+ suggested_batch_size,
373
+ self.arrow_schema.fields().len(),
374
+ );
375
+ }
376
+
377
+ /// Flush buffered rows to the Parquet file
378
+ fn flush_buffered_rows(&mut self) -> Result<()> {
379
+ if self.buffered_row_count == 0 {
380
+ return Ok(());
381
+ }
382
+
383
+ // Convert columns to Arrow arrays
384
+ let arrow_columns = self
385
+ .buffered_columns
386
+ .iter()
387
+ .zip(self.arrow_schema.fields())
388
+ .map(|(values, field)| parquet_values_to_arrow_array(values, field))
389
+ .collect::<Result<Vec<_>>>()?;
390
+
391
+ // Create RecordBatch
392
+ let batch = RecordBatch::try_new(self.arrow_schema.clone(), arrow_columns)?;
393
+
394
+ // Write the batch
395
+ if let Some(writer) = &mut self.arrow_writer {
396
+ writer.write(&batch)?;
397
+
398
+ let num_rows = self.buffered_row_count;
399
+ self.buffered_row_count = 0;
400
+ self.total_rows_written += num_rows;
401
+ let reserve_target = self.current_batch_size;
402
+ for column in &mut self.buffered_columns {
403
+ column.clear();
404
+ let additional_capacity = reserve_target.saturating_sub(column.capacity());
405
+ column.reserve(additional_capacity);
406
+ }
407
+
408
+ // Check if we need to flush a completed row group to the
409
+ // destination. Raw staged bytes trip the threshold too: highly
410
+ // compressible data can sit far below the threshold once encoded,
411
+ // which would otherwise keep the whole file buffered until close.
412
+ if self.raw_bytes_since_flush >= self.memory_threshold
413
+ || writer.in_progress_size() >= self.memory_threshold
414
+ || writer.memory_size() >= self.memory_threshold
415
+ {
416
+ writer.flush()?;
417
+ self.raw_bytes_since_flush = 0;
418
+ }
419
+ } else {
420
+ return Err(ParquetError::Io(std::io::Error::new(
421
+ std::io::ErrorKind::Other,
422
+ "Writer has been closed",
423
+ )));
424
+ }
425
+
426
+ Ok(())
427
+ }
428
+
429
+ /// Write columns to the Parquet file
430
+ ///
431
+ /// Each element is a tuple of (column_name, values)
432
+ pub fn write_columns(&mut self, columns: Vec<(String, Vec<ParquetValue>)>) -> Result<()> {
433
+ self.flush_buffered_rows()?;
434
+
435
+ if columns.is_empty() {
436
+ return Ok(());
437
+ }
438
+
439
+ // Verify column names match schema
440
+ let schema_fields = self.arrow_schema.fields();
441
+ if columns.len() != schema_fields.len() {
442
+ return Err(ParquetError::Schema(format!(
443
+ "Provided {} columns but schema has {} fields",
444
+ columns.len(),
445
+ schema_fields.len()
446
+ )));
447
+ }
448
+
449
+ let mut columns_by_name = HashMap::with_capacity(columns.len());
450
+ for (name, values) in columns {
451
+ match columns_by_name.entry(name) {
452
+ Entry::Vacant(entry) => {
453
+ entry.insert(values);
454
+ }
455
+ Entry::Occupied(entry) => {
456
+ return Err(ParquetError::Schema(format!(
457
+ "Duplicate column: {}",
458
+ entry.key()
459
+ )));
460
+ }
461
+ }
462
+ }
463
+
464
+ // Anchor the expected length to the first schema column and report
465
+ // mismatches in schema order, so the error is deterministic regardless
466
+ // of HashMap iteration order.
467
+ let expected_len = schema_fields
468
+ .first()
469
+ .and_then(|field| columns_by_name.get(field.name().as_str()))
470
+ .map_or(0, Vec::len);
471
+ for field in schema_fields {
472
+ if let Some(values) = columns_by_name.get(field.name().as_str()) {
473
+ if values.len() != expected_len {
474
+ return Err(ParquetError::Schema(format!(
475
+ "Column '{}' has {} values but expected {}",
476
+ field.name(),
477
+ values.len(),
478
+ expected_len
479
+ )));
480
+ }
481
+ }
482
+ }
483
+
484
+ // Sort columns to match schema order and convert to arrays
485
+ let mut arrow_columns = Vec::with_capacity(schema_fields.len());
486
+ let mut batch_raw_bytes: usize = 0;
487
+
488
+ for field in schema_fields {
489
+ let values = columns_by_name
490
+ .remove(field.name().as_str())
491
+ .ok_or_else(|| ParquetError::Schema(format!("Missing column: {}", field.name())))?;
492
+
493
+ for (idx, value) in values.iter().enumerate() {
494
+ validate_value_against_field(
495
+ value,
496
+ field,
497
+ &format!("column '{}'[{}]", field.name(), idx),
498
+ )?;
499
+ batch_raw_bytes = batch_raw_bytes
500
+ .saturating_add(self.estimate_value_size(value, field.data_type())?);
501
+ }
502
+
503
+ let array = parquet_values_to_arrow_array(&values, field)?;
504
+ arrow_columns.push(array);
505
+ }
506
+
507
+ // Create RecordBatch
508
+ let batch = RecordBatch::try_new(self.arrow_schema.clone(), arrow_columns)?;
509
+
510
+ // Write the batch
511
+ if let Some(writer) = &mut self.arrow_writer {
512
+ writer.write(&batch)?;
513
+ self.raw_bytes_since_flush = self.raw_bytes_since_flush.saturating_add(batch_raw_bytes);
514
+
515
+ // Check if we need to flush a completed row group, like the row
516
+ // path does; otherwise repeated write_columns calls accumulate
517
+ // every row group in memory until close.
518
+ if self.raw_bytes_since_flush >= self.memory_threshold
519
+ || writer.in_progress_size() >= self.memory_threshold
520
+ || writer.memory_size() >= self.memory_threshold
521
+ {
522
+ writer.flush()?;
523
+ self.raw_bytes_since_flush = 0;
524
+ }
525
+ } else {
526
+ return Err(ParquetError::Io(std::io::Error::new(
527
+ std::io::ErrorKind::Other,
528
+ "Writer has been closed",
529
+ )));
530
+ }
531
+
532
+ Ok(())
533
+ }
534
+
535
+ /// Flush any buffered data
536
+ pub fn flush(&mut self) -> Result<()> {
537
+ // First flush any buffered rows
538
+ self.flush_buffered_rows()?;
539
+
540
+ // Then flush the arrow writer
541
+ if let Some(writer) = &mut self.arrow_writer {
542
+ writer.flush()?;
543
+ }
544
+ self.raw_bytes_since_flush = 0;
545
+ Ok(())
546
+ }
547
+
548
+ /// Close the writer and write the file footer
549
+ ///
550
+ /// This must be called to finalize the Parquet file
551
+ pub fn close(mut self) -> Result<()> {
552
+ // Flush any remaining buffered rows
553
+ self.flush_buffered_rows()?;
554
+
555
+ // Close the arrow writer
556
+ if let Some(writer) = self.arrow_writer.take() {
557
+ writer.close()?;
558
+ }
559
+ Ok(())
560
+ }
561
+ }
562
+
563
+ /// Validate a value against its field schema
564
+ fn validate_value_against_field(value: &ParquetValue, field: &Field, path: &str) -> Result<()> {
565
+ use ParquetValue::*;
566
+
567
+ // Null handling
568
+ if matches!(value, Null) {
569
+ if !field.is_nullable() {
570
+ return Err(ParquetError::Schema(format!(
571
+ "Found null value for non-nullable field at {}",
572
+ path
573
+ )));
574
+ }
575
+ return Ok(());
576
+ }
577
+
578
+ // Type validation
579
+ match (value, field.data_type()) {
580
+ // Boolean
581
+ (Boolean(_), DataType::Boolean) => Ok(()),
582
+
583
+ // Integer types
584
+ (Int8(_), DataType::Int8) => Ok(()),
585
+ (Int16(_), DataType::Int16) => Ok(()),
586
+ (Int32(_), DataType::Int32) => Ok(()),
587
+ (Int64(_), DataType::Int64) => Ok(()),
588
+ (UInt8(_), DataType::UInt8) => Ok(()),
589
+ (UInt16(_), DataType::UInt16) => Ok(()),
590
+ (UInt32(_), DataType::UInt32) => Ok(()),
591
+ (UInt64(_), DataType::UInt64) => Ok(()),
592
+
593
+ // Float types
594
+ (Float16(_), DataType::Float16) => Ok(()),
595
+ (Float32(_), DataType::Float32) => Ok(()),
596
+ (Float64(_), DataType::Float64) => Ok(()),
597
+
598
+ // String and binary
599
+ (String(_), DataType::Utf8) => Ok(()),
600
+ (Bytes(_), DataType::Binary) => Ok(()),
601
+ (Bytes(b), DataType::FixedSizeBinary(size)) => {
602
+ // Validate up front so a wrong-length value is rejected at write_row
603
+ // rather than poisoning the buffer at flush time.
604
+ if b.len() != *size as usize {
605
+ return Err(ParquetError::Schema(format!(
606
+ "Fixed size binary expected {} bytes, got {} at {}",
607
+ size,
608
+ b.len(),
609
+ path
610
+ )));
611
+ }
612
+ Ok(())
613
+ }
614
+
615
+ // Date/time types
616
+ (Date32(_), DataType::Date32) => Ok(()),
617
+ (Date64(_), DataType::Date64) => Ok(()),
618
+ (TimeMillis(_), DataType::Time32(_)) => Ok(()),
619
+ (TimeMicros(_), DataType::Time64(_)) => Ok(()),
620
+ (TimeNanos(_), DataType::Time64(_)) => Ok(()),
621
+ (TimestampSecond(_, _), DataType::Timestamp(_, _)) => Ok(()),
622
+ (TimestampMillis(_, _), DataType::Timestamp(_, _)) => Ok(()),
623
+ (TimestampMicros(_, _), DataType::Timestamp(_, _)) => Ok(()),
624
+ (TimestampNanos(_, _), DataType::Timestamp(_, _)) => Ok(()),
625
+
626
+ // Decimal types
627
+ (Decimal128(decimal, value_scale), DataType::Decimal128(precision, scale)) => {
628
+ validate_decimal128_schema(*decimal, *value_scale, *precision, *scale, path)
629
+ }
630
+ (Decimal256(decimal, value_scale), DataType::Decimal256(precision, scale)) => {
631
+ validate_decimal256_schema(decimal, *value_scale, *precision, *scale, path)
632
+ }
633
+
634
+ // List type
635
+ (List(items), DataType::List(item_field)) => {
636
+ for (idx, item) in items.iter().enumerate() {
637
+ validate_value_against_field(item, item_field, &format!("{}[{}]", path, idx))?;
638
+ }
639
+ Ok(())
640
+ }
641
+
642
+ // Map type
643
+ (Map(entries), DataType::Map(entries_field, _)) => {
644
+ if let DataType::Struct(fields) = entries_field.data_type() {
645
+ if fields.len() >= 2 {
646
+ let key_field = &fields[0];
647
+ let value_field = &fields[1];
648
+
649
+ for (idx, (key, val)) in entries.iter().enumerate() {
650
+ validate_value_against_field(
651
+ key,
652
+ key_field,
653
+ &format!("{}.key[{}]", path, idx),
654
+ )?;
655
+ validate_value_against_field(
656
+ val,
657
+ value_field,
658
+ &format!("{}.value[{}]", path, idx),
659
+ )?;
660
+ }
661
+ }
662
+ }
663
+ Ok(())
664
+ }
665
+
666
+ // Struct type
667
+ (Record(record_fields), DataType::Struct(schema_fields)) => {
668
+ for field in schema_fields {
669
+ let field_name = field.name();
670
+ if let Some(value) = record_fields.get(field_name.as_str()) {
671
+ validate_value_against_field(
672
+ value,
673
+ field,
674
+ &format!("{}.{}", path, field_name),
675
+ )?;
676
+ } else if !field.is_nullable() {
677
+ return Err(ParquetError::Schema(format!(
678
+ "Required field '{}' is missing in struct at {}",
679
+ field_name, path
680
+ )));
681
+ }
682
+ }
683
+ Ok(())
684
+ }
685
+
686
+ // Type mismatch
687
+ (value, expected_type) => Err(ParquetError::Schema(format!(
688
+ "Type mismatch at {}: expected {:?}, got {:?}",
689
+ path,
690
+ expected_type,
691
+ value.type_name()
692
+ ))),
693
+ }
694
+ }
695
+
696
+ /// Convert our Schema to Arrow Schema
697
+ fn schema_to_arrow(schema: &Schema) -> Result<StdArc<arrow_schema::Schema>> {
698
+ schema.validate().map_err(ParquetError::Schema)?;
699
+ match &schema.root {
700
+ SchemaNode::Struct { fields, .. } => {
701
+ let arrow_fields = fields
702
+ .iter()
703
+ .map(schema_node_to_arrow_field)
704
+ .collect::<Result<Vec<_>>>()?;
705
+
706
+ Ok(StdArc::new(arrow_schema::Schema::new(arrow_fields)))
707
+ }
708
+ _ => Err(ParquetError::Schema(
709
+ "Root schema node must be a struct".to_string(),
710
+ )),
711
+ }
712
+ }
713
+
714
+ fn validate_column_count(column_count: usize) -> Result<()> {
715
+ if column_count > MAX_BUFFERED_VALUE_SLOTS {
716
+ return Err(ParquetError::Schema(format!(
717
+ "Schema has {} columns, exceeding the writer buffer slot limit of {}",
718
+ column_count, MAX_BUFFERED_VALUE_SLOTS
719
+ )));
720
+ }
721
+ Ok(())
722
+ }
723
+
724
+ fn max_batch_size_for_column_count(column_count: usize) -> usize {
725
+ let width = column_count.max(1);
726
+ (MAX_BUFFERED_VALUE_SLOTS / width)
727
+ .max(1)
728
+ .min(MAX_BATCH_SIZE)
729
+ }
730
+
731
+ fn default_batch_size_for_column_count(column_count: usize) -> usize {
732
+ DEFAULT_BATCH_SIZE.min(max_batch_size_for_column_count(column_count))
733
+ }
734
+
735
+ fn validate_fixed_batch_size(batch_size: usize, column_count: usize) -> Result<usize> {
736
+ if batch_size == 0 {
737
+ return Err(ParquetError::Schema(
738
+ "batch_size must be greater than 0".to_string(),
739
+ ));
740
+ }
741
+
742
+ let max_batch_size = max_batch_size_for_column_count(column_count);
743
+ if batch_size > max_batch_size {
744
+ return Err(ParquetError::Schema(format!(
745
+ "batch_size {} exceeds maximum {} for {} columns",
746
+ batch_size, max_batch_size, column_count
747
+ )));
748
+ }
749
+
750
+ Ok(batch_size)
751
+ }
752
+
753
+ fn validate_sample_size(sample_size: usize) -> Result<usize> {
754
+ if sample_size == 0 {
755
+ return Err(ParquetError::Schema(
756
+ "sample_size must be greater than 0".to_string(),
757
+ ));
758
+ }
759
+ if sample_size > MAX_SAMPLE_SIZE {
760
+ return Err(ParquetError::Schema(format!(
761
+ "sample_size {} exceeds maximum {}",
762
+ sample_size, MAX_SAMPLE_SIZE
763
+ )));
764
+ }
765
+ Ok(sample_size)
766
+ }
767
+
768
+ fn dynamic_batch_size_for_column_count(suggested_batch_size: usize, column_count: usize) -> usize {
769
+ let max_batch_size = max_batch_size_for_column_count(column_count);
770
+ let min_batch_size = MIN_BATCH_SIZE.min(max_batch_size);
771
+ suggested_batch_size.clamp(min_batch_size, max_batch_size)
772
+ }
773
+
774
+ /// Convert a SchemaNode to an Arrow Field
775
+ fn schema_node_to_arrow_field(node: &SchemaNode) -> Result<Field> {
776
+ match node {
777
+ SchemaNode::Primitive {
778
+ name,
779
+ primitive_type,
780
+ nullable,
781
+ format,
782
+ } => {
783
+ let data_type = primitive_type_to_arrow(primitive_type)?;
784
+ let field = Field::new(name, data_type, *nullable);
785
+ let extended_field = if format.as_deref() == Some("uuid") {
786
+ field.with_extension_type(arrow_schema::extension::Uuid)
787
+ } else {
788
+ field
789
+ };
790
+ Ok(extended_field)
791
+ }
792
+ SchemaNode::List {
793
+ name,
794
+ item,
795
+ nullable,
796
+ } => {
797
+ let item_field = schema_node_to_arrow_field(item)?;
798
+ // Use the conventional Arrow list element name "item" rather than the
799
+ // schema node's internal name (e.g. "<field>_item"), so written files
800
+ // interoperate with external Parquet readers. The element's data type
801
+ // and nullability still come from the schema node.
802
+ let list_type = DataType::List(StdArc::new(Field::new(
803
+ "item",
804
+ item_field.data_type().clone(),
805
+ item_field.is_nullable(),
806
+ )));
807
+ Ok(Field::new(name, list_type, *nullable))
808
+ }
809
+ SchemaNode::Map {
810
+ name,
811
+ key,
812
+ value,
813
+ nullable,
814
+ } => {
815
+ let key_field = schema_node_to_arrow_field(key)?;
816
+ let value_field = schema_node_to_arrow_field(value)?;
817
+
818
+ let struct_fields = vec![
819
+ Field::new(
820
+ key_field.name().clone(),
821
+ key_field.data_type().clone(),
822
+ false,
823
+ ),
824
+ Field::new(
825
+ value_field.name().clone(),
826
+ value_field.data_type().clone(),
827
+ value_field.is_nullable(),
828
+ ),
829
+ ];
830
+
831
+ let map_type = DataType::Map(
832
+ StdArc::new(Field::new(
833
+ "entries",
834
+ DataType::Struct(struct_fields.into()),
835
+ false,
836
+ )),
837
+ false, // keys_sorted
838
+ );
839
+
840
+ Ok(Field::new(name, map_type, *nullable))
841
+ }
842
+ SchemaNode::Struct {
843
+ name,
844
+ fields,
845
+ nullable,
846
+ } => {
847
+ let struct_fields = fields
848
+ .iter()
849
+ .map(schema_node_to_arrow_field)
850
+ .collect::<Result<Vec<_>>>()?;
851
+
852
+ let struct_type = DataType::Struct(struct_fields.into());
853
+ Ok(Field::new(name, struct_type, *nullable))
854
+ }
855
+ }
856
+ }
857
+
858
+ fn new_buffered_columns(
859
+ arrow_schema: &arrow_schema::Schema,
860
+ capacity: usize,
861
+ ) -> Vec<Vec<ParquetValue>> {
862
+ let column_count = arrow_schema.fields().len();
863
+ debug_assert!(column_count <= MAX_BUFFERED_VALUE_SLOTS);
864
+ debug_assert!(capacity <= max_batch_size_for_column_count(column_count));
865
+
866
+ arrow_schema
867
+ .fields()
868
+ .iter()
869
+ .map(|_| Vec::with_capacity(capacity))
870
+ .collect()
871
+ }
872
+
873
+ fn validate_decimal128_schema(
874
+ value: i128,
875
+ value_scale: i8,
876
+ precision: u8,
877
+ scale: i8,
878
+ path: &str,
879
+ ) -> Result<()> {
880
+ if value_scale != scale {
881
+ return Err(ParquetError::Schema(format!(
882
+ "Decimal scale mismatch at {}: schema scale {}, value scale {}",
883
+ path, scale, value_scale
884
+ )));
885
+ }
886
+
887
+ validate_decimal_precision(decimal128_digit_count(value), precision, path)
888
+ }
889
+
890
+ fn validate_decimal256_schema(
891
+ value: &num::BigInt,
892
+ value_scale: i8,
893
+ precision: u8,
894
+ scale: i8,
895
+ path: &str,
896
+ ) -> Result<()> {
897
+ if value_scale != scale {
898
+ return Err(ParquetError::Schema(format!(
899
+ "Decimal scale mismatch at {}: schema scale {}, value scale {}",
900
+ path, scale, value_scale
901
+ )));
902
+ }
903
+
904
+ validate_decimal_precision(decimal256_digit_count(value), precision, path)
905
+ }
906
+
907
+ fn validate_decimal_precision(value_digits: usize, precision: u8, path: &str) -> Result<()> {
908
+ if value_digits > precision as usize {
909
+ return Err(ParquetError::Schema(format!(
910
+ "Decimal precision overflow at {}: schema precision {}, value has {} digits",
911
+ path, precision, value_digits
912
+ )));
913
+ }
914
+
915
+ Ok(())
916
+ }
917
+
918
+ fn decimal128_digit_count(value: i128) -> usize {
919
+ value.unsigned_abs().to_string().len()
920
+ }
921
+
922
+ fn decimal256_digit_count(value: &num::BigInt) -> usize {
923
+ value.to_str_radix(10).trim_start_matches('-').len()
924
+ }
925
+
926
+ /// Convert PrimitiveType to Arrow DataType
927
+ fn primitive_type_to_arrow(ptype: &crate::PrimitiveType) -> Result<DataType> {
928
+ use crate::PrimitiveType::*;
929
+
930
+ Ok(match ptype {
931
+ Boolean => DataType::Boolean,
932
+ Int8 => DataType::Int8,
933
+ Int16 => DataType::Int16,
934
+ Int32 => DataType::Int32,
935
+ Int64 => DataType::Int64,
936
+ UInt8 => DataType::UInt8,
937
+ UInt16 => DataType::UInt16,
938
+ UInt32 => DataType::UInt32,
939
+ UInt64 => DataType::UInt64,
940
+ Float32 => DataType::Float32,
941
+ Float64 => DataType::Float64,
942
+ String => DataType::Utf8,
943
+ Binary => DataType::Binary,
944
+ Date32 => DataType::Date32,
945
+ TimeMillis => DataType::Time32(arrow_schema::TimeUnit::Millisecond),
946
+ TimeMicros => DataType::Time64(arrow_schema::TimeUnit::Microsecond),
947
+ TimeNanos => DataType::Time64(arrow_schema::TimeUnit::Nanosecond),
948
+ TimestampMillis(tz) => DataType::Timestamp(
949
+ arrow_schema::TimeUnit::Millisecond,
950
+ // PARQUET SPEC: ANY timezone (e.g., "+09:00", "America/New_York") means
951
+ // UTC-normalized storage (isAdjustedToUTC = true). Original timezone is lost.
952
+ tz.as_ref().map(|_| StdArc::from("UTC")),
953
+ ),
954
+ TimestampMicros(tz) => DataType::Timestamp(
955
+ arrow_schema::TimeUnit::Microsecond,
956
+ // PARQUET SPEC: ANY timezone (e.g., "+09:00", "America/New_York") means
957
+ // UTC-normalized storage (isAdjustedToUTC = true). Original timezone is lost.
958
+ tz.as_ref().map(|_| StdArc::from("UTC")),
959
+ ),
960
+ Decimal128(precision, scale) => DataType::Decimal128(*precision, *scale),
961
+ Decimal256(precision, scale) => DataType::Decimal256(*precision, *scale),
962
+ Date64 => DataType::Date64,
963
+ TimestampSecond(tz) => DataType::Timestamp(
964
+ arrow_schema::TimeUnit::Second,
965
+ // PARQUET SPEC: ANY timezone (e.g., "+09:00", "America/New_York") means
966
+ // UTC-normalized storage (isAdjustedToUTC = true). Original timezone is lost.
967
+ tz.as_ref().map(|_| StdArc::from("UTC")),
968
+ ),
969
+ TimestampNanos(tz) => DataType::Timestamp(
970
+ arrow_schema::TimeUnit::Nanosecond,
971
+ // PARQUET SPEC: ANY timezone (e.g., "+09:00", "America/New_York") means
972
+ // UTC-normalized storage (isAdjustedToUTC = true). Original timezone is lost.
973
+ tz.as_ref().map(|_| StdArc::from("UTC")),
974
+ ),
975
+ FixedLenByteArray(len) => DataType::FixedSizeBinary(*len),
976
+ })
977
+ }
978
+
979
+ #[cfg(test)]
980
+ mod tests {
981
+ use super::*;
982
+ use crate::SchemaBuilder;
983
+ use triomphe::Arc;
984
+
985
+ fn int64_schema(column_count: usize) -> Schema {
986
+ SchemaBuilder::new()
987
+ .with_root(SchemaNode::Struct {
988
+ name: "root".to_string(),
989
+ nullable: false,
990
+ fields: (0..column_count)
991
+ .map(|index| SchemaNode::Primitive {
992
+ name: format!("field_{index}"),
993
+ primitive_type: crate::PrimitiveType::Int64,
994
+ nullable: false,
995
+ format: None,
996
+ })
997
+ .collect(),
998
+ })
999
+ .build()
1000
+ .unwrap()
1001
+ }
1002
+
1003
+ fn single_int64_schema() -> Schema {
1004
+ int64_schema(1)
1005
+ }
1006
+
1007
+ fn single_int64_writer(buffer: Vec<u8>) -> Writer<Vec<u8>> {
1008
+ Writer::new(buffer, single_int64_schema()).unwrap()
1009
+ }
1010
+
1011
+ #[test]
1012
+ fn dynamic_batch_size_is_clamped_to_max() {
1013
+ let mut writer = single_int64_writer(Vec::new());
1014
+ // A pathological tiny average row size would otherwise drive the batch
1015
+ // size toward memory_threshold rows; it must be capped at MAX_BATCH_SIZE.
1016
+ writer.size_samples = vec![1; MIN_SAMPLES_FOR_ESTIMATE];
1017
+ writer.update_batch_size();
1018
+ assert_eq!(writer.current_batch_size, MAX_BATCH_SIZE);
1019
+
1020
+ // A realistic average stays below the cap.
1021
+ writer.size_samples = vec![DEFAULT_MEMORY_THRESHOLD / 1000; MIN_SAMPLES_FOR_ESTIMATE];
1022
+ writer.update_batch_size();
1023
+ assert!(writer.current_batch_size <= MAX_BATCH_SIZE);
1024
+ assert!(writer.current_batch_size >= MIN_BATCH_SIZE);
1025
+ }
1026
+
1027
+ #[test]
1028
+ fn dynamic_batch_size_is_clamped_to_width_bound() {
1029
+ let mut writer = WriterBuilder::new()
1030
+ .build(Vec::new(), int64_schema(2))
1031
+ .unwrap();
1032
+
1033
+ writer.size_samples = vec![1; MIN_SAMPLES_FOR_ESTIMATE];
1034
+ writer.update_batch_size();
1035
+
1036
+ assert_eq!(
1037
+ writer.current_batch_size,
1038
+ max_batch_size_for_column_count(2)
1039
+ );
1040
+ assert_eq!(
1041
+ writer.current_batch_size * writer.buffered_columns.len(),
1042
+ MAX_BUFFERED_VALUE_SLOTS
1043
+ );
1044
+ }
1045
+
1046
+ #[test]
1047
+ fn fixed_batch_size_preserves_small_user_value() {
1048
+ let writer = WriterBuilder::new()
1049
+ .with_batch_size(1)
1050
+ .build(Vec::new(), single_int64_schema())
1051
+ .unwrap();
1052
+
1053
+ assert_eq!(writer.current_batch_size, 1);
1054
+ assert_eq!(writer.buffered_columns[0].capacity(), 1);
1055
+ }
1056
+
1057
+ #[test]
1058
+ fn oversized_fixed_batch_size_is_rejected_before_initial_buffer_allocation() {
1059
+ let result = WriterBuilder::new()
1060
+ .with_batch_size(MAX_BATCH_SIZE + 1)
1061
+ .build(Vec::new(), single_int64_schema());
1062
+
1063
+ assert!(result.is_err());
1064
+ }
1065
+
1066
+ #[test]
1067
+ fn wide_schema_fixed_batch_size_is_rejected_by_total_slot_bound() {
1068
+ let result = WriterBuilder::new()
1069
+ .with_batch_size(MAX_BATCH_SIZE)
1070
+ .build(Vec::new(), int64_schema(2));
1071
+
1072
+ assert!(result.is_err());
1073
+ }
1074
+
1075
+ #[test]
1076
+ fn sample_size_preserves_small_user_value() {
1077
+ let writer = WriterBuilder::new()
1078
+ .with_sample_size(1)
1079
+ .build(Vec::new(), single_int64_schema())
1080
+ .unwrap();
1081
+
1082
+ assert_eq!(writer.sample_size, 1);
1083
+ assert_eq!(writer.size_samples.capacity(), 1);
1084
+ }
1085
+
1086
+ #[test]
1087
+ fn small_sample_size_updates_after_requested_sample_count() {
1088
+ let mut writer = WriterBuilder::new()
1089
+ .with_memory_threshold(128)
1090
+ .with_sample_size(1)
1091
+ .build(Vec::new(), single_int64_schema())
1092
+ .unwrap();
1093
+
1094
+ writer.write_row(vec![ParquetValue::Int64(1)]).unwrap();
1095
+
1096
+ assert_eq!(writer.size_samples.len(), 1);
1097
+ assert_eq!(
1098
+ writer.current_batch_size,
1099
+ dynamic_batch_size_for_column_count(16, 1)
1100
+ );
1101
+ }
1102
+
1103
+ #[test]
1104
+ fn oversized_sample_size_is_rejected_before_initial_buffer_allocation() {
1105
+ let result = WriterBuilder::new()
1106
+ .with_sample_size(usize::MAX)
1107
+ .build(Vec::new(), single_int64_schema());
1108
+
1109
+ assert!(result.is_err());
1110
+ }
1111
+
1112
+ #[test]
1113
+ fn test_writer_creation() {
1114
+ let schema = SchemaBuilder::new()
1115
+ .with_root(SchemaNode::Struct {
1116
+ name: "root".to_string(),
1117
+ nullable: false,
1118
+ fields: vec![SchemaNode::Primitive {
1119
+ name: "id".to_string(),
1120
+ primitive_type: crate::PrimitiveType::Int64,
1121
+ nullable: false,
1122
+ format: None,
1123
+ }],
1124
+ })
1125
+ .build()
1126
+ .unwrap();
1127
+
1128
+ let buffer = Vec::new();
1129
+ let _writer = Writer::new(buffer, schema).unwrap();
1130
+ }
1131
+
1132
+ #[test]
1133
+ fn test_writer_builder() {
1134
+ let schema = SchemaBuilder::new()
1135
+ .with_root(SchemaNode::Struct {
1136
+ name: "root".to_string(),
1137
+ nullable: false,
1138
+ fields: vec![SchemaNode::Primitive {
1139
+ name: "id".to_string(),
1140
+ primitive_type: crate::PrimitiveType::Int64,
1141
+ nullable: false,
1142
+ format: None,
1143
+ }],
1144
+ })
1145
+ .build()
1146
+ .unwrap();
1147
+
1148
+ let buffer = Vec::new();
1149
+ let _writer = WriterBuilder::new()
1150
+ .with_compression(Compression::ZSTD(parquet::basic::ZstdLevel::default()))
1151
+ .with_batch_size(500)
1152
+ .with_memory_threshold(50 * 1024 * 1024)
1153
+ .with_sample_size(50)
1154
+ .build(buffer, schema)
1155
+ .unwrap();
1156
+ }
1157
+
1158
+ #[test]
1159
+ fn test_buffered_writing() {
1160
+ let schema = SchemaBuilder::new()
1161
+ .with_root(SchemaNode::Struct {
1162
+ name: "root".to_string(),
1163
+ nullable: false,
1164
+ fields: vec![
1165
+ SchemaNode::Primitive {
1166
+ name: "id".to_string(),
1167
+ primitive_type: crate::PrimitiveType::Int64,
1168
+ nullable: false,
1169
+ format: None,
1170
+ },
1171
+ SchemaNode::Primitive {
1172
+ name: "name".to_string(),
1173
+ primitive_type: crate::PrimitiveType::String,
1174
+ nullable: true,
1175
+ format: None,
1176
+ },
1177
+ ],
1178
+ })
1179
+ .build()
1180
+ .unwrap();
1181
+
1182
+ let buffer = Vec::new();
1183
+ let mut writer = WriterBuilder::new()
1184
+ .with_batch_size(10) // Small batch for testing
1185
+ .build(buffer, schema)
1186
+ .unwrap();
1187
+
1188
+ // Write 25 rows - should trigger 2 flushes with batch size 10
1189
+ for i in 0..25 {
1190
+ writer
1191
+ .write_row(vec![
1192
+ ParquetValue::Int64(i),
1193
+ ParquetValue::String(Arc::from(format!("row_{}", i))),
1194
+ ])
1195
+ .unwrap();
1196
+ }
1197
+
1198
+ // Close to flush remaining rows
1199
+ writer.close().unwrap();
1200
+ }
1201
+
1202
+ #[test]
1203
+ fn test_row_size_estimation() {
1204
+ let schema = SchemaBuilder::new()
1205
+ .with_root(SchemaNode::Struct {
1206
+ name: "root".to_string(),
1207
+ nullable: false,
1208
+ fields: vec![
1209
+ SchemaNode::Primitive {
1210
+ name: "id".to_string(),
1211
+ primitive_type: crate::PrimitiveType::Int64,
1212
+ nullable: false,
1213
+ format: None,
1214
+ },
1215
+ SchemaNode::Primitive {
1216
+ name: "data".to_string(),
1217
+ primitive_type: crate::PrimitiveType::String,
1218
+ nullable: false,
1219
+ format: None,
1220
+ },
1221
+ ],
1222
+ })
1223
+ .build()
1224
+ .unwrap();
1225
+
1226
+ let buffer = Vec::new();
1227
+ let writer = Writer::new(buffer, schema).unwrap();
1228
+
1229
+ // Test size estimation for different value types
1230
+ let row = vec![
1231
+ ParquetValue::Int64(12345),
1232
+ ParquetValue::String(Arc::from("Hello, World!")),
1233
+ ];
1234
+
1235
+ let size = writer.estimate_row_size(&row).unwrap();
1236
+ assert!(size > 0);
1237
+
1238
+ // Int64 = 8 bytes, String = 13 chars + overhead
1239
+ assert!(size >= 8 + 13);
1240
+ }
1241
+ }