parquet-tyfoom 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Cargo.lock +1854 -0
- data/Cargo.toml +3 -0
- data/Gemfile +21 -0
- data/LICENSE +21 -0
- data/README.md +428 -0
- data/Rakefile +43 -0
- data/ext/parquet/Cargo.toml +39 -0
- data/ext/parquet/build.rs +5 -0
- data/ext/parquet/extconf.rb +4 -0
- data/ext/parquet/src/adapter_ffi.rs +297 -0
- data/ext/parquet/src/allocator.rs +13 -0
- data/ext/parquet/src/lib.rs +24 -0
- data/ext/parquet-core/Cargo.toml +24 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
- data/ext/parquet-core/src/error.rs +189 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +368 -0
- data/ext/parquet-core/src/schema.rs +452 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +190 -0
- data/ext/parquet-core/src/value.rs +220 -0
- data/ext/parquet-core/src/writer.rs +1241 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +431 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/review_regressions.rs +787 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
- data/ext/parquet-ruby-adapter/src/error.rs +141 -0
- data/ext/parquet-ruby-adapter/src/io.rs +432 -0
- data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
- data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
- data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +98 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
- data/lib/parquet/schema.rb +262 -0
- data/lib/parquet/version.rb +3 -0
- data/lib/parquet.rb +11 -0
- data/lib/parquet.rbi +181 -0
- metadata +165 -0
|
@@ -0,0 +1,1241 @@
|
|
|
1
|
+
//! Core Parquet writing functionality
|
|
2
|
+
|
|
3
|
+
use crate::{
|
|
4
|
+
arrow_conversion::parquet_values_to_arrow_array, ParquetError, ParquetValue, Result, Schema,
|
|
5
|
+
SchemaNode,
|
|
6
|
+
};
|
|
7
|
+
use arrow::record_batch::RecordBatch;
|
|
8
|
+
use arrow_schema::{DataType, Field};
|
|
9
|
+
use parquet::arrow::ArrowWriter;
|
|
10
|
+
use parquet::basic::Compression;
|
|
11
|
+
use parquet::file::properties::WriterProperties;
|
|
12
|
+
use rand::Rng;
|
|
13
|
+
use std::collections::{hash_map::Entry, HashMap};
|
|
14
|
+
use std::sync::Arc as StdArc;
|
|
15
|
+
|
|
16
|
+
// Default configuration constants
|
|
17
|
+
const DEFAULT_BATCH_SIZE: usize = 1000;
|
|
18
|
+
const DEFAULT_MEMORY_THRESHOLD: usize = 100 * 1024 * 1024; // 100MB
|
|
19
|
+
const DEFAULT_SAMPLE_SIZE: usize = 100;
|
|
20
|
+
const MIN_BATCH_SIZE: usize = 10;
|
|
21
|
+
// Ceiling for a fixed or dynamically-estimated batch size on a single-column
|
|
22
|
+
// schema. The effective cap is also limited by schema width below.
|
|
23
|
+
pub const MAX_BATCH_SIZE: usize = 1_000_000;
|
|
24
|
+
// `sample_size` also backs an eager Vec reservation during writer creation.
|
|
25
|
+
// Keep user-provided estimates from becoming an unbounded upfront allocation.
|
|
26
|
+
pub const MAX_SAMPLE_SIZE: usize = 10_000;
|
|
27
|
+
// Total slots eagerly reserved across all per-column buffers. This keeps wide
|
|
28
|
+
// schemas from multiplying a row-count cap into an unbounded allocation.
|
|
29
|
+
const MAX_BUFFERED_VALUE_SLOTS: usize = 1_000_000;
|
|
30
|
+
const MIN_SAMPLES_FOR_ESTIMATE: usize = 10;
|
|
31
|
+
|
|
32
|
+
/// Builder for creating a configured Writer
|
|
33
|
+
pub struct WriterBuilder {
|
|
34
|
+
compression: Compression,
|
|
35
|
+
batch_size: Option<usize>,
|
|
36
|
+
memory_threshold: usize,
|
|
37
|
+
sample_size: usize,
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
impl Default for WriterBuilder {
|
|
41
|
+
fn default() -> Self {
|
|
42
|
+
Self {
|
|
43
|
+
compression: Compression::SNAPPY,
|
|
44
|
+
batch_size: None,
|
|
45
|
+
memory_threshold: DEFAULT_MEMORY_THRESHOLD,
|
|
46
|
+
sample_size: DEFAULT_SAMPLE_SIZE,
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
impl WriterBuilder {
|
|
52
|
+
/// Create a new WriterBuilder with default settings
|
|
53
|
+
pub fn new() -> Self {
|
|
54
|
+
Self::default()
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/// Set the compression algorithm
|
|
58
|
+
pub fn with_compression(mut self, compression: Compression) -> Self {
|
|
59
|
+
self.compression = compression;
|
|
60
|
+
self
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/// Set a fixed batch size (disables dynamic sizing)
|
|
64
|
+
pub fn with_batch_size(mut self, size: usize) -> Self {
|
|
65
|
+
self.batch_size = Some(size);
|
|
66
|
+
self
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/// Set the memory threshold for flushing
|
|
70
|
+
pub fn with_memory_threshold(mut self, threshold: usize) -> Self {
|
|
71
|
+
self.memory_threshold = threshold;
|
|
72
|
+
self
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/// Set the sample size for row size estimation
|
|
76
|
+
pub fn with_sample_size(mut self, size: usize) -> Self {
|
|
77
|
+
self.sample_size = size;
|
|
78
|
+
self
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/// Build a Writer with the configured settings
|
|
82
|
+
pub fn build<W: std::io::Write + Send>(self, writer: W, schema: Schema) -> Result<Writer<W>> {
|
|
83
|
+
let arrow_schema = schema_to_arrow(&schema)?;
|
|
84
|
+
|
|
85
|
+
let props = WriterProperties::builder()
|
|
86
|
+
.set_compression(self.compression)
|
|
87
|
+
.build();
|
|
88
|
+
|
|
89
|
+
let arrow_writer = ArrowWriter::try_new(writer, arrow_schema.clone(), Some(props))?;
|
|
90
|
+
|
|
91
|
+
validate_column_count(arrow_schema.fields().len())?;
|
|
92
|
+
let current_batch_size = match self.batch_size {
|
|
93
|
+
Some(size) => validate_fixed_batch_size(size, arrow_schema.fields().len())?,
|
|
94
|
+
None => default_batch_size_for_column_count(arrow_schema.fields().len()),
|
|
95
|
+
};
|
|
96
|
+
let sample_size = validate_sample_size(self.sample_size)?;
|
|
97
|
+
let buffered_columns = new_buffered_columns(&arrow_schema, current_batch_size);
|
|
98
|
+
|
|
99
|
+
Ok(Writer {
|
|
100
|
+
arrow_writer: Some(arrow_writer),
|
|
101
|
+
arrow_schema,
|
|
102
|
+
buffered_columns,
|
|
103
|
+
buffered_row_count: 0,
|
|
104
|
+
current_batch_size,
|
|
105
|
+
memory_threshold: self.memory_threshold,
|
|
106
|
+
sample_size,
|
|
107
|
+
size_samples: Vec::with_capacity(sample_size),
|
|
108
|
+
total_rows_written: 0,
|
|
109
|
+
fixed_batch_size: self.batch_size,
|
|
110
|
+
raw_bytes_since_flush: 0,
|
|
111
|
+
})
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/// Core Parquet writer that works with any type implementing Write
|
|
116
|
+
pub struct Writer<W: std::io::Write> {
|
|
117
|
+
arrow_writer: Option<ArrowWriter<W>>,
|
|
118
|
+
arrow_schema: StdArc<arrow_schema::Schema>,
|
|
119
|
+
buffered_columns: Vec<Vec<ParquetValue>>,
|
|
120
|
+
buffered_row_count: usize,
|
|
121
|
+
current_batch_size: usize,
|
|
122
|
+
memory_threshold: usize,
|
|
123
|
+
sample_size: usize,
|
|
124
|
+
size_samples: Vec<usize>,
|
|
125
|
+
total_rows_written: usize,
|
|
126
|
+
fixed_batch_size: Option<usize>,
|
|
127
|
+
/// Estimated raw bytes accepted since the last row-group flush. Tracked
|
|
128
|
+
/// separately from the arrow writer's encoded buffer sizes so
|
|
129
|
+
/// `memory_threshold` still bounds in-flight data (and streams row groups
|
|
130
|
+
/// to the destination) when encoding/compression shrinks it dramatically.
|
|
131
|
+
raw_bytes_since_flush: usize,
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
impl<W> Writer<W>
|
|
135
|
+
where
|
|
136
|
+
W: std::io::Write + Send,
|
|
137
|
+
{
|
|
138
|
+
/// Create a new writer with default settings
|
|
139
|
+
pub fn new(writer: W, schema: Schema) -> Result<Self> {
|
|
140
|
+
WriterBuilder::new().build(writer, schema)
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/// Create a new writer with custom properties
|
|
144
|
+
pub fn new_with_properties(writer: W, schema: Schema, props: WriterProperties) -> Result<Self> {
|
|
145
|
+
let arrow_schema = schema_to_arrow(&schema)?;
|
|
146
|
+
|
|
147
|
+
let arrow_writer = ArrowWriter::try_new(writer, arrow_schema.clone(), Some(props))?;
|
|
148
|
+
|
|
149
|
+
validate_column_count(arrow_schema.fields().len())?;
|
|
150
|
+
let current_batch_size = default_batch_size_for_column_count(arrow_schema.fields().len());
|
|
151
|
+
let buffered_columns = new_buffered_columns(&arrow_schema, current_batch_size);
|
|
152
|
+
|
|
153
|
+
Ok(Self {
|
|
154
|
+
arrow_writer: Some(arrow_writer),
|
|
155
|
+
arrow_schema,
|
|
156
|
+
buffered_columns,
|
|
157
|
+
buffered_row_count: 0,
|
|
158
|
+
current_batch_size,
|
|
159
|
+
memory_threshold: DEFAULT_MEMORY_THRESHOLD,
|
|
160
|
+
sample_size: DEFAULT_SAMPLE_SIZE,
|
|
161
|
+
size_samples: Vec::with_capacity(DEFAULT_SAMPLE_SIZE),
|
|
162
|
+
total_rows_written: 0,
|
|
163
|
+
fixed_batch_size: None,
|
|
164
|
+
raw_bytes_since_flush: 0,
|
|
165
|
+
})
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/// Write a batch of rows to the Parquet file
|
|
169
|
+
///
|
|
170
|
+
/// Each row is a vector of values corresponding to the schema fields
|
|
171
|
+
pub fn write_rows(&mut self, rows: Vec<Vec<ParquetValue>>) -> Result<()> {
|
|
172
|
+
for row in rows {
|
|
173
|
+
self.write_row(row)?;
|
|
174
|
+
}
|
|
175
|
+
Ok(())
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/// Write a single row to the Parquet file
|
|
179
|
+
///
|
|
180
|
+
/// Rows are buffered internally and written in batches to optimize memory usage
|
|
181
|
+
pub fn write_row(&mut self, row: Vec<ParquetValue>) -> Result<()> {
|
|
182
|
+
// Validate row length
|
|
183
|
+
let num_cols = self.arrow_schema.fields().len();
|
|
184
|
+
if row.len() != num_cols {
|
|
185
|
+
return Err(ParquetError::Schema(format!(
|
|
186
|
+
"Row has {} values but schema has {} fields",
|
|
187
|
+
row.len(),
|
|
188
|
+
num_cols
|
|
189
|
+
)));
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// Validate each value matches its schema
|
|
193
|
+
for (idx, (value, field)) in row.iter().zip(self.arrow_schema.fields()).enumerate() {
|
|
194
|
+
validate_value_against_field(value, field, &format!("row[{}]", idx))?;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
let row_size = self.estimate_row_size(&row)?;
|
|
198
|
+
|
|
199
|
+
// Sample row size for dynamic batch sizing
|
|
200
|
+
if self.fixed_batch_size.is_none() {
|
|
201
|
+
self.sample_row_size(row_size);
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// Count raw staged bytes toward the flush threshold.
|
|
205
|
+
self.raw_bytes_since_flush = self.raw_bytes_since_flush.saturating_add(row_size);
|
|
206
|
+
|
|
207
|
+
for (col_idx, value) in row.into_iter().enumerate() {
|
|
208
|
+
self.buffered_columns[col_idx].push(value);
|
|
209
|
+
}
|
|
210
|
+
self.buffered_row_count += 1;
|
|
211
|
+
|
|
212
|
+
// Check if we need to flush: batch full, or raw staged bytes already
|
|
213
|
+
// past the threshold (bounds in-flight memory when rows are large
|
|
214
|
+
// relative to the configured batch size).
|
|
215
|
+
if self.buffered_row_count >= self.current_batch_size
|
|
216
|
+
|| self.raw_bytes_since_flush >= self.memory_threshold
|
|
217
|
+
{
|
|
218
|
+
self.flush_buffered_rows()?;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
Ok(())
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
/// Sample row size for dynamic batch sizing using reservoir sampling
|
|
225
|
+
fn sample_row_size(&mut self, row_size: usize) {
|
|
226
|
+
if self.size_samples.len() < self.sample_size {
|
|
227
|
+
self.size_samples.push(row_size);
|
|
228
|
+
} else {
|
|
229
|
+
// Reservoir sampling
|
|
230
|
+
let mut rng = rand::rng();
|
|
231
|
+
let idx = rng.random_range(0..=self.total_rows_written);
|
|
232
|
+
if idx < self.sample_size {
|
|
233
|
+
self.size_samples[idx] = row_size;
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// Update batch size once the requested sample has been collected. Small
|
|
238
|
+
// explicit sample sizes are valid because they bound how long large rows
|
|
239
|
+
// may keep using the default batch size.
|
|
240
|
+
let samples_required = self.sample_size.min(MIN_SAMPLES_FOR_ESTIMATE);
|
|
241
|
+
if self.size_samples.len() >= samples_required {
|
|
242
|
+
self.update_batch_size();
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
/// Estimate the memory size of a single row
|
|
247
|
+
fn estimate_row_size(&self, row: &[ParquetValue]) -> Result<usize> {
|
|
248
|
+
let mut size = 0;
|
|
249
|
+
for (idx, value) in row.iter().enumerate() {
|
|
250
|
+
let field = &self.arrow_schema.fields()[idx];
|
|
251
|
+
size += self.estimate_value_size(value, field.data_type())?;
|
|
252
|
+
}
|
|
253
|
+
Ok(size)
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
/// Estimate the memory footprint of a single value
|
|
257
|
+
#[allow(clippy::only_used_in_recursion)]
|
|
258
|
+
fn estimate_value_size(&self, value: &ParquetValue, data_type: &DataType) -> Result<usize> {
|
|
259
|
+
use ParquetValue::*;
|
|
260
|
+
|
|
261
|
+
Ok(match (value, data_type) {
|
|
262
|
+
(Null, _) => 0,
|
|
263
|
+
|
|
264
|
+
// Fixed size types
|
|
265
|
+
(Boolean(_), DataType::Boolean) => 1,
|
|
266
|
+
(Int8(_), DataType::Int8) => 1,
|
|
267
|
+
(UInt8(_), DataType::UInt8) => 1,
|
|
268
|
+
(Int16(_), DataType::Int16) => 2,
|
|
269
|
+
(UInt16(_), DataType::UInt16) => 2,
|
|
270
|
+
(Int32(_), DataType::Int32) => 4,
|
|
271
|
+
(UInt32(_), DataType::UInt32) => 4,
|
|
272
|
+
(Float32(_), DataType::Float32) => 4,
|
|
273
|
+
(Int64(_), DataType::Int64) => 8,
|
|
274
|
+
(UInt64(_), DataType::UInt64) => 8,
|
|
275
|
+
(Float64(_), DataType::Float64) => 8,
|
|
276
|
+
(Date32(_), DataType::Date32) => 4,
|
|
277
|
+
(Date64(_), DataType::Date64) => 8,
|
|
278
|
+
(TimeMillis(_), DataType::Time32(_)) => 4,
|
|
279
|
+
(TimeMicros(_), DataType::Time64(_)) => 8,
|
|
280
|
+
(TimeNanos(_), DataType::Time64(_)) => 8,
|
|
281
|
+
(TimestampSecond(_, _), DataType::Timestamp(_, _)) => 8,
|
|
282
|
+
(TimestampMillis(_, _), DataType::Timestamp(_, _)) => 8,
|
|
283
|
+
(TimestampMicros(_, _), DataType::Timestamp(_, _)) => 8,
|
|
284
|
+
(TimestampNanos(_, _), DataType::Timestamp(_, _)) => 8,
|
|
285
|
+
(Decimal128(_, _), DataType::Decimal128(_, _)) => 16,
|
|
286
|
+
|
|
287
|
+
// Variable size types
|
|
288
|
+
(String(s), DataType::Utf8) => s.len() + std::mem::size_of::<usize>() * 3,
|
|
289
|
+
(Bytes(b), DataType::Binary) => b.len() + std::mem::size_of::<usize>() * 3,
|
|
290
|
+
(Bytes(_), DataType::FixedSizeBinary(len)) => *len as usize,
|
|
291
|
+
|
|
292
|
+
(Decimal256(v, _), DataType::Decimal256(_, _)) => {
|
|
293
|
+
let bytes = v.to_signed_bytes_le();
|
|
294
|
+
32 + bytes.len()
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
// Complex types
|
|
298
|
+
(List(items), DataType::List(field)) => {
|
|
299
|
+
let base_size = std::mem::size_of::<usize>() * 3;
|
|
300
|
+
if items.is_empty() {
|
|
301
|
+
base_size
|
|
302
|
+
} else {
|
|
303
|
+
// Sample up to 5 elements
|
|
304
|
+
let sample_count = items.len().min(5);
|
|
305
|
+
let sample_size: usize = items
|
|
306
|
+
.iter()
|
|
307
|
+
.take(sample_count)
|
|
308
|
+
.map(|item| {
|
|
309
|
+
self.estimate_value_size(item, field.data_type())
|
|
310
|
+
.unwrap_or(0)
|
|
311
|
+
})
|
|
312
|
+
.sum();
|
|
313
|
+
let avg_size = sample_size / sample_count;
|
|
314
|
+
base_size + (avg_size * items.len())
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
(Map(entries), DataType::Map(entries_field, _)) => {
|
|
319
|
+
if let DataType::Struct(fields) = entries_field.data_type() {
|
|
320
|
+
let base_size = std::mem::size_of::<usize>() * 4;
|
|
321
|
+
if entries.is_empty() || fields.len() < 2 {
|
|
322
|
+
base_size
|
|
323
|
+
} else {
|
|
324
|
+
// Sample up to 5 entries
|
|
325
|
+
let sample_count = entries.len().min(5);
|
|
326
|
+
let mut total_size = base_size;
|
|
327
|
+
|
|
328
|
+
for (key, val) in entries.iter().take(sample_count) {
|
|
329
|
+
total_size += self
|
|
330
|
+
.estimate_value_size(key, fields[0].data_type())
|
|
331
|
+
.unwrap_or(0);
|
|
332
|
+
total_size += self
|
|
333
|
+
.estimate_value_size(val, fields[1].data_type())
|
|
334
|
+
.unwrap_or(0);
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
let avg_entry_size = (total_size - base_size) / sample_count;
|
|
338
|
+
base_size + (avg_entry_size * entries.len())
|
|
339
|
+
}
|
|
340
|
+
} else {
|
|
341
|
+
100 // Default estimate
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
(Record(fields), DataType::Struct(schema_fields)) => {
|
|
346
|
+
let base_size = std::mem::size_of::<usize>() * 3;
|
|
347
|
+
let field_sizes: usize = fields
|
|
348
|
+
.iter()
|
|
349
|
+
.zip(schema_fields.iter())
|
|
350
|
+
.map(|((_, val), field)| {
|
|
351
|
+
self.estimate_value_size(val, field.data_type())
|
|
352
|
+
.unwrap_or(0)
|
|
353
|
+
})
|
|
354
|
+
.sum();
|
|
355
|
+
base_size + field_sizes
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
_ => 100, // Default estimate for mismatched types
|
|
359
|
+
})
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
/// Update dynamic batch size based on current samples
|
|
363
|
+
fn update_batch_size(&mut self) {
|
|
364
|
+
if self.size_samples.is_empty() {
|
|
365
|
+
return;
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
let total_size: usize = self.size_samples.iter().sum();
|
|
369
|
+
let avg_row_size = (total_size as f64 / self.size_samples.len() as f64).max(1.0);
|
|
370
|
+
let suggested_batch_size = (self.memory_threshold as f64 / avg_row_size).floor() as usize;
|
|
371
|
+
self.current_batch_size = dynamic_batch_size_for_column_count(
|
|
372
|
+
suggested_batch_size,
|
|
373
|
+
self.arrow_schema.fields().len(),
|
|
374
|
+
);
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
/// Flush buffered rows to the Parquet file
|
|
378
|
+
fn flush_buffered_rows(&mut self) -> Result<()> {
|
|
379
|
+
if self.buffered_row_count == 0 {
|
|
380
|
+
return Ok(());
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
// Convert columns to Arrow arrays
|
|
384
|
+
let arrow_columns = self
|
|
385
|
+
.buffered_columns
|
|
386
|
+
.iter()
|
|
387
|
+
.zip(self.arrow_schema.fields())
|
|
388
|
+
.map(|(values, field)| parquet_values_to_arrow_array(values, field))
|
|
389
|
+
.collect::<Result<Vec<_>>>()?;
|
|
390
|
+
|
|
391
|
+
// Create RecordBatch
|
|
392
|
+
let batch = RecordBatch::try_new(self.arrow_schema.clone(), arrow_columns)?;
|
|
393
|
+
|
|
394
|
+
// Write the batch
|
|
395
|
+
if let Some(writer) = &mut self.arrow_writer {
|
|
396
|
+
writer.write(&batch)?;
|
|
397
|
+
|
|
398
|
+
let num_rows = self.buffered_row_count;
|
|
399
|
+
self.buffered_row_count = 0;
|
|
400
|
+
self.total_rows_written += num_rows;
|
|
401
|
+
let reserve_target = self.current_batch_size;
|
|
402
|
+
for column in &mut self.buffered_columns {
|
|
403
|
+
column.clear();
|
|
404
|
+
let additional_capacity = reserve_target.saturating_sub(column.capacity());
|
|
405
|
+
column.reserve(additional_capacity);
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
// Check if we need to flush a completed row group to the
|
|
409
|
+
// destination. Raw staged bytes trip the threshold too: highly
|
|
410
|
+
// compressible data can sit far below the threshold once encoded,
|
|
411
|
+
// which would otherwise keep the whole file buffered until close.
|
|
412
|
+
if self.raw_bytes_since_flush >= self.memory_threshold
|
|
413
|
+
|| writer.in_progress_size() >= self.memory_threshold
|
|
414
|
+
|| writer.memory_size() >= self.memory_threshold
|
|
415
|
+
{
|
|
416
|
+
writer.flush()?;
|
|
417
|
+
self.raw_bytes_since_flush = 0;
|
|
418
|
+
}
|
|
419
|
+
} else {
|
|
420
|
+
return Err(ParquetError::Io(std::io::Error::new(
|
|
421
|
+
std::io::ErrorKind::Other,
|
|
422
|
+
"Writer has been closed",
|
|
423
|
+
)));
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
Ok(())
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
/// Write columns to the Parquet file
|
|
430
|
+
///
|
|
431
|
+
/// Each element is a tuple of (column_name, values)
|
|
432
|
+
pub fn write_columns(&mut self, columns: Vec<(String, Vec<ParquetValue>)>) -> Result<()> {
|
|
433
|
+
self.flush_buffered_rows()?;
|
|
434
|
+
|
|
435
|
+
if columns.is_empty() {
|
|
436
|
+
return Ok(());
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
// Verify column names match schema
|
|
440
|
+
let schema_fields = self.arrow_schema.fields();
|
|
441
|
+
if columns.len() != schema_fields.len() {
|
|
442
|
+
return Err(ParquetError::Schema(format!(
|
|
443
|
+
"Provided {} columns but schema has {} fields",
|
|
444
|
+
columns.len(),
|
|
445
|
+
schema_fields.len()
|
|
446
|
+
)));
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
let mut columns_by_name = HashMap::with_capacity(columns.len());
|
|
450
|
+
for (name, values) in columns {
|
|
451
|
+
match columns_by_name.entry(name) {
|
|
452
|
+
Entry::Vacant(entry) => {
|
|
453
|
+
entry.insert(values);
|
|
454
|
+
}
|
|
455
|
+
Entry::Occupied(entry) => {
|
|
456
|
+
return Err(ParquetError::Schema(format!(
|
|
457
|
+
"Duplicate column: {}",
|
|
458
|
+
entry.key()
|
|
459
|
+
)));
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
// Anchor the expected length to the first schema column and report
|
|
465
|
+
// mismatches in schema order, so the error is deterministic regardless
|
|
466
|
+
// of HashMap iteration order.
|
|
467
|
+
let expected_len = schema_fields
|
|
468
|
+
.first()
|
|
469
|
+
.and_then(|field| columns_by_name.get(field.name().as_str()))
|
|
470
|
+
.map_or(0, Vec::len);
|
|
471
|
+
for field in schema_fields {
|
|
472
|
+
if let Some(values) = columns_by_name.get(field.name().as_str()) {
|
|
473
|
+
if values.len() != expected_len {
|
|
474
|
+
return Err(ParquetError::Schema(format!(
|
|
475
|
+
"Column '{}' has {} values but expected {}",
|
|
476
|
+
field.name(),
|
|
477
|
+
values.len(),
|
|
478
|
+
expected_len
|
|
479
|
+
)));
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
// Sort columns to match schema order and convert to arrays
|
|
485
|
+
let mut arrow_columns = Vec::with_capacity(schema_fields.len());
|
|
486
|
+
let mut batch_raw_bytes: usize = 0;
|
|
487
|
+
|
|
488
|
+
for field in schema_fields {
|
|
489
|
+
let values = columns_by_name
|
|
490
|
+
.remove(field.name().as_str())
|
|
491
|
+
.ok_or_else(|| ParquetError::Schema(format!("Missing column: {}", field.name())))?;
|
|
492
|
+
|
|
493
|
+
for (idx, value) in values.iter().enumerate() {
|
|
494
|
+
validate_value_against_field(
|
|
495
|
+
value,
|
|
496
|
+
field,
|
|
497
|
+
&format!("column '{}'[{}]", field.name(), idx),
|
|
498
|
+
)?;
|
|
499
|
+
batch_raw_bytes = batch_raw_bytes
|
|
500
|
+
.saturating_add(self.estimate_value_size(value, field.data_type())?);
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
let array = parquet_values_to_arrow_array(&values, field)?;
|
|
504
|
+
arrow_columns.push(array);
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
// Create RecordBatch
|
|
508
|
+
let batch = RecordBatch::try_new(self.arrow_schema.clone(), arrow_columns)?;
|
|
509
|
+
|
|
510
|
+
// Write the batch
|
|
511
|
+
if let Some(writer) = &mut self.arrow_writer {
|
|
512
|
+
writer.write(&batch)?;
|
|
513
|
+
self.raw_bytes_since_flush = self.raw_bytes_since_flush.saturating_add(batch_raw_bytes);
|
|
514
|
+
|
|
515
|
+
// Check if we need to flush a completed row group, like the row
|
|
516
|
+
// path does; otherwise repeated write_columns calls accumulate
|
|
517
|
+
// every row group in memory until close.
|
|
518
|
+
if self.raw_bytes_since_flush >= self.memory_threshold
|
|
519
|
+
|| writer.in_progress_size() >= self.memory_threshold
|
|
520
|
+
|| writer.memory_size() >= self.memory_threshold
|
|
521
|
+
{
|
|
522
|
+
writer.flush()?;
|
|
523
|
+
self.raw_bytes_since_flush = 0;
|
|
524
|
+
}
|
|
525
|
+
} else {
|
|
526
|
+
return Err(ParquetError::Io(std::io::Error::new(
|
|
527
|
+
std::io::ErrorKind::Other,
|
|
528
|
+
"Writer has been closed",
|
|
529
|
+
)));
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
Ok(())
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
/// Flush any buffered data
|
|
536
|
+
pub fn flush(&mut self) -> Result<()> {
|
|
537
|
+
// First flush any buffered rows
|
|
538
|
+
self.flush_buffered_rows()?;
|
|
539
|
+
|
|
540
|
+
// Then flush the arrow writer
|
|
541
|
+
if let Some(writer) = &mut self.arrow_writer {
|
|
542
|
+
writer.flush()?;
|
|
543
|
+
}
|
|
544
|
+
self.raw_bytes_since_flush = 0;
|
|
545
|
+
Ok(())
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
/// Close the writer and write the file footer
|
|
549
|
+
///
|
|
550
|
+
/// This must be called to finalize the Parquet file
|
|
551
|
+
pub fn close(mut self) -> Result<()> {
|
|
552
|
+
// Flush any remaining buffered rows
|
|
553
|
+
self.flush_buffered_rows()?;
|
|
554
|
+
|
|
555
|
+
// Close the arrow writer
|
|
556
|
+
if let Some(writer) = self.arrow_writer.take() {
|
|
557
|
+
writer.close()?;
|
|
558
|
+
}
|
|
559
|
+
Ok(())
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
/// Validate a value against its field schema
|
|
564
|
+
fn validate_value_against_field(value: &ParquetValue, field: &Field, path: &str) -> Result<()> {
|
|
565
|
+
use ParquetValue::*;
|
|
566
|
+
|
|
567
|
+
// Null handling
|
|
568
|
+
if matches!(value, Null) {
|
|
569
|
+
if !field.is_nullable() {
|
|
570
|
+
return Err(ParquetError::Schema(format!(
|
|
571
|
+
"Found null value for non-nullable field at {}",
|
|
572
|
+
path
|
|
573
|
+
)));
|
|
574
|
+
}
|
|
575
|
+
return Ok(());
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
// Type validation
|
|
579
|
+
match (value, field.data_type()) {
|
|
580
|
+
// Boolean
|
|
581
|
+
(Boolean(_), DataType::Boolean) => Ok(()),
|
|
582
|
+
|
|
583
|
+
// Integer types
|
|
584
|
+
(Int8(_), DataType::Int8) => Ok(()),
|
|
585
|
+
(Int16(_), DataType::Int16) => Ok(()),
|
|
586
|
+
(Int32(_), DataType::Int32) => Ok(()),
|
|
587
|
+
(Int64(_), DataType::Int64) => Ok(()),
|
|
588
|
+
(UInt8(_), DataType::UInt8) => Ok(()),
|
|
589
|
+
(UInt16(_), DataType::UInt16) => Ok(()),
|
|
590
|
+
(UInt32(_), DataType::UInt32) => Ok(()),
|
|
591
|
+
(UInt64(_), DataType::UInt64) => Ok(()),
|
|
592
|
+
|
|
593
|
+
// Float types
|
|
594
|
+
(Float16(_), DataType::Float16) => Ok(()),
|
|
595
|
+
(Float32(_), DataType::Float32) => Ok(()),
|
|
596
|
+
(Float64(_), DataType::Float64) => Ok(()),
|
|
597
|
+
|
|
598
|
+
// String and binary
|
|
599
|
+
(String(_), DataType::Utf8) => Ok(()),
|
|
600
|
+
(Bytes(_), DataType::Binary) => Ok(()),
|
|
601
|
+
(Bytes(b), DataType::FixedSizeBinary(size)) => {
|
|
602
|
+
// Validate up front so a wrong-length value is rejected at write_row
|
|
603
|
+
// rather than poisoning the buffer at flush time.
|
|
604
|
+
if b.len() != *size as usize {
|
|
605
|
+
return Err(ParquetError::Schema(format!(
|
|
606
|
+
"Fixed size binary expected {} bytes, got {} at {}",
|
|
607
|
+
size,
|
|
608
|
+
b.len(),
|
|
609
|
+
path
|
|
610
|
+
)));
|
|
611
|
+
}
|
|
612
|
+
Ok(())
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
// Date/time types
|
|
616
|
+
(Date32(_), DataType::Date32) => Ok(()),
|
|
617
|
+
(Date64(_), DataType::Date64) => Ok(()),
|
|
618
|
+
(TimeMillis(_), DataType::Time32(_)) => Ok(()),
|
|
619
|
+
(TimeMicros(_), DataType::Time64(_)) => Ok(()),
|
|
620
|
+
(TimeNanos(_), DataType::Time64(_)) => Ok(()),
|
|
621
|
+
(TimestampSecond(_, _), DataType::Timestamp(_, _)) => Ok(()),
|
|
622
|
+
(TimestampMillis(_, _), DataType::Timestamp(_, _)) => Ok(()),
|
|
623
|
+
(TimestampMicros(_, _), DataType::Timestamp(_, _)) => Ok(()),
|
|
624
|
+
(TimestampNanos(_, _), DataType::Timestamp(_, _)) => Ok(()),
|
|
625
|
+
|
|
626
|
+
// Decimal types
|
|
627
|
+
(Decimal128(decimal, value_scale), DataType::Decimal128(precision, scale)) => {
|
|
628
|
+
validate_decimal128_schema(*decimal, *value_scale, *precision, *scale, path)
|
|
629
|
+
}
|
|
630
|
+
(Decimal256(decimal, value_scale), DataType::Decimal256(precision, scale)) => {
|
|
631
|
+
validate_decimal256_schema(decimal, *value_scale, *precision, *scale, path)
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
// List type
|
|
635
|
+
(List(items), DataType::List(item_field)) => {
|
|
636
|
+
for (idx, item) in items.iter().enumerate() {
|
|
637
|
+
validate_value_against_field(item, item_field, &format!("{}[{}]", path, idx))?;
|
|
638
|
+
}
|
|
639
|
+
Ok(())
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
// Map type
|
|
643
|
+
(Map(entries), DataType::Map(entries_field, _)) => {
|
|
644
|
+
if let DataType::Struct(fields) = entries_field.data_type() {
|
|
645
|
+
if fields.len() >= 2 {
|
|
646
|
+
let key_field = &fields[0];
|
|
647
|
+
let value_field = &fields[1];
|
|
648
|
+
|
|
649
|
+
for (idx, (key, val)) in entries.iter().enumerate() {
|
|
650
|
+
validate_value_against_field(
|
|
651
|
+
key,
|
|
652
|
+
key_field,
|
|
653
|
+
&format!("{}.key[{}]", path, idx),
|
|
654
|
+
)?;
|
|
655
|
+
validate_value_against_field(
|
|
656
|
+
val,
|
|
657
|
+
value_field,
|
|
658
|
+
&format!("{}.value[{}]", path, idx),
|
|
659
|
+
)?;
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
}
|
|
663
|
+
Ok(())
|
|
664
|
+
}
|
|
665
|
+
|
|
666
|
+
// Struct type
|
|
667
|
+
(Record(record_fields), DataType::Struct(schema_fields)) => {
|
|
668
|
+
for field in schema_fields {
|
|
669
|
+
let field_name = field.name();
|
|
670
|
+
if let Some(value) = record_fields.get(field_name.as_str()) {
|
|
671
|
+
validate_value_against_field(
|
|
672
|
+
value,
|
|
673
|
+
field,
|
|
674
|
+
&format!("{}.{}", path, field_name),
|
|
675
|
+
)?;
|
|
676
|
+
} else if !field.is_nullable() {
|
|
677
|
+
return Err(ParquetError::Schema(format!(
|
|
678
|
+
"Required field '{}' is missing in struct at {}",
|
|
679
|
+
field_name, path
|
|
680
|
+
)));
|
|
681
|
+
}
|
|
682
|
+
}
|
|
683
|
+
Ok(())
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
// Type mismatch
|
|
687
|
+
(value, expected_type) => Err(ParquetError::Schema(format!(
|
|
688
|
+
"Type mismatch at {}: expected {:?}, got {:?}",
|
|
689
|
+
path,
|
|
690
|
+
expected_type,
|
|
691
|
+
value.type_name()
|
|
692
|
+
))),
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
/// Convert our Schema to Arrow Schema
|
|
697
|
+
fn schema_to_arrow(schema: &Schema) -> Result<StdArc<arrow_schema::Schema>> {
|
|
698
|
+
schema.validate().map_err(ParquetError::Schema)?;
|
|
699
|
+
match &schema.root {
|
|
700
|
+
SchemaNode::Struct { fields, .. } => {
|
|
701
|
+
let arrow_fields = fields
|
|
702
|
+
.iter()
|
|
703
|
+
.map(schema_node_to_arrow_field)
|
|
704
|
+
.collect::<Result<Vec<_>>>()?;
|
|
705
|
+
|
|
706
|
+
Ok(StdArc::new(arrow_schema::Schema::new(arrow_fields)))
|
|
707
|
+
}
|
|
708
|
+
_ => Err(ParquetError::Schema(
|
|
709
|
+
"Root schema node must be a struct".to_string(),
|
|
710
|
+
)),
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
|
|
714
|
+
fn validate_column_count(column_count: usize) -> Result<()> {
|
|
715
|
+
if column_count > MAX_BUFFERED_VALUE_SLOTS {
|
|
716
|
+
return Err(ParquetError::Schema(format!(
|
|
717
|
+
"Schema has {} columns, exceeding the writer buffer slot limit of {}",
|
|
718
|
+
column_count, MAX_BUFFERED_VALUE_SLOTS
|
|
719
|
+
)));
|
|
720
|
+
}
|
|
721
|
+
Ok(())
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
fn max_batch_size_for_column_count(column_count: usize) -> usize {
|
|
725
|
+
let width = column_count.max(1);
|
|
726
|
+
(MAX_BUFFERED_VALUE_SLOTS / width)
|
|
727
|
+
.max(1)
|
|
728
|
+
.min(MAX_BATCH_SIZE)
|
|
729
|
+
}
|
|
730
|
+
|
|
731
|
+
fn default_batch_size_for_column_count(column_count: usize) -> usize {
|
|
732
|
+
DEFAULT_BATCH_SIZE.min(max_batch_size_for_column_count(column_count))
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
fn validate_fixed_batch_size(batch_size: usize, column_count: usize) -> Result<usize> {
|
|
736
|
+
if batch_size == 0 {
|
|
737
|
+
return Err(ParquetError::Schema(
|
|
738
|
+
"batch_size must be greater than 0".to_string(),
|
|
739
|
+
));
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
let max_batch_size = max_batch_size_for_column_count(column_count);
|
|
743
|
+
if batch_size > max_batch_size {
|
|
744
|
+
return Err(ParquetError::Schema(format!(
|
|
745
|
+
"batch_size {} exceeds maximum {} for {} columns",
|
|
746
|
+
batch_size, max_batch_size, column_count
|
|
747
|
+
)));
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
Ok(batch_size)
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
fn validate_sample_size(sample_size: usize) -> Result<usize> {
|
|
754
|
+
if sample_size == 0 {
|
|
755
|
+
return Err(ParquetError::Schema(
|
|
756
|
+
"sample_size must be greater than 0".to_string(),
|
|
757
|
+
));
|
|
758
|
+
}
|
|
759
|
+
if sample_size > MAX_SAMPLE_SIZE {
|
|
760
|
+
return Err(ParquetError::Schema(format!(
|
|
761
|
+
"sample_size {} exceeds maximum {}",
|
|
762
|
+
sample_size, MAX_SAMPLE_SIZE
|
|
763
|
+
)));
|
|
764
|
+
}
|
|
765
|
+
Ok(sample_size)
|
|
766
|
+
}
|
|
767
|
+
|
|
768
|
+
fn dynamic_batch_size_for_column_count(suggested_batch_size: usize, column_count: usize) -> usize {
|
|
769
|
+
let max_batch_size = max_batch_size_for_column_count(column_count);
|
|
770
|
+
let min_batch_size = MIN_BATCH_SIZE.min(max_batch_size);
|
|
771
|
+
suggested_batch_size.clamp(min_batch_size, max_batch_size)
|
|
772
|
+
}
|
|
773
|
+
|
|
774
|
+
/// Convert a SchemaNode to an Arrow Field
|
|
775
|
+
fn schema_node_to_arrow_field(node: &SchemaNode) -> Result<Field> {
|
|
776
|
+
match node {
|
|
777
|
+
SchemaNode::Primitive {
|
|
778
|
+
name,
|
|
779
|
+
primitive_type,
|
|
780
|
+
nullable,
|
|
781
|
+
format,
|
|
782
|
+
} => {
|
|
783
|
+
let data_type = primitive_type_to_arrow(primitive_type)?;
|
|
784
|
+
let field = Field::new(name, data_type, *nullable);
|
|
785
|
+
let extended_field = if format.as_deref() == Some("uuid") {
|
|
786
|
+
field.with_extension_type(arrow_schema::extension::Uuid)
|
|
787
|
+
} else {
|
|
788
|
+
field
|
|
789
|
+
};
|
|
790
|
+
Ok(extended_field)
|
|
791
|
+
}
|
|
792
|
+
SchemaNode::List {
|
|
793
|
+
name,
|
|
794
|
+
item,
|
|
795
|
+
nullable,
|
|
796
|
+
} => {
|
|
797
|
+
let item_field = schema_node_to_arrow_field(item)?;
|
|
798
|
+
// Use the conventional Arrow list element name "item" rather than the
|
|
799
|
+
// schema node's internal name (e.g. "<field>_item"), so written files
|
|
800
|
+
// interoperate with external Parquet readers. The element's data type
|
|
801
|
+
// and nullability still come from the schema node.
|
|
802
|
+
let list_type = DataType::List(StdArc::new(Field::new(
|
|
803
|
+
"item",
|
|
804
|
+
item_field.data_type().clone(),
|
|
805
|
+
item_field.is_nullable(),
|
|
806
|
+
)));
|
|
807
|
+
Ok(Field::new(name, list_type, *nullable))
|
|
808
|
+
}
|
|
809
|
+
SchemaNode::Map {
|
|
810
|
+
name,
|
|
811
|
+
key,
|
|
812
|
+
value,
|
|
813
|
+
nullable,
|
|
814
|
+
} => {
|
|
815
|
+
let key_field = schema_node_to_arrow_field(key)?;
|
|
816
|
+
let value_field = schema_node_to_arrow_field(value)?;
|
|
817
|
+
|
|
818
|
+
let struct_fields = vec![
|
|
819
|
+
Field::new(
|
|
820
|
+
key_field.name().clone(),
|
|
821
|
+
key_field.data_type().clone(),
|
|
822
|
+
false,
|
|
823
|
+
),
|
|
824
|
+
Field::new(
|
|
825
|
+
value_field.name().clone(),
|
|
826
|
+
value_field.data_type().clone(),
|
|
827
|
+
value_field.is_nullable(),
|
|
828
|
+
),
|
|
829
|
+
];
|
|
830
|
+
|
|
831
|
+
let map_type = DataType::Map(
|
|
832
|
+
StdArc::new(Field::new(
|
|
833
|
+
"entries",
|
|
834
|
+
DataType::Struct(struct_fields.into()),
|
|
835
|
+
false,
|
|
836
|
+
)),
|
|
837
|
+
false, // keys_sorted
|
|
838
|
+
);
|
|
839
|
+
|
|
840
|
+
Ok(Field::new(name, map_type, *nullable))
|
|
841
|
+
}
|
|
842
|
+
SchemaNode::Struct {
|
|
843
|
+
name,
|
|
844
|
+
fields,
|
|
845
|
+
nullable,
|
|
846
|
+
} => {
|
|
847
|
+
let struct_fields = fields
|
|
848
|
+
.iter()
|
|
849
|
+
.map(schema_node_to_arrow_field)
|
|
850
|
+
.collect::<Result<Vec<_>>>()?;
|
|
851
|
+
|
|
852
|
+
let struct_type = DataType::Struct(struct_fields.into());
|
|
853
|
+
Ok(Field::new(name, struct_type, *nullable))
|
|
854
|
+
}
|
|
855
|
+
}
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
fn new_buffered_columns(
|
|
859
|
+
arrow_schema: &arrow_schema::Schema,
|
|
860
|
+
capacity: usize,
|
|
861
|
+
) -> Vec<Vec<ParquetValue>> {
|
|
862
|
+
let column_count = arrow_schema.fields().len();
|
|
863
|
+
debug_assert!(column_count <= MAX_BUFFERED_VALUE_SLOTS);
|
|
864
|
+
debug_assert!(capacity <= max_batch_size_for_column_count(column_count));
|
|
865
|
+
|
|
866
|
+
arrow_schema
|
|
867
|
+
.fields()
|
|
868
|
+
.iter()
|
|
869
|
+
.map(|_| Vec::with_capacity(capacity))
|
|
870
|
+
.collect()
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
fn validate_decimal128_schema(
|
|
874
|
+
value: i128,
|
|
875
|
+
value_scale: i8,
|
|
876
|
+
precision: u8,
|
|
877
|
+
scale: i8,
|
|
878
|
+
path: &str,
|
|
879
|
+
) -> Result<()> {
|
|
880
|
+
if value_scale != scale {
|
|
881
|
+
return Err(ParquetError::Schema(format!(
|
|
882
|
+
"Decimal scale mismatch at {}: schema scale {}, value scale {}",
|
|
883
|
+
path, scale, value_scale
|
|
884
|
+
)));
|
|
885
|
+
}
|
|
886
|
+
|
|
887
|
+
validate_decimal_precision(decimal128_digit_count(value), precision, path)
|
|
888
|
+
}
|
|
889
|
+
|
|
890
|
+
fn validate_decimal256_schema(
|
|
891
|
+
value: &num::BigInt,
|
|
892
|
+
value_scale: i8,
|
|
893
|
+
precision: u8,
|
|
894
|
+
scale: i8,
|
|
895
|
+
path: &str,
|
|
896
|
+
) -> Result<()> {
|
|
897
|
+
if value_scale != scale {
|
|
898
|
+
return Err(ParquetError::Schema(format!(
|
|
899
|
+
"Decimal scale mismatch at {}: schema scale {}, value scale {}",
|
|
900
|
+
path, scale, value_scale
|
|
901
|
+
)));
|
|
902
|
+
}
|
|
903
|
+
|
|
904
|
+
validate_decimal_precision(decimal256_digit_count(value), precision, path)
|
|
905
|
+
}
|
|
906
|
+
|
|
907
|
+
fn validate_decimal_precision(value_digits: usize, precision: u8, path: &str) -> Result<()> {
|
|
908
|
+
if value_digits > precision as usize {
|
|
909
|
+
return Err(ParquetError::Schema(format!(
|
|
910
|
+
"Decimal precision overflow at {}: schema precision {}, value has {} digits",
|
|
911
|
+
path, precision, value_digits
|
|
912
|
+
)));
|
|
913
|
+
}
|
|
914
|
+
|
|
915
|
+
Ok(())
|
|
916
|
+
}
|
|
917
|
+
|
|
918
|
+
fn decimal128_digit_count(value: i128) -> usize {
|
|
919
|
+
value.unsigned_abs().to_string().len()
|
|
920
|
+
}
|
|
921
|
+
|
|
922
|
+
fn decimal256_digit_count(value: &num::BigInt) -> usize {
|
|
923
|
+
value.to_str_radix(10).trim_start_matches('-').len()
|
|
924
|
+
}
|
|
925
|
+
|
|
926
|
+
/// Convert PrimitiveType to Arrow DataType
|
|
927
|
+
fn primitive_type_to_arrow(ptype: &crate::PrimitiveType) -> Result<DataType> {
|
|
928
|
+
use crate::PrimitiveType::*;
|
|
929
|
+
|
|
930
|
+
Ok(match ptype {
|
|
931
|
+
Boolean => DataType::Boolean,
|
|
932
|
+
Int8 => DataType::Int8,
|
|
933
|
+
Int16 => DataType::Int16,
|
|
934
|
+
Int32 => DataType::Int32,
|
|
935
|
+
Int64 => DataType::Int64,
|
|
936
|
+
UInt8 => DataType::UInt8,
|
|
937
|
+
UInt16 => DataType::UInt16,
|
|
938
|
+
UInt32 => DataType::UInt32,
|
|
939
|
+
UInt64 => DataType::UInt64,
|
|
940
|
+
Float32 => DataType::Float32,
|
|
941
|
+
Float64 => DataType::Float64,
|
|
942
|
+
String => DataType::Utf8,
|
|
943
|
+
Binary => DataType::Binary,
|
|
944
|
+
Date32 => DataType::Date32,
|
|
945
|
+
TimeMillis => DataType::Time32(arrow_schema::TimeUnit::Millisecond),
|
|
946
|
+
TimeMicros => DataType::Time64(arrow_schema::TimeUnit::Microsecond),
|
|
947
|
+
TimeNanos => DataType::Time64(arrow_schema::TimeUnit::Nanosecond),
|
|
948
|
+
TimestampMillis(tz) => DataType::Timestamp(
|
|
949
|
+
arrow_schema::TimeUnit::Millisecond,
|
|
950
|
+
// PARQUET SPEC: ANY timezone (e.g., "+09:00", "America/New_York") means
|
|
951
|
+
// UTC-normalized storage (isAdjustedToUTC = true). Original timezone is lost.
|
|
952
|
+
tz.as_ref().map(|_| StdArc::from("UTC")),
|
|
953
|
+
),
|
|
954
|
+
TimestampMicros(tz) => DataType::Timestamp(
|
|
955
|
+
arrow_schema::TimeUnit::Microsecond,
|
|
956
|
+
// PARQUET SPEC: ANY timezone (e.g., "+09:00", "America/New_York") means
|
|
957
|
+
// UTC-normalized storage (isAdjustedToUTC = true). Original timezone is lost.
|
|
958
|
+
tz.as_ref().map(|_| StdArc::from("UTC")),
|
|
959
|
+
),
|
|
960
|
+
Decimal128(precision, scale) => DataType::Decimal128(*precision, *scale),
|
|
961
|
+
Decimal256(precision, scale) => DataType::Decimal256(*precision, *scale),
|
|
962
|
+
Date64 => DataType::Date64,
|
|
963
|
+
TimestampSecond(tz) => DataType::Timestamp(
|
|
964
|
+
arrow_schema::TimeUnit::Second,
|
|
965
|
+
// PARQUET SPEC: ANY timezone (e.g., "+09:00", "America/New_York") means
|
|
966
|
+
// UTC-normalized storage (isAdjustedToUTC = true). Original timezone is lost.
|
|
967
|
+
tz.as_ref().map(|_| StdArc::from("UTC")),
|
|
968
|
+
),
|
|
969
|
+
TimestampNanos(tz) => DataType::Timestamp(
|
|
970
|
+
arrow_schema::TimeUnit::Nanosecond,
|
|
971
|
+
// PARQUET SPEC: ANY timezone (e.g., "+09:00", "America/New_York") means
|
|
972
|
+
// UTC-normalized storage (isAdjustedToUTC = true). Original timezone is lost.
|
|
973
|
+
tz.as_ref().map(|_| StdArc::from("UTC")),
|
|
974
|
+
),
|
|
975
|
+
FixedLenByteArray(len) => DataType::FixedSizeBinary(*len),
|
|
976
|
+
})
|
|
977
|
+
}
|
|
978
|
+
|
|
979
|
+
#[cfg(test)]
|
|
980
|
+
mod tests {
|
|
981
|
+
use super::*;
|
|
982
|
+
use crate::SchemaBuilder;
|
|
983
|
+
use triomphe::Arc;
|
|
984
|
+
|
|
985
|
+
fn int64_schema(column_count: usize) -> Schema {
|
|
986
|
+
SchemaBuilder::new()
|
|
987
|
+
.with_root(SchemaNode::Struct {
|
|
988
|
+
name: "root".to_string(),
|
|
989
|
+
nullable: false,
|
|
990
|
+
fields: (0..column_count)
|
|
991
|
+
.map(|index| SchemaNode::Primitive {
|
|
992
|
+
name: format!("field_{index}"),
|
|
993
|
+
primitive_type: crate::PrimitiveType::Int64,
|
|
994
|
+
nullable: false,
|
|
995
|
+
format: None,
|
|
996
|
+
})
|
|
997
|
+
.collect(),
|
|
998
|
+
})
|
|
999
|
+
.build()
|
|
1000
|
+
.unwrap()
|
|
1001
|
+
}
|
|
1002
|
+
|
|
1003
|
+
fn single_int64_schema() -> Schema {
|
|
1004
|
+
int64_schema(1)
|
|
1005
|
+
}
|
|
1006
|
+
|
|
1007
|
+
fn single_int64_writer(buffer: Vec<u8>) -> Writer<Vec<u8>> {
|
|
1008
|
+
Writer::new(buffer, single_int64_schema()).unwrap()
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
#[test]
|
|
1012
|
+
fn dynamic_batch_size_is_clamped_to_max() {
|
|
1013
|
+
let mut writer = single_int64_writer(Vec::new());
|
|
1014
|
+
// A pathological tiny average row size would otherwise drive the batch
|
|
1015
|
+
// size toward memory_threshold rows; it must be capped at MAX_BATCH_SIZE.
|
|
1016
|
+
writer.size_samples = vec![1; MIN_SAMPLES_FOR_ESTIMATE];
|
|
1017
|
+
writer.update_batch_size();
|
|
1018
|
+
assert_eq!(writer.current_batch_size, MAX_BATCH_SIZE);
|
|
1019
|
+
|
|
1020
|
+
// A realistic average stays below the cap.
|
|
1021
|
+
writer.size_samples = vec![DEFAULT_MEMORY_THRESHOLD / 1000; MIN_SAMPLES_FOR_ESTIMATE];
|
|
1022
|
+
writer.update_batch_size();
|
|
1023
|
+
assert!(writer.current_batch_size <= MAX_BATCH_SIZE);
|
|
1024
|
+
assert!(writer.current_batch_size >= MIN_BATCH_SIZE);
|
|
1025
|
+
}
|
|
1026
|
+
|
|
1027
|
+
#[test]
|
|
1028
|
+
fn dynamic_batch_size_is_clamped_to_width_bound() {
|
|
1029
|
+
let mut writer = WriterBuilder::new()
|
|
1030
|
+
.build(Vec::new(), int64_schema(2))
|
|
1031
|
+
.unwrap();
|
|
1032
|
+
|
|
1033
|
+
writer.size_samples = vec![1; MIN_SAMPLES_FOR_ESTIMATE];
|
|
1034
|
+
writer.update_batch_size();
|
|
1035
|
+
|
|
1036
|
+
assert_eq!(
|
|
1037
|
+
writer.current_batch_size,
|
|
1038
|
+
max_batch_size_for_column_count(2)
|
|
1039
|
+
);
|
|
1040
|
+
assert_eq!(
|
|
1041
|
+
writer.current_batch_size * writer.buffered_columns.len(),
|
|
1042
|
+
MAX_BUFFERED_VALUE_SLOTS
|
|
1043
|
+
);
|
|
1044
|
+
}
|
|
1045
|
+
|
|
1046
|
+
#[test]
|
|
1047
|
+
fn fixed_batch_size_preserves_small_user_value() {
|
|
1048
|
+
let writer = WriterBuilder::new()
|
|
1049
|
+
.with_batch_size(1)
|
|
1050
|
+
.build(Vec::new(), single_int64_schema())
|
|
1051
|
+
.unwrap();
|
|
1052
|
+
|
|
1053
|
+
assert_eq!(writer.current_batch_size, 1);
|
|
1054
|
+
assert_eq!(writer.buffered_columns[0].capacity(), 1);
|
|
1055
|
+
}
|
|
1056
|
+
|
|
1057
|
+
#[test]
|
|
1058
|
+
fn oversized_fixed_batch_size_is_rejected_before_initial_buffer_allocation() {
|
|
1059
|
+
let result = WriterBuilder::new()
|
|
1060
|
+
.with_batch_size(MAX_BATCH_SIZE + 1)
|
|
1061
|
+
.build(Vec::new(), single_int64_schema());
|
|
1062
|
+
|
|
1063
|
+
assert!(result.is_err());
|
|
1064
|
+
}
|
|
1065
|
+
|
|
1066
|
+
#[test]
|
|
1067
|
+
fn wide_schema_fixed_batch_size_is_rejected_by_total_slot_bound() {
|
|
1068
|
+
let result = WriterBuilder::new()
|
|
1069
|
+
.with_batch_size(MAX_BATCH_SIZE)
|
|
1070
|
+
.build(Vec::new(), int64_schema(2));
|
|
1071
|
+
|
|
1072
|
+
assert!(result.is_err());
|
|
1073
|
+
}
|
|
1074
|
+
|
|
1075
|
+
#[test]
|
|
1076
|
+
fn sample_size_preserves_small_user_value() {
|
|
1077
|
+
let writer = WriterBuilder::new()
|
|
1078
|
+
.with_sample_size(1)
|
|
1079
|
+
.build(Vec::new(), single_int64_schema())
|
|
1080
|
+
.unwrap();
|
|
1081
|
+
|
|
1082
|
+
assert_eq!(writer.sample_size, 1);
|
|
1083
|
+
assert_eq!(writer.size_samples.capacity(), 1);
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1086
|
+
#[test]
|
|
1087
|
+
fn small_sample_size_updates_after_requested_sample_count() {
|
|
1088
|
+
let mut writer = WriterBuilder::new()
|
|
1089
|
+
.with_memory_threshold(128)
|
|
1090
|
+
.with_sample_size(1)
|
|
1091
|
+
.build(Vec::new(), single_int64_schema())
|
|
1092
|
+
.unwrap();
|
|
1093
|
+
|
|
1094
|
+
writer.write_row(vec![ParquetValue::Int64(1)]).unwrap();
|
|
1095
|
+
|
|
1096
|
+
assert_eq!(writer.size_samples.len(), 1);
|
|
1097
|
+
assert_eq!(
|
|
1098
|
+
writer.current_batch_size,
|
|
1099
|
+
dynamic_batch_size_for_column_count(16, 1)
|
|
1100
|
+
);
|
|
1101
|
+
}
|
|
1102
|
+
|
|
1103
|
+
#[test]
|
|
1104
|
+
fn oversized_sample_size_is_rejected_before_initial_buffer_allocation() {
|
|
1105
|
+
let result = WriterBuilder::new()
|
|
1106
|
+
.with_sample_size(usize::MAX)
|
|
1107
|
+
.build(Vec::new(), single_int64_schema());
|
|
1108
|
+
|
|
1109
|
+
assert!(result.is_err());
|
|
1110
|
+
}
|
|
1111
|
+
|
|
1112
|
+
#[test]
|
|
1113
|
+
fn test_writer_creation() {
|
|
1114
|
+
let schema = SchemaBuilder::new()
|
|
1115
|
+
.with_root(SchemaNode::Struct {
|
|
1116
|
+
name: "root".to_string(),
|
|
1117
|
+
nullable: false,
|
|
1118
|
+
fields: vec![SchemaNode::Primitive {
|
|
1119
|
+
name: "id".to_string(),
|
|
1120
|
+
primitive_type: crate::PrimitiveType::Int64,
|
|
1121
|
+
nullable: false,
|
|
1122
|
+
format: None,
|
|
1123
|
+
}],
|
|
1124
|
+
})
|
|
1125
|
+
.build()
|
|
1126
|
+
.unwrap();
|
|
1127
|
+
|
|
1128
|
+
let buffer = Vec::new();
|
|
1129
|
+
let _writer = Writer::new(buffer, schema).unwrap();
|
|
1130
|
+
}
|
|
1131
|
+
|
|
1132
|
+
#[test]
|
|
1133
|
+
fn test_writer_builder() {
|
|
1134
|
+
let schema = SchemaBuilder::new()
|
|
1135
|
+
.with_root(SchemaNode::Struct {
|
|
1136
|
+
name: "root".to_string(),
|
|
1137
|
+
nullable: false,
|
|
1138
|
+
fields: vec![SchemaNode::Primitive {
|
|
1139
|
+
name: "id".to_string(),
|
|
1140
|
+
primitive_type: crate::PrimitiveType::Int64,
|
|
1141
|
+
nullable: false,
|
|
1142
|
+
format: None,
|
|
1143
|
+
}],
|
|
1144
|
+
})
|
|
1145
|
+
.build()
|
|
1146
|
+
.unwrap();
|
|
1147
|
+
|
|
1148
|
+
let buffer = Vec::new();
|
|
1149
|
+
let _writer = WriterBuilder::new()
|
|
1150
|
+
.with_compression(Compression::ZSTD(parquet::basic::ZstdLevel::default()))
|
|
1151
|
+
.with_batch_size(500)
|
|
1152
|
+
.with_memory_threshold(50 * 1024 * 1024)
|
|
1153
|
+
.with_sample_size(50)
|
|
1154
|
+
.build(buffer, schema)
|
|
1155
|
+
.unwrap();
|
|
1156
|
+
}
|
|
1157
|
+
|
|
1158
|
+
#[test]
|
|
1159
|
+
fn test_buffered_writing() {
|
|
1160
|
+
let schema = SchemaBuilder::new()
|
|
1161
|
+
.with_root(SchemaNode::Struct {
|
|
1162
|
+
name: "root".to_string(),
|
|
1163
|
+
nullable: false,
|
|
1164
|
+
fields: vec![
|
|
1165
|
+
SchemaNode::Primitive {
|
|
1166
|
+
name: "id".to_string(),
|
|
1167
|
+
primitive_type: crate::PrimitiveType::Int64,
|
|
1168
|
+
nullable: false,
|
|
1169
|
+
format: None,
|
|
1170
|
+
},
|
|
1171
|
+
SchemaNode::Primitive {
|
|
1172
|
+
name: "name".to_string(),
|
|
1173
|
+
primitive_type: crate::PrimitiveType::String,
|
|
1174
|
+
nullable: true,
|
|
1175
|
+
format: None,
|
|
1176
|
+
},
|
|
1177
|
+
],
|
|
1178
|
+
})
|
|
1179
|
+
.build()
|
|
1180
|
+
.unwrap();
|
|
1181
|
+
|
|
1182
|
+
let buffer = Vec::new();
|
|
1183
|
+
let mut writer = WriterBuilder::new()
|
|
1184
|
+
.with_batch_size(10) // Small batch for testing
|
|
1185
|
+
.build(buffer, schema)
|
|
1186
|
+
.unwrap();
|
|
1187
|
+
|
|
1188
|
+
// Write 25 rows - should trigger 2 flushes with batch size 10
|
|
1189
|
+
for i in 0..25 {
|
|
1190
|
+
writer
|
|
1191
|
+
.write_row(vec![
|
|
1192
|
+
ParquetValue::Int64(i),
|
|
1193
|
+
ParquetValue::String(Arc::from(format!("row_{}", i))),
|
|
1194
|
+
])
|
|
1195
|
+
.unwrap();
|
|
1196
|
+
}
|
|
1197
|
+
|
|
1198
|
+
// Close to flush remaining rows
|
|
1199
|
+
writer.close().unwrap();
|
|
1200
|
+
}
|
|
1201
|
+
|
|
1202
|
+
#[test]
|
|
1203
|
+
fn test_row_size_estimation() {
|
|
1204
|
+
let schema = SchemaBuilder::new()
|
|
1205
|
+
.with_root(SchemaNode::Struct {
|
|
1206
|
+
name: "root".to_string(),
|
|
1207
|
+
nullable: false,
|
|
1208
|
+
fields: vec![
|
|
1209
|
+
SchemaNode::Primitive {
|
|
1210
|
+
name: "id".to_string(),
|
|
1211
|
+
primitive_type: crate::PrimitiveType::Int64,
|
|
1212
|
+
nullable: false,
|
|
1213
|
+
format: None,
|
|
1214
|
+
},
|
|
1215
|
+
SchemaNode::Primitive {
|
|
1216
|
+
name: "data".to_string(),
|
|
1217
|
+
primitive_type: crate::PrimitiveType::String,
|
|
1218
|
+
nullable: false,
|
|
1219
|
+
format: None,
|
|
1220
|
+
},
|
|
1221
|
+
],
|
|
1222
|
+
})
|
|
1223
|
+
.build()
|
|
1224
|
+
.unwrap();
|
|
1225
|
+
|
|
1226
|
+
let buffer = Vec::new();
|
|
1227
|
+
let writer = Writer::new(buffer, schema).unwrap();
|
|
1228
|
+
|
|
1229
|
+
// Test size estimation for different value types
|
|
1230
|
+
let row = vec![
|
|
1231
|
+
ParquetValue::Int64(12345),
|
|
1232
|
+
ParquetValue::String(Arc::from("Hello, World!")),
|
|
1233
|
+
];
|
|
1234
|
+
|
|
1235
|
+
let size = writer.estimate_row_size(&row).unwrap();
|
|
1236
|
+
assert!(size > 0);
|
|
1237
|
+
|
|
1238
|
+
// Int64 = 8 bytes, String = 13 chars + overhead
|
|
1239
|
+
assert!(size >= 8 + 13);
|
|
1240
|
+
}
|
|
1241
|
+
}
|