parquet-tyfoom 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Cargo.lock +1854 -0
- data/Cargo.toml +3 -0
- data/Gemfile +21 -0
- data/LICENSE +21 -0
- data/README.md +428 -0
- data/Rakefile +43 -0
- data/ext/parquet/Cargo.toml +39 -0
- data/ext/parquet/build.rs +5 -0
- data/ext/parquet/extconf.rb +4 -0
- data/ext/parquet/src/adapter_ffi.rs +297 -0
- data/ext/parquet/src/allocator.rs +13 -0
- data/ext/parquet/src/lib.rs +24 -0
- data/ext/parquet-core/Cargo.toml +24 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
- data/ext/parquet-core/src/error.rs +189 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +368 -0
- data/ext/parquet-core/src/schema.rs +452 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +190 -0
- data/ext/parquet-core/src/value.rs +220 -0
- data/ext/parquet-core/src/writer.rs +1241 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +431 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/review_regressions.rs +787 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
- data/ext/parquet-ruby-adapter/src/error.rs +141 -0
- data/ext/parquet-ruby-adapter/src/io.rs +432 -0
- data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
- data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
- data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +98 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
- data/lib/parquet/schema.rb +262 -0
- data/lib/parquet/version.rb +3 -0
- data/lib/parquet.rb +11 -0
- data/lib/parquet.rbi +181 -0
- metadata +165 -0
|
@@ -0,0 +1,625 @@
|
|
|
1
|
+
use magnus::value::ReprValue;
|
|
2
|
+
use magnus::{Enumerator, Error as MagnusError, RArray, Ruby, TryConvert, Value};
|
|
3
|
+
use parquet_core::writer::WriterBuilder;
|
|
4
|
+
use parquet_core::Schema;
|
|
5
|
+
use std::io::{BufReader, BufWriter, Write};
|
|
6
|
+
use tempfile::NamedTempFile;
|
|
7
|
+
|
|
8
|
+
use crate::io::RubyIOWriter;
|
|
9
|
+
use crate::types::WriterOutput;
|
|
10
|
+
use crate::utils::parse_compression;
|
|
11
|
+
|
|
12
|
+
/// Rows pulled per slice when streaming rows from an Enumerable without a
|
|
13
|
+
/// user-provided batch_size. Mirrors the core writer's default batch size so
|
|
14
|
+
/// the Ruby-side slice and the core row buffer stay in the same range.
|
|
15
|
+
const DEFAULT_ROW_SLICE_SIZE: usize = 1000;
|
|
16
|
+
|
|
17
|
+
/// How the writer batches rows before flushing. All batch sizing is owned by the
|
|
18
|
+
/// core `Writer`; the adapter only forwards the user's options.
|
|
19
|
+
#[derive(Debug, Default, Clone, Copy)]
|
|
20
|
+
pub struct BatchSizingOptions {
|
|
21
|
+
pub batch_size: Option<usize>,
|
|
22
|
+
pub flush_threshold: Option<usize>,
|
|
23
|
+
pub sample_size: Option<usize>,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/// Create a writer based on the output type (file path or IO object), forwarding
|
|
27
|
+
/// the batch-sizing options to the core writer (the single source of truth).
|
|
28
|
+
pub fn create_writer(
|
|
29
|
+
ruby: &Ruby,
|
|
30
|
+
write_to: Value,
|
|
31
|
+
schema: Schema,
|
|
32
|
+
compression: Option<String>,
|
|
33
|
+
options: BatchSizingOptions,
|
|
34
|
+
) -> Result<WriterOutput, MagnusError> {
|
|
35
|
+
let mut builder = WriterBuilder::new().with_compression(parse_compression(ruby, compression)?);
|
|
36
|
+
if let Some(size) = options.batch_size {
|
|
37
|
+
builder = builder.with_batch_size(size);
|
|
38
|
+
}
|
|
39
|
+
if let Some(threshold) = options.flush_threshold {
|
|
40
|
+
builder = builder.with_memory_threshold(threshold);
|
|
41
|
+
}
|
|
42
|
+
if let Some(size) = options.sample_size {
|
|
43
|
+
builder = builder.with_sample_size(size);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
if write_to.is_kind_of(ruby.class_string()) {
|
|
47
|
+
// Direct file path
|
|
48
|
+
let path_str: String = TryConvert::try_convert(write_to)?;
|
|
49
|
+
let file = std::fs::File::create(&path_str)
|
|
50
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
51
|
+
let writer = builder
|
|
52
|
+
.build(file, schema)
|
|
53
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
54
|
+
Ok(WriterOutput::File(writer))
|
|
55
|
+
} else {
|
|
56
|
+
// IO-like object - create temporary file
|
|
57
|
+
let temp_file = NamedTempFile::new().map_err(|e| {
|
|
58
|
+
MagnusError::new(
|
|
59
|
+
ruby.exception_runtime_error(),
|
|
60
|
+
format!("Failed to create temporary file: {}", e),
|
|
61
|
+
)
|
|
62
|
+
})?;
|
|
63
|
+
|
|
64
|
+
// Clone the file handle for the writer
|
|
65
|
+
let file = temp_file.reopen().map_err(|e| {
|
|
66
|
+
MagnusError::new(
|
|
67
|
+
ruby.exception_runtime_error(),
|
|
68
|
+
format!("Failed to reopen temporary file: {}", e),
|
|
69
|
+
)
|
|
70
|
+
})?;
|
|
71
|
+
|
|
72
|
+
let writer = builder
|
|
73
|
+
.build(file, schema)
|
|
74
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
75
|
+
|
|
76
|
+
Ok(WriterOutput::TempFile(writer, temp_file, write_to))
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/// Finalize the writer and copy temp file to IO if needed
|
|
81
|
+
pub fn finalize_writer(ruby: &Ruby, writer_output: WriterOutput) -> Result<(), MagnusError> {
|
|
82
|
+
match writer_output {
|
|
83
|
+
WriterOutput::File(writer) => writer
|
|
84
|
+
.close()
|
|
85
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string())),
|
|
86
|
+
WriterOutput::TempFile(writer, temp_file, io_object) => {
|
|
87
|
+
// Close the writer first
|
|
88
|
+
writer
|
|
89
|
+
.close()
|
|
90
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
91
|
+
|
|
92
|
+
// Copy temp file to IO object
|
|
93
|
+
copy_temp_file_to_io(ruby, temp_file, io_object)
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/// Copy temporary file contents to Ruby IO object
|
|
99
|
+
fn copy_temp_file_to_io(
|
|
100
|
+
ruby: &Ruby,
|
|
101
|
+
temp_file: NamedTempFile,
|
|
102
|
+
io_object: Value,
|
|
103
|
+
) -> Result<(), MagnusError> {
|
|
104
|
+
let file = temp_file.reopen().map_err(|e| {
|
|
105
|
+
MagnusError::new(
|
|
106
|
+
ruby.exception_runtime_error(),
|
|
107
|
+
format!("Failed to reopen temporary file: {}", e),
|
|
108
|
+
)
|
|
109
|
+
})?;
|
|
110
|
+
|
|
111
|
+
let mut buf_reader = BufReader::new(file);
|
|
112
|
+
let ruby_io_writer = RubyIOWriter::new(io_object);
|
|
113
|
+
let mut buf_writer = BufWriter::new(ruby_io_writer);
|
|
114
|
+
|
|
115
|
+
std::io::copy(&mut buf_reader, &mut buf_writer).map_err(|e| {
|
|
116
|
+
MagnusError::new(
|
|
117
|
+
ruby.exception_runtime_error(),
|
|
118
|
+
format!("Failed to copy temp file to IO object: {}", e),
|
|
119
|
+
)
|
|
120
|
+
})?;
|
|
121
|
+
|
|
122
|
+
buf_writer.flush().map_err(|e| {
|
|
123
|
+
MagnusError::new(
|
|
124
|
+
ruby.exception_runtime_error(),
|
|
125
|
+
format!("Failed to flush IO object: {}", e),
|
|
126
|
+
)
|
|
127
|
+
})?;
|
|
128
|
+
|
|
129
|
+
// The temporary file will be automatically deleted when temp_file is dropped
|
|
130
|
+
Ok(())
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/// The rows to write, either already materialized or pulled lazily from an
|
|
134
|
+
/// Enumerable in bounded slices so the whole input is never resident at once.
|
|
135
|
+
enum RowSource {
|
|
136
|
+
Materialized(RArray),
|
|
137
|
+
Streamed {
|
|
138
|
+
first_slice: Option<RArray>,
|
|
139
|
+
remaining: Enumerator,
|
|
140
|
+
},
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/// Classify `read_from` without draining it. Arrays are used as-is; anything
|
|
144
|
+
/// else that supports `each_slice` (Enumerator, any Enumerable) is streamed in
|
|
145
|
+
/// bounded slices; objects that only support `to_a` keep the legacy
|
|
146
|
+
/// materialize-first behavior.
|
|
147
|
+
fn row_source(
|
|
148
|
+
ruby: &Ruby,
|
|
149
|
+
read_from: Value,
|
|
150
|
+
batch_size: Option<usize>,
|
|
151
|
+
) -> Result<RowSource, MagnusError> {
|
|
152
|
+
if read_from.is_kind_of(ruby.class_array()) {
|
|
153
|
+
return Ok(RowSource::Materialized(TryConvert::try_convert(read_from)?));
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
if read_from.respond_to("each_slice", false)? {
|
|
157
|
+
// One fiber switch per slice keeps Enumerator overhead negligible
|
|
158
|
+
// while bounding how many Ruby rows are in flight at once.
|
|
159
|
+
let slice_size = batch_size.unwrap_or(DEFAULT_ROW_SLICE_SIZE);
|
|
160
|
+
let mut remaining = read_from.enumeratorize("each_slice", (slice_size,));
|
|
161
|
+
let first_slice = match remaining.next() {
|
|
162
|
+
Some(slice) => Some(TryConvert::try_convert(slice?)?),
|
|
163
|
+
None => None,
|
|
164
|
+
};
|
|
165
|
+
return Ok(RowSource::Streamed {
|
|
166
|
+
first_slice,
|
|
167
|
+
remaining,
|
|
168
|
+
});
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
if read_from.respond_to("to_a", false)? {
|
|
172
|
+
let array_value: Value = read_from.funcall("to_a", ())?;
|
|
173
|
+
return Ok(RowSource::Materialized(TryConvert::try_convert(
|
|
174
|
+
array_value,
|
|
175
|
+
)?));
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
Err(MagnusError::new(
|
|
179
|
+
ruby.exception_type_error(),
|
|
180
|
+
"data must be an array or respond to 'to_a'",
|
|
181
|
+
))
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/// Map a value-conversion failure to the Ruby exception the write API raises:
|
|
185
|
+
/// encoding problems surface as EncodingError, everything else RuntimeError.
|
|
186
|
+
fn conversion_error(ruby: &Ruby, error_msg: String) -> MagnusError {
|
|
187
|
+
if error_msg.contains("EncodingError") || error_msg.contains("invalid utf-8") {
|
|
188
|
+
// Extract the actual encoding error message
|
|
189
|
+
if let Some(pos) = error_msg.find("EncodingError: ") {
|
|
190
|
+
let encoding_msg = error_msg[pos + 15..].to_string();
|
|
191
|
+
MagnusError::new(ruby.exception_encoding_error(), encoding_msg)
|
|
192
|
+
} else {
|
|
193
|
+
MagnusError::new(ruby.exception_encoding_error(), error_msg)
|
|
194
|
+
}
|
|
195
|
+
} else {
|
|
196
|
+
MagnusError::new(ruby.exception_runtime_error(), error_msg)
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
/// Convert one slice of Ruby rows and write them through the core writer.
|
|
201
|
+
/// Returns the number of rows written.
|
|
202
|
+
///
|
|
203
|
+
/// `release_consumed` should be true only when `rows` is an array this module
|
|
204
|
+
/// created itself (an `each_slice` slice): each element is dropped from the
|
|
205
|
+
/// slice once converted, so large rows become collectable while the rest of
|
|
206
|
+
/// the slice is still being written. User-provided arrays must not be mutated.
|
|
207
|
+
fn write_row_slice(
|
|
208
|
+
ruby: &Ruby,
|
|
209
|
+
writer_output: &mut WriterOutput,
|
|
210
|
+
converter: &mut crate::converter::RubyValueConverter,
|
|
211
|
+
field_schemas: &[parquet_core::SchemaNode],
|
|
212
|
+
rows: RArray,
|
|
213
|
+
release_consumed: bool,
|
|
214
|
+
) -> Result<u64, MagnusError> {
|
|
215
|
+
let mut rows_written = 0u64;
|
|
216
|
+
|
|
217
|
+
for (row_idx, row_value) in rows.into_iter().enumerate() {
|
|
218
|
+
// Convert Ruby row to ParquetValue vector
|
|
219
|
+
let row = if row_value.is_kind_of(ruby.class_array()) {
|
|
220
|
+
let array: RArray = TryConvert::try_convert(row_value)?;
|
|
221
|
+
let mut values = Vec::with_capacity(array.len());
|
|
222
|
+
|
|
223
|
+
for (idx, item) in array.into_iter().enumerate() {
|
|
224
|
+
let schema_hint = field_schemas.get(idx);
|
|
225
|
+
let pq_value = converter
|
|
226
|
+
.to_parquet_with_schema_hint(item, schema_hint)
|
|
227
|
+
.map_err(|e| conversion_error(ruby, e.to_string()))?;
|
|
228
|
+
values.push(pq_value);
|
|
229
|
+
}
|
|
230
|
+
values
|
|
231
|
+
} else {
|
|
232
|
+
return Err(MagnusError::new(
|
|
233
|
+
ruby.exception_type_error(),
|
|
234
|
+
"each row must be an array",
|
|
235
|
+
));
|
|
236
|
+
};
|
|
237
|
+
|
|
238
|
+
if release_consumed {
|
|
239
|
+
rows.store(row_idx as isize, ruby.qnil())?;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
match writer_output {
|
|
243
|
+
WriterOutput::File(writer) | WriterOutput::TempFile(writer, _, _) => {
|
|
244
|
+
writer
|
|
245
|
+
.write_row(row)
|
|
246
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
rows_written += 1;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
Ok(rows_written)
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
/// Write data in row format to a parquet file
|
|
256
|
+
pub fn write_rows(
|
|
257
|
+
ruby: &Ruby,
|
|
258
|
+
write_args: crate::types::ParquetWriteArgs,
|
|
259
|
+
) -> Result<Value, MagnusError> {
|
|
260
|
+
use crate::converter::RubyValueConverter;
|
|
261
|
+
use crate::logger::RubyLogger;
|
|
262
|
+
use crate::schema::{extract_field_schemas, process_schema_value, ruby_schema_to_parquet};
|
|
263
|
+
use crate::string_cache::StringCache;
|
|
264
|
+
|
|
265
|
+
// Read rows lazily where possible: draining an Enumerator up front would
|
|
266
|
+
// hold the entire dataset in memory for the duration of the write.
|
|
267
|
+
let source = row_source(ruby, write_args.read_from, write_args.batch_size)?;
|
|
268
|
+
|
|
269
|
+
// Schema inference (used when `schema` is nil/empty) only inspects the
|
|
270
|
+
// first row, so the first slice is enough.
|
|
271
|
+
let empty_rows;
|
|
272
|
+
let inference_rows = match &source {
|
|
273
|
+
RowSource::Materialized(rows) => rows,
|
|
274
|
+
RowSource::Streamed {
|
|
275
|
+
first_slice: Some(rows),
|
|
276
|
+
..
|
|
277
|
+
} => rows,
|
|
278
|
+
RowSource::Streamed {
|
|
279
|
+
first_slice: None, ..
|
|
280
|
+
} => {
|
|
281
|
+
empty_rows = ruby.ary_new();
|
|
282
|
+
&empty_rows
|
|
283
|
+
}
|
|
284
|
+
};
|
|
285
|
+
|
|
286
|
+
// Process schema value
|
|
287
|
+
let schema_hash = process_schema_value(ruby, write_args.schema_value, Some(inference_rows))
|
|
288
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
289
|
+
|
|
290
|
+
// Create schema
|
|
291
|
+
let schema = ruby_schema_to_parquet(schema_hash)
|
|
292
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
293
|
+
|
|
294
|
+
// Extract field schemas for conversion hints
|
|
295
|
+
let field_schemas = extract_field_schemas(&schema);
|
|
296
|
+
|
|
297
|
+
// Create writer. All batch sizing and flushing is owned by the core writer;
|
|
298
|
+
// the user's batch_size/flush_threshold/sample_size are forwarded to it.
|
|
299
|
+
let mut writer_output = create_writer(
|
|
300
|
+
ruby,
|
|
301
|
+
write_args.write_to,
|
|
302
|
+
schema.clone(),
|
|
303
|
+
write_args.compression,
|
|
304
|
+
BatchSizingOptions {
|
|
305
|
+
batch_size: write_args.batch_size,
|
|
306
|
+
flush_threshold: write_args.flush_threshold,
|
|
307
|
+
sample_size: write_args.sample_size,
|
|
308
|
+
},
|
|
309
|
+
)?;
|
|
310
|
+
|
|
311
|
+
// Create logger
|
|
312
|
+
let logger = RubyLogger::new(write_args.logger)?;
|
|
313
|
+
let _ = logger.info(|| "Starting to write parquet file".to_string());
|
|
314
|
+
|
|
315
|
+
// Create converter with string cache if enabled. `string_cache` is the
|
|
316
|
+
// requested capacity (None = disabled).
|
|
317
|
+
let mut converter = if let Some(capacity) = write_args.string_cache {
|
|
318
|
+
let _ = logger.debug(|| format!("String cache enabled (capacity {})", capacity));
|
|
319
|
+
RubyValueConverter::with_string_cache(StringCache::new(capacity))
|
|
320
|
+
} else {
|
|
321
|
+
RubyValueConverter::new()
|
|
322
|
+
};
|
|
323
|
+
|
|
324
|
+
// Stream each row to the core writer, which buffers and flushes internally
|
|
325
|
+
// according to its (now sole) batch-sizing policy. Completed row groups
|
|
326
|
+
// reach the destination while later rows are still being produced.
|
|
327
|
+
let mut total_rows = 0u64;
|
|
328
|
+
|
|
329
|
+
match source {
|
|
330
|
+
RowSource::Materialized(rows) => {
|
|
331
|
+
total_rows += write_row_slice(
|
|
332
|
+
ruby,
|
|
333
|
+
&mut writer_output,
|
|
334
|
+
&mut converter,
|
|
335
|
+
&field_schemas,
|
|
336
|
+
rows,
|
|
337
|
+
false,
|
|
338
|
+
)?;
|
|
339
|
+
}
|
|
340
|
+
RowSource::Streamed {
|
|
341
|
+
first_slice,
|
|
342
|
+
remaining,
|
|
343
|
+
} => {
|
|
344
|
+
if let Some(rows) = first_slice {
|
|
345
|
+
total_rows += write_row_slice(
|
|
346
|
+
ruby,
|
|
347
|
+
&mut writer_output,
|
|
348
|
+
&mut converter,
|
|
349
|
+
&field_schemas,
|
|
350
|
+
rows,
|
|
351
|
+
true,
|
|
352
|
+
)?;
|
|
353
|
+
}
|
|
354
|
+
for slice in remaining {
|
|
355
|
+
let rows: RArray = TryConvert::try_convert(slice?)?;
|
|
356
|
+
total_rows += write_row_slice(
|
|
357
|
+
ruby,
|
|
358
|
+
&mut writer_output,
|
|
359
|
+
&mut converter,
|
|
360
|
+
&field_schemas,
|
|
361
|
+
rows,
|
|
362
|
+
true,
|
|
363
|
+
)?;
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
// The core writer flushes any remaining buffered rows when closed by
|
|
369
|
+
// finalize_writer below.
|
|
370
|
+
let _ = logger.info(|| format!("Finished writing {} rows to parquet file", total_rows));
|
|
371
|
+
|
|
372
|
+
// Log string cache statistics if enabled. `misses` is exact even after the
|
|
373
|
+
// bounded cache fills; exact distinct cardinality would require an unbounded
|
|
374
|
+
// side table, so the log labels it as misses rather than unique strings.
|
|
375
|
+
if let Some(stats) = converter.string_cache_stats() {
|
|
376
|
+
let _ = logger.info(|| {
|
|
377
|
+
format!(
|
|
378
|
+
"String cache stats: {} cache misses, {} hits ({:.1}% hit rate)",
|
|
379
|
+
stats.misses,
|
|
380
|
+
stats.hits,
|
|
381
|
+
stats.hit_rate * 100.0
|
|
382
|
+
)
|
|
383
|
+
});
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
// Finalize the writer
|
|
387
|
+
finalize_writer(ruby, writer_output)?;
|
|
388
|
+
|
|
389
|
+
Ok(ruby.qnil().as_value())
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
/// The column batches to write, either already materialized or pulled lazily
|
|
393
|
+
/// from an Enumerable one batch at a time.
|
|
394
|
+
enum BatchSource {
|
|
395
|
+
Materialized(RArray),
|
|
396
|
+
Streamed {
|
|
397
|
+
first_batch: Option<Value>,
|
|
398
|
+
remaining: Enumerator,
|
|
399
|
+
},
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
/// Classify `read_from` without draining it, mirroring `row_source`. Batches
|
|
403
|
+
/// are chunky (whole columns), so streamed sources are pulled one batch per
|
|
404
|
+
/// fiber switch via `each`.
|
|
405
|
+
fn batch_source(ruby: &Ruby, read_from: Value) -> Result<BatchSource, MagnusError> {
|
|
406
|
+
if read_from.is_kind_of(ruby.class_array()) {
|
|
407
|
+
return Ok(BatchSource::Materialized(TryConvert::try_convert(
|
|
408
|
+
read_from,
|
|
409
|
+
)?));
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
if read_from.respond_to("each", false)? {
|
|
413
|
+
let mut remaining = read_from.enumeratorize("each", ());
|
|
414
|
+
let first_batch = remaining.next().transpose()?;
|
|
415
|
+
return Ok(BatchSource::Streamed {
|
|
416
|
+
first_batch,
|
|
417
|
+
remaining,
|
|
418
|
+
});
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
if read_from.respond_to("to_a", false)? {
|
|
422
|
+
let array_value: Value = read_from.funcall("to_a", ())?;
|
|
423
|
+
return Ok(BatchSource::Materialized(TryConvert::try_convert(
|
|
424
|
+
array_value,
|
|
425
|
+
)?));
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
Err(MagnusError::new(
|
|
429
|
+
ruby.exception_type_error(),
|
|
430
|
+
"data must be an array or respond to 'to_a'",
|
|
431
|
+
))
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
/// Convert one Ruby batch (an array of per-column value arrays) and write it
|
|
435
|
+
/// through the core writer as a record batch. Returns the batch's row count.
|
|
436
|
+
fn write_column_batch(
|
|
437
|
+
ruby: &Ruby,
|
|
438
|
+
writer_output: &mut WriterOutput,
|
|
439
|
+
field_schemas: &[parquet_core::SchemaNode],
|
|
440
|
+
column_names: &[String],
|
|
441
|
+
batch: Value,
|
|
442
|
+
) -> Result<usize, MagnusError> {
|
|
443
|
+
use crate::converter::RubyValueConverter;
|
|
444
|
+
|
|
445
|
+
if !batch.is_kind_of(ruby.class_array()) {
|
|
446
|
+
return Err(MagnusError::new(
|
|
447
|
+
ruby.exception_type_error(),
|
|
448
|
+
"each batch must be an array of column values",
|
|
449
|
+
));
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
let batch_array: RArray = TryConvert::try_convert(batch)?;
|
|
453
|
+
|
|
454
|
+
// Verify batch has the right number of columns
|
|
455
|
+
if batch_array.len() != column_names.len() {
|
|
456
|
+
return Err(MagnusError::new(
|
|
457
|
+
ruby.exception_runtime_error(),
|
|
458
|
+
format!(
|
|
459
|
+
"Batch has {} columns but schema has {}",
|
|
460
|
+
batch_array.len(),
|
|
461
|
+
column_names.len()
|
|
462
|
+
),
|
|
463
|
+
));
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
let mut batch_columns: Vec<(String, Vec<parquet_core::ParquetValue>)> =
|
|
467
|
+
Vec::with_capacity(column_names.len());
|
|
468
|
+
|
|
469
|
+
// Process each column in the batch
|
|
470
|
+
for (col_idx, column_values) in batch_array.into_iter().enumerate() {
|
|
471
|
+
if !column_values.is_kind_of(ruby.class_array()) {
|
|
472
|
+
return Err(MagnusError::new(
|
|
473
|
+
ruby.exception_type_error(),
|
|
474
|
+
format!("Column {} values must be an array", col_idx),
|
|
475
|
+
));
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
let values_array: RArray = TryConvert::try_convert(column_values)?;
|
|
479
|
+
|
|
480
|
+
// Convert and append values
|
|
481
|
+
let mut converter = RubyValueConverter::new();
|
|
482
|
+
let schema_hint = field_schemas.get(col_idx);
|
|
483
|
+
|
|
484
|
+
let mut values = Vec::with_capacity(values_array.len());
|
|
485
|
+
for value in values_array.into_iter() {
|
|
486
|
+
let pq_value = converter
|
|
487
|
+
.to_parquet_with_schema_hint(value, schema_hint)
|
|
488
|
+
.map_err(|e| conversion_error(ruby, e.to_string()))?;
|
|
489
|
+
values.push(pq_value);
|
|
490
|
+
}
|
|
491
|
+
batch_columns.push((column_names[col_idx].clone(), values));
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
let batch_rows = batch_columns
|
|
495
|
+
.first()
|
|
496
|
+
.map(|(_name, values)| values.len())
|
|
497
|
+
.unwrap_or(0);
|
|
498
|
+
|
|
499
|
+
// Write this batch immediately; the core writer flushes completed row
|
|
500
|
+
// groups to the destination once its in-progress buffer exceeds the
|
|
501
|
+
// flush threshold.
|
|
502
|
+
match writer_output {
|
|
503
|
+
WriterOutput::File(writer) | WriterOutput::TempFile(writer, _, _) => {
|
|
504
|
+
writer
|
|
505
|
+
.write_columns(batch_columns)
|
|
506
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
Ok(batch_rows)
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
/// Write data in column format to a parquet file
|
|
514
|
+
pub fn write_columns(
|
|
515
|
+
ruby: &Ruby,
|
|
516
|
+
write_args: crate::types::ParquetWriteArgs,
|
|
517
|
+
) -> Result<Value, MagnusError> {
|
|
518
|
+
use crate::logger::RubyLogger;
|
|
519
|
+
use crate::schema::{extract_field_schemas, process_schema_value, ruby_schema_to_parquet};
|
|
520
|
+
|
|
521
|
+
let logger = RubyLogger::new(write_args.logger)?;
|
|
522
|
+
|
|
523
|
+
// Read batches lazily where possible: draining an Enumerator up front
|
|
524
|
+
// would hold every batch in memory for the duration of the write.
|
|
525
|
+
let source = batch_source(ruby, write_args.read_from)?;
|
|
526
|
+
|
|
527
|
+
// Schema inference (used when `schema` is nil/empty) only inspects the
|
|
528
|
+
// first batch, so one batch is enough.
|
|
529
|
+
let first_batch_holder;
|
|
530
|
+
let inference_batches = match &source {
|
|
531
|
+
BatchSource::Materialized(batches) => batches,
|
|
532
|
+
BatchSource::Streamed { first_batch, .. } => {
|
|
533
|
+
first_batch_holder = ruby.ary_new();
|
|
534
|
+
if let Some(batch) = first_batch {
|
|
535
|
+
first_batch_holder.push(*batch)?;
|
|
536
|
+
}
|
|
537
|
+
&first_batch_holder
|
|
538
|
+
}
|
|
539
|
+
};
|
|
540
|
+
|
|
541
|
+
// Process schema value
|
|
542
|
+
let schema_hash = process_schema_value(ruby, write_args.schema_value, Some(inference_batches))
|
|
543
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
544
|
+
|
|
545
|
+
// Create schema
|
|
546
|
+
let schema = ruby_schema_to_parquet(schema_hash)
|
|
547
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
548
|
+
|
|
549
|
+
// Extract field schemas for conversion hints
|
|
550
|
+
let field_schemas = extract_field_schemas(&schema);
|
|
551
|
+
|
|
552
|
+
// Create writer. The columnar path writes one record batch per write_columns
|
|
553
|
+
// call, so row batch-sizing options are rejected before this point.
|
|
554
|
+
let mut writer_output = create_writer(
|
|
555
|
+
ruby,
|
|
556
|
+
write_args.write_to,
|
|
557
|
+
schema.clone(),
|
|
558
|
+
write_args.compression,
|
|
559
|
+
BatchSizingOptions {
|
|
560
|
+
batch_size: None,
|
|
561
|
+
flush_threshold: write_args.flush_threshold,
|
|
562
|
+
sample_size: None,
|
|
563
|
+
},
|
|
564
|
+
)?;
|
|
565
|
+
let _ = logger.info(|| "Starting to write parquet file columns".to_string());
|
|
566
|
+
|
|
567
|
+
// Get column names from schema
|
|
568
|
+
let column_names: Vec<String> =
|
|
569
|
+
if let parquet_core::SchemaNode::Struct { fields, .. } = &schema.root {
|
|
570
|
+
fields.iter().map(|f| f.name().to_string()).collect()
|
|
571
|
+
} else {
|
|
572
|
+
return Err(MagnusError::new(
|
|
573
|
+
ruby.exception_runtime_error(),
|
|
574
|
+
"Schema root must be a struct",
|
|
575
|
+
));
|
|
576
|
+
};
|
|
577
|
+
|
|
578
|
+
// Convert and write each batch as it arrives; completed row groups are
|
|
579
|
+
// flushed to the destination instead of accumulating every batch first.
|
|
580
|
+
let mut total_rows: usize = 0;
|
|
581
|
+
|
|
582
|
+
match source {
|
|
583
|
+
BatchSource::Materialized(batches) => {
|
|
584
|
+
for batch in batches.into_iter() {
|
|
585
|
+
total_rows += write_column_batch(
|
|
586
|
+
ruby,
|
|
587
|
+
&mut writer_output,
|
|
588
|
+
&field_schemas,
|
|
589
|
+
&column_names,
|
|
590
|
+
batch,
|
|
591
|
+
)?;
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
BatchSource::Streamed {
|
|
595
|
+
first_batch,
|
|
596
|
+
remaining,
|
|
597
|
+
} => {
|
|
598
|
+
if let Some(batch) = first_batch {
|
|
599
|
+
total_rows += write_column_batch(
|
|
600
|
+
ruby,
|
|
601
|
+
&mut writer_output,
|
|
602
|
+
&field_schemas,
|
|
603
|
+
&column_names,
|
|
604
|
+
batch,
|
|
605
|
+
)?;
|
|
606
|
+
}
|
|
607
|
+
for batch in remaining {
|
|
608
|
+
total_rows += write_column_batch(
|
|
609
|
+
ruby,
|
|
610
|
+
&mut writer_output,
|
|
611
|
+
&field_schemas,
|
|
612
|
+
&column_names,
|
|
613
|
+
batch?,
|
|
614
|
+
)?;
|
|
615
|
+
}
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
let _ = logger.info(|| format!("Finished writing {total_rows} rows to parquet file columns"));
|
|
620
|
+
|
|
621
|
+
// Finalize the writer
|
|
622
|
+
finalize_writer(ruby, writer_output)?;
|
|
623
|
+
|
|
624
|
+
Ok(ruby.qnil().as_value())
|
|
625
|
+
}
|