parquet 0.5.12 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +295 -98
- data/Cargo.toml +1 -1
- data/Gemfile +1 -0
- data/README.md +94 -3
- data/ext/parquet/Cargo.toml +8 -5
- data/ext/parquet/src/adapter_ffi.rs +156 -0
- data/ext/parquet/src/lib.rs +13 -21
- data/ext/parquet-core/Cargo.toml +23 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
- data/ext/parquet-core/src/error.rs +163 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +263 -0
- data/ext/parquet-core/src/schema.rs +283 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +151 -0
- data/ext/parquet-core/src/value.rs +209 -0
- data/ext/parquet-core/src/writer.rs +839 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +430 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
- data/ext/parquet-ruby-adapter/src/error.rs +148 -0
- data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
- data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +94 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
- data/lib/parquet/schema.rb +19 -0
- data/lib/parquet/version.rb +1 -1
- metadata +50 -24
- data/ext/parquet/src/enumerator.rs +0 -68
- data/ext/parquet/src/header_cache.rs +0 -99
- data/ext/parquet/src/logger.rs +0 -171
- data/ext/parquet/src/reader/common.rs +0 -111
- data/ext/parquet/src/reader/mod.rs +0 -211
- data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
- data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
- data/ext/parquet/src/reader/unified/mod.rs +0 -363
- data/ext/parquet/src/types/core_types.rs +0 -120
- data/ext/parquet/src/types/mod.rs +0 -100
- data/ext/parquet/src/types/parquet_value.rs +0 -1275
- data/ext/parquet/src/types/record_types.rs +0 -603
- data/ext/parquet/src/types/schema_converter.rs +0 -290
- data/ext/parquet/src/types/schema_node.rs +0 -424
- data/ext/parquet/src/types/timestamp.rs +0 -285
- data/ext/parquet/src/types/type_conversion.rs +0 -1949
- data/ext/parquet/src/types/writer_types.rs +0 -329
- data/ext/parquet/src/utils.rs +0 -184
- data/ext/parquet/src/writer/mod.rs +0 -505
- data/ext/parquet/src/writer/write_columns.rs +0 -238
- data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,435 @@
|
|
1
|
+
use magnus::value::ReprValue;
|
2
|
+
use magnus::{Error as MagnusError, Ruby, TryConvert, Value};
|
3
|
+
use parquet::file::properties::WriterProperties;
|
4
|
+
use parquet_core::Schema;
|
5
|
+
use std::io::{BufReader, BufWriter, Write};
|
6
|
+
use tempfile::NamedTempFile;
|
7
|
+
|
8
|
+
use crate::io::RubyIOWriter;
|
9
|
+
use crate::types::WriterOutput;
|
10
|
+
use crate::utils::parse_compression;
|
11
|
+
|
12
|
+
/// Create a writer based on the output type (file path or IO object)
|
13
|
+
pub fn create_writer(
|
14
|
+
ruby: &Ruby,
|
15
|
+
write_to: Value,
|
16
|
+
schema: Schema,
|
17
|
+
compression: Option<String>,
|
18
|
+
) -> Result<WriterOutput, MagnusError> {
|
19
|
+
let compression_setting = parse_compression(compression)?;
|
20
|
+
let props = WriterProperties::builder()
|
21
|
+
.set_compression(compression_setting)
|
22
|
+
.build();
|
23
|
+
|
24
|
+
if write_to.is_kind_of(ruby.class_string()) {
|
25
|
+
// Direct file path
|
26
|
+
let path_str: String = TryConvert::try_convert(write_to)?;
|
27
|
+
let file = std::fs::File::create(&path_str)
|
28
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
29
|
+
let writer = parquet_core::writer::Writer::new_with_properties(file, schema, props)
|
30
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
31
|
+
Ok(WriterOutput::File(writer))
|
32
|
+
} else {
|
33
|
+
// IO-like object - create temporary file
|
34
|
+
let temp_file = NamedTempFile::new().map_err(|e| {
|
35
|
+
MagnusError::new(
|
36
|
+
ruby.exception_runtime_error(),
|
37
|
+
format!("Failed to create temporary file: {}", e),
|
38
|
+
)
|
39
|
+
})?;
|
40
|
+
|
41
|
+
// Clone the file handle for the writer
|
42
|
+
let file = temp_file.reopen().map_err(|e| {
|
43
|
+
MagnusError::new(
|
44
|
+
ruby.exception_runtime_error(),
|
45
|
+
format!("Failed to reopen temporary file: {}", e),
|
46
|
+
)
|
47
|
+
})?;
|
48
|
+
|
49
|
+
let writer = parquet_core::writer::Writer::new_with_properties(file, schema, props)
|
50
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
51
|
+
|
52
|
+
Ok(WriterOutput::TempFile(writer, temp_file, write_to))
|
53
|
+
}
|
54
|
+
}
|
55
|
+
|
56
|
+
/// Finalize the writer and copy temp file to IO if needed
|
57
|
+
pub fn finalize_writer(writer_output: WriterOutput) -> Result<(), MagnusError> {
|
58
|
+
match writer_output {
|
59
|
+
WriterOutput::File(writer) => writer
|
60
|
+
.close()
|
61
|
+
.map_err(|e| MagnusError::new(magnus::exception::runtime_error(), e.to_string())),
|
62
|
+
WriterOutput::TempFile(writer, temp_file, io_object) => {
|
63
|
+
// Close the writer first
|
64
|
+
writer
|
65
|
+
.close()
|
66
|
+
.map_err(|e| MagnusError::new(magnus::exception::runtime_error(), e.to_string()))?;
|
67
|
+
|
68
|
+
// Copy temp file to IO object
|
69
|
+
copy_temp_file_to_io(temp_file, io_object)
|
70
|
+
}
|
71
|
+
}
|
72
|
+
}
|
73
|
+
|
74
|
+
/// Copy temporary file contents to Ruby IO object
|
75
|
+
fn copy_temp_file_to_io(temp_file: NamedTempFile, io_object: Value) -> Result<(), MagnusError> {
|
76
|
+
let file = temp_file.reopen().map_err(|e| {
|
77
|
+
MagnusError::new(
|
78
|
+
magnus::exception::runtime_error(),
|
79
|
+
format!("Failed to reopen temporary file: {}", e),
|
80
|
+
)
|
81
|
+
})?;
|
82
|
+
|
83
|
+
let mut buf_reader = BufReader::new(file);
|
84
|
+
let ruby_io_writer = RubyIOWriter::new(io_object);
|
85
|
+
let mut buf_writer = BufWriter::new(ruby_io_writer);
|
86
|
+
|
87
|
+
std::io::copy(&mut buf_reader, &mut buf_writer).map_err(|e| {
|
88
|
+
MagnusError::new(
|
89
|
+
magnus::exception::runtime_error(),
|
90
|
+
format!("Failed to copy temp file to IO object: {}", e),
|
91
|
+
)
|
92
|
+
})?;
|
93
|
+
|
94
|
+
buf_writer.flush().map_err(|e| {
|
95
|
+
MagnusError::new(
|
96
|
+
magnus::exception::runtime_error(),
|
97
|
+
format!("Failed to flush IO object: {}", e),
|
98
|
+
)
|
99
|
+
})?;
|
100
|
+
|
101
|
+
// The temporary file will be automatically deleted when temp_file is dropped
|
102
|
+
Ok(())
|
103
|
+
}
|
104
|
+
|
105
|
+
/// Write data in row format to a parquet file
|
106
|
+
pub fn write_rows(
|
107
|
+
ruby: &Ruby,
|
108
|
+
write_args: crate::types::ParquetWriteArgs,
|
109
|
+
) -> Result<Value, MagnusError> {
|
110
|
+
use crate::batch_manager::BatchSizeManager;
|
111
|
+
use crate::converter::RubyValueConverter;
|
112
|
+
use crate::logger::RubyLogger;
|
113
|
+
use crate::schema::{extract_field_schemas, process_schema_value, ruby_schema_to_parquet};
|
114
|
+
use crate::string_cache::StringCache;
|
115
|
+
use crate::utils::estimate_row_size;
|
116
|
+
use magnus::{RArray, TryConvert};
|
117
|
+
|
118
|
+
// Convert data to array if it isn't already
|
119
|
+
let data_array = if write_args.read_from.is_kind_of(ruby.class_array()) {
|
120
|
+
TryConvert::try_convert(write_args.read_from)?
|
121
|
+
} else if write_args.read_from.respond_to("to_a", false)? {
|
122
|
+
let array_value: Value = write_args.read_from.funcall("to_a", ())?;
|
123
|
+
TryConvert::try_convert(array_value)?
|
124
|
+
} else {
|
125
|
+
return Err(MagnusError::new(
|
126
|
+
ruby.exception_type_error(),
|
127
|
+
"data must be an array or respond to 'to_a'",
|
128
|
+
));
|
129
|
+
};
|
130
|
+
|
131
|
+
let data_array: RArray = data_array;
|
132
|
+
|
133
|
+
// Process schema value
|
134
|
+
let schema_hash = process_schema_value(ruby, write_args.schema_value, Some(&data_array))
|
135
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
136
|
+
|
137
|
+
// Create schema
|
138
|
+
let schema = ruby_schema_to_parquet(schema_hash)
|
139
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
140
|
+
|
141
|
+
// Extract field schemas for conversion hints
|
142
|
+
let field_schemas = extract_field_schemas(&schema);
|
143
|
+
|
144
|
+
// Create writer
|
145
|
+
let mut writer_output = create_writer(
|
146
|
+
ruby,
|
147
|
+
write_args.write_to,
|
148
|
+
schema.clone(),
|
149
|
+
write_args.compression,
|
150
|
+
)?;
|
151
|
+
|
152
|
+
// Create logger
|
153
|
+
let logger = RubyLogger::new(write_args.logger)?;
|
154
|
+
let _ = logger.info(|| "Starting to write parquet file".to_string());
|
155
|
+
|
156
|
+
// Create batch size manager
|
157
|
+
let mut batch_manager = BatchSizeManager::new(
|
158
|
+
write_args.batch_size,
|
159
|
+
write_args.flush_threshold,
|
160
|
+
write_args.sample_size,
|
161
|
+
);
|
162
|
+
|
163
|
+
let _ = logger.debug(|| {
|
164
|
+
format!(
|
165
|
+
"Batch sizing: fixed_size={:?}, memory_threshold={}, sample_size={}",
|
166
|
+
batch_manager.fixed_batch_size,
|
167
|
+
batch_manager.memory_threshold,
|
168
|
+
batch_manager.sample_size
|
169
|
+
)
|
170
|
+
});
|
171
|
+
|
172
|
+
// Create converter with string cache if enabled
|
173
|
+
let mut converter = if write_args.string_cache.unwrap_or(false) {
|
174
|
+
let _ = logger.debug(|| "String cache enabled".to_string());
|
175
|
+
RubyValueConverter::with_string_cache(StringCache::new(true))
|
176
|
+
} else {
|
177
|
+
RubyValueConverter::new()
|
178
|
+
};
|
179
|
+
|
180
|
+
// Collect rows in batches
|
181
|
+
let mut batch = Vec::new();
|
182
|
+
let mut batch_memory_size = 0usize;
|
183
|
+
let mut total_rows = 0u64;
|
184
|
+
|
185
|
+
for row_value in data_array.into_iter() {
|
186
|
+
// Convert Ruby row to ParquetValue vector
|
187
|
+
let row = if row_value.is_kind_of(ruby.class_array()) {
|
188
|
+
let array: RArray = TryConvert::try_convert(row_value)?;
|
189
|
+
let mut values = Vec::with_capacity(array.len());
|
190
|
+
|
191
|
+
for (idx, item) in array.into_iter().enumerate() {
|
192
|
+
let schema_hint = field_schemas.get(idx);
|
193
|
+
let pq_value = converter
|
194
|
+
.to_parquet_with_schema_hint(item, schema_hint)
|
195
|
+
.map_err(|e| {
|
196
|
+
let error_msg = e.to_string();
|
197
|
+
// Check if this is an encoding error
|
198
|
+
if error_msg.contains("EncodingError")
|
199
|
+
|| error_msg.contains("invalid utf-8")
|
200
|
+
{
|
201
|
+
// Extract the actual encoding error message
|
202
|
+
if let Some(pos) = error_msg.find("EncodingError: ") {
|
203
|
+
let encoding_msg = error_msg[pos + 15..].to_string();
|
204
|
+
MagnusError::new(ruby.exception_encoding_error(), encoding_msg)
|
205
|
+
} else {
|
206
|
+
MagnusError::new(ruby.exception_encoding_error(), error_msg)
|
207
|
+
}
|
208
|
+
} else {
|
209
|
+
MagnusError::new(ruby.exception_runtime_error(), error_msg)
|
210
|
+
}
|
211
|
+
})?;
|
212
|
+
values.push(pq_value);
|
213
|
+
}
|
214
|
+
values
|
215
|
+
} else {
|
216
|
+
return Err(MagnusError::new(
|
217
|
+
ruby.exception_type_error(),
|
218
|
+
"each row must be an array",
|
219
|
+
));
|
220
|
+
};
|
221
|
+
|
222
|
+
// Record row size for dynamic batch sizing
|
223
|
+
let row_size = estimate_row_size(&row);
|
224
|
+
batch_manager.record_row_size(row_size);
|
225
|
+
batch_memory_size += row_size;
|
226
|
+
|
227
|
+
batch.push(row);
|
228
|
+
total_rows += 1;
|
229
|
+
|
230
|
+
// Log sampling progress
|
231
|
+
if batch_manager.row_size_samples.len() <= batch_manager.sample_size
|
232
|
+
&& batch_manager.row_size_samples.len() % 10 == 0
|
233
|
+
{
|
234
|
+
let _ = logger.debug(|| {
|
235
|
+
format!(
|
236
|
+
"Sampled {} rows, avg size: {} bytes, current batch size: {}",
|
237
|
+
batch_manager.row_size_samples.len(),
|
238
|
+
batch_manager.average_row_size(),
|
239
|
+
batch_manager.current_batch_size
|
240
|
+
)
|
241
|
+
});
|
242
|
+
}
|
243
|
+
|
244
|
+
// Write batch if it reaches threshold
|
245
|
+
if batch_manager.should_flush(batch.len(), batch_memory_size) {
|
246
|
+
let _ = logger.info(|| format!("Writing batch of {} rows", batch.len()));
|
247
|
+
let _ = logger.debug(|| format!(
|
248
|
+
"Batch details: recent avg row size: {} bytes, current batch size: {}, actual memory: {} bytes",
|
249
|
+
batch_manager.recent_average_size(),
|
250
|
+
batch_manager.current_batch_size,
|
251
|
+
batch_memory_size
|
252
|
+
));
|
253
|
+
match &mut writer_output {
|
254
|
+
WriterOutput::File(writer) | WriterOutput::TempFile(writer, _, _) => {
|
255
|
+
writer.write_rows(std::mem::take(&mut batch)).map_err(|e| {
|
256
|
+
MagnusError::new(ruby.exception_runtime_error(), e.to_string())
|
257
|
+
})?;
|
258
|
+
}
|
259
|
+
}
|
260
|
+
batch_memory_size = 0;
|
261
|
+
}
|
262
|
+
}
|
263
|
+
|
264
|
+
// Write remaining rows
|
265
|
+
if !batch.is_empty() {
|
266
|
+
let _ = logger.info(|| format!("Writing batch of {} rows", batch.len()));
|
267
|
+
let _ = logger.debug(|| format!("Final batch: {} rows", batch.len()));
|
268
|
+
match &mut writer_output {
|
269
|
+
WriterOutput::File(writer) | WriterOutput::TempFile(writer, _, _) => {
|
270
|
+
writer
|
271
|
+
.write_rows(batch)
|
272
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
273
|
+
}
|
274
|
+
}
|
275
|
+
}
|
276
|
+
|
277
|
+
let _ = logger.info(|| format!("Finished writing {} rows to parquet file", total_rows));
|
278
|
+
|
279
|
+
// Log string cache statistics if enabled
|
280
|
+
if let Some(stats) = converter.string_cache_stats() {
|
281
|
+
let _ = logger.info(|| {
|
282
|
+
format!(
|
283
|
+
"String cache stats: {} unique strings, {} hits ({:.1}% hit rate)",
|
284
|
+
stats.size,
|
285
|
+
stats.hits,
|
286
|
+
stats.hit_rate * 100.0
|
287
|
+
)
|
288
|
+
});
|
289
|
+
}
|
290
|
+
|
291
|
+
// Finalize the writer
|
292
|
+
finalize_writer(writer_output)?;
|
293
|
+
|
294
|
+
Ok(ruby.qnil().as_value())
|
295
|
+
}
|
296
|
+
|
297
|
+
/// Write data in column format to a parquet file
|
298
|
+
pub fn write_columns(
|
299
|
+
ruby: &Ruby,
|
300
|
+
write_args: crate::types::ParquetWriteArgs,
|
301
|
+
) -> Result<Value, MagnusError> {
|
302
|
+
use crate::converter::RubyValueConverter;
|
303
|
+
use crate::schema::{extract_field_schemas, process_schema_value, ruby_schema_to_parquet};
|
304
|
+
use magnus::{RArray, TryConvert};
|
305
|
+
|
306
|
+
// Convert data to array for processing
|
307
|
+
let data_array = if write_args.read_from.is_kind_of(ruby.class_array()) {
|
308
|
+
TryConvert::try_convert(write_args.read_from)?
|
309
|
+
} else if write_args.read_from.respond_to("to_a", false)? {
|
310
|
+
let array_value: Value = write_args.read_from.funcall("to_a", ())?;
|
311
|
+
TryConvert::try_convert(array_value)?
|
312
|
+
} else {
|
313
|
+
return Err(MagnusError::new(
|
314
|
+
ruby.exception_type_error(),
|
315
|
+
"data must be an array or respond to 'to_a'",
|
316
|
+
));
|
317
|
+
};
|
318
|
+
|
319
|
+
let data_array: RArray = data_array;
|
320
|
+
|
321
|
+
// Process schema value
|
322
|
+
let schema_hash = process_schema_value(ruby, write_args.schema_value, Some(&data_array))
|
323
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
324
|
+
|
325
|
+
// Create schema
|
326
|
+
let schema = ruby_schema_to_parquet(schema_hash)
|
327
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
328
|
+
|
329
|
+
// Extract field schemas for conversion hints
|
330
|
+
let field_schemas = extract_field_schemas(&schema);
|
331
|
+
|
332
|
+
// Create writer
|
333
|
+
let mut writer_output = create_writer(
|
334
|
+
ruby,
|
335
|
+
write_args.write_to,
|
336
|
+
schema.clone(),
|
337
|
+
write_args.compression,
|
338
|
+
)?;
|
339
|
+
|
340
|
+
// Get column names from schema
|
341
|
+
let column_names: Vec<String> =
|
342
|
+
if let parquet_core::SchemaNode::Struct { fields, .. } = &schema.root {
|
343
|
+
fields.iter().map(|f| f.name().to_string()).collect()
|
344
|
+
} else {
|
345
|
+
return Err(MagnusError::new(
|
346
|
+
ruby.exception_runtime_error(),
|
347
|
+
"Schema root must be a struct",
|
348
|
+
));
|
349
|
+
};
|
350
|
+
|
351
|
+
// Convert data to columns format
|
352
|
+
let mut all_columns: Vec<(String, Vec<parquet_core::ParquetValue>)> = Vec::new();
|
353
|
+
|
354
|
+
// Process batches
|
355
|
+
for (batch_idx, batch) in data_array.into_iter().enumerate() {
|
356
|
+
if !batch.is_kind_of(ruby.class_array()) {
|
357
|
+
return Err(MagnusError::new(
|
358
|
+
ruby.exception_type_error(),
|
359
|
+
"each batch must be an array of column values",
|
360
|
+
));
|
361
|
+
}
|
362
|
+
|
363
|
+
let batch_array: RArray = TryConvert::try_convert(batch)?;
|
364
|
+
|
365
|
+
// Verify batch has the right number of columns
|
366
|
+
if batch_array.len() != column_names.len() {
|
367
|
+
return Err(MagnusError::new(
|
368
|
+
ruby.exception_runtime_error(),
|
369
|
+
format!(
|
370
|
+
"Batch has {} columns but schema has {}",
|
371
|
+
batch_array.len(),
|
372
|
+
column_names.len()
|
373
|
+
),
|
374
|
+
));
|
375
|
+
}
|
376
|
+
|
377
|
+
// Process each column in the batch
|
378
|
+
for (col_idx, column_values) in batch_array.into_iter().enumerate() {
|
379
|
+
if !column_values.is_kind_of(ruby.class_array()) {
|
380
|
+
return Err(MagnusError::new(
|
381
|
+
ruby.exception_type_error(),
|
382
|
+
format!("Column {} values must be an array", col_idx),
|
383
|
+
));
|
384
|
+
}
|
385
|
+
|
386
|
+
let values_array: RArray = TryConvert::try_convert(column_values)?;
|
387
|
+
|
388
|
+
// Initialize column vector on first batch
|
389
|
+
if batch_idx == 0 {
|
390
|
+
all_columns.push((column_names[col_idx].clone(), Vec::new()));
|
391
|
+
}
|
392
|
+
|
393
|
+
// Convert and append values
|
394
|
+
let mut converter = RubyValueConverter::new();
|
395
|
+
let schema_hint = field_schemas.get(col_idx);
|
396
|
+
|
397
|
+
for value in values_array.into_iter() {
|
398
|
+
let pq_value = converter
|
399
|
+
.to_parquet_with_schema_hint(value, schema_hint)
|
400
|
+
.map_err(|e| {
|
401
|
+
let error_msg = e.to_string();
|
402
|
+
// Check if this is an encoding error
|
403
|
+
if error_msg.contains("EncodingError")
|
404
|
+
|| error_msg.contains("invalid utf-8")
|
405
|
+
{
|
406
|
+
// Extract the actual encoding error message
|
407
|
+
if let Some(pos) = error_msg.find("EncodingError: ") {
|
408
|
+
let encoding_msg = error_msg[pos + 15..].to_string();
|
409
|
+
MagnusError::new(ruby.exception_encoding_error(), encoding_msg)
|
410
|
+
} else {
|
411
|
+
MagnusError::new(ruby.exception_encoding_error(), error_msg)
|
412
|
+
}
|
413
|
+
} else {
|
414
|
+
MagnusError::new(ruby.exception_runtime_error(), error_msg)
|
415
|
+
}
|
416
|
+
})?;
|
417
|
+
all_columns[col_idx].1.push(pq_value);
|
418
|
+
}
|
419
|
+
}
|
420
|
+
}
|
421
|
+
|
422
|
+
// Write the columns
|
423
|
+
match &mut writer_output {
|
424
|
+
WriterOutput::File(writer) | WriterOutput::TempFile(writer, _, _) => {
|
425
|
+
writer
|
426
|
+
.write_columns(all_columns)
|
427
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
428
|
+
}
|
429
|
+
}
|
430
|
+
|
431
|
+
// Finalize the writer
|
432
|
+
finalize_writer(writer_output)?;
|
433
|
+
|
434
|
+
Ok(ruby.qnil().as_value())
|
435
|
+
}
|
data/lib/parquet/schema.rb
CHANGED
@@ -59,12 +59,31 @@ module Parquet
|
|
59
59
|
# - `key_nullable:, value_nullable:` controls nullability of map keys/values (default: true)
|
60
60
|
# - `format:` if you want to store some format string
|
61
61
|
# - `precision:, scale:` if type == :decimal (precision defaults to 38, scale to 0)
|
62
|
+
# - `has_timezone:` if type is timestamp - true means UTC storage (default), false means local/unzoned
|
63
|
+
# - `timezone:` (DEPRECATED) if type is timestamp - any value means UTC storage
|
62
64
|
# - `nullable:` default to true if not specified
|
63
65
|
def field(name, type, nullable: true, **kwargs, &block)
|
64
66
|
field_hash = { name: name.to_s, type: type, nullable: !!nullable }
|
65
67
|
|
66
68
|
# Possibly store a format if provided
|
67
69
|
field_hash[:format] = kwargs[:format] if kwargs.key?(:format)
|
70
|
+
|
71
|
+
# Handle timezone for timestamp types
|
72
|
+
if [:timestamp_second, :timestamp_millis, :timestamp_micros, :timestamp_nanos].include?(type)
|
73
|
+
# Support new has_timezone parameter (preferred)
|
74
|
+
if kwargs.key?(:has_timezone)
|
75
|
+
# If has_timezone is true, store "UTC" to indicate timezone presence
|
76
|
+
# If explicitly false, don't store timezone (indicates local/unzoned)
|
77
|
+
field_hash[:timezone] = "UTC" if kwargs[:has_timezone]
|
78
|
+
elsif kwargs.key?(:timezone)
|
79
|
+
# Legacy support: any timezone value means UTC storage
|
80
|
+
# Store "UTC" regardless of the actual value to make it clear
|
81
|
+
field_hash[:timezone] = "UTC"
|
82
|
+
else
|
83
|
+
# Default behavior when neither parameter is specified: UTC storage
|
84
|
+
field_hash[:timezone] = "UTC"
|
85
|
+
end
|
86
|
+
end
|
68
87
|
|
69
88
|
case type
|
70
89
|
when :struct
|
data/lib/parquet/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-07-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -54,33 +54,59 @@ files:
|
|
54
54
|
- LICENSE
|
55
55
|
- README.md
|
56
56
|
- Rakefile
|
57
|
+
- ext/parquet-core/Cargo.toml
|
58
|
+
- ext/parquet-core/src/arrow_conversion.rs
|
59
|
+
- ext/parquet-core/src/error.rs
|
60
|
+
- ext/parquet-core/src/lib.rs
|
61
|
+
- ext/parquet-core/src/reader.rs
|
62
|
+
- ext/parquet-core/src/schema.rs
|
63
|
+
- ext/parquet-core/src/test_utils.rs
|
64
|
+
- ext/parquet-core/src/traits/mod.rs
|
65
|
+
- ext/parquet-core/src/traits/schema.rs
|
66
|
+
- ext/parquet-core/src/value.rs
|
67
|
+
- ext/parquet-core/src/writer.rs
|
68
|
+
- ext/parquet-core/tests/arrow_conversion_tests.rs
|
69
|
+
- ext/parquet-core/tests/binary_data.rs
|
70
|
+
- ext/parquet-core/tests/column_projection.rs
|
71
|
+
- ext/parquet-core/tests/complex_types.rs
|
72
|
+
- ext/parquet-core/tests/compression_tests.rs
|
73
|
+
- ext/parquet-core/tests/concurrent_access.rs
|
74
|
+
- ext/parquet-core/tests/decimal_tests.rs
|
75
|
+
- ext/parquet-core/tests/edge_cases_corner_cases.rs
|
76
|
+
- ext/parquet-core/tests/error_handling_comprehensive_tests.rs
|
77
|
+
- ext/parquet-core/tests/null_handling_tests.rs
|
78
|
+
- ext/parquet-core/tests/performance_memory.rs
|
79
|
+
- ext/parquet-core/tests/primitive_types.rs
|
80
|
+
- ext/parquet-core/tests/real_world_patterns.rs
|
81
|
+
- ext/parquet-core/tests/roundtrip_correctness.rs
|
82
|
+
- ext/parquet-core/tests/schema_comprehensive_tests.rs
|
83
|
+
- ext/parquet-core/tests/temporal_tests.rs
|
84
|
+
- ext/parquet-core/tests/test_helpers.rs
|
85
|
+
- ext/parquet-core/tests/writer_tests.rs
|
86
|
+
- ext/parquet-ruby-adapter/Cargo.toml
|
87
|
+
- ext/parquet-ruby-adapter/build.rs
|
88
|
+
- ext/parquet-ruby-adapter/examples/try_into_value_demo.rs
|
89
|
+
- ext/parquet-ruby-adapter/src/batch_manager.rs
|
90
|
+
- ext/parquet-ruby-adapter/src/chunk_reader.rs
|
91
|
+
- ext/parquet-ruby-adapter/src/converter.rs
|
92
|
+
- ext/parquet-ruby-adapter/src/error.rs
|
93
|
+
- ext/parquet-ruby-adapter/src/io.rs
|
94
|
+
- ext/parquet-ruby-adapter/src/lib.rs
|
95
|
+
- ext/parquet-ruby-adapter/src/logger.rs
|
96
|
+
- ext/parquet-ruby-adapter/src/metadata.rs
|
97
|
+
- ext/parquet-ruby-adapter/src/reader.rs
|
98
|
+
- ext/parquet-ruby-adapter/src/schema.rs
|
99
|
+
- ext/parquet-ruby-adapter/src/string_cache.rs
|
100
|
+
- ext/parquet-ruby-adapter/src/try_into_value.rs
|
101
|
+
- ext/parquet-ruby-adapter/src/types.rs
|
102
|
+
- ext/parquet-ruby-adapter/src/utils.rs
|
103
|
+
- ext/parquet-ruby-adapter/src/writer.rs
|
57
104
|
- ext/parquet/Cargo.toml
|
58
105
|
- ext/parquet/build.rs
|
59
106
|
- ext/parquet/extconf.rb
|
107
|
+
- ext/parquet/src/adapter_ffi.rs
|
60
108
|
- ext/parquet/src/allocator.rs
|
61
|
-
- ext/parquet/src/enumerator.rs
|
62
|
-
- ext/parquet/src/header_cache.rs
|
63
109
|
- ext/parquet/src/lib.rs
|
64
|
-
- ext/parquet/src/logger.rs
|
65
|
-
- ext/parquet/src/reader/common.rs
|
66
|
-
- ext/parquet/src/reader/mod.rs
|
67
|
-
- ext/parquet/src/reader/parquet_column_reader.rs
|
68
|
-
- ext/parquet/src/reader/parquet_row_reader.rs
|
69
|
-
- ext/parquet/src/reader/unified/mod.rs
|
70
|
-
- ext/parquet/src/ruby_reader.rs
|
71
|
-
- ext/parquet/src/types/core_types.rs
|
72
|
-
- ext/parquet/src/types/mod.rs
|
73
|
-
- ext/parquet/src/types/parquet_value.rs
|
74
|
-
- ext/parquet/src/types/record_types.rs
|
75
|
-
- ext/parquet/src/types/schema_converter.rs
|
76
|
-
- ext/parquet/src/types/schema_node.rs
|
77
|
-
- ext/parquet/src/types/timestamp.rs
|
78
|
-
- ext/parquet/src/types/type_conversion.rs
|
79
|
-
- ext/parquet/src/types/writer_types.rs
|
80
|
-
- ext/parquet/src/utils.rs
|
81
|
-
- ext/parquet/src/writer/mod.rs
|
82
|
-
- ext/parquet/src/writer/write_columns.rs
|
83
|
-
- ext/parquet/src/writer/write_rows.rs
|
84
110
|
- lib/parquet.rb
|
85
111
|
- lib/parquet.rbi
|
86
112
|
- lib/parquet/schema.rb
|
@@ -1,68 +0,0 @@
|
|
1
|
-
use crate::ParserResultType;
|
2
|
-
use magnus::{value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value};
|
3
|
-
|
4
|
-
pub struct RowEnumeratorArgs {
|
5
|
-
pub rb_self: Value,
|
6
|
-
pub to_read: Value,
|
7
|
-
pub result_type: ParserResultType,
|
8
|
-
pub columns: Option<Vec<String>>,
|
9
|
-
pub strict: bool,
|
10
|
-
pub logger: Option<Value>,
|
11
|
-
}
|
12
|
-
|
13
|
-
/// Creates an enumerator for lazy Parquet row parsing
|
14
|
-
pub fn create_row_enumerator(args: RowEnumeratorArgs) -> Result<magnus::Enumerator, MagnusError> {
|
15
|
-
let kwargs = RHash::new();
|
16
|
-
kwargs.aset(
|
17
|
-
Symbol::new("result_type"),
|
18
|
-
Symbol::new(args.result_type.to_string()),
|
19
|
-
)?;
|
20
|
-
if let Some(columns) = args.columns {
|
21
|
-
kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
|
22
|
-
}
|
23
|
-
if args.strict {
|
24
|
-
kwargs.aset(Symbol::new("strict"), true)?;
|
25
|
-
}
|
26
|
-
if let Some(logger) = args.logger {
|
27
|
-
kwargs.aset(Symbol::new("logger"), logger)?;
|
28
|
-
}
|
29
|
-
Ok(args
|
30
|
-
.rb_self
|
31
|
-
.enumeratorize("each_row", (args.to_read, KwArgs(kwargs))))
|
32
|
-
}
|
33
|
-
|
34
|
-
pub struct ColumnEnumeratorArgs {
|
35
|
-
pub rb_self: Value,
|
36
|
-
pub to_read: Value,
|
37
|
-
pub result_type: ParserResultType,
|
38
|
-
pub columns: Option<Vec<String>>,
|
39
|
-
pub batch_size: Option<usize>,
|
40
|
-
pub strict: bool,
|
41
|
-
pub logger: Option<Value>,
|
42
|
-
}
|
43
|
-
|
44
|
-
#[inline]
|
45
|
-
pub fn create_column_enumerator(
|
46
|
-
args: ColumnEnumeratorArgs,
|
47
|
-
) -> Result<magnus::Enumerator, MagnusError> {
|
48
|
-
let kwargs = RHash::new();
|
49
|
-
kwargs.aset(
|
50
|
-
Symbol::new("result_type"),
|
51
|
-
Symbol::new(args.result_type.to_string()),
|
52
|
-
)?;
|
53
|
-
if let Some(columns) = args.columns {
|
54
|
-
kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
|
55
|
-
}
|
56
|
-
if let Some(batch_size) = args.batch_size {
|
57
|
-
kwargs.aset(Symbol::new("batch_size"), batch_size)?;
|
58
|
-
}
|
59
|
-
if args.strict {
|
60
|
-
kwargs.aset(Symbol::new("strict"), true)?;
|
61
|
-
}
|
62
|
-
if let Some(logger) = args.logger {
|
63
|
-
kwargs.aset(Symbol::new("logger"), logger)?;
|
64
|
-
}
|
65
|
-
Ok(args
|
66
|
-
.rb_self
|
67
|
-
.enumeratorize("each_column", (args.to_read, KwArgs(kwargs))))
|
68
|
-
}
|