parquet 0.5.13 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +3 -0
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -605
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,435 @@
1
+ use magnus::value::ReprValue;
2
+ use magnus::{Error as MagnusError, Ruby, TryConvert, Value};
3
+ use parquet::file::properties::WriterProperties;
4
+ use parquet_core::Schema;
5
+ use std::io::{BufReader, BufWriter, Write};
6
+ use tempfile::NamedTempFile;
7
+
8
+ use crate::io::RubyIOWriter;
9
+ use crate::types::WriterOutput;
10
+ use crate::utils::parse_compression;
11
+
12
+ /// Create a writer based on the output type (file path or IO object)
13
+ pub fn create_writer(
14
+ ruby: &Ruby,
15
+ write_to: Value,
16
+ schema: Schema,
17
+ compression: Option<String>,
18
+ ) -> Result<WriterOutput, MagnusError> {
19
+ let compression_setting = parse_compression(compression)?;
20
+ let props = WriterProperties::builder()
21
+ .set_compression(compression_setting)
22
+ .build();
23
+
24
+ if write_to.is_kind_of(ruby.class_string()) {
25
+ // Direct file path
26
+ let path_str: String = TryConvert::try_convert(write_to)?;
27
+ let file = std::fs::File::create(&path_str)
28
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
29
+ let writer = parquet_core::writer::Writer::new_with_properties(file, schema, props)
30
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
31
+ Ok(WriterOutput::File(writer))
32
+ } else {
33
+ // IO-like object - create temporary file
34
+ let temp_file = NamedTempFile::new().map_err(|e| {
35
+ MagnusError::new(
36
+ ruby.exception_runtime_error(),
37
+ format!("Failed to create temporary file: {}", e),
38
+ )
39
+ })?;
40
+
41
+ // Clone the file handle for the writer
42
+ let file = temp_file.reopen().map_err(|e| {
43
+ MagnusError::new(
44
+ ruby.exception_runtime_error(),
45
+ format!("Failed to reopen temporary file: {}", e),
46
+ )
47
+ })?;
48
+
49
+ let writer = parquet_core::writer::Writer::new_with_properties(file, schema, props)
50
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
51
+
52
+ Ok(WriterOutput::TempFile(writer, temp_file, write_to))
53
+ }
54
+ }
55
+
56
+ /// Finalize the writer and copy temp file to IO if needed
57
+ pub fn finalize_writer(writer_output: WriterOutput) -> Result<(), MagnusError> {
58
+ match writer_output {
59
+ WriterOutput::File(writer) => writer
60
+ .close()
61
+ .map_err(|e| MagnusError::new(magnus::exception::runtime_error(), e.to_string())),
62
+ WriterOutput::TempFile(writer, temp_file, io_object) => {
63
+ // Close the writer first
64
+ writer
65
+ .close()
66
+ .map_err(|e| MagnusError::new(magnus::exception::runtime_error(), e.to_string()))?;
67
+
68
+ // Copy temp file to IO object
69
+ copy_temp_file_to_io(temp_file, io_object)
70
+ }
71
+ }
72
+ }
73
+
74
+ /// Copy temporary file contents to Ruby IO object
75
+ fn copy_temp_file_to_io(temp_file: NamedTempFile, io_object: Value) -> Result<(), MagnusError> {
76
+ let file = temp_file.reopen().map_err(|e| {
77
+ MagnusError::new(
78
+ magnus::exception::runtime_error(),
79
+ format!("Failed to reopen temporary file: {}", e),
80
+ )
81
+ })?;
82
+
83
+ let mut buf_reader = BufReader::new(file);
84
+ let ruby_io_writer = RubyIOWriter::new(io_object);
85
+ let mut buf_writer = BufWriter::new(ruby_io_writer);
86
+
87
+ std::io::copy(&mut buf_reader, &mut buf_writer).map_err(|e| {
88
+ MagnusError::new(
89
+ magnus::exception::runtime_error(),
90
+ format!("Failed to copy temp file to IO object: {}", e),
91
+ )
92
+ })?;
93
+
94
+ buf_writer.flush().map_err(|e| {
95
+ MagnusError::new(
96
+ magnus::exception::runtime_error(),
97
+ format!("Failed to flush IO object: {}", e),
98
+ )
99
+ })?;
100
+
101
+ // The temporary file will be automatically deleted when temp_file is dropped
102
+ Ok(())
103
+ }
104
+
105
+ /// Write data in row format to a parquet file
106
+ pub fn write_rows(
107
+ ruby: &Ruby,
108
+ write_args: crate::types::ParquetWriteArgs,
109
+ ) -> Result<Value, MagnusError> {
110
+ use crate::batch_manager::BatchSizeManager;
111
+ use crate::converter::RubyValueConverter;
112
+ use crate::logger::RubyLogger;
113
+ use crate::schema::{extract_field_schemas, process_schema_value, ruby_schema_to_parquet};
114
+ use crate::string_cache::StringCache;
115
+ use crate::utils::estimate_row_size;
116
+ use magnus::{RArray, TryConvert};
117
+
118
+ // Convert data to array if it isn't already
119
+ let data_array = if write_args.read_from.is_kind_of(ruby.class_array()) {
120
+ TryConvert::try_convert(write_args.read_from)?
121
+ } else if write_args.read_from.respond_to("to_a", false)? {
122
+ let array_value: Value = write_args.read_from.funcall("to_a", ())?;
123
+ TryConvert::try_convert(array_value)?
124
+ } else {
125
+ return Err(MagnusError::new(
126
+ ruby.exception_type_error(),
127
+ "data must be an array or respond to 'to_a'",
128
+ ));
129
+ };
130
+
131
+ let data_array: RArray = data_array;
132
+
133
+ // Process schema value
134
+ let schema_hash = process_schema_value(ruby, write_args.schema_value, Some(&data_array))
135
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
136
+
137
+ // Create schema
138
+ let schema = ruby_schema_to_parquet(schema_hash)
139
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
140
+
141
+ // Extract field schemas for conversion hints
142
+ let field_schemas = extract_field_schemas(&schema);
143
+
144
+ // Create writer
145
+ let mut writer_output = create_writer(
146
+ ruby,
147
+ write_args.write_to,
148
+ schema.clone(),
149
+ write_args.compression,
150
+ )?;
151
+
152
+ // Create logger
153
+ let logger = RubyLogger::new(write_args.logger)?;
154
+ let _ = logger.info(|| "Starting to write parquet file".to_string());
155
+
156
+ // Create batch size manager
157
+ let mut batch_manager = BatchSizeManager::new(
158
+ write_args.batch_size,
159
+ write_args.flush_threshold,
160
+ write_args.sample_size,
161
+ );
162
+
163
+ let _ = logger.debug(|| {
164
+ format!(
165
+ "Batch sizing: fixed_size={:?}, memory_threshold={}, sample_size={}",
166
+ batch_manager.fixed_batch_size,
167
+ batch_manager.memory_threshold,
168
+ batch_manager.sample_size
169
+ )
170
+ });
171
+
172
+ // Create converter with string cache if enabled
173
+ let mut converter = if write_args.string_cache.unwrap_or(false) {
174
+ let _ = logger.debug(|| "String cache enabled".to_string());
175
+ RubyValueConverter::with_string_cache(StringCache::new(true))
176
+ } else {
177
+ RubyValueConverter::new()
178
+ };
179
+
180
+ // Collect rows in batches
181
+ let mut batch = Vec::new();
182
+ let mut batch_memory_size = 0usize;
183
+ let mut total_rows = 0u64;
184
+
185
+ for row_value in data_array.into_iter() {
186
+ // Convert Ruby row to ParquetValue vector
187
+ let row = if row_value.is_kind_of(ruby.class_array()) {
188
+ let array: RArray = TryConvert::try_convert(row_value)?;
189
+ let mut values = Vec::with_capacity(array.len());
190
+
191
+ for (idx, item) in array.into_iter().enumerate() {
192
+ let schema_hint = field_schemas.get(idx);
193
+ let pq_value = converter
194
+ .to_parquet_with_schema_hint(item, schema_hint)
195
+ .map_err(|e| {
196
+ let error_msg = e.to_string();
197
+ // Check if this is an encoding error
198
+ if error_msg.contains("EncodingError")
199
+ || error_msg.contains("invalid utf-8")
200
+ {
201
+ // Extract the actual encoding error message
202
+ if let Some(pos) = error_msg.find("EncodingError: ") {
203
+ let encoding_msg = error_msg[pos + 15..].to_string();
204
+ MagnusError::new(ruby.exception_encoding_error(), encoding_msg)
205
+ } else {
206
+ MagnusError::new(ruby.exception_encoding_error(), error_msg)
207
+ }
208
+ } else {
209
+ MagnusError::new(ruby.exception_runtime_error(), error_msg)
210
+ }
211
+ })?;
212
+ values.push(pq_value);
213
+ }
214
+ values
215
+ } else {
216
+ return Err(MagnusError::new(
217
+ ruby.exception_type_error(),
218
+ "each row must be an array",
219
+ ));
220
+ };
221
+
222
+ // Record row size for dynamic batch sizing
223
+ let row_size = estimate_row_size(&row);
224
+ batch_manager.record_row_size(row_size);
225
+ batch_memory_size += row_size;
226
+
227
+ batch.push(row);
228
+ total_rows += 1;
229
+
230
+ // Log sampling progress
231
+ if batch_manager.row_size_samples.len() <= batch_manager.sample_size
232
+ && batch_manager.row_size_samples.len() % 10 == 0
233
+ {
234
+ let _ = logger.debug(|| {
235
+ format!(
236
+ "Sampled {} rows, avg size: {} bytes, current batch size: {}",
237
+ batch_manager.row_size_samples.len(),
238
+ batch_manager.average_row_size(),
239
+ batch_manager.current_batch_size
240
+ )
241
+ });
242
+ }
243
+
244
+ // Write batch if it reaches threshold
245
+ if batch_manager.should_flush(batch.len(), batch_memory_size) {
246
+ let _ = logger.info(|| format!("Writing batch of {} rows", batch.len()));
247
+ let _ = logger.debug(|| format!(
248
+ "Batch details: recent avg row size: {} bytes, current batch size: {}, actual memory: {} bytes",
249
+ batch_manager.recent_average_size(),
250
+ batch_manager.current_batch_size,
251
+ batch_memory_size
252
+ ));
253
+ match &mut writer_output {
254
+ WriterOutput::File(writer) | WriterOutput::TempFile(writer, _, _) => {
255
+ writer.write_rows(std::mem::take(&mut batch)).map_err(|e| {
256
+ MagnusError::new(ruby.exception_runtime_error(), e.to_string())
257
+ })?;
258
+ }
259
+ }
260
+ batch_memory_size = 0;
261
+ }
262
+ }
263
+
264
+ // Write remaining rows
265
+ if !batch.is_empty() {
266
+ let _ = logger.info(|| format!("Writing batch of {} rows", batch.len()));
267
+ let _ = logger.debug(|| format!("Final batch: {} rows", batch.len()));
268
+ match &mut writer_output {
269
+ WriterOutput::File(writer) | WriterOutput::TempFile(writer, _, _) => {
270
+ writer
271
+ .write_rows(batch)
272
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
273
+ }
274
+ }
275
+ }
276
+
277
+ let _ = logger.info(|| format!("Finished writing {} rows to parquet file", total_rows));
278
+
279
+ // Log string cache statistics if enabled
280
+ if let Some(stats) = converter.string_cache_stats() {
281
+ let _ = logger.info(|| {
282
+ format!(
283
+ "String cache stats: {} unique strings, {} hits ({:.1}% hit rate)",
284
+ stats.size,
285
+ stats.hits,
286
+ stats.hit_rate * 100.0
287
+ )
288
+ });
289
+ }
290
+
291
+ // Finalize the writer
292
+ finalize_writer(writer_output)?;
293
+
294
+ Ok(ruby.qnil().as_value())
295
+ }
296
+
297
+ /// Write data in column format to a parquet file
298
+ pub fn write_columns(
299
+ ruby: &Ruby,
300
+ write_args: crate::types::ParquetWriteArgs,
301
+ ) -> Result<Value, MagnusError> {
302
+ use crate::converter::RubyValueConverter;
303
+ use crate::schema::{extract_field_schemas, process_schema_value, ruby_schema_to_parquet};
304
+ use magnus::{RArray, TryConvert};
305
+
306
+ // Convert data to array for processing
307
+ let data_array = if write_args.read_from.is_kind_of(ruby.class_array()) {
308
+ TryConvert::try_convert(write_args.read_from)?
309
+ } else if write_args.read_from.respond_to("to_a", false)? {
310
+ let array_value: Value = write_args.read_from.funcall("to_a", ())?;
311
+ TryConvert::try_convert(array_value)?
312
+ } else {
313
+ return Err(MagnusError::new(
314
+ ruby.exception_type_error(),
315
+ "data must be an array or respond to 'to_a'",
316
+ ));
317
+ };
318
+
319
+ let data_array: RArray = data_array;
320
+
321
+ // Process schema value
322
+ let schema_hash = process_schema_value(ruby, write_args.schema_value, Some(&data_array))
323
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
324
+
325
+ // Create schema
326
+ let schema = ruby_schema_to_parquet(schema_hash)
327
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
328
+
329
+ // Extract field schemas for conversion hints
330
+ let field_schemas = extract_field_schemas(&schema);
331
+
332
+ // Create writer
333
+ let mut writer_output = create_writer(
334
+ ruby,
335
+ write_args.write_to,
336
+ schema.clone(),
337
+ write_args.compression,
338
+ )?;
339
+
340
+ // Get column names from schema
341
+ let column_names: Vec<String> =
342
+ if let parquet_core::SchemaNode::Struct { fields, .. } = &schema.root {
343
+ fields.iter().map(|f| f.name().to_string()).collect()
344
+ } else {
345
+ return Err(MagnusError::new(
346
+ ruby.exception_runtime_error(),
347
+ "Schema root must be a struct",
348
+ ));
349
+ };
350
+
351
+ // Convert data to columns format
352
+ let mut all_columns: Vec<(String, Vec<parquet_core::ParquetValue>)> = Vec::new();
353
+
354
+ // Process batches
355
+ for (batch_idx, batch) in data_array.into_iter().enumerate() {
356
+ if !batch.is_kind_of(ruby.class_array()) {
357
+ return Err(MagnusError::new(
358
+ ruby.exception_type_error(),
359
+ "each batch must be an array of column values",
360
+ ));
361
+ }
362
+
363
+ let batch_array: RArray = TryConvert::try_convert(batch)?;
364
+
365
+ // Verify batch has the right number of columns
366
+ if batch_array.len() != column_names.len() {
367
+ return Err(MagnusError::new(
368
+ ruby.exception_runtime_error(),
369
+ format!(
370
+ "Batch has {} columns but schema has {}",
371
+ batch_array.len(),
372
+ column_names.len()
373
+ ),
374
+ ));
375
+ }
376
+
377
+ // Process each column in the batch
378
+ for (col_idx, column_values) in batch_array.into_iter().enumerate() {
379
+ if !column_values.is_kind_of(ruby.class_array()) {
380
+ return Err(MagnusError::new(
381
+ ruby.exception_type_error(),
382
+ format!("Column {} values must be an array", col_idx),
383
+ ));
384
+ }
385
+
386
+ let values_array: RArray = TryConvert::try_convert(column_values)?;
387
+
388
+ // Initialize column vector on first batch
389
+ if batch_idx == 0 {
390
+ all_columns.push((column_names[col_idx].clone(), Vec::new()));
391
+ }
392
+
393
+ // Convert and append values
394
+ let mut converter = RubyValueConverter::new();
395
+ let schema_hint = field_schemas.get(col_idx);
396
+
397
+ for value in values_array.into_iter() {
398
+ let pq_value = converter
399
+ .to_parquet_with_schema_hint(value, schema_hint)
400
+ .map_err(|e| {
401
+ let error_msg = e.to_string();
402
+ // Check if this is an encoding error
403
+ if error_msg.contains("EncodingError")
404
+ || error_msg.contains("invalid utf-8")
405
+ {
406
+ // Extract the actual encoding error message
407
+ if let Some(pos) = error_msg.find("EncodingError: ") {
408
+ let encoding_msg = error_msg[pos + 15..].to_string();
409
+ MagnusError::new(ruby.exception_encoding_error(), encoding_msg)
410
+ } else {
411
+ MagnusError::new(ruby.exception_encoding_error(), error_msg)
412
+ }
413
+ } else {
414
+ MagnusError::new(ruby.exception_runtime_error(), error_msg)
415
+ }
416
+ })?;
417
+ all_columns[col_idx].1.push(pq_value);
418
+ }
419
+ }
420
+ }
421
+
422
+ // Write the columns
423
+ match &mut writer_output {
424
+ WriterOutput::File(writer) | WriterOutput::TempFile(writer, _, _) => {
425
+ writer
426
+ .write_columns(all_columns)
427
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
428
+ }
429
+ }
430
+
431
+ // Finalize the writer
432
+ finalize_writer(writer_output)?;
433
+
434
+ Ok(ruby.qnil().as_value())
435
+ }
@@ -59,12 +59,31 @@ module Parquet
59
59
  # - `key_nullable:, value_nullable:` controls nullability of map keys/values (default: true)
60
60
  # - `format:` if you want to store some format string
61
61
  # - `precision:, scale:` if type == :decimal (precision defaults to 38, scale to 0)
62
+ # - `has_timezone:` if type is timestamp - true means UTC storage (default), false means local/unzoned
63
+ # - `timezone:` (DEPRECATED) if type is timestamp - any value means UTC storage
62
64
  # - `nullable:` default to true if not specified
63
65
  def field(name, type, nullable: true, **kwargs, &block)
64
66
  field_hash = { name: name.to_s, type: type, nullable: !!nullable }
65
67
 
66
68
  # Possibly store a format if provided
67
69
  field_hash[:format] = kwargs[:format] if kwargs.key?(:format)
70
+
71
+ # Handle timezone for timestamp types
72
+ if [:timestamp_second, :timestamp_millis, :timestamp_micros, :timestamp_nanos].include?(type)
73
+ # Support new has_timezone parameter (preferred)
74
+ if kwargs.key?(:has_timezone)
75
+ # If has_timezone is true, store "UTC" to indicate timezone presence
76
+ # If explicitly false, don't store timezone (indicates local/unzoned)
77
+ field_hash[:timezone] = "UTC" if kwargs[:has_timezone]
78
+ elsif kwargs.key?(:timezone)
79
+ # Legacy support: any timezone value means UTC storage
80
+ # Store "UTC" regardless of the actual value to make it clear
81
+ field_hash[:timezone] = "UTC"
82
+ else
83
+ # Default behavior when neither parameter is specified: UTC storage
84
+ field_hash[:timezone] = "UTC"
85
+ end
86
+ end
68
87
 
69
88
  case type
70
89
  when :struct
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.5.13"
2
+ VERSION = "0.6.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.13
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-06-24 00:00:00.000000000 Z
11
+ date: 2025-07-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -54,33 +54,59 @@ files:
54
54
  - LICENSE
55
55
  - README.md
56
56
  - Rakefile
57
+ - ext/parquet-core/Cargo.toml
58
+ - ext/parquet-core/src/arrow_conversion.rs
59
+ - ext/parquet-core/src/error.rs
60
+ - ext/parquet-core/src/lib.rs
61
+ - ext/parquet-core/src/reader.rs
62
+ - ext/parquet-core/src/schema.rs
63
+ - ext/parquet-core/src/test_utils.rs
64
+ - ext/parquet-core/src/traits/mod.rs
65
+ - ext/parquet-core/src/traits/schema.rs
66
+ - ext/parquet-core/src/value.rs
67
+ - ext/parquet-core/src/writer.rs
68
+ - ext/parquet-core/tests/arrow_conversion_tests.rs
69
+ - ext/parquet-core/tests/binary_data.rs
70
+ - ext/parquet-core/tests/column_projection.rs
71
+ - ext/parquet-core/tests/complex_types.rs
72
+ - ext/parquet-core/tests/compression_tests.rs
73
+ - ext/parquet-core/tests/concurrent_access.rs
74
+ - ext/parquet-core/tests/decimal_tests.rs
75
+ - ext/parquet-core/tests/edge_cases_corner_cases.rs
76
+ - ext/parquet-core/tests/error_handling_comprehensive_tests.rs
77
+ - ext/parquet-core/tests/null_handling_tests.rs
78
+ - ext/parquet-core/tests/performance_memory.rs
79
+ - ext/parquet-core/tests/primitive_types.rs
80
+ - ext/parquet-core/tests/real_world_patterns.rs
81
+ - ext/parquet-core/tests/roundtrip_correctness.rs
82
+ - ext/parquet-core/tests/schema_comprehensive_tests.rs
83
+ - ext/parquet-core/tests/temporal_tests.rs
84
+ - ext/parquet-core/tests/test_helpers.rs
85
+ - ext/parquet-core/tests/writer_tests.rs
86
+ - ext/parquet-ruby-adapter/Cargo.toml
87
+ - ext/parquet-ruby-adapter/build.rs
88
+ - ext/parquet-ruby-adapter/examples/try_into_value_demo.rs
89
+ - ext/parquet-ruby-adapter/src/batch_manager.rs
90
+ - ext/parquet-ruby-adapter/src/chunk_reader.rs
91
+ - ext/parquet-ruby-adapter/src/converter.rs
92
+ - ext/parquet-ruby-adapter/src/error.rs
93
+ - ext/parquet-ruby-adapter/src/io.rs
94
+ - ext/parquet-ruby-adapter/src/lib.rs
95
+ - ext/parquet-ruby-adapter/src/logger.rs
96
+ - ext/parquet-ruby-adapter/src/metadata.rs
97
+ - ext/parquet-ruby-adapter/src/reader.rs
98
+ - ext/parquet-ruby-adapter/src/schema.rs
99
+ - ext/parquet-ruby-adapter/src/string_cache.rs
100
+ - ext/parquet-ruby-adapter/src/try_into_value.rs
101
+ - ext/parquet-ruby-adapter/src/types.rs
102
+ - ext/parquet-ruby-adapter/src/utils.rs
103
+ - ext/parquet-ruby-adapter/src/writer.rs
57
104
  - ext/parquet/Cargo.toml
58
105
  - ext/parquet/build.rs
59
106
  - ext/parquet/extconf.rb
107
+ - ext/parquet/src/adapter_ffi.rs
60
108
  - ext/parquet/src/allocator.rs
61
- - ext/parquet/src/enumerator.rs
62
- - ext/parquet/src/header_cache.rs
63
109
  - ext/parquet/src/lib.rs
64
- - ext/parquet/src/logger.rs
65
- - ext/parquet/src/reader/common.rs
66
- - ext/parquet/src/reader/mod.rs
67
- - ext/parquet/src/reader/parquet_column_reader.rs
68
- - ext/parquet/src/reader/parquet_row_reader.rs
69
- - ext/parquet/src/reader/unified/mod.rs
70
- - ext/parquet/src/ruby_reader.rs
71
- - ext/parquet/src/types/core_types.rs
72
- - ext/parquet/src/types/mod.rs
73
- - ext/parquet/src/types/parquet_value.rs
74
- - ext/parquet/src/types/record_types.rs
75
- - ext/parquet/src/types/schema_converter.rs
76
- - ext/parquet/src/types/schema_node.rs
77
- - ext/parquet/src/types/timestamp.rs
78
- - ext/parquet/src/types/type_conversion.rs
79
- - ext/parquet/src/types/writer_types.rs
80
- - ext/parquet/src/utils.rs
81
- - ext/parquet/src/writer/mod.rs
82
- - ext/parquet/src/writer/write_columns.rs
83
- - ext/parquet/src/writer/write_rows.rs
84
110
  - lib/parquet.rb
85
111
  - lib/parquet.rbi
86
112
  - lib/parquet/schema.rb
@@ -1,68 +0,0 @@
1
- use crate::ParserResultType;
2
- use magnus::{value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value};
3
-
4
- pub struct RowEnumeratorArgs {
5
- pub rb_self: Value,
6
- pub to_read: Value,
7
- pub result_type: ParserResultType,
8
- pub columns: Option<Vec<String>>,
9
- pub strict: bool,
10
- pub logger: Option<Value>,
11
- }
12
-
13
- /// Creates an enumerator for lazy Parquet row parsing
14
- pub fn create_row_enumerator(args: RowEnumeratorArgs) -> Result<magnus::Enumerator, MagnusError> {
15
- let kwargs = RHash::new();
16
- kwargs.aset(
17
- Symbol::new("result_type"),
18
- Symbol::new(args.result_type.to_string()),
19
- )?;
20
- if let Some(columns) = args.columns {
21
- kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
22
- }
23
- if args.strict {
24
- kwargs.aset(Symbol::new("strict"), true)?;
25
- }
26
- if let Some(logger) = args.logger {
27
- kwargs.aset(Symbol::new("logger"), logger)?;
28
- }
29
- Ok(args
30
- .rb_self
31
- .enumeratorize("each_row", (args.to_read, KwArgs(kwargs))))
32
- }
33
-
34
- pub struct ColumnEnumeratorArgs {
35
- pub rb_self: Value,
36
- pub to_read: Value,
37
- pub result_type: ParserResultType,
38
- pub columns: Option<Vec<String>>,
39
- pub batch_size: Option<usize>,
40
- pub strict: bool,
41
- pub logger: Option<Value>,
42
- }
43
-
44
- #[inline]
45
- pub fn create_column_enumerator(
46
- args: ColumnEnumeratorArgs,
47
- ) -> Result<magnus::Enumerator, MagnusError> {
48
- let kwargs = RHash::new();
49
- kwargs.aset(
50
- Symbol::new("result_type"),
51
- Symbol::new(args.result_type.to_string()),
52
- )?;
53
- if let Some(columns) = args.columns {
54
- kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
55
- }
56
- if let Some(batch_size) = args.batch_size {
57
- kwargs.aset(Symbol::new("batch_size"), batch_size)?;
58
- }
59
- if args.strict {
60
- kwargs.aset(Symbol::new("strict"), true)?;
61
- }
62
- if let Some(logger) = args.logger {
63
- kwargs.aset(Symbol::new("logger"), logger)?;
64
- }
65
- Ok(args
66
- .rb_self
67
- .enumeratorize("each_column", (args.to_read, KwArgs(kwargs))))
68
- }