parquet 0.5.13 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +3 -0
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -605
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,427 @@
1
+ use magnus::value::ReprValue;
2
+ use magnus::{Error as MagnusError, IntoValue, Ruby, Value};
3
+ use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader};
4
+ use std::fs::File;
5
+
6
+ use crate::error::{IntoMagnusError, Result, RubyAdapterError};
7
+ use crate::io::{RubyIOReader, ThreadSafeRubyIOReader};
8
+ use crate::TryIntoValue;
9
+
10
+ /// Wrapper for ParquetMetaData to implement IntoValue trait
11
+ pub struct RubyParquetMetaData(pub ParquetMetaData);
12
+
13
+ impl TryIntoValue for RubyParquetMetaData {
14
+ fn try_into_value(self, handle: &Ruby) -> Result<Value> {
15
+ let metadata = &self.0;
16
+ let file_metadata = metadata.file_metadata();
17
+ let row_groups = metadata.row_groups();
18
+
19
+ // Construct a hash with the metadata
20
+ let hash = handle.hash_new();
21
+ hash.aset("num_rows", file_metadata.num_rows())
22
+ .map_err(|e| RubyAdapterError::metadata(format!("Failed to set num_rows: {}", e)))?;
23
+ hash.aset("created_by", file_metadata.created_by())
24
+ .map_err(|e| RubyAdapterError::metadata(format!("Failed to set created_by: {}", e)))?;
25
+
26
+ // Convert key_value_metadata to a Ruby array if it exists
27
+ if let Some(key_value_metadata) = file_metadata.key_value_metadata() {
28
+ let kv_array = handle.ary_new();
29
+ for kv in key_value_metadata {
30
+ let kv_hash = handle.hash_new();
31
+ kv_hash
32
+ .aset("key", kv.key.clone())
33
+ .map_err(|e| RubyAdapterError::metadata(format!("Failed to set key: {}", e)))?;
34
+ kv_hash.aset("value", kv.value.clone()).map_err(|e| {
35
+ RubyAdapterError::metadata(format!("Failed to set value: {}", e))
36
+ })?;
37
+ kv_array.push(kv_hash).map_err(|e| {
38
+ RubyAdapterError::metadata(format!("Failed to push kv_hash: {}", e))
39
+ })?;
40
+ }
41
+ hash.aset("key_value_metadata", kv_array).map_err(|e| {
42
+ RubyAdapterError::metadata(format!("Failed to set key_value_metadata: {}", e))
43
+ })?;
44
+ } else {
45
+ hash.aset("key_value_metadata", None::<Value>)
46
+ .map_err(|e| {
47
+ RubyAdapterError::metadata(format!("Failed to set key_value_metadata: {}", e))
48
+ })?;
49
+ }
50
+
51
+ // Convert schema to a Ruby hash since &Type doesn't implement IntoValue
52
+ let schema_hash = handle.hash_new();
53
+ let schema = file_metadata.schema();
54
+ schema_hash
55
+ .aset("name", schema.name())
56
+ .map_err(|e| RubyAdapterError::metadata(format!("Failed to set schema name: {}", e)))?;
57
+
58
+ // Add schema fields information
59
+ let fields_array = handle.ary_new();
60
+ for field in schema.get_fields() {
61
+ let field_hash = handle.hash_new();
62
+ field_hash.aset("name", field.name()).map_err(|e| {
63
+ RubyAdapterError::metadata(format!("Failed to set field name: {}", e))
64
+ })?;
65
+
66
+ // Handle different field types
67
+ match field.as_ref() {
68
+ parquet::schema::types::Type::PrimitiveType {
69
+ physical_type,
70
+ type_length,
71
+ scale,
72
+ precision,
73
+ ..
74
+ } => {
75
+ field_hash.aset("type", "primitive").map_err(|e| {
76
+ RubyAdapterError::metadata(format!("Failed to set type: {}", e))
77
+ })?;
78
+ field_hash
79
+ .aset("physical_type", format!("{:?}", physical_type))
80
+ .map_err(|e| {
81
+ RubyAdapterError::metadata(format!(
82
+ "Failed to set physical_type: {}",
83
+ e
84
+ ))
85
+ })?;
86
+ field_hash.aset("type_length", *type_length).map_err(|e| {
87
+ RubyAdapterError::metadata(format!("Failed to set type_length: {}", e))
88
+ })?;
89
+ field_hash.aset("scale", *scale).map_err(|e| {
90
+ RubyAdapterError::metadata(format!("Failed to set scale: {}", e))
91
+ })?;
92
+ field_hash.aset("precision", *precision).map_err(|e| {
93
+ RubyAdapterError::metadata(format!("Failed to set precision: {}", e))
94
+ })?;
95
+ }
96
+ parquet::schema::types::Type::GroupType { .. } => {
97
+ field_hash.aset("type", "group").map_err(|e| {
98
+ RubyAdapterError::metadata(format!("Failed to set type: {}", e))
99
+ })?;
100
+ }
101
+ }
102
+
103
+ // Add basic info
104
+ let basic_info = field.get_basic_info();
105
+ field_hash
106
+ .aset("repetition", format!("{:?}", basic_info.repetition()))
107
+ .map_err(|e| {
108
+ RubyAdapterError::metadata(format!("Failed to set repetition: {}", e))
109
+ })?;
110
+ field_hash
111
+ .aset(
112
+ "converted_type",
113
+ format!("{:?}", basic_info.converted_type()),
114
+ )
115
+ .map_err(|e| {
116
+ RubyAdapterError::metadata(format!("Failed to set converted_type: {}", e))
117
+ })?;
118
+ if let Some(logical_type) = basic_info.logical_type() {
119
+ field_hash
120
+ .aset("logical_type", format!("{:?}", logical_type))
121
+ .map_err(|e| {
122
+ RubyAdapterError::metadata(format!("Failed to set logical_type: {}", e))
123
+ })?;
124
+ }
125
+
126
+ fields_array.push(field_hash).map_err(|e| {
127
+ RubyAdapterError::metadata(format!("Failed to push field_hash: {}", e))
128
+ })?;
129
+ }
130
+ schema_hash
131
+ .aset("fields", fields_array)
132
+ .map_err(|e| RubyAdapterError::metadata(format!("Failed to set fields: {}", e)))?;
133
+
134
+ hash.aset("schema", schema_hash)
135
+ .map_err(|e| RubyAdapterError::metadata(format!("Failed to set schema: {}", e)))?;
136
+
137
+ // Convert row_groups to a Ruby array since &[RowGroupMetaData] doesn't implement IntoValue
138
+ let row_groups_array = handle.ary_new();
139
+ for row_group in row_groups.iter() {
140
+ let rg_hash = handle.hash_new();
141
+ rg_hash
142
+ .aset("num_columns", row_group.num_columns())
143
+ .map_err(|e| {
144
+ RubyAdapterError::metadata(format!("Failed to set num_columns: {}", e))
145
+ })?;
146
+ rg_hash
147
+ .aset("num_rows", row_group.num_rows())
148
+ .map_err(|e| {
149
+ RubyAdapterError::metadata(format!("Failed to set num_rows: {}", e))
150
+ })?;
151
+ rg_hash
152
+ .aset("total_byte_size", row_group.total_byte_size())
153
+ .map_err(|e| {
154
+ RubyAdapterError::metadata(format!("Failed to set total_byte_size: {}", e))
155
+ })?;
156
+ rg_hash
157
+ .aset("file_offset", row_group.file_offset())
158
+ .map_err(|e| {
159
+ RubyAdapterError::metadata(format!("Failed to set file_offset: {}", e))
160
+ })?;
161
+ rg_hash
162
+ .aset("ordinal", row_group.ordinal())
163
+ .map_err(|e| RubyAdapterError::metadata(format!("Failed to set ordinal: {}", e)))?;
164
+ rg_hash
165
+ .aset("compressed_size", row_group.compressed_size())
166
+ .map_err(|e| {
167
+ RubyAdapterError::metadata(format!("Failed to set compressed_size: {}", e))
168
+ })?;
169
+
170
+ // Add column chunks metadata
171
+ let columns_array = handle.ary_new();
172
+ for col_idx in 0..row_group.num_columns() {
173
+ let column = row_group.column(col_idx);
174
+ let col_hash = handle.hash_new();
175
+
176
+ col_hash
177
+ .aset("column_path", column.column_path().string())
178
+ .map_err(|e| {
179
+ RubyAdapterError::metadata(format!("Failed to set column_path: {}", e))
180
+ })?;
181
+ col_hash
182
+ .aset("file_path", column.file_path())
183
+ .map_err(|e| {
184
+ RubyAdapterError::metadata(format!("Failed to set file_path: {}", e))
185
+ })?;
186
+ col_hash
187
+ .aset("file_offset", column.file_offset())
188
+ .map_err(|e| {
189
+ RubyAdapterError::metadata(format!("Failed to set file_offset: {}", e))
190
+ })?;
191
+ col_hash
192
+ .aset("num_values", column.num_values())
193
+ .map_err(|e| {
194
+ RubyAdapterError::metadata(format!("Failed to set num_values: {}", e))
195
+ })?;
196
+ col_hash
197
+ .aset("compression", format!("{:?}", column.compression()))
198
+ .map_err(|e| {
199
+ RubyAdapterError::metadata(format!("Failed to set compression: {}", e))
200
+ })?;
201
+ col_hash
202
+ .aset("total_compressed_size", column.compressed_size())
203
+ .map_err(|e| {
204
+ RubyAdapterError::metadata(format!(
205
+ "Failed to set total_compressed_size: {}",
206
+ e
207
+ ))
208
+ })?;
209
+ col_hash
210
+ .aset("total_uncompressed_size", column.uncompressed_size())
211
+ .map_err(|e| {
212
+ RubyAdapterError::metadata(format!(
213
+ "Failed to set total_uncompressed_size: {}",
214
+ e
215
+ ))
216
+ })?;
217
+ col_hash
218
+ .aset("data_page_offset", column.data_page_offset())
219
+ .map_err(|e| {
220
+ RubyAdapterError::metadata(format!("Failed to set data_page_offset: {}", e))
221
+ })?;
222
+
223
+ if let Some(offset) = column.dictionary_page_offset() {
224
+ col_hash
225
+ .aset("dictionary_page_offset", offset)
226
+ .map_err(|e| {
227
+ RubyAdapterError::metadata(format!(
228
+ "Failed to set dictionary_page_offset: {}",
229
+ e
230
+ ))
231
+ })?;
232
+ }
233
+
234
+ if let Some(offset) = column.bloom_filter_offset() {
235
+ col_hash.aset("bloom_filter_offset", offset).map_err(|e| {
236
+ RubyAdapterError::metadata(format!(
237
+ "Failed to set bloom_filter_offset: {}",
238
+ e
239
+ ))
240
+ })?;
241
+ }
242
+
243
+ if let Some(length) = column.bloom_filter_length() {
244
+ col_hash.aset("bloom_filter_length", length).map_err(|e| {
245
+ RubyAdapterError::metadata(format!(
246
+ "Failed to set bloom_filter_length: {}",
247
+ e
248
+ ))
249
+ })?;
250
+ }
251
+
252
+ if let Some(offset) = column.offset_index_offset() {
253
+ col_hash.aset("offset_index_offset", offset).map_err(|e| {
254
+ RubyAdapterError::metadata(format!(
255
+ "Failed to set offset_index_offset: {}",
256
+ e
257
+ ))
258
+ })?;
259
+ }
260
+
261
+ if let Some(length) = column.offset_index_length() {
262
+ col_hash.aset("offset_index_length", length).map_err(|e| {
263
+ RubyAdapterError::metadata(format!(
264
+ "Failed to set offset_index_length: {}",
265
+ e
266
+ ))
267
+ })?;
268
+ }
269
+
270
+ if let Some(offset) = column.column_index_offset() {
271
+ col_hash.aset("column_index_offset", offset).map_err(|e| {
272
+ RubyAdapterError::metadata(format!(
273
+ "Failed to set column_index_offset: {}",
274
+ e
275
+ ))
276
+ })?;
277
+ }
278
+
279
+ if let Some(length) = column.column_index_length() {
280
+ col_hash.aset("column_index_length", length).map_err(|e| {
281
+ RubyAdapterError::metadata(format!(
282
+ "Failed to set column_index_length: {}",
283
+ e
284
+ ))
285
+ })?;
286
+ }
287
+
288
+ // Add encodings
289
+ let encodings_array = handle.ary_new();
290
+ for encoding in column.encodings() {
291
+ encodings_array
292
+ .push(format!("{:?}", encoding))
293
+ .map_err(|e| {
294
+ RubyAdapterError::metadata(format!("Failed to push encoding: {}", e))
295
+ })?;
296
+ }
297
+ col_hash.aset("encodings", encodings_array).map_err(|e| {
298
+ RubyAdapterError::metadata(format!("Failed to set encodings: {}", e))
299
+ })?;
300
+
301
+ // Add statistics if available
302
+ if let Some(stats) = column.statistics() {
303
+ let stats_hash = handle.hash_new();
304
+ stats_hash
305
+ .aset("min_is_exact", stats.min_is_exact())
306
+ .map_err(|e| {
307
+ RubyAdapterError::metadata(format!("Failed to set min_is_exact: {}", e))
308
+ })?;
309
+ stats_hash
310
+ .aset("max_is_exact", stats.max_is_exact())
311
+ .map_err(|e| {
312
+ RubyAdapterError::metadata(format!("Failed to set max_is_exact: {}", e))
313
+ })?;
314
+
315
+ col_hash.aset("statistics", stats_hash).map_err(|e| {
316
+ RubyAdapterError::metadata(format!("Failed to set statistics: {}", e))
317
+ })?;
318
+ }
319
+
320
+ // Add page encoding stats if available
321
+ if let Some(page_encoding_stats) = column.page_encoding_stats() {
322
+ let page_stats_array = handle.ary_new();
323
+ for stat in page_encoding_stats {
324
+ let stat_hash = handle.hash_new();
325
+ stat_hash
326
+ .aset("page_type", format!("{:?}", stat.page_type))
327
+ .map_err(|e| {
328
+ RubyAdapterError::metadata(format!(
329
+ "Failed to set page_type: {}",
330
+ e
331
+ ))
332
+ })?;
333
+ stat_hash
334
+ .aset("encoding", format!("{:?}", stat.encoding))
335
+ .map_err(|e| {
336
+ RubyAdapterError::metadata(format!("Failed to set encoding: {}", e))
337
+ })?;
338
+ stat_hash.aset("count", stat.count).map_err(|e| {
339
+ RubyAdapterError::metadata(format!("Failed to set count: {}", e))
340
+ })?;
341
+ page_stats_array.push(stat_hash).map_err(|e| {
342
+ RubyAdapterError::metadata(format!("Failed to push stat_hash: {}", e))
343
+ })?;
344
+ }
345
+ col_hash
346
+ .aset("page_encoding_stats", page_stats_array)
347
+ .map_err(|e| {
348
+ RubyAdapterError::metadata(format!(
349
+ "Failed to set page_encoding_stats: {}",
350
+ e
351
+ ))
352
+ })?;
353
+ }
354
+
355
+ columns_array.push(col_hash).map_err(|e| {
356
+ RubyAdapterError::metadata(format!("Failed to push col_hash: {}", e))
357
+ })?;
358
+ }
359
+ rg_hash
360
+ .aset("columns", columns_array)
361
+ .map_err(|e| RubyAdapterError::metadata(format!("Failed to set columns: {}", e)))?;
362
+
363
+ row_groups_array.push(rg_hash).map_err(|e| {
364
+ RubyAdapterError::metadata(format!("Failed to push rg_hash: {}", e))
365
+ })?;
366
+ }
367
+ hash.aset("row_groups", row_groups_array)
368
+ .map_err(|e| RubyAdapterError::metadata(format!("Failed to set row_groups: {}", e)))?;
369
+
370
+ Ok(handle.into_value(hash))
371
+ }
372
+ }
373
+
374
+ // Also implement IntoValue for backwards compatibility
375
+ impl IntoValue for RubyParquetMetaData {
376
+ fn into_value_with(self, handle: &Ruby) -> Value {
377
+ // Use TryIntoValue and handle errors by returning an error hash
378
+ match self.try_into_value(handle) {
379
+ Ok(value) => value,
380
+ Err(e) => {
381
+ // Create an error hash instead of panicking
382
+ let error_hash = handle.hash_new();
383
+ let _ = error_hash.aset("error", true);
384
+ let _ = error_hash.aset("message", e.to_string());
385
+ handle.into_value(error_hash)
386
+ }
387
+ }
388
+ }
389
+ }
390
+
391
+ /// Parse metadata from a file path or Ruby IO object
392
+ pub fn parse_metadata(arg: Value) -> std::result::Result<Value, MagnusError> {
393
+ parse_metadata_impl(arg).into_magnus_error()
394
+ }
395
+
396
+ fn parse_metadata_impl(arg: Value) -> Result<Value> {
397
+ let ruby = Ruby::get().map_err(|_| RubyAdapterError::runtime("Failed to get Ruby runtime"))?;
398
+
399
+ let mut reader = ParquetMetaDataReader::new();
400
+ if arg.is_kind_of(ruby.class_string()) {
401
+ let path = arg
402
+ .to_r_string()
403
+ .map_err(|e| {
404
+ RubyAdapterError::invalid_input(format!("Failed to convert to string: {}", e))
405
+ })?
406
+ .to_string()
407
+ .map_err(|e| {
408
+ RubyAdapterError::invalid_input(format!("Failed to convert to Rust string: {}", e))
409
+ })?;
410
+ let file = File::open(path).map_err(RubyAdapterError::Io)?;
411
+ reader
412
+ .try_parse(&file)
413
+ .map_err(|e| RubyAdapterError::Parquet(parquet_core::ParquetError::Parquet(e)))?;
414
+ } else {
415
+ let file = RubyIOReader::new(arg).map_err(RubyAdapterError::Io)?;
416
+ reader
417
+ .try_parse(&ThreadSafeRubyIOReader::new(file))
418
+ .map_err(|e| RubyAdapterError::Parquet(parquet_core::ParquetError::Parquet(e)))?;
419
+ }
420
+
421
+ let metadata = reader
422
+ .finish()
423
+ .map_err(|e| RubyAdapterError::Parquet(parquet_core::ParquetError::Parquet(e)))?;
424
+
425
+ // Use TryIntoValue instead of IntoValue
426
+ RubyParquetMetaData(metadata).try_into_value(&ruby)
427
+ }