parquet 0.5.12 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +8 -5
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -603
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,423 @@
1
+ use arrow_array::*;
2
+ use arrow_schema::{DataType, Field, TimeUnit};
3
+ use bytes::Bytes;
4
+ use num::BigInt;
5
+ use ordered_float::OrderedFloat;
6
+ use parquet_core::arrow_conversion::{arrow_to_parquet_value, parquet_values_to_arrow_array};
7
+ use parquet_core::*;
8
+ use std::sync::Arc;
9
+
10
+ #[test]
11
+ fn test_float16_conversion() {
12
+ let values = vec![
13
+ ParquetValue::Float16(OrderedFloat(1.0f32)),
14
+ ParquetValue::Float16(OrderedFloat(-2.5f32)),
15
+ ParquetValue::Float16(OrderedFloat(0.0f32)),
16
+ ParquetValue::Null,
17
+ ];
18
+
19
+ // Test upcast to Float32
20
+ let field = Field::new("test", DataType::Float32, true);
21
+ let array = parquet_values_to_arrow_array(values.clone(), &field).unwrap();
22
+ assert_eq!(array.len(), 4);
23
+
24
+ let float_array = array.as_any().downcast_ref::<Float32Array>().unwrap();
25
+ assert_eq!(float_array.value(0), 1.0);
26
+ assert_eq!(float_array.value(1), -2.5);
27
+ assert_eq!(float_array.value(2), 0.0);
28
+ assert!(float_array.is_null(3));
29
+
30
+ // Test upcast to Float64
31
+ let field = Field::new("test", DataType::Float64, true);
32
+ let array = parquet_values_to_arrow_array(values, &field).unwrap();
33
+ let float_array = array.as_any().downcast_ref::<Float64Array>().unwrap();
34
+ assert_eq!(float_array.value(0), 1.0);
35
+ assert_eq!(float_array.value(1), -2.5);
36
+ assert_eq!(float_array.value(2), 0.0);
37
+ }
38
+
39
+ #[test]
40
+ fn test_fixed_size_binary_conversion() {
41
+ let uuid_bytes = Bytes::from(vec![
42
+ 0x12, 0x34, 0x56, 0x78, 0x9a, 0xbc, 0xde, 0xf0, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
43
+ 0x88,
44
+ ]);
45
+
46
+ let values = vec![
47
+ ParquetValue::Bytes(uuid_bytes.clone()),
48
+ ParquetValue::Bytes(Bytes::from(vec![0u8; 16])),
49
+ ParquetValue::Null,
50
+ ];
51
+
52
+ let field = Field::new("uuid", DataType::FixedSizeBinary(16), true);
53
+ let array = parquet_values_to_arrow_array(values, &field).unwrap();
54
+
55
+ let fixed_array = array
56
+ .as_any()
57
+ .downcast_ref::<FixedSizeBinaryArray>()
58
+ .unwrap();
59
+ assert_eq!(fixed_array.value(0), uuid_bytes.as_ref());
60
+ assert_eq!(fixed_array.value(1), vec![0u8; 16]);
61
+ assert!(fixed_array.is_null(2));
62
+ }
63
+
64
+ #[test]
65
+ fn test_fixed_size_binary_wrong_size_error() {
66
+ let values = vec![
67
+ ParquetValue::Bytes(Bytes::from(vec![1, 2, 3])), // Wrong size
68
+ ];
69
+
70
+ let field = Field::new("test", DataType::FixedSizeBinary(16), true);
71
+ let result = parquet_values_to_arrow_array(values, &field);
72
+
73
+ assert!(result.is_err());
74
+ assert!(result
75
+ .unwrap_err()
76
+ .to_string()
77
+ .contains("Fixed size binary expected 16 bytes, got 3"));
78
+ }
79
+
80
+ #[test]
81
+ fn test_decimal256_large_values() {
82
+ // Test very large Decimal256 values
83
+ let large_positive = BigInt::parse_bytes(
84
+ b"99999999999999999999999999999999999999999999999999999999999999999999999999",
85
+ 10,
86
+ )
87
+ .unwrap();
88
+ let large_negative = -large_positive.clone();
89
+
90
+ let values = vec![
91
+ ParquetValue::Decimal256(large_positive.clone(), 0),
92
+ ParquetValue::Decimal256(large_negative.clone(), 0),
93
+ ParquetValue::Decimal256(BigInt::from(0), 0),
94
+ ParquetValue::Null,
95
+ ];
96
+
97
+ let field = Field::new("test", DataType::Decimal256(76, 0), true);
98
+ let array = parquet_values_to_arrow_array(values, &field).unwrap();
99
+
100
+ // Verify roundtrip
101
+ for i in 0..4 {
102
+ let value = arrow_to_parquet_value(array.as_ref(), i).unwrap();
103
+ match (i, value) {
104
+ (0, ParquetValue::Decimal256(v, _)) => assert_eq!(v, large_positive.clone()),
105
+ (1, ParquetValue::Decimal256(v, _)) => assert_eq!(v, large_negative.clone()),
106
+ (2, ParquetValue::Decimal256(v, _)) => assert_eq!(v, BigInt::from(0)),
107
+ (3, ParquetValue::Null) => {}
108
+ _ => panic!("Unexpected value"),
109
+ }
110
+ }
111
+ }
112
+
113
+ #[test]
114
+ fn test_decimal256_too_large_error() {
115
+ // Create a value that's too large for 256 bits
116
+ let too_large = BigInt::from(2).pow(256);
117
+
118
+ let values = vec![ParquetValue::Decimal256(too_large, 0)];
119
+
120
+ let field = Field::new("test", DataType::Decimal256(76, 0), true);
121
+ let result = parquet_values_to_arrow_array(values, &field);
122
+
123
+ assert!(result.is_err());
124
+ assert!(result
125
+ .unwrap_err()
126
+ .to_string()
127
+ .contains("Decimal256 value too large"));
128
+ }
129
+
130
+ #[test]
131
+ fn test_time_type_conversions() {
132
+ // Test TimeMillis
133
+ let values_millis = vec![
134
+ ParquetValue::TimeMillis(12345),
135
+ ParquetValue::TimeMillis(0),
136
+ ParquetValue::TimeMillis(86399999), // Last millisecond of day
137
+ ParquetValue::Null,
138
+ ];
139
+
140
+ let field = Field::new("time", DataType::Time32(TimeUnit::Millisecond), true);
141
+ let array = parquet_values_to_arrow_array(values_millis, &field).unwrap();
142
+ assert_eq!(array.len(), 4);
143
+
144
+ // Test TimeMicros
145
+ let values_micros = vec![
146
+ ParquetValue::TimeMicros(12345678),
147
+ ParquetValue::TimeMicros(0),
148
+ ParquetValue::TimeMicros(86399999999), // Last microsecond of day
149
+ ParquetValue::Null,
150
+ ];
151
+
152
+ let field = Field::new("time", DataType::Time64(TimeUnit::Microsecond), true);
153
+ let array = parquet_values_to_arrow_array(values_micros, &field).unwrap();
154
+ assert_eq!(array.len(), 4);
155
+ }
156
+
157
+ #[test]
158
+ fn test_timestamp_with_timezone() {
159
+ let tz = Some(Arc::from("America/New_York"));
160
+
161
+ let values = vec![
162
+ ParquetValue::TimestampMillis(1234567890123, tz.clone()),
163
+ ParquetValue::TimestampMillis(0, tz.clone()),
164
+ ParquetValue::Null,
165
+ ];
166
+
167
+ let field = Field::new(
168
+ "ts",
169
+ DataType::Timestamp(TimeUnit::Millisecond, Some("America/New_York".into())),
170
+ true,
171
+ );
172
+ let array = parquet_values_to_arrow_array(values, &field).unwrap();
173
+
174
+ // Verify roundtrip preserves timezone
175
+ for i in 0..3 {
176
+ let value = arrow_to_parquet_value(array.as_ref(), i).unwrap();
177
+ match value {
178
+ ParquetValue::TimestampMillis(_, Some(tz)) => {
179
+ assert_eq!(tz.as_ref(), "America/New_York");
180
+ }
181
+ ParquetValue::Null => assert_eq!(i, 2),
182
+ _ => panic!("Unexpected value"),
183
+ }
184
+ }
185
+ }
186
+
187
+ #[test]
188
+ fn test_nested_list_of_lists() {
189
+ // Create a list of lists: [[1, 2], [3], [], null, [4, 5, 6]]
190
+ let inner_lists = vec![
191
+ ParquetValue::List(vec![ParquetValue::Int32(1), ParquetValue::Int32(2)]),
192
+ ParquetValue::List(vec![ParquetValue::Int32(3)]),
193
+ ParquetValue::List(vec![]),
194
+ ParquetValue::Null,
195
+ ParquetValue::List(vec![
196
+ ParquetValue::Int32(4),
197
+ ParquetValue::Int32(5),
198
+ ParquetValue::Int32(6),
199
+ ]),
200
+ ];
201
+
202
+ let values = vec![ParquetValue::List(inner_lists)];
203
+
204
+ let inner_field = Field::new("item", DataType::Int32, false);
205
+ let list_field = Field::new("inner_list", DataType::List(Arc::new(inner_field)), true);
206
+ let outer_field = Field::new("outer_list", DataType::List(Arc::new(list_field)), false);
207
+
208
+ let array = parquet_values_to_arrow_array(values, &outer_field).unwrap();
209
+ assert_eq!(array.len(), 1);
210
+
211
+ // Verify roundtrip
212
+ let value = arrow_to_parquet_value(array.as_ref(), 0).unwrap();
213
+ match value {
214
+ ParquetValue::List(items) => assert_eq!(items.len(), 5),
215
+ _ => panic!("Expected list"),
216
+ }
217
+ }
218
+
219
+ #[test]
220
+ fn test_map_with_null_values() {
221
+ let map_entries = vec![
222
+ (
223
+ ParquetValue::String(Arc::from("key1")),
224
+ ParquetValue::Int32(100),
225
+ ),
226
+ (ParquetValue::String(Arc::from("key2")), ParquetValue::Null),
227
+ (
228
+ ParquetValue::String(Arc::from("key3")),
229
+ ParquetValue::Int32(300),
230
+ ),
231
+ ];
232
+
233
+ let values = vec![ParquetValue::Map(map_entries), ParquetValue::Null];
234
+
235
+ let key_field = Field::new("key", DataType::Utf8, false);
236
+ let value_field = Field::new("value", DataType::Int32, true);
237
+ let entries_field = Field::new(
238
+ "entries",
239
+ DataType::Struct(vec![key_field, value_field].into()),
240
+ false,
241
+ );
242
+ let map_field = Field::new("map", DataType::Map(Arc::new(entries_field), false), true);
243
+
244
+ let array = parquet_values_to_arrow_array(values, &map_field).unwrap();
245
+ assert_eq!(array.len(), 2);
246
+
247
+ // Verify the map was created correctly
248
+ let map_array = array.as_any().downcast_ref::<MapArray>().unwrap();
249
+ assert!(!map_array.is_null(0));
250
+ assert!(map_array.is_null(1));
251
+ }
252
+
253
+ #[test]
254
+ fn test_struct_with_missing_fields() {
255
+ use indexmap::IndexMap;
256
+
257
+ // Create a struct with some fields missing
258
+ let mut record1 = IndexMap::new();
259
+ record1.insert(
260
+ Arc::from("field1"),
261
+ ParquetValue::String(Arc::from("value1")),
262
+ );
263
+ // field2 is missing
264
+ record1.insert(Arc::from("field3"), ParquetValue::Int32(42));
265
+
266
+ let mut record2 = IndexMap::new();
267
+ record2.insert(
268
+ Arc::from("field1"),
269
+ ParquetValue::String(Arc::from("value2")),
270
+ );
271
+ record2.insert(Arc::from("field2"), ParquetValue::Boolean(true));
272
+ record2.insert(Arc::from("field3"), ParquetValue::Int32(99));
273
+
274
+ let values = vec![
275
+ ParquetValue::Record(record1),
276
+ ParquetValue::Record(record2),
277
+ ParquetValue::Null,
278
+ ];
279
+
280
+ let fields = vec![
281
+ Field::new("field1", DataType::Utf8, false),
282
+ Field::new("field2", DataType::Boolean, true), // nullable to handle missing
283
+ Field::new("field3", DataType::Int32, false),
284
+ ];
285
+
286
+ let struct_field = Field::new("struct", DataType::Struct(fields.into()), true);
287
+ let array = parquet_values_to_arrow_array(values, &struct_field).unwrap();
288
+
289
+ let struct_array = array.as_any().downcast_ref::<StructArray>().unwrap();
290
+ assert_eq!(struct_array.len(), 3);
291
+
292
+ // Verify field2 is null for first record
293
+ let field2_array = struct_array
294
+ .column(1)
295
+ .as_any()
296
+ .downcast_ref::<BooleanArray>()
297
+ .unwrap();
298
+ assert!(field2_array.is_null(0));
299
+ assert!(!field2_array.is_null(1));
300
+ }
301
+
302
+ #[test]
303
+ fn test_type_mismatch_errors() {
304
+ // Test various type mismatches
305
+
306
+ // Boolean field expecting String value
307
+ let values = vec![ParquetValue::String(Arc::from("not a boolean"))];
308
+ let field = Field::new("test", DataType::Boolean, false);
309
+ let result = parquet_values_to_arrow_array(values, &field);
310
+ assert!(result.is_err());
311
+ let error_msg = result.unwrap_err().to_string();
312
+ assert!(
313
+ error_msg.contains("Expected Boolean") && error_msg.contains("String"),
314
+ "Error message was: {}",
315
+ error_msg
316
+ );
317
+
318
+ // Int32 field expecting Float value
319
+ let values = vec![ParquetValue::Float32(OrderedFloat(3.14))];
320
+ let field = Field::new("test", DataType::Int32, false);
321
+ let result = parquet_values_to_arrow_array(values, &field);
322
+ assert!(result.is_err());
323
+ let error_msg = result.unwrap_err().to_string();
324
+ assert!(
325
+ error_msg.contains("Expected Int32") && error_msg.contains("Float32"),
326
+ "Error message was: {}",
327
+ error_msg
328
+ );
329
+
330
+ // List field expecting non-list value
331
+ let values = vec![ParquetValue::Int32(42)];
332
+ let item_field = Field::new("item", DataType::Int32, false);
333
+ let list_field = Field::new("list", DataType::List(Arc::new(item_field)), false);
334
+ let result = parquet_values_to_arrow_array(values, &list_field);
335
+ assert!(result.is_err());
336
+ let error_msg = result.unwrap_err().to_string();
337
+ assert!(
338
+ error_msg.contains("Expected List") && error_msg.contains("Int32"),
339
+ "Error message was: {}",
340
+ error_msg
341
+ );
342
+ }
343
+
344
+ #[test]
345
+ fn test_unsupported_arrow_types() {
346
+ // Test arrow_to_parquet_value with unsupported types
347
+ // Create a simple union type
348
+ let type_ids = arrow_buffer::ScalarBuffer::from(vec![0i8, 0, 0]);
349
+ let fields = vec![Arc::new(Field::new("int", DataType::Int32, false))];
350
+ let union_fields = arrow_schema::UnionFields::new(vec![0], fields);
351
+
352
+ let array = arrow_array::UnionArray::try_new(
353
+ union_fields,
354
+ type_ids,
355
+ None,
356
+ vec![Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef],
357
+ )
358
+ .unwrap();
359
+
360
+ let result = arrow_to_parquet_value(&array, 0);
361
+ assert!(result.is_err());
362
+ assert!(result
363
+ .unwrap_err()
364
+ .to_string()
365
+ .contains("Unsupported data type for conversion"));
366
+ }
367
+
368
+ #[test]
369
+ fn test_integer_overflow_prevention() {
370
+ // Test that we can't upcast a value that would overflow
371
+ let values = vec![ParquetValue::Int64(i64::MAX), ParquetValue::Int64(i64::MIN)];
372
+
373
+ // These should work fine in Int64
374
+ let field = Field::new("test", DataType::Int64, false);
375
+ let array = parquet_values_to_arrow_array(values, &field).unwrap();
376
+ let int_array = array.as_any().downcast_ref::<Int64Array>().unwrap();
377
+ assert_eq!(int_array.value(0), i64::MAX);
378
+ assert_eq!(int_array.value(1), i64::MIN);
379
+ }
380
+
381
+ #[test]
382
+ fn test_empty_collections() {
383
+ // Test empty list
384
+ let values = vec![ParquetValue::List(vec![])];
385
+ let field = Field::new(
386
+ "list",
387
+ DataType::List(Arc::new(Field::new("item", DataType::Int32, true))),
388
+ false,
389
+ );
390
+ let array = parquet_values_to_arrow_array(values.clone(), &field).unwrap();
391
+ let list_array = array.as_any().downcast_ref::<ListArray>().unwrap();
392
+ assert_eq!(list_array.value(0).len(), 0);
393
+
394
+ // Test empty map
395
+ let values = vec![ParquetValue::Map(vec![])];
396
+ let key_field = Field::new("key", DataType::Utf8, false);
397
+ let value_field = Field::new("value", DataType::Int32, true);
398
+ let entries_field = Field::new(
399
+ "entries",
400
+ DataType::Struct(vec![key_field, value_field].into()),
401
+ false,
402
+ );
403
+ let map_field = Field::new("map", DataType::Map(Arc::new(entries_field), false), false);
404
+ let array = parquet_values_to_arrow_array(values, &map_field).unwrap();
405
+ let map_array = array.as_any().downcast_ref::<MapArray>().unwrap();
406
+ assert_eq!(map_array.value(0).len(), 0);
407
+
408
+ // Test empty struct (all fields null)
409
+ use indexmap::IndexMap;
410
+ let empty_record = IndexMap::new();
411
+ let values = vec![ParquetValue::Record(empty_record)];
412
+ let fields = vec![
413
+ Field::new("field1", DataType::Utf8, true),
414
+ Field::new("field2", DataType::Int32, true),
415
+ ];
416
+ let struct_field = Field::new("struct", DataType::Struct(fields.into()), false);
417
+ let array = parquet_values_to_arrow_array(values, &struct_field).unwrap();
418
+ let struct_array = array.as_any().downcast_ref::<StructArray>().unwrap();
419
+
420
+ // All fields should be null
421
+ assert!(struct_array.column(0).is_null(0));
422
+ assert!(struct_array.column(1).is_null(0));
423
+ }