parquet 0.5.13 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +3 -0
  7. data/ext/parquet/src/adapter_ffi.rs +162 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +817 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +201 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -605
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,437 @@
1
+ use bytes::Bytes;
2
+ use indexmap::IndexMap;
3
+ use parquet_core::*;
4
+ use std::sync::Arc;
5
+
6
+ #[test]
7
+ fn test_binary_data_basic() {
8
+ // Test basic binary data handling
9
+ let schema = SchemaBuilder::new()
10
+ .with_root(SchemaNode::Struct {
11
+ name: "root".to_string(),
12
+ nullable: false,
13
+ fields: vec![
14
+ SchemaNode::Primitive {
15
+ name: "id".to_string(),
16
+ primitive_type: PrimitiveType::Int32,
17
+ nullable: false,
18
+ format: None,
19
+ },
20
+ SchemaNode::Primitive {
21
+ name: "data".to_string(),
22
+ primitive_type: PrimitiveType::Binary,
23
+ nullable: false,
24
+ format: None,
25
+ },
26
+ ],
27
+ })
28
+ .build()
29
+ .unwrap();
30
+
31
+ let test_data = vec![
32
+ // Empty binary data
33
+ vec![
34
+ ParquetValue::Int32(1),
35
+ ParquetValue::Bytes(Bytes::from(vec![])),
36
+ ],
37
+ // Small binary data
38
+ vec![
39
+ ParquetValue::Int32(2),
40
+ ParquetValue::Bytes(Bytes::from(vec![0x00, 0x01, 0x02, 0x03])),
41
+ ],
42
+ // Bytes data with all byte values
43
+ vec![
44
+ ParquetValue::Int32(3),
45
+ ParquetValue::Bytes(Bytes::from((0u8..=255u8).collect::<Vec<u8>>())),
46
+ ],
47
+ // Bytes data with null bytes
48
+ vec![
49
+ ParquetValue::Int32(4),
50
+ ParquetValue::Bytes(Bytes::from(vec![0x00, 0x00, 0x00, 0x00])),
51
+ ],
52
+ // Random binary data
53
+ vec![
54
+ ParquetValue::Int32(5),
55
+ ParquetValue::Bytes(Bytes::from(vec![
56
+ 0xDE, 0xAD, 0xBE, 0xEF, 0xCA, 0xFE, 0xBA, 0xBE,
57
+ ])),
58
+ ],
59
+ ];
60
+
61
+ let mut buffer = Vec::new();
62
+ {
63
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
64
+ writer.write_rows(test_data.clone()).unwrap();
65
+ writer.close().unwrap();
66
+ }
67
+
68
+ // Read back and verify
69
+ let bytes = Bytes::from(buffer);
70
+ let reader = Reader::new(bytes);
71
+
72
+ let read_rows: Vec<_> = reader
73
+ .read_rows()
74
+ .unwrap()
75
+ .collect::<Result<Vec<_>>>()
76
+ .unwrap();
77
+
78
+ assert_eq!(read_rows.len(), test_data.len());
79
+
80
+ // Verify binary data is preserved exactly
81
+ for (expected, actual) in test_data.iter().zip(read_rows.iter()) {
82
+ assert_eq!(expected, actual);
83
+ }
84
+ }
85
+
86
+ #[test]
87
+ fn test_large_binary_data() {
88
+ // Test handling of large binary blobs
89
+ let schema = SchemaBuilder::new()
90
+ .with_root(SchemaNode::Struct {
91
+ name: "root".to_string(),
92
+ nullable: false,
93
+ fields: vec![SchemaNode::Primitive {
94
+ name: "blob".to_string(),
95
+ primitive_type: PrimitiveType::Binary,
96
+ nullable: false,
97
+ format: None,
98
+ }],
99
+ })
100
+ .build()
101
+ .unwrap();
102
+
103
+ let sizes = vec![
104
+ 1024, // 1 KB
105
+ 10 * 1024, // 10 KB
106
+ 100 * 1024, // 100 KB
107
+ 1024 * 1024, // 1 MB
108
+ ];
109
+
110
+ for size in sizes {
111
+ let large_data: Bytes = (0..size).map(|i| (i % 256) as u8).collect();
112
+
113
+ let rows = vec![vec![ParquetValue::Bytes(large_data.clone())]];
114
+
115
+ let mut buffer = Vec::new();
116
+ {
117
+ let mut writer = Writer::new(&mut buffer, schema.clone()).unwrap();
118
+ writer.write_rows(rows).unwrap();
119
+ writer.close().unwrap();
120
+ }
121
+
122
+ // Read back and verify
123
+ let bytes = Bytes::from(buffer);
124
+ let reader = Reader::new(bytes);
125
+
126
+ let read_rows: Vec<_> = reader
127
+ .read_rows()
128
+ .unwrap()
129
+ .collect::<Result<Vec<_>>>()
130
+ .unwrap();
131
+
132
+ assert_eq!(read_rows.len(), 1);
133
+
134
+ match &read_rows[0][0] {
135
+ ParquetValue::Bytes(data) => {
136
+ assert_eq!(data.len(), size);
137
+ assert_eq!(data, &large_data);
138
+ }
139
+ _ => panic!("Expected binary value"),
140
+ }
141
+ }
142
+ }
143
+
144
+ #[test]
145
+ fn test_nullable_binary() {
146
+ // Test nullable binary fields
147
+ let schema = SchemaBuilder::new()
148
+ .with_root(SchemaNode::Struct {
149
+ name: "root".to_string(),
150
+ nullable: false,
151
+ fields: vec![SchemaNode::Primitive {
152
+ name: "optional_data".to_string(),
153
+ primitive_type: PrimitiveType::Binary,
154
+ nullable: true,
155
+ format: None,
156
+ }],
157
+ })
158
+ .build()
159
+ .unwrap();
160
+
161
+ let rows = vec![
162
+ vec![ParquetValue::Bytes(Bytes::from(vec![1, 2, 3]))],
163
+ vec![ParquetValue::Null],
164
+ vec![ParquetValue::Bytes(Bytes::from(vec![]))],
165
+ vec![ParquetValue::Null],
166
+ vec![ParquetValue::Bytes(Bytes::from(vec![255, 254, 253]))],
167
+ ];
168
+
169
+ let mut buffer = Vec::new();
170
+ {
171
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
172
+ writer.write_rows(rows.clone()).unwrap();
173
+ writer.close().unwrap();
174
+ }
175
+
176
+ // Read back and verify
177
+ let bytes = Bytes::from(buffer);
178
+ let reader = Reader::new(bytes);
179
+
180
+ let read_rows: Vec<_> = reader
181
+ .read_rows()
182
+ .unwrap()
183
+ .collect::<Result<Vec<_>>>()
184
+ .unwrap();
185
+
186
+ assert_eq!(read_rows.len(), rows.len());
187
+
188
+ // Verify nulls and empty binary are handled correctly
189
+ for (expected, actual) in rows.iter().zip(read_rows.iter()) {
190
+ assert_eq!(expected, actual);
191
+ }
192
+ }
193
+
194
+ #[test]
195
+ fn test_fixed_size_binary() {
196
+ // Test fixed-size binary data (if supported)
197
+ let schema = SchemaBuilder::new()
198
+ .with_root(SchemaNode::Struct {
199
+ name: "root".to_string(),
200
+ nullable: false,
201
+ fields: vec![
202
+ SchemaNode::Primitive {
203
+ name: "uuid".to_string(),
204
+ primitive_type: PrimitiveType::Binary, // Ideally would be FixedBytes(16)
205
+ nullable: false,
206
+ format: None,
207
+ },
208
+ SchemaNode::Primitive {
209
+ name: "hash".to_string(),
210
+ primitive_type: PrimitiveType::Binary, // Ideally would be FixedBytes(32)
211
+ nullable: false,
212
+ format: None,
213
+ },
214
+ ],
215
+ })
216
+ .build()
217
+ .unwrap();
218
+
219
+ let rows = vec![
220
+ vec![
221
+ // 16-byte UUID-like value
222
+ ParquetValue::Bytes(Bytes::from(vec![
223
+ 0x12, 0x34, 0x56, 0x78, 0x9a, 0xbc, 0xde, 0xf0, 0x12, 0x34, 0x56, 0x78, 0x9a, 0xbc,
224
+ 0xde, 0xf0,
225
+ ])),
226
+ // 32-byte hash-like value
227
+ ParquetValue::Bytes(Bytes::from(vec![
228
+ 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd,
229
+ 0xee, 0xff, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb,
230
+ 0xcc, 0xdd, 0xee, 0xff,
231
+ ])),
232
+ ],
233
+ vec![
234
+ // Another UUID
235
+ ParquetValue::Bytes(Bytes::from(vec![
236
+ 0xf0, 0xe1, 0xd2, 0xc3, 0xb4, 0xa5, 0x96, 0x87, 0x78, 0x69, 0x5a, 0x4b, 0x3c, 0x2d,
237
+ 0x1e, 0x0f,
238
+ ])),
239
+ // Another hash
240
+ ParquetValue::Bytes(Bytes::from(vec![
241
+ 0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99, 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22,
242
+ 0x11, 0x00, 0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99, 0x88, 0x77, 0x66, 0x55, 0x44,
243
+ 0x33, 0x22, 0x11, 0x00,
244
+ ])),
245
+ ],
246
+ ];
247
+
248
+ let mut buffer = Vec::new();
249
+ {
250
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
251
+ writer.write_rows(rows.clone()).unwrap();
252
+ writer.close().unwrap();
253
+ }
254
+
255
+ // Read back and verify
256
+ let bytes = Bytes::from(buffer);
257
+ let reader = Reader::new(bytes);
258
+
259
+ let read_rows: Vec<_> = reader
260
+ .read_rows()
261
+ .unwrap()
262
+ .collect::<Result<Vec<_>>>()
263
+ .unwrap();
264
+
265
+ assert_eq!(read_rows.len(), rows.len());
266
+
267
+ for (expected, actual) in rows.iter().zip(read_rows.iter()) {
268
+ assert_eq!(expected, actual);
269
+ }
270
+ }
271
+
272
+ #[test]
273
+ fn test_binary_string_interoperability() {
274
+ // Test that binary data doesn't get confused with strings
275
+ let schema = SchemaBuilder::new()
276
+ .with_root(SchemaNode::Struct {
277
+ name: "root".to_string(),
278
+ nullable: false,
279
+ fields: vec![
280
+ SchemaNode::Primitive {
281
+ name: "text".to_string(),
282
+ primitive_type: PrimitiveType::String,
283
+ nullable: false,
284
+ format: None,
285
+ },
286
+ SchemaNode::Primitive {
287
+ name: "binary".to_string(),
288
+ primitive_type: PrimitiveType::Binary,
289
+ nullable: false,
290
+ format: None,
291
+ },
292
+ ],
293
+ })
294
+ .build()
295
+ .unwrap();
296
+
297
+ let test_string = "Hello, 世界! 🦀";
298
+ let test_bytes = test_string.as_bytes().to_vec();
299
+
300
+ let rows = vec![
301
+ vec![
302
+ ParquetValue::String(Arc::from(test_string)),
303
+ ParquetValue::Bytes(test_bytes.into()),
304
+ ],
305
+ vec![
306
+ ParquetValue::String(Arc::from("Regular ASCII text")),
307
+ ParquetValue::Bytes(Bytes::from(vec![0xff, 0xfe, 0xfd])), // Invalid UTF-8
308
+ ],
309
+ vec![
310
+ ParquetValue::String(Arc::from("")), // Empty string
311
+ ParquetValue::Bytes(Bytes::from(vec![])), // Empty binary
312
+ ],
313
+ ];
314
+
315
+ let mut buffer = Vec::new();
316
+ {
317
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
318
+ writer.write_rows(rows.clone()).unwrap();
319
+ writer.close().unwrap();
320
+ }
321
+
322
+ // Read back and verify
323
+ let bytes = Bytes::from(buffer);
324
+ let reader = Reader::new(bytes);
325
+
326
+ let read_rows: Vec<_> = reader
327
+ .read_rows()
328
+ .unwrap()
329
+ .collect::<Result<Vec<_>>>()
330
+ .unwrap();
331
+
332
+ assert_eq!(read_rows.len(), rows.len());
333
+
334
+ // Verify string and binary are kept separate
335
+ for (expected, actual) in rows.iter().zip(read_rows.iter()) {
336
+ assert_eq!(expected, actual);
337
+ }
338
+ }
339
+
340
+ #[test]
341
+ fn test_binary_in_complex_types() {
342
+ // Test binary data within lists and structs
343
+ let schema = SchemaBuilder::new()
344
+ .with_root(SchemaNode::Struct {
345
+ name: "root".to_string(),
346
+ nullable: false,
347
+ fields: vec![
348
+ SchemaNode::List {
349
+ name: "binary_list".to_string(),
350
+ nullable: false,
351
+ item: Box::new(SchemaNode::Primitive {
352
+ name: "item".to_string(),
353
+ primitive_type: PrimitiveType::Binary,
354
+ nullable: false,
355
+ format: None,
356
+ }),
357
+ },
358
+ SchemaNode::Struct {
359
+ name: "binary_struct".to_string(),
360
+ nullable: false,
361
+ fields: vec![
362
+ SchemaNode::Primitive {
363
+ name: "data1".to_string(),
364
+ primitive_type: PrimitiveType::Binary,
365
+ nullable: false,
366
+ format: None,
367
+ },
368
+ SchemaNode::Primitive {
369
+ name: "data2".to_string(),
370
+ primitive_type: PrimitiveType::Binary,
371
+ nullable: true,
372
+ format: None,
373
+ },
374
+ ],
375
+ },
376
+ ],
377
+ })
378
+ .build()
379
+ .unwrap();
380
+
381
+ let rows = vec![
382
+ vec![
383
+ ParquetValue::List(vec![
384
+ ParquetValue::Bytes(Bytes::from(vec![1, 2, 3])),
385
+ ParquetValue::Bytes(Bytes::from(vec![4, 5, 6])),
386
+ ParquetValue::Bytes(Bytes::from(vec![7, 8, 9])),
387
+ ]),
388
+ ParquetValue::Record({
389
+ let mut map = IndexMap::new();
390
+ map.insert(
391
+ Arc::from("data1"),
392
+ ParquetValue::Bytes(Bytes::from(vec![0xAA, 0xBB])),
393
+ );
394
+ map.insert(
395
+ Arc::from("data2"),
396
+ ParquetValue::Bytes(Bytes::from(vec![0xCC, 0xDD])),
397
+ );
398
+ map
399
+ }),
400
+ ],
401
+ vec![
402
+ ParquetValue::List(vec![ParquetValue::Bytes(Bytes::from(vec![]))]),
403
+ ParquetValue::Record({
404
+ let mut map = IndexMap::new();
405
+ map.insert(
406
+ Arc::from("data1"),
407
+ ParquetValue::Bytes(Bytes::from(vec![0xFF])),
408
+ );
409
+ map.insert(Arc::from("data2"), ParquetValue::Null);
410
+ map
411
+ }),
412
+ ],
413
+ ];
414
+
415
+ let mut buffer = Vec::new();
416
+ {
417
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
418
+ writer.write_rows(rows.clone()).unwrap();
419
+ writer.close().unwrap();
420
+ }
421
+
422
+ // Read back and verify
423
+ let bytes = Bytes::from(buffer);
424
+ let reader = Reader::new(bytes);
425
+
426
+ let read_rows: Vec<_> = reader
427
+ .read_rows()
428
+ .unwrap()
429
+ .collect::<Result<Vec<_>>>()
430
+ .unwrap();
431
+
432
+ assert_eq!(read_rows.len(), rows.len());
433
+
434
+ for (expected, actual) in rows.iter().zip(read_rows.iter()) {
435
+ assert_eq!(expected, actual);
436
+ }
437
+ }