parquet 0.5.12 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +8 -5
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -603
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,547 @@
1
+ use bytes::Bytes;
2
+ use indexmap::IndexMap;
3
+ use parquet_core::*;
4
+ use std::sync::Arc;
5
+
6
+ mod test_helpers;
7
+
8
+ // ====== Schema Construction Errors ======
9
+
10
+ #[test]
11
+ fn test_schema_builder_error_cases() {
12
+ // Test building without root node
13
+ let result = SchemaBuilder::new().build();
14
+ assert!(result.is_err());
15
+ assert_eq!(result.unwrap_err(), "Schema must have a root node");
16
+ }
17
+
18
+ #[test]
19
+ fn test_empty_struct_unsupported() {
20
+ // Test that empty structs are not supported by Parquet
21
+ let schema = SchemaBuilder::new()
22
+ .with_root(SchemaNode::Struct {
23
+ name: "root".to_string(),
24
+ nullable: false,
25
+ fields: vec![
26
+ SchemaNode::List {
27
+ name: "empty_list".to_string(),
28
+ nullable: false,
29
+ item: Box::new(SchemaNode::Primitive {
30
+ name: "item".to_string(),
31
+ primitive_type: PrimitiveType::Int32,
32
+ nullable: false,
33
+ format: None,
34
+ }),
35
+ },
36
+ SchemaNode::Map {
37
+ name: "empty_map".to_string(),
38
+ nullable: false,
39
+ key: Box::new(SchemaNode::Primitive {
40
+ name: "key".to_string(),
41
+ primitive_type: PrimitiveType::String,
42
+ nullable: false,
43
+ format: None,
44
+ }),
45
+ value: Box::new(SchemaNode::Primitive {
46
+ name: "value".to_string(),
47
+ primitive_type: PrimitiveType::Int32,
48
+ nullable: false,
49
+ format: None,
50
+ }),
51
+ },
52
+ SchemaNode::Struct {
53
+ name: "empty_struct".to_string(),
54
+ nullable: false,
55
+ fields: vec![], // Empty struct - not supported by Parquet
56
+ },
57
+ ],
58
+ })
59
+ .build()
60
+ .unwrap();
61
+
62
+ // Try to create a writer - this should fail due to empty struct
63
+ let mut buffer = Vec::new();
64
+ let result = Writer::new(&mut buffer, schema.clone());
65
+
66
+ // Expect an error about empty structs
67
+ assert!(result.is_err());
68
+ match result {
69
+ Err(ParquetError::Parquet(e)) => {
70
+ assert!(e.to_string().contains("empty struct"));
71
+ }
72
+ _ => panic!("Expected Parquet error about empty structs"),
73
+ }
74
+ }
75
+
76
+ // ====== Field Count Validation Errors ======
77
+
78
+ #[test]
79
+ fn test_field_count_mismatch() {
80
+ let schema = SchemaBuilder::new()
81
+ .with_root(SchemaNode::Struct {
82
+ name: "root".to_string(),
83
+ nullable: false,
84
+ fields: vec![
85
+ SchemaNode::Primitive {
86
+ name: "field1".to_string(),
87
+ primitive_type: PrimitiveType::Int32,
88
+ nullable: false,
89
+ format: None,
90
+ },
91
+ SchemaNode::Primitive {
92
+ name: "field2".to_string(),
93
+ primitive_type: PrimitiveType::String,
94
+ nullable: false,
95
+ format: None,
96
+ },
97
+ SchemaNode::Primitive {
98
+ name: "field3".to_string(),
99
+ primitive_type: PrimitiveType::Float64,
100
+ nullable: false,
101
+ format: None,
102
+ },
103
+ ],
104
+ })
105
+ .build()
106
+ .unwrap();
107
+
108
+ let mut buffer = Vec::new();
109
+
110
+ // Test using Writer
111
+ {
112
+ let mut writer = Writer::new(&mut buffer, schema.clone()).unwrap();
113
+
114
+ // Test row with too few fields
115
+ let result = writer.write_rows(vec![vec![
116
+ ParquetValue::Int32(1),
117
+ ParquetValue::String(Arc::from("test")),
118
+ // Missing third field
119
+ ]]);
120
+
121
+ assert!(result.is_err());
122
+ let err_msg = result.unwrap_err().to_string();
123
+ assert!(
124
+ err_msg.contains("Row has 2 values") && err_msg.contains("schema has 3 fields"),
125
+ "Error message was: {}",
126
+ err_msg
127
+ );
128
+ }
129
+
130
+ // Test using WriterBuilder
131
+ {
132
+ buffer.clear();
133
+ let mut writer = WriterBuilder::new()
134
+ .build(&mut buffer, schema.clone())
135
+ .unwrap();
136
+
137
+ // Test row with too few fields
138
+ let result = writer.write_row(vec![
139
+ ParquetValue::Int32(42),
140
+ // Missing second and third fields
141
+ ]);
142
+
143
+ assert!(result.is_err());
144
+ assert!(result
145
+ .unwrap_err()
146
+ .to_string()
147
+ .contains("Row has 1 values but schema has 3 fields"));
148
+
149
+ // Test row with too many fields
150
+ let result = writer.write_row(vec![
151
+ ParquetValue::Int32(42),
152
+ ParquetValue::String(Arc::from("test")),
153
+ ParquetValue::Float64(ordered_float::OrderedFloat(3.14)),
154
+ ParquetValue::Boolean(true), // Extra field
155
+ ]);
156
+
157
+ assert!(result.is_err());
158
+ assert!(result
159
+ .unwrap_err()
160
+ .to_string()
161
+ .contains("Row has 4 values but schema has 3 fields"));
162
+ }
163
+ }
164
+
165
+ // ====== Type Mismatch Errors ======
166
+
167
+ #[test]
168
+ fn test_type_mismatch() {
169
+ let schema = SchemaBuilder::new()
170
+ .with_root(SchemaNode::Struct {
171
+ name: "root".to_string(),
172
+ nullable: false,
173
+ fields: vec![
174
+ SchemaNode::Primitive {
175
+ name: "int_field".to_string(),
176
+ primitive_type: PrimitiveType::Int32,
177
+ nullable: false,
178
+ format: None,
179
+ },
180
+ SchemaNode::Primitive {
181
+ name: "string_field".to_string(),
182
+ primitive_type: PrimitiveType::String,
183
+ nullable: false,
184
+ format: None,
185
+ },
186
+ ],
187
+ })
188
+ .build()
189
+ .unwrap();
190
+
191
+ let mut buffer = Vec::new();
192
+ {
193
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
194
+
195
+ // Try to write wrong types
196
+ let result = writer.write_rows(vec![vec![
197
+ ParquetValue::String(Arc::from("not an int")), // Wrong type for int_field
198
+ ParquetValue::Int32(123), // Wrong type for string_field
199
+ ]]);
200
+
201
+ assert!(result.is_err());
202
+ let err_msg = result.unwrap_err().to_string();
203
+ assert!(
204
+ err_msg.contains("Type mismatch") && err_msg.contains("expected Int32"),
205
+ "Error message was: {}",
206
+ err_msg
207
+ );
208
+ }
209
+ }
210
+
211
+ // ====== Null Validation Errors ======
212
+
213
+ #[test]
214
+ fn test_null_in_non_nullable_field() {
215
+ let schema = SchemaBuilder::new()
216
+ .with_root(SchemaNode::Struct {
217
+ name: "root".to_string(),
218
+ nullable: false,
219
+ fields: vec![SchemaNode::Primitive {
220
+ name: "required_field".to_string(),
221
+ primitive_type: PrimitiveType::Int32,
222
+ nullable: false,
223
+ format: None,
224
+ }],
225
+ })
226
+ .build()
227
+ .unwrap();
228
+
229
+ let mut buffer = Vec::new();
230
+ {
231
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
232
+
233
+ // Try to write null to non-nullable field
234
+ let result = writer.write_rows(vec![vec![ParquetValue::Null]]);
235
+
236
+ assert!(result.is_err());
237
+ let err_msg = result.unwrap_err().to_string();
238
+ assert!(
239
+ err_msg.contains("null value") && err_msg.contains("non-nullable"),
240
+ "Error message was: {}",
241
+ err_msg
242
+ );
243
+ }
244
+ }
245
+
246
+ // ====== Complex Type Validation Errors ======
247
+
248
+ #[test]
249
+ fn test_invalid_struct_fields() {
250
+ let schema = SchemaBuilder::new()
251
+ .with_root(SchemaNode::Struct {
252
+ name: "root".to_string(),
253
+ nullable: false,
254
+ fields: vec![SchemaNode::Struct {
255
+ name: "nested".to_string(),
256
+ nullable: false,
257
+ fields: vec![
258
+ SchemaNode::Primitive {
259
+ name: "field1".to_string(),
260
+ primitive_type: PrimitiveType::Int32,
261
+ nullable: false,
262
+ format: None,
263
+ },
264
+ SchemaNode::Primitive {
265
+ name: "field2".to_string(),
266
+ primitive_type: PrimitiveType::String,
267
+ nullable: false,
268
+ format: None,
269
+ },
270
+ ],
271
+ }],
272
+ })
273
+ .build()
274
+ .unwrap();
275
+
276
+ let mut buffer = Vec::new();
277
+ {
278
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
279
+
280
+ // Try to write struct with missing fields
281
+ let mut incomplete_struct = IndexMap::new();
282
+ incomplete_struct.insert(Arc::from("field1"), ParquetValue::Int32(42));
283
+ // field2 is missing
284
+
285
+ let result = writer.write_rows(vec![vec![ParquetValue::Record(incomplete_struct)]]);
286
+
287
+ assert!(result.is_err());
288
+ let err_msg = result.unwrap_err().to_string();
289
+ assert!(
290
+ err_msg.contains("Required field") && err_msg.contains("field2"),
291
+ "Error message was: {}",
292
+ err_msg
293
+ );
294
+ }
295
+ }
296
+
297
+ #[test]
298
+ fn test_invalid_list_element_type() {
299
+ let schema = SchemaBuilder::new()
300
+ .with_root(SchemaNode::Struct {
301
+ name: "root".to_string(),
302
+ nullable: false,
303
+ fields: vec![SchemaNode::List {
304
+ name: "int_list".to_string(),
305
+ nullable: false,
306
+ item: Box::new(SchemaNode::Primitive {
307
+ name: "item".to_string(),
308
+ primitive_type: PrimitiveType::Int32,
309
+ nullable: false,
310
+ format: None,
311
+ }),
312
+ }],
313
+ })
314
+ .build()
315
+ .unwrap();
316
+
317
+ let mut buffer = Vec::new();
318
+ {
319
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
320
+
321
+ // Try to write list with wrong element type
322
+ let result = writer.write_rows(vec![vec![ParquetValue::List(vec![ParquetValue::String(
323
+ Arc::from("not an int"),
324
+ )])]]);
325
+
326
+ assert!(result.is_err());
327
+ let err_msg = result.unwrap_err().to_string();
328
+ assert!(
329
+ err_msg.contains("Type mismatch") && err_msg.contains("expected Int32"),
330
+ "Error message was: {}",
331
+ err_msg
332
+ );
333
+ }
334
+ }
335
+
336
+ #[test]
337
+ fn test_invalid_map_key_value_types() {
338
+ let schema = SchemaBuilder::new()
339
+ .with_root(SchemaNode::Struct {
340
+ name: "root".to_string(),
341
+ nullable: false,
342
+ fields: vec![SchemaNode::Map {
343
+ name: "string_int_map".to_string(),
344
+ nullable: false,
345
+ key: Box::new(SchemaNode::Primitive {
346
+ name: "key".to_string(),
347
+ primitive_type: PrimitiveType::String,
348
+ nullable: false,
349
+ format: None,
350
+ }),
351
+ value: Box::new(SchemaNode::Primitive {
352
+ name: "value".to_string(),
353
+ primitive_type: PrimitiveType::Int32,
354
+ nullable: false,
355
+ format: None,
356
+ }),
357
+ }],
358
+ })
359
+ .build()
360
+ .unwrap();
361
+
362
+ let mut buffer = Vec::new();
363
+ {
364
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
365
+
366
+ // Try to write map with wrong key type
367
+ let result = writer.write_rows(vec![vec![ParquetValue::Map(vec![(
368
+ ParquetValue::Int32(42), // Wrong key type
369
+ ParquetValue::Int32(100),
370
+ )])]]);
371
+
372
+ assert!(result.is_err());
373
+ let err_msg = result.unwrap_err().to_string();
374
+ assert!(
375
+ err_msg.contains("Type mismatch") && err_msg.contains("expected Utf8"),
376
+ "Error message was: {}",
377
+ err_msg
378
+ );
379
+ }
380
+ }
381
+
382
+ // ====== Unsupported Features ======
383
+
384
+ #[test]
385
+ fn test_map_with_struct_values_unsupported() {
386
+ // Test that maps with struct values are not yet supported
387
+ let schema = SchemaBuilder::new()
388
+ .with_root(SchemaNode::Struct {
389
+ name: "root".to_string(),
390
+ nullable: false,
391
+ fields: vec![SchemaNode::Map {
392
+ name: "map_field".to_string(),
393
+ nullable: false,
394
+ key: Box::new(SchemaNode::Primitive {
395
+ name: "key".to_string(),
396
+ primitive_type: PrimitiveType::String,
397
+ nullable: false,
398
+ format: None,
399
+ }),
400
+ value: Box::new(SchemaNode::Struct {
401
+ name: "value_struct".to_string(),
402
+ nullable: false,
403
+ fields: vec![SchemaNode::Primitive {
404
+ name: "field".to_string(),
405
+ primitive_type: PrimitiveType::Int32,
406
+ nullable: false,
407
+ format: None,
408
+ }],
409
+ }),
410
+ }],
411
+ })
412
+ .build()
413
+ .unwrap();
414
+
415
+ // Try to write a map with struct values
416
+ let row = vec![ParquetValue::Map(vec![(
417
+ ParquetValue::String(Arc::from("key1")),
418
+ ParquetValue::Record({
419
+ let mut map = IndexMap::new();
420
+ map.insert(Arc::from("field"), ParquetValue::Int32(42));
421
+ map
422
+ }),
423
+ )])];
424
+
425
+ let mut buffer = Vec::new();
426
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
427
+ let result = writer.write_rows(vec![row]);
428
+
429
+ // Check if this is still a limitation
430
+ if result.is_err() {
431
+ match result {
432
+ Err(ParquetError::Conversion(msg)) => {
433
+ assert!(msg.contains("Maps with struct values are not yet supported"));
434
+ }
435
+ _ => panic!("Expected Conversion error about maps with struct values"),
436
+ }
437
+ } else {
438
+ // If it succeeds, then the limitation has been fixed!
439
+ // Let's verify we can read it back
440
+ writer.close().unwrap();
441
+
442
+ let bytes = Bytes::from(buffer);
443
+ let reader = Reader::new(bytes);
444
+
445
+ let read_rows: Vec<_> = reader
446
+ .read_rows()
447
+ .unwrap()
448
+ .collect::<Result<Vec<_>>>()
449
+ .unwrap();
450
+
451
+ assert_eq!(read_rows.len(), 1);
452
+ // Maps with struct values now work!
453
+ }
454
+ }
455
+
456
+ // ====== Writer State Errors ======
457
+
458
+ #[test]
459
+ fn test_writer_multiple_close() {
460
+ let schema = SchemaBuilder::new()
461
+ .with_root(SchemaNode::Struct {
462
+ name: "root".to_string(),
463
+ nullable: false,
464
+ fields: vec![SchemaNode::Primitive {
465
+ name: "value".to_string(),
466
+ primitive_type: PrimitiveType::Int32,
467
+ nullable: false,
468
+ format: None,
469
+ }],
470
+ })
471
+ .build()
472
+ .unwrap();
473
+
474
+ // Test that we can't write after moving the writer into close()
475
+ let mut buffer = Vec::new();
476
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
477
+
478
+ // Write some data
479
+ writer
480
+ .write_rows(vec![vec![ParquetValue::Int32(1)]])
481
+ .unwrap();
482
+
483
+ // Close consumes the writer, so we can't use it afterwards
484
+ writer.close().unwrap();
485
+
486
+ // The writer has been consumed by close(), so we can't access it anymore
487
+ // This is enforced at compile time by Rust's ownership system
488
+ }
489
+
490
+ // ====== Invalid Collection Schemas ======
491
+
492
+ #[test]
493
+ fn test_invalid_collection_schemas() {
494
+ let test_cases = vec![
495
+ (
496
+ "list_without_item",
497
+ SchemaNode::List {
498
+ name: "invalid_list".to_string(),
499
+ nullable: false,
500
+ item: Box::new(SchemaNode::Struct {
501
+ name: "empty".to_string(),
502
+ nullable: false,
503
+ fields: vec![],
504
+ }),
505
+ },
506
+ ),
507
+ (
508
+ "map_without_value",
509
+ SchemaNode::Map {
510
+ name: "invalid_map".to_string(),
511
+ nullable: false,
512
+ key: Box::new(SchemaNode::Primitive {
513
+ name: "key".to_string(),
514
+ primitive_type: PrimitiveType::String,
515
+ nullable: false,
516
+ format: None,
517
+ }),
518
+ value: Box::new(SchemaNode::Struct {
519
+ name: "empty".to_string(),
520
+ nullable: false,
521
+ fields: vec![],
522
+ }),
523
+ },
524
+ ),
525
+ ];
526
+
527
+ for (name, invalid_node) in test_cases {
528
+ let result = SchemaBuilder::new()
529
+ .with_root(SchemaNode::Struct {
530
+ name: "root".to_string(),
531
+ nullable: false,
532
+ fields: vec![invalid_node],
533
+ })
534
+ .build();
535
+
536
+ // Document the behavior for invalid collection schemas
537
+ match result {
538
+ Ok(_) => {
539
+ // Some invalid schemas might be allowed at build time
540
+ // but fail at write time
541
+ }
542
+ Err(e) => {
543
+ println!("Schema validation for {}: {}", name, e);
544
+ }
545
+ }
546
+ }
547
+ }