parquet-tyfoom 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/Cargo.lock +1854 -0
  3. data/Cargo.toml +3 -0
  4. data/Gemfile +21 -0
  5. data/LICENSE +21 -0
  6. data/README.md +428 -0
  7. data/Rakefile +43 -0
  8. data/ext/parquet/Cargo.toml +39 -0
  9. data/ext/parquet/build.rs +5 -0
  10. data/ext/parquet/extconf.rb +4 -0
  11. data/ext/parquet/src/adapter_ffi.rs +297 -0
  12. data/ext/parquet/src/allocator.rs +13 -0
  13. data/ext/parquet/src/lib.rs +24 -0
  14. data/ext/parquet-core/Cargo.toml +24 -0
  15. data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
  16. data/ext/parquet-core/src/error.rs +189 -0
  17. data/ext/parquet-core/src/lib.rs +60 -0
  18. data/ext/parquet-core/src/reader.rs +368 -0
  19. data/ext/parquet-core/src/schema.rs +452 -0
  20. data/ext/parquet-core/src/test_utils.rs +308 -0
  21. data/ext/parquet-core/src/traits/mod.rs +5 -0
  22. data/ext/parquet-core/src/traits/schema.rs +190 -0
  23. data/ext/parquet-core/src/value.rs +220 -0
  24. data/ext/parquet-core/src/writer.rs +1241 -0
  25. data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
  26. data/ext/parquet-core/tests/binary_data.rs +437 -0
  27. data/ext/parquet-core/tests/column_projection.rs +557 -0
  28. data/ext/parquet-core/tests/complex_types.rs +821 -0
  29. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  30. data/ext/parquet-core/tests/concurrent_access.rs +431 -0
  31. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  32. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  33. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
  34. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  35. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  36. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  37. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  38. data/ext/parquet-core/tests/review_regressions.rs +787 -0
  39. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  40. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
  41. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  42. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  43. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  44. data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
  45. data/ext/parquet-ruby-adapter/build.rs +5 -0
  46. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  47. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  48. data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
  49. data/ext/parquet-ruby-adapter/src/error.rs +141 -0
  50. data/ext/parquet-ruby-adapter/src/io.rs +432 -0
  51. data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
  52. data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
  53. data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
  54. data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
  55. data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
  56. data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
  57. data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
  58. data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
  59. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  60. data/ext/parquet-ruby-adapter/src/types.rs +98 -0
  61. data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
  62. data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
  63. data/lib/parquet/schema.rb +262 -0
  64. data/lib/parquet/version.rb +3 -0
  65. data/lib/parquet.rb +11 -0
  66. data/lib/parquet.rbi +181 -0
  67. metadata +165 -0
@@ -0,0 +1,540 @@
1
+ use bytes::Bytes;
2
+ use indexmap::IndexMap;
3
+ use parquet_core::*;
4
+ use triomphe::Arc;
5
+
6
+ mod test_helpers;
7
+
8
+ // ====== Schema Construction Errors ======
9
+
10
+ #[test]
11
+ fn test_schema_builder_error_cases() {
12
+ // Test building without root node
13
+ let result = SchemaBuilder::new().build();
14
+ assert!(result.is_err());
15
+ assert_eq!(result.unwrap_err(), "Schema must have a root node");
16
+ }
17
+
18
+ #[test]
19
+ fn test_empty_struct_unsupported() {
20
+ // Test that empty structs are not supported by Parquet
21
+ let error = SchemaBuilder::new()
22
+ .with_root(SchemaNode::Struct {
23
+ name: "root".to_string(),
24
+ nullable: false,
25
+ fields: vec![
26
+ SchemaNode::List {
27
+ name: "empty_list".to_string(),
28
+ nullable: false,
29
+ item: Box::new(SchemaNode::Primitive {
30
+ name: "item".to_string(),
31
+ primitive_type: PrimitiveType::Int32,
32
+ nullable: false,
33
+ format: None,
34
+ }),
35
+ },
36
+ SchemaNode::Map {
37
+ name: "empty_map".to_string(),
38
+ nullable: false,
39
+ key: Box::new(SchemaNode::Primitive {
40
+ name: "key".to_string(),
41
+ primitive_type: PrimitiveType::String,
42
+ nullable: false,
43
+ format: None,
44
+ }),
45
+ value: Box::new(SchemaNode::Primitive {
46
+ name: "value".to_string(),
47
+ primitive_type: PrimitiveType::Int32,
48
+ nullable: false,
49
+ format: None,
50
+ }),
51
+ },
52
+ SchemaNode::Struct {
53
+ name: "empty_struct".to_string(),
54
+ nullable: false,
55
+ fields: vec![], // Empty struct - not supported by Parquet
56
+ },
57
+ ],
58
+ })
59
+ .build()
60
+ .unwrap_err();
61
+
62
+ assert_eq!(
63
+ error,
64
+ "Struct field 'root.empty_struct' must contain at least one field"
65
+ );
66
+ }
67
+
68
+ // ====== Field Count Validation Errors ======
69
+
70
+ #[test]
71
+ fn test_field_count_mismatch() {
72
+ let schema = SchemaBuilder::new()
73
+ .with_root(SchemaNode::Struct {
74
+ name: "root".to_string(),
75
+ nullable: false,
76
+ fields: vec![
77
+ SchemaNode::Primitive {
78
+ name: "field1".to_string(),
79
+ primitive_type: PrimitiveType::Int32,
80
+ nullable: false,
81
+ format: None,
82
+ },
83
+ SchemaNode::Primitive {
84
+ name: "field2".to_string(),
85
+ primitive_type: PrimitiveType::String,
86
+ nullable: false,
87
+ format: None,
88
+ },
89
+ SchemaNode::Primitive {
90
+ name: "field3".to_string(),
91
+ primitive_type: PrimitiveType::Float64,
92
+ nullable: false,
93
+ format: None,
94
+ },
95
+ ],
96
+ })
97
+ .build()
98
+ .unwrap();
99
+
100
+ let mut buffer = Vec::new();
101
+
102
+ // Test using Writer
103
+ {
104
+ let mut writer = Writer::new(&mut buffer, schema.clone()).unwrap();
105
+
106
+ // Test row with too few fields
107
+ let result = writer.write_rows(vec![vec![
108
+ ParquetValue::Int32(1),
109
+ ParquetValue::String(Arc::from("test")),
110
+ // Missing third field
111
+ ]]);
112
+
113
+ assert!(result.is_err());
114
+ let err_msg = result.unwrap_err().to_string();
115
+ assert!(
116
+ err_msg.contains("Row has 2 values") && err_msg.contains("schema has 3 fields"),
117
+ "Error message was: {}",
118
+ err_msg
119
+ );
120
+ }
121
+
122
+ // Test using WriterBuilder
123
+ {
124
+ buffer.clear();
125
+ let mut writer = WriterBuilder::new()
126
+ .build(&mut buffer, schema.clone())
127
+ .unwrap();
128
+
129
+ // Test row with too few fields
130
+ let result = writer.write_row(vec![
131
+ ParquetValue::Int32(42),
132
+ // Missing second and third fields
133
+ ]);
134
+
135
+ assert!(result.is_err());
136
+ assert!(result
137
+ .unwrap_err()
138
+ .to_string()
139
+ .contains("Row has 1 values but schema has 3 fields"));
140
+
141
+ // Test row with too many fields
142
+ let result = writer.write_row(vec![
143
+ ParquetValue::Int32(42),
144
+ ParquetValue::String(Arc::from("test")),
145
+ ParquetValue::Float64(ordered_float::OrderedFloat(3.14)),
146
+ ParquetValue::Boolean(true), // Extra field
147
+ ]);
148
+
149
+ assert!(result.is_err());
150
+ assert!(result
151
+ .unwrap_err()
152
+ .to_string()
153
+ .contains("Row has 4 values but schema has 3 fields"));
154
+ }
155
+ }
156
+
157
+ // ====== Type Mismatch Errors ======
158
+
159
+ #[test]
160
+ fn test_type_mismatch() {
161
+ let schema = SchemaBuilder::new()
162
+ .with_root(SchemaNode::Struct {
163
+ name: "root".to_string(),
164
+ nullable: false,
165
+ fields: vec![
166
+ SchemaNode::Primitive {
167
+ name: "int_field".to_string(),
168
+ primitive_type: PrimitiveType::Int32,
169
+ nullable: false,
170
+ format: None,
171
+ },
172
+ SchemaNode::Primitive {
173
+ name: "string_field".to_string(),
174
+ primitive_type: PrimitiveType::String,
175
+ nullable: false,
176
+ format: None,
177
+ },
178
+ ],
179
+ })
180
+ .build()
181
+ .unwrap();
182
+
183
+ let mut buffer = Vec::new();
184
+ {
185
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
186
+
187
+ // Try to write wrong types
188
+ let result = writer.write_rows(vec![vec![
189
+ ParquetValue::String(Arc::from("not an int")), // Wrong type for int_field
190
+ ParquetValue::Int32(123), // Wrong type for string_field
191
+ ]]);
192
+
193
+ assert!(result.is_err());
194
+ let err_msg = result.unwrap_err().to_string();
195
+ assert!(
196
+ err_msg.contains("Type mismatch") && err_msg.contains("expected Int32"),
197
+ "Error message was: {}",
198
+ err_msg
199
+ );
200
+ }
201
+ }
202
+
203
+ // ====== Null Validation Errors ======
204
+
205
+ #[test]
206
+ fn test_null_in_non_nullable_field() {
207
+ let schema = SchemaBuilder::new()
208
+ .with_root(SchemaNode::Struct {
209
+ name: "root".to_string(),
210
+ nullable: false,
211
+ fields: vec![SchemaNode::Primitive {
212
+ name: "required_field".to_string(),
213
+ primitive_type: PrimitiveType::Int32,
214
+ nullable: false,
215
+ format: None,
216
+ }],
217
+ })
218
+ .build()
219
+ .unwrap();
220
+
221
+ let mut buffer = Vec::new();
222
+ {
223
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
224
+
225
+ // Try to write null to non-nullable field
226
+ let result = writer.write_rows(vec![vec![ParquetValue::Null]]);
227
+
228
+ assert!(result.is_err());
229
+ let err_msg = result.unwrap_err().to_string();
230
+ assert!(
231
+ err_msg.contains("null value") && err_msg.contains("non-nullable"),
232
+ "Error message was: {}",
233
+ err_msg
234
+ );
235
+ }
236
+ }
237
+
238
+ // ====== Complex Type Validation Errors ======
239
+
240
+ #[test]
241
+ fn test_invalid_struct_fields() {
242
+ let schema = SchemaBuilder::new()
243
+ .with_root(SchemaNode::Struct {
244
+ name: "root".to_string(),
245
+ nullable: false,
246
+ fields: vec![SchemaNode::Struct {
247
+ name: "nested".to_string(),
248
+ nullable: false,
249
+ fields: vec![
250
+ SchemaNode::Primitive {
251
+ name: "field1".to_string(),
252
+ primitive_type: PrimitiveType::Int32,
253
+ nullable: false,
254
+ format: None,
255
+ },
256
+ SchemaNode::Primitive {
257
+ name: "field2".to_string(),
258
+ primitive_type: PrimitiveType::String,
259
+ nullable: false,
260
+ format: None,
261
+ },
262
+ ],
263
+ }],
264
+ })
265
+ .build()
266
+ .unwrap();
267
+
268
+ let mut buffer = Vec::new();
269
+ {
270
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
271
+
272
+ // Try to write struct with missing fields
273
+ let mut incomplete_struct = IndexMap::new();
274
+ incomplete_struct.insert(Arc::from("field1"), ParquetValue::Int32(42));
275
+ // field2 is missing
276
+
277
+ let result = writer.write_rows(vec![vec![ParquetValue::Record(incomplete_struct)]]);
278
+
279
+ assert!(result.is_err());
280
+ let err_msg = result.unwrap_err().to_string();
281
+ assert!(
282
+ err_msg.contains("Required field") && err_msg.contains("field2"),
283
+ "Error message was: {}",
284
+ err_msg
285
+ );
286
+ }
287
+ }
288
+
289
+ #[test]
290
+ fn test_invalid_list_element_type() {
291
+ let schema = SchemaBuilder::new()
292
+ .with_root(SchemaNode::Struct {
293
+ name: "root".to_string(),
294
+ nullable: false,
295
+ fields: vec![SchemaNode::List {
296
+ name: "int_list".to_string(),
297
+ nullable: false,
298
+ item: Box::new(SchemaNode::Primitive {
299
+ name: "item".to_string(),
300
+ primitive_type: PrimitiveType::Int32,
301
+ nullable: false,
302
+ format: None,
303
+ }),
304
+ }],
305
+ })
306
+ .build()
307
+ .unwrap();
308
+
309
+ let mut buffer = Vec::new();
310
+ {
311
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
312
+
313
+ // Try to write list with wrong element type
314
+ let result = writer.write_rows(vec![vec![ParquetValue::List(vec![ParquetValue::String(
315
+ Arc::from("not an int"),
316
+ )])]]);
317
+
318
+ assert!(result.is_err());
319
+ let err_msg = result.unwrap_err().to_string();
320
+ assert!(
321
+ err_msg.contains("Type mismatch") && err_msg.contains("expected Int32"),
322
+ "Error message was: {}",
323
+ err_msg
324
+ );
325
+ }
326
+ }
327
+
328
+ #[test]
329
+ fn test_invalid_map_key_value_types() {
330
+ let schema = SchemaBuilder::new()
331
+ .with_root(SchemaNode::Struct {
332
+ name: "root".to_string(),
333
+ nullable: false,
334
+ fields: vec![SchemaNode::Map {
335
+ name: "string_int_map".to_string(),
336
+ nullable: false,
337
+ key: Box::new(SchemaNode::Primitive {
338
+ name: "key".to_string(),
339
+ primitive_type: PrimitiveType::String,
340
+ nullable: false,
341
+ format: None,
342
+ }),
343
+ value: Box::new(SchemaNode::Primitive {
344
+ name: "value".to_string(),
345
+ primitive_type: PrimitiveType::Int32,
346
+ nullable: false,
347
+ format: None,
348
+ }),
349
+ }],
350
+ })
351
+ .build()
352
+ .unwrap();
353
+
354
+ let mut buffer = Vec::new();
355
+ {
356
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
357
+
358
+ // Try to write map with wrong key type
359
+ let result = writer.write_rows(vec![vec![ParquetValue::Map(vec![(
360
+ ParquetValue::Int32(42), // Wrong key type
361
+ ParquetValue::Int32(100),
362
+ )])]]);
363
+
364
+ assert!(result.is_err());
365
+ let err_msg = result.unwrap_err().to_string();
366
+ assert!(
367
+ err_msg.contains("Type mismatch") && err_msg.contains("expected Utf8"),
368
+ "Error message was: {}",
369
+ err_msg
370
+ );
371
+ }
372
+ }
373
+
374
+ // ====== Unsupported Features ======
375
+
376
+ #[test]
377
+ fn test_map_with_struct_values_unsupported() {
378
+ // Test that maps with struct values are not yet supported
379
+ let schema = SchemaBuilder::new()
380
+ .with_root(SchemaNode::Struct {
381
+ name: "root".to_string(),
382
+ nullable: false,
383
+ fields: vec![SchemaNode::Map {
384
+ name: "map_field".to_string(),
385
+ nullable: false,
386
+ key: Box::new(SchemaNode::Primitive {
387
+ name: "key".to_string(),
388
+ primitive_type: PrimitiveType::String,
389
+ nullable: false,
390
+ format: None,
391
+ }),
392
+ value: Box::new(SchemaNode::Struct {
393
+ name: "value_struct".to_string(),
394
+ nullable: false,
395
+ fields: vec![SchemaNode::Primitive {
396
+ name: "field".to_string(),
397
+ primitive_type: PrimitiveType::Int32,
398
+ nullable: false,
399
+ format: None,
400
+ }],
401
+ }),
402
+ }],
403
+ })
404
+ .build()
405
+ .unwrap();
406
+
407
+ // Try to write a map with struct values
408
+ let row = vec![ParquetValue::Map(vec![(
409
+ ParquetValue::String(Arc::from("key1")),
410
+ ParquetValue::Record({
411
+ let mut map = IndexMap::new();
412
+ map.insert(Arc::from("field"), ParquetValue::Int32(42));
413
+ map
414
+ }),
415
+ )])];
416
+
417
+ let mut buffer = Vec::new();
418
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
419
+ let result = writer.write_rows(vec![row]);
420
+
421
+ // Check if this is still a limitation
422
+ if result.is_err() {
423
+ match result {
424
+ Err(ParquetError::Conversion(msg)) => {
425
+ assert!(msg.contains("Maps with struct values are not yet supported"));
426
+ }
427
+ _ => panic!("Expected Conversion error about maps with struct values"),
428
+ }
429
+ } else {
430
+ // If it succeeds, then the limitation has been fixed!
431
+ // Let's verify we can read it back
432
+ writer.close().unwrap();
433
+
434
+ let bytes = Bytes::from(buffer);
435
+ let reader = Reader::new(bytes);
436
+
437
+ let read_rows: Vec<_> = reader
438
+ .read_rows()
439
+ .unwrap()
440
+ .collect::<Result<Vec<_>>>()
441
+ .unwrap();
442
+
443
+ assert_eq!(read_rows.len(), 1);
444
+ // Maps with struct values now work!
445
+ }
446
+ }
447
+
448
+ // ====== Writer State Errors ======
449
+
450
+ #[test]
451
+ fn test_writer_multiple_close() {
452
+ let schema = SchemaBuilder::new()
453
+ .with_root(SchemaNode::Struct {
454
+ name: "root".to_string(),
455
+ nullable: false,
456
+ fields: vec![SchemaNode::Primitive {
457
+ name: "value".to_string(),
458
+ primitive_type: PrimitiveType::Int32,
459
+ nullable: false,
460
+ format: None,
461
+ }],
462
+ })
463
+ .build()
464
+ .unwrap();
465
+
466
+ // Test that we can't write after moving the writer into close()
467
+ let mut buffer = Vec::new();
468
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
469
+
470
+ // Write some data
471
+ writer
472
+ .write_rows(vec![vec![ParquetValue::Int32(1)]])
473
+ .unwrap();
474
+
475
+ // Close consumes the writer, so we can't use it afterwards
476
+ writer.close().unwrap();
477
+
478
+ // The writer has been consumed by close(), so we can't access it anymore
479
+ // This is enforced at compile time by Rust's ownership system
480
+ }
481
+
482
+ // ====== Invalid Collection Schemas ======
483
+
484
+ #[test]
485
+ fn test_invalid_collection_schemas() {
486
+ let test_cases = vec![
487
+ (
488
+ "list_without_item",
489
+ SchemaNode::List {
490
+ name: "invalid_list".to_string(),
491
+ nullable: false,
492
+ item: Box::new(SchemaNode::Struct {
493
+ name: "empty".to_string(),
494
+ nullable: false,
495
+ fields: vec![],
496
+ }),
497
+ },
498
+ ),
499
+ (
500
+ "map_without_value",
501
+ SchemaNode::Map {
502
+ name: "invalid_map".to_string(),
503
+ nullable: false,
504
+ key: Box::new(SchemaNode::Primitive {
505
+ name: "key".to_string(),
506
+ primitive_type: PrimitiveType::String,
507
+ nullable: false,
508
+ format: None,
509
+ }),
510
+ value: Box::new(SchemaNode::Struct {
511
+ name: "empty".to_string(),
512
+ nullable: false,
513
+ fields: vec![],
514
+ }),
515
+ },
516
+ ),
517
+ ];
518
+
519
+ for (name, invalid_node) in test_cases {
520
+ let error = SchemaBuilder::new()
521
+ .with_root(SchemaNode::Struct {
522
+ name: "root".to_string(),
523
+ nullable: false,
524
+ fields: vec![invalid_node],
525
+ })
526
+ .build()
527
+ .unwrap_err();
528
+
529
+ let expected = match name {
530
+ "list_without_item" => {
531
+ "Struct field 'root.invalid_list.empty' must contain at least one field"
532
+ }
533
+ "map_without_value" => {
534
+ "Struct field 'root.invalid_map.empty' must contain at least one field"
535
+ }
536
+ _ => unreachable!("unexpected invalid collection schema"),
537
+ };
538
+ assert_eq!(error, expected);
539
+ }
540
+ }