parquet 0.5.13 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +3 -0
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -605
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,518 @@
1
+ use bytes::Bytes;
2
+ use parquet_core::*;
3
+ use std::sync::Arc;
4
+
5
+ mod test_helpers;
6
+ use test_helpers::*;
7
+
8
+ #[test]
9
+ fn test_date_types() {
10
+ let schema = SchemaBuilder::new()
11
+ .with_root(SchemaNode::Struct {
12
+ name: "root".to_string(),
13
+ nullable: false,
14
+ fields: vec![
15
+ SchemaNode::Primitive {
16
+ name: "date32".to_string(),
17
+ primitive_type: PrimitiveType::Date32,
18
+ nullable: false,
19
+ format: None,
20
+ },
21
+ SchemaNode::Primitive {
22
+ name: "date64".to_string(),
23
+ primitive_type: PrimitiveType::Date64,
24
+ nullable: false,
25
+ format: None,
26
+ },
27
+ SchemaNode::Primitive {
28
+ name: "date32_nullable".to_string(),
29
+ primitive_type: PrimitiveType::Date32,
30
+ nullable: true,
31
+ format: None,
32
+ },
33
+ ],
34
+ })
35
+ .build()
36
+ .unwrap();
37
+
38
+ let epoch_date32 = 0; // 1970-01-01
39
+ let epoch_date64 = 0; // 1970-01-01
40
+ let today_date32 = 19000; // ~2022
41
+ let today_date64 = 19000 * 86400 * 1000; // Same day in milliseconds
42
+
43
+ let rows = vec![
44
+ vec![
45
+ ParquetValue::Date32(epoch_date32),
46
+ ParquetValue::Date64(epoch_date64),
47
+ ParquetValue::Date32(epoch_date32),
48
+ ],
49
+ vec![
50
+ ParquetValue::Date32(today_date32),
51
+ ParquetValue::Date64(today_date64),
52
+ ParquetValue::Date32(today_date32),
53
+ ],
54
+ vec![
55
+ ParquetValue::Date32(-365), // One year before epoch
56
+ ParquetValue::Date64(-365 * 86400 * 1000), // Same in milliseconds
57
+ ParquetValue::Null,
58
+ ],
59
+ ];
60
+
61
+ // Use test helper for roundtrip
62
+ test_roundtrip(rows, schema).unwrap();
63
+ }
64
+
65
+ #[test]
66
+ fn test_timestamp_types() {
67
+ let schema = SchemaBuilder::new()
68
+ .with_root(SchemaNode::Struct {
69
+ name: "root".to_string(),
70
+ nullable: false,
71
+ fields: vec![
72
+ SchemaNode::Primitive {
73
+ name: "ts_millis".to_string(),
74
+ primitive_type: PrimitiveType::TimestampMillis(None),
75
+ nullable: false,
76
+ format: None,
77
+ },
78
+ SchemaNode::Primitive {
79
+ name: "ts_micros".to_string(),
80
+ primitive_type: PrimitiveType::TimestampMicros(None),
81
+ nullable: false,
82
+ format: None,
83
+ },
84
+ SchemaNode::Primitive {
85
+ name: "ts_millis_tz".to_string(),
86
+ primitive_type: PrimitiveType::TimestampMillis(Some(Arc::from(
87
+ "America/New_York",
88
+ ))),
89
+ nullable: false,
90
+ format: None,
91
+ },
92
+ SchemaNode::Primitive {
93
+ name: "ts_micros_tz".to_string(),
94
+ primitive_type: PrimitiveType::TimestampMicros(Some(Arc::from(
95
+ "America/New_York",
96
+ ))),
97
+ nullable: false,
98
+ format: None,
99
+ },
100
+ ],
101
+ })
102
+ .build()
103
+ .unwrap();
104
+
105
+ // Test various timestamp values
106
+ let epoch_millis = 0;
107
+ let epoch_micros = 0;
108
+ let now_millis = 1_700_000_000_000; // Approximate timestamp for 2023
109
+ let now_micros = now_millis * 1000;
110
+ let tz = Some(Arc::from("America/New_York"));
111
+
112
+ let rows = vec![
113
+ vec![
114
+ ParquetValue::TimestampMillis(epoch_millis, None),
115
+ ParquetValue::TimestampMicros(epoch_micros, None),
116
+ ParquetValue::TimestampMillis(epoch_millis, tz.clone()),
117
+ ParquetValue::TimestampMicros(epoch_micros, tz.clone()),
118
+ ],
119
+ vec![
120
+ ParquetValue::TimestampMillis(now_millis, None),
121
+ ParquetValue::TimestampMicros(now_micros, None),
122
+ ParquetValue::TimestampMillis(now_millis, tz.clone()),
123
+ ParquetValue::TimestampMicros(now_micros, tz.clone()),
124
+ ],
125
+ vec![
126
+ ParquetValue::TimestampMillis(-86400000, None), // One day before epoch
127
+ ParquetValue::TimestampMicros(-86400000000, None),
128
+ ParquetValue::TimestampMillis(-86400000, Some(Arc::from("UTC"))),
129
+ ParquetValue::TimestampMicros(-86400000000, Some(Arc::from("UTC"))),
130
+ ],
131
+ ];
132
+
133
+ let mut buffer = Vec::new();
134
+ {
135
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
136
+ writer.write_rows(rows.clone()).unwrap();
137
+ writer.close().unwrap();
138
+ }
139
+
140
+ // Read back and verify
141
+ let bytes = Bytes::from(buffer);
142
+ let reader = Reader::new(bytes);
143
+
144
+ let read_rows: Vec<_> = reader
145
+ .read_rows()
146
+ .unwrap()
147
+ .collect::<Result<Vec<_>>>()
148
+ .unwrap();
149
+
150
+ assert_eq!(read_rows.len(), rows.len());
151
+
152
+ // Verify the timestamps match, accounting for the fact that field timezone overrides value timezone
153
+ for (row_idx, (expected_row, actual_row)) in rows.iter().zip(read_rows.iter()).enumerate() {
154
+ assert_eq!(expected_row.len(), actual_row.len());
155
+ for (col_idx, (expected_val, actual_val)) in
156
+ expected_row.iter().zip(actual_row.iter()).enumerate()
157
+ {
158
+ match (expected_val, actual_val) {
159
+ (
160
+ ParquetValue::TimestampMillis(e_ts, e_tz),
161
+ ParquetValue::TimestampMillis(a_ts, a_tz),
162
+ ) => {
163
+ assert_eq!(
164
+ e_ts, a_ts,
165
+ "Timestamp value mismatch at row {}, col {}",
166
+ row_idx, col_idx
167
+ );
168
+ // For columns with timezone in schema (col 2 and 3), the schema timezone wins
169
+ if col_idx >= 2 {
170
+ assert_eq!(
171
+ a_tz.as_deref(),
172
+ Some("UTC"),
173
+ "Timezone mismatch at row {}, col {}",
174
+ row_idx,
175
+ col_idx
176
+ );
177
+ } else {
178
+ assert_eq!(
179
+ e_tz, a_tz,
180
+ "Timezone mismatch at row {}, col {}",
181
+ row_idx, col_idx
182
+ );
183
+ }
184
+ }
185
+ (
186
+ ParquetValue::TimestampMicros(e_ts, e_tz),
187
+ ParquetValue::TimestampMicros(a_ts, a_tz),
188
+ ) => {
189
+ assert_eq!(
190
+ e_ts, a_ts,
191
+ "Timestamp value mismatch at row {}, col {}",
192
+ row_idx, col_idx
193
+ );
194
+ // For columns with timezone in schema (col 2 and 3), the schema timezone wins
195
+ if col_idx >= 2 {
196
+ assert_eq!(
197
+ a_tz.as_deref(),
198
+ Some("UTC"),
199
+ "Timezone mismatch at row {}, col {}",
200
+ row_idx,
201
+ col_idx
202
+ );
203
+ } else {
204
+ assert_eq!(
205
+ e_tz, a_tz,
206
+ "Timezone mismatch at row {}, col {}",
207
+ row_idx, col_idx
208
+ );
209
+ }
210
+ }
211
+ _ => panic!("Unexpected value types at row {}, col {}", row_idx, col_idx),
212
+ }
213
+ }
214
+ }
215
+ }
216
+
217
+ #[test]
218
+ fn test_time_types() {
219
+ let schema = SchemaBuilder::new()
220
+ .with_root(SchemaNode::Struct {
221
+ name: "root".to_string(),
222
+ nullable: false,
223
+ fields: vec![
224
+ SchemaNode::Primitive {
225
+ name: "time_millis".to_string(),
226
+ primitive_type: PrimitiveType::TimeMillis,
227
+ nullable: false,
228
+ format: None,
229
+ },
230
+ SchemaNode::Primitive {
231
+ name: "time_micros".to_string(),
232
+ primitive_type: PrimitiveType::TimeMicros,
233
+ nullable: false,
234
+ format: None,
235
+ },
236
+ SchemaNode::Primitive {
237
+ name: "time_millis_nullable".to_string(),
238
+ primitive_type: PrimitiveType::TimeMillis,
239
+ nullable: true,
240
+ format: None,
241
+ },
242
+ ],
243
+ })
244
+ .build()
245
+ .unwrap();
246
+
247
+ // Time values (milliseconds/microseconds since midnight)
248
+ let midnight = 0;
249
+ let noon_millis = 12 * 60 * 60 * 1000; // 12:00:00
250
+ let noon_micros = noon_millis as i64 * 1000;
251
+ let end_of_day_millis = 23 * 60 * 60 * 1000 + 59 * 60 * 1000 + 59 * 1000 + 999; // 23:59:59.999
252
+ let end_of_day_micros = end_of_day_millis as i64 * 1000 + 999; // 23:59:59.999999
253
+
254
+ let rows = vec![
255
+ vec![
256
+ ParquetValue::TimeMillis(midnight),
257
+ ParquetValue::TimeMicros(midnight as i64),
258
+ ParquetValue::TimeMillis(midnight),
259
+ ],
260
+ vec![
261
+ ParquetValue::TimeMillis(noon_millis),
262
+ ParquetValue::TimeMicros(noon_micros),
263
+ ParquetValue::TimeMillis(noon_millis),
264
+ ],
265
+ vec![
266
+ ParquetValue::TimeMillis(end_of_day_millis),
267
+ ParquetValue::TimeMicros(end_of_day_micros),
268
+ ParquetValue::Null,
269
+ ],
270
+ ];
271
+
272
+ // Use test helper for roundtrip
273
+ test_roundtrip(rows, schema).unwrap();
274
+ }
275
+
276
+ #[test]
277
+ fn test_temporal_types_in_collections() {
278
+ // Test temporal types within lists and maps
279
+ let schema = SchemaBuilder::new()
280
+ .with_root(SchemaNode::Struct {
281
+ name: "root".to_string(),
282
+ nullable: false,
283
+ fields: vec![
284
+ SchemaNode::List {
285
+ name: "timestamp_list".to_string(),
286
+ nullable: false,
287
+ item: Box::new(SchemaNode::Primitive {
288
+ name: "item".to_string(),
289
+ primitive_type: PrimitiveType::TimestampMillis(None),
290
+ nullable: false,
291
+ format: None,
292
+ }),
293
+ },
294
+ SchemaNode::Map {
295
+ name: "date_map".to_string(),
296
+ nullable: false,
297
+ key: Box::new(SchemaNode::Primitive {
298
+ name: "key".to_string(),
299
+ primitive_type: PrimitiveType::String,
300
+ nullable: false,
301
+ format: None,
302
+ }),
303
+ value: Box::new(SchemaNode::Primitive {
304
+ name: "value".to_string(),
305
+ primitive_type: PrimitiveType::Date32,
306
+ nullable: true,
307
+ format: None,
308
+ }),
309
+ },
310
+ ],
311
+ })
312
+ .build()
313
+ .unwrap();
314
+
315
+ let rows = vec![
316
+ vec![
317
+ ParquetValue::List(vec![
318
+ ParquetValue::TimestampMillis(1000000000000, None),
319
+ ParquetValue::TimestampMillis(1100000000000, None),
320
+ ParquetValue::TimestampMillis(1200000000000, None),
321
+ ]),
322
+ ParquetValue::Map(vec![
323
+ (
324
+ ParquetValue::String(Arc::from("start_date")),
325
+ ParquetValue::Date32(18000),
326
+ ),
327
+ (
328
+ ParquetValue::String(Arc::from("end_date")),
329
+ ParquetValue::Date32(18365),
330
+ ),
331
+ (
332
+ ParquetValue::String(Arc::from("milestone")),
333
+ ParquetValue::Null,
334
+ ),
335
+ ]),
336
+ ],
337
+ vec![ParquetValue::List(vec![]), ParquetValue::Map(vec![])],
338
+ ];
339
+
340
+ // Use test helper for roundtrip
341
+ test_roundtrip(rows, schema).unwrap();
342
+ }
343
+
344
+ #[test]
345
+ fn test_temporal_edge_cases() {
346
+ // Comprehensive test for edge cases of all temporal types
347
+ let schema = SchemaBuilder::new()
348
+ .with_root(SchemaNode::Struct {
349
+ name: "root".to_string(),
350
+ nullable: false,
351
+ fields: vec![
352
+ // All timestamp types
353
+ SchemaNode::Primitive {
354
+ name: "ts_sec".to_string(),
355
+ primitive_type: PrimitiveType::TimestampSecond(None),
356
+ nullable: true,
357
+ format: None,
358
+ },
359
+ SchemaNode::Primitive {
360
+ name: "ts_millis".to_string(),
361
+ primitive_type: PrimitiveType::TimestampMillis(None),
362
+ nullable: true,
363
+ format: None,
364
+ },
365
+ SchemaNode::Primitive {
366
+ name: "ts_micros".to_string(),
367
+ primitive_type: PrimitiveType::TimestampMicros(None),
368
+ nullable: true,
369
+ format: None,
370
+ },
371
+ SchemaNode::Primitive {
372
+ name: "ts_nanos".to_string(),
373
+ primitive_type: PrimitiveType::TimestampNanos(None),
374
+ nullable: true,
375
+ format: None,
376
+ },
377
+ // Date types
378
+ SchemaNode::Primitive {
379
+ name: "date32".to_string(),
380
+ primitive_type: PrimitiveType::Date32,
381
+ nullable: true,
382
+ format: None,
383
+ },
384
+ SchemaNode::Primitive {
385
+ name: "date64".to_string(),
386
+ primitive_type: PrimitiveType::Date64,
387
+ nullable: true,
388
+ format: None,
389
+ },
390
+ // Time types
391
+ SchemaNode::Primitive {
392
+ name: "time_millis".to_string(),
393
+ primitive_type: PrimitiveType::TimeMillis,
394
+ nullable: true,
395
+ format: None,
396
+ },
397
+ SchemaNode::Primitive {
398
+ name: "time_micros".to_string(),
399
+ primitive_type: PrimitiveType::TimeMicros,
400
+ nullable: true,
401
+ format: None,
402
+ },
403
+ ],
404
+ })
405
+ .build()
406
+ .unwrap();
407
+
408
+ let rows = vec![
409
+ // Minimum values
410
+ vec![
411
+ ParquetValue::TimestampSecond(i64::MIN, None),
412
+ ParquetValue::TimestampMillis(i64::MIN, None),
413
+ ParquetValue::TimestampMicros(i64::MIN, None),
414
+ ParquetValue::TimestampNanos(i64::MIN, None),
415
+ ParquetValue::Date32(i32::MIN),
416
+ ParquetValue::Date64(i64::MIN),
417
+ ParquetValue::TimeMillis(0), // Time can't be negative
418
+ ParquetValue::TimeMicros(0),
419
+ ],
420
+ // Maximum values
421
+ vec![
422
+ ParquetValue::TimestampSecond(i64::MAX, None),
423
+ ParquetValue::TimestampMillis(i64::MAX, None),
424
+ ParquetValue::TimestampMicros(i64::MAX, None),
425
+ ParquetValue::TimestampNanos(i64::MAX, None),
426
+ ParquetValue::Date32(i32::MAX),
427
+ ParquetValue::Date64(i64::MAX),
428
+ ParquetValue::TimeMillis(86399999), // 23:59:59.999
429
+ ParquetValue::TimeMicros(86399999999), // 23:59:59.999999
430
+ ],
431
+ // Zero values (Unix epoch / midnight)
432
+ vec![
433
+ ParquetValue::TimestampSecond(0, None),
434
+ ParquetValue::TimestampMillis(0, None),
435
+ ParquetValue::TimestampMicros(0, None),
436
+ ParquetValue::TimestampNanos(0, None),
437
+ ParquetValue::Date32(0),
438
+ ParquetValue::Date64(0),
439
+ ParquetValue::TimeMillis(0),
440
+ ParquetValue::TimeMicros(0),
441
+ ],
442
+ // Common timestamp (2025-01-01 00:00:00 UTC)
443
+ vec![
444
+ ParquetValue::TimestampSecond(1735689600, None),
445
+ ParquetValue::TimestampMillis(1735689600000, None),
446
+ ParquetValue::TimestampMicros(1735689600000000, None),
447
+ ParquetValue::TimestampNanos(1735689600000000000, None),
448
+ ParquetValue::Date32(19723), // Days since Unix epoch
449
+ ParquetValue::Date64(1735689600000), // Milliseconds since Unix epoch
450
+ ParquetValue::TimeMillis(0), // Midnight
451
+ ParquetValue::TimeMicros(0),
452
+ ],
453
+ // All nulls
454
+ vec![
455
+ ParquetValue::Null,
456
+ ParquetValue::Null,
457
+ ParquetValue::Null,
458
+ ParquetValue::Null,
459
+ ParquetValue::Null,
460
+ ParquetValue::Null,
461
+ ParquetValue::Null,
462
+ ParquetValue::Null,
463
+ ],
464
+ ];
465
+
466
+ let mut buffer = Vec::new();
467
+ {
468
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
469
+ writer.write_rows(rows.clone()).unwrap();
470
+ writer.close().unwrap();
471
+ }
472
+
473
+ // Read back and verify
474
+ let bytes = Bytes::from(buffer);
475
+ let reader = Reader::new(bytes);
476
+
477
+ let read_rows: Vec<_> = reader
478
+ .read_rows()
479
+ .unwrap()
480
+ .collect::<Result<Vec<_>>>()
481
+ .unwrap();
482
+
483
+ assert_eq!(read_rows.len(), rows.len());
484
+
485
+ // Verify values match exactly
486
+ for (i, (expected_row, actual_row)) in rows.iter().zip(read_rows.iter()).enumerate() {
487
+ for (j, (expected, actual)) in expected_row.iter().zip(actual_row.iter()).enumerate() {
488
+ match (expected, actual) {
489
+ (
490
+ ParquetValue::TimestampSecond(e_val, _),
491
+ ParquetValue::TimestampSecond(a_val, _),
492
+ ) => {
493
+ assert_eq!(e_val, a_val, "Row {} col {}: timestamp values differ", i, j);
494
+ }
495
+ (
496
+ ParquetValue::TimestampMillis(e_val, _),
497
+ ParquetValue::TimestampMillis(a_val, _),
498
+ ) => {
499
+ assert_eq!(e_val, a_val, "Row {} col {}: timestamp values differ", i, j);
500
+ }
501
+ (
502
+ ParquetValue::TimestampMicros(e_val, _),
503
+ ParquetValue::TimestampMicros(a_val, _),
504
+ ) => {
505
+ assert_eq!(e_val, a_val, "Row {} col {}: timestamp values differ", i, j);
506
+ }
507
+ (
508
+ ParquetValue::TimestampNanos(e_val, _),
509
+ ParquetValue::TimestampNanos(a_val, _),
510
+ ) => {
511
+ assert_eq!(e_val, a_val, "Row {} col {}: timestamp values differ", i, j);
512
+ }
513
+ (ParquetValue::Null, ParquetValue::Null) => {} // Both null is ok
514
+ _ => assert_eq!(expected, actual, "Row {} col {}: values differ", i, j),
515
+ }
516
+ }
517
+ }
518
+ }
@@ -0,0 +1,132 @@
1
+ use bytes::Bytes;
2
+ use parquet::basic::Compression;
3
+ use parquet::file::properties::WriterProperties;
4
+ use parquet_core::*;
5
+ use std::sync::Arc;
6
+
7
+ /// Create a test schema with common field types
8
+ pub fn create_test_schema() -> Schema {
9
+ SchemaBuilder::new()
10
+ .with_root(SchemaNode::Struct {
11
+ name: "root".to_string(),
12
+ nullable: false,
13
+ fields: vec![
14
+ SchemaNode::Primitive {
15
+ name: "id".to_string(),
16
+ primitive_type: PrimitiveType::Int32,
17
+ nullable: false,
18
+ format: None,
19
+ },
20
+ SchemaNode::Primitive {
21
+ name: "name".to_string(),
22
+ primitive_type: PrimitiveType::String,
23
+ nullable: true,
24
+ format: None,
25
+ },
26
+ SchemaNode::Primitive {
27
+ name: "value".to_string(),
28
+ primitive_type: PrimitiveType::Float64,
29
+ nullable: true,
30
+ format: None,
31
+ },
32
+ SchemaNode::Primitive {
33
+ name: "active".to_string(),
34
+ primitive_type: PrimitiveType::Boolean,
35
+ nullable: false,
36
+ format: None,
37
+ },
38
+ ],
39
+ })
40
+ .build()
41
+ .unwrap()
42
+ }
43
+
44
+ /// Generate test rows with sequential data
45
+ pub fn generate_test_rows(count: usize) -> Vec<Vec<ParquetValue>> {
46
+ (0..count)
47
+ .map(|i| {
48
+ vec![
49
+ ParquetValue::Int32(i as i32),
50
+ ParquetValue::String(Arc::from(format!("name_{}", i))),
51
+ ParquetValue::Float64(ordered_float::OrderedFloat(i as f64 * 1.5)),
52
+ ParquetValue::Boolean(i % 2 == 0),
53
+ ]
54
+ })
55
+ .collect()
56
+ }
57
+
58
+ /// Perform a roundtrip test and verify data integrity
59
+ pub fn test_roundtrip(
60
+ rows: Vec<Vec<ParquetValue>>,
61
+ schema: Schema,
62
+ ) -> std::result::Result<(), Box<dyn std::error::Error>> {
63
+ test_roundtrip_with_options(rows, schema, Compression::UNCOMPRESSED, None)
64
+ }
65
+
66
+ /// Perform a roundtrip test with custom writer options
67
+ pub fn test_roundtrip_with_options(
68
+ rows: Vec<Vec<ParquetValue>>,
69
+ schema: Schema,
70
+ compression: Compression,
71
+ batch_size: Option<usize>,
72
+ ) -> std::result::Result<(), Box<dyn std::error::Error>> {
73
+ use tempfile::NamedTempFile;
74
+
75
+ let temp_file = NamedTempFile::new()?;
76
+ let file_path = temp_file.path().to_str().unwrap();
77
+
78
+ // Write
79
+ let mut buffer = Vec::new();
80
+ {
81
+ let mut builder = WriterBuilder::new();
82
+
83
+ if let Some(size) = batch_size {
84
+ builder = builder.with_batch_size(size);
85
+ }
86
+
87
+ let props = WriterProperties::builder()
88
+ .set_compression(compression)
89
+ .build();
90
+
91
+ let mut writer = if batch_size.is_some() {
92
+ builder.build(&mut buffer, schema.clone())?
93
+ } else {
94
+ Writer::new_with_properties(&mut buffer, schema.clone(), props)?
95
+ };
96
+
97
+ writer.write_rows(rows.clone())?;
98
+ writer.close()?;
99
+ }
100
+
101
+ // Write to file for persistence
102
+ std::fs::write(file_path, &buffer)?;
103
+
104
+ // Read back
105
+ let bytes = Bytes::from(buffer);
106
+ let reader = Reader::new(bytes);
107
+
108
+ let read_rows: Vec<Vec<ParquetValue>> = reader.read_rows()?.collect::<Result<Vec<_>>>()?;
109
+
110
+ // Verify
111
+ assert_eq!(rows.len(), read_rows.len(), "Row count mismatch");
112
+
113
+ for (i, (original, read)) in rows.iter().zip(read_rows.iter()).enumerate() {
114
+ assert_eq!(original, read, "Row {} mismatch", i);
115
+ }
116
+
117
+ Ok(())
118
+ }
119
+
120
+ #[cfg(test)]
121
+ mod tests {
122
+ use super::*;
123
+
124
+ #[test]
125
+ fn test_helpers_work() {
126
+ let schema = create_test_schema();
127
+ let rows = generate_test_rows(10);
128
+ assert_eq!(rows.len(), 10);
129
+
130
+ test_roundtrip(rows, schema).unwrap();
131
+ }
132
+ }