parquet 0.5.12 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +8 -5
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -603
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,430 @@
1
+ use bytes::Bytes;
2
+ use indexmap::IndexMap;
3
+ use parquet_core::*;
4
+ use std::sync::Arc;
5
+
6
+ mod test_helpers;
7
+ use test_helpers::*;
8
+
9
+ #[test]
10
+ fn test_null_handling_all_types() {
11
+ // Test null handling for all nullable primitive types
12
+ let schema = SchemaBuilder::new()
13
+ .with_root(SchemaNode::Struct {
14
+ name: "root".to_string(),
15
+ nullable: false,
16
+ fields: vec![
17
+ SchemaNode::Primitive {
18
+ name: "bool_field".to_string(),
19
+ primitive_type: PrimitiveType::Boolean,
20
+ nullable: true,
21
+ format: None,
22
+ },
23
+ SchemaNode::Primitive {
24
+ name: "int32_field".to_string(),
25
+ primitive_type: PrimitiveType::Int32,
26
+ nullable: true,
27
+ format: None,
28
+ },
29
+ SchemaNode::Primitive {
30
+ name: "int64_field".to_string(),
31
+ primitive_type: PrimitiveType::Int64,
32
+ nullable: true,
33
+ format: None,
34
+ },
35
+ SchemaNode::Primitive {
36
+ name: "float32_field".to_string(),
37
+ primitive_type: PrimitiveType::Float32,
38
+ nullable: true,
39
+ format: None,
40
+ },
41
+ SchemaNode::Primitive {
42
+ name: "float64_field".to_string(),
43
+ primitive_type: PrimitiveType::Float64,
44
+ nullable: true,
45
+ format: None,
46
+ },
47
+ SchemaNode::Primitive {
48
+ name: "string_field".to_string(),
49
+ primitive_type: PrimitiveType::String,
50
+ nullable: true,
51
+ format: None,
52
+ },
53
+ SchemaNode::Primitive {
54
+ name: "binary_field".to_string(),
55
+ primitive_type: PrimitiveType::Binary,
56
+ nullable: true,
57
+ format: None,
58
+ },
59
+ SchemaNode::Primitive {
60
+ name: "decimal128_field".to_string(),
61
+ primitive_type: PrimitiveType::Decimal128(10, 2),
62
+ nullable: true,
63
+ format: None,
64
+ },
65
+ ],
66
+ })
67
+ .build()
68
+ .unwrap();
69
+
70
+ // Test 1: All values present
71
+ let all_present = vec![
72
+ ParquetValue::Boolean(true),
73
+ ParquetValue::Int32(42),
74
+ ParquetValue::Int64(12345),
75
+ ParquetValue::Float32(ordered_float::OrderedFloat(3.14)),
76
+ ParquetValue::Float64(ordered_float::OrderedFloat(2.718)),
77
+ ParquetValue::String(Arc::from("test")),
78
+ ParquetValue::Bytes(Bytes::from(vec![1, 2, 3, 4])),
79
+ ParquetValue::Decimal128(12345, 2),
80
+ ];
81
+
82
+ // Test 2: All nulls
83
+ let all_nulls: Vec<ParquetValue> = (0..8).map(|_| ParquetValue::Null).collect();
84
+
85
+ // Test 3: Mixed nulls and values - alternating pattern
86
+ let mixed_alternating = vec![
87
+ ParquetValue::Boolean(true),
88
+ ParquetValue::Null,
89
+ ParquetValue::Int64(12345),
90
+ ParquetValue::Null,
91
+ ParquetValue::Float64(ordered_float::OrderedFloat(2.718)),
92
+ ParquetValue::Null,
93
+ ParquetValue::Bytes(Bytes::from(vec![1, 2, 3, 4])),
94
+ ParquetValue::Null,
95
+ ];
96
+
97
+ // Test 4: Mixed nulls and values - sparse pattern (mostly nulls)
98
+ let mixed_sparse = vec![
99
+ ParquetValue::Null,
100
+ ParquetValue::Null,
101
+ ParquetValue::Int64(12345),
102
+ ParquetValue::Null,
103
+ ParquetValue::Null,
104
+ ParquetValue::Null,
105
+ ParquetValue::Null,
106
+ ParquetValue::Decimal128(12345, 2),
107
+ ];
108
+
109
+ let test_rows = [
110
+ vec![all_present.clone()],
111
+ vec![all_nulls.clone()],
112
+ vec![mixed_alternating.clone()],
113
+ vec![mixed_sparse.clone()],
114
+ // Add multiple rows to test null patterns
115
+ (0..10)
116
+ .map(|i| {
117
+ if i % 3 == 0 {
118
+ all_nulls.clone()
119
+ } else if i % 2 == 0 {
120
+ mixed_alternating.clone()
121
+ } else {
122
+ all_present.clone()
123
+ }
124
+ })
125
+ .collect::<Vec<_>>(),
126
+ ]
127
+ .concat();
128
+
129
+ // Use test helper for roundtrip
130
+ test_roundtrip(test_rows, schema).unwrap();
131
+ }
132
+
133
+ #[test]
134
+ fn test_all_null_column() {
135
+ // Test handling of columns where all values are null
136
+ let schema = SchemaBuilder::new()
137
+ .with_root(SchemaNode::Struct {
138
+ name: "root".to_string(),
139
+ nullable: false,
140
+ fields: vec![
141
+ SchemaNode::Primitive {
142
+ name: "id".to_string(),
143
+ primitive_type: PrimitiveType::Int32,
144
+ nullable: false,
145
+ format: None,
146
+ },
147
+ SchemaNode::Primitive {
148
+ name: "optional".to_string(),
149
+ primitive_type: PrimitiveType::String,
150
+ nullable: true,
151
+ format: None,
152
+ },
153
+ ],
154
+ })
155
+ .build()
156
+ .unwrap();
157
+
158
+ let rows: Vec<Vec<ParquetValue>> = (0..100)
159
+ .map(|i| vec![ParquetValue::Int32(i), ParquetValue::Null])
160
+ .collect();
161
+
162
+ // Use test helper for roundtrip
163
+ test_roundtrip(rows, schema).unwrap();
164
+ }
165
+
166
+ #[test]
167
+ fn test_null_patterns() {
168
+ let patterns: Vec<(&str, Box<dyn Fn(usize) -> bool>)> = vec![
169
+ ("alternating", Box::new(|i: usize| i % 2 == 0)),
170
+ ("sparse_90_percent", Box::new(|i: usize| i % 10 != 0)),
171
+ ("dense_10_percent", Box::new(|i: usize| i % 10 == 0)),
172
+ ("first_half", Box::new(|i: usize| i < 500)),
173
+ ("last_half", Box::new(|i: usize| i >= 500)),
174
+ ("blocks_of_10", Box::new(|i: usize| (i / 10) % 2 == 0)),
175
+ ];
176
+
177
+ for (pattern_name, is_null) in patterns {
178
+ // Test various null distribution patterns
179
+ let schema = SchemaBuilder::new()
180
+ .with_root(SchemaNode::Struct {
181
+ name: "root".to_string(),
182
+ nullable: false,
183
+ fields: vec![
184
+ SchemaNode::Primitive {
185
+ name: "id".to_string(),
186
+ primitive_type: PrimitiveType::Int32,
187
+ nullable: false,
188
+ format: None,
189
+ },
190
+ SchemaNode::Primitive {
191
+ name: "value".to_string(),
192
+ primitive_type: PrimitiveType::String,
193
+ nullable: true,
194
+ format: None,
195
+ },
196
+ ],
197
+ })
198
+ .build()
199
+ .unwrap();
200
+
201
+ let rows: Vec<Vec<ParquetValue>> = (0..1000)
202
+ .map(|i| {
203
+ vec![
204
+ ParquetValue::Int32(i as i32),
205
+ if is_null(i) {
206
+ ParquetValue::Null
207
+ } else {
208
+ ParquetValue::String(Arc::from(format!("value_{}", i)))
209
+ },
210
+ ]
211
+ })
212
+ .collect();
213
+
214
+ // Count nulls for verification
215
+ let null_count = rows
216
+ .iter()
217
+ .filter(|row| matches!(row[1], ParquetValue::Null))
218
+ .count();
219
+ println!("{} pattern - nulls: {}/1000", pattern_name, null_count);
220
+
221
+ // Use test helper for roundtrip
222
+ test_roundtrip(rows, schema).unwrap();
223
+ }
224
+ }
225
+
226
+ #[test]
227
+ fn test_deeply_nested_nulls() {
228
+ // Test nulls at various levels of nesting
229
+ let schema = SchemaBuilder::new()
230
+ .with_root(SchemaNode::Struct {
231
+ name: "root".to_string(),
232
+ nullable: false,
233
+ fields: vec![
234
+ SchemaNode::Primitive {
235
+ name: "id".to_string(),
236
+ primitive_type: PrimitiveType::Int32,
237
+ nullable: false,
238
+ format: None,
239
+ },
240
+ SchemaNode::Struct {
241
+ name: "nested".to_string(),
242
+ nullable: true,
243
+ fields: vec![
244
+ SchemaNode::Primitive {
245
+ name: "value".to_string(),
246
+ primitive_type: PrimitiveType::String,
247
+ nullable: true,
248
+ format: None,
249
+ },
250
+ SchemaNode::List {
251
+ name: "items".to_string(),
252
+ nullable: true,
253
+ item: Box::new(SchemaNode::Primitive {
254
+ name: "item".to_string(),
255
+ primitive_type: PrimitiveType::Int32,
256
+ nullable: true,
257
+ format: None,
258
+ }),
259
+ },
260
+ ],
261
+ },
262
+ ],
263
+ })
264
+ .build()
265
+ .unwrap();
266
+
267
+ let rows = vec![
268
+ // Entire struct is null
269
+ vec![ParquetValue::Int32(1), ParquetValue::Null],
270
+ // Struct with null value and null list
271
+ vec![
272
+ ParquetValue::Int32(2),
273
+ ParquetValue::Record({
274
+ let mut map = IndexMap::new();
275
+ map.insert(Arc::from("value"), ParquetValue::Null);
276
+ map.insert(Arc::from("items"), ParquetValue::Null);
277
+ map
278
+ }),
279
+ ],
280
+ // Struct with value and list containing nulls
281
+ vec![
282
+ ParquetValue::Int32(3),
283
+ ParquetValue::Record({
284
+ let mut map = IndexMap::new();
285
+ map.insert(Arc::from("value"), ParquetValue::String(Arc::from("test")));
286
+ map.insert(
287
+ Arc::from("items"),
288
+ ParquetValue::List(vec![
289
+ ParquetValue::Int32(1),
290
+ ParquetValue::Null,
291
+ ParquetValue::Int32(3),
292
+ ]),
293
+ );
294
+ map
295
+ }),
296
+ ],
297
+ // Struct with null value and empty list
298
+ vec![
299
+ ParquetValue::Int32(4),
300
+ ParquetValue::Record({
301
+ let mut map = IndexMap::new();
302
+ map.insert(Arc::from("value"), ParquetValue::Null);
303
+ map.insert(Arc::from("items"), ParquetValue::List(vec![]));
304
+ map
305
+ }),
306
+ ],
307
+ ];
308
+
309
+ // Use test helper for roundtrip
310
+ test_roundtrip(rows, schema).unwrap();
311
+ }
312
+
313
+ #[test]
314
+ fn test_null_across_row_groups() {
315
+ // Test null handling when nulls span multiple row groups
316
+ let schema = SchemaBuilder::new()
317
+ .with_root(SchemaNode::Struct {
318
+ name: "root".to_string(),
319
+ nullable: false,
320
+ fields: vec![
321
+ SchemaNode::Primitive {
322
+ name: "id".to_string(),
323
+ primitive_type: PrimitiveType::Int64,
324
+ nullable: false,
325
+ format: None,
326
+ },
327
+ SchemaNode::Primitive {
328
+ name: "value".to_string(),
329
+ primitive_type: PrimitiveType::String,
330
+ nullable: true,
331
+ format: None,
332
+ },
333
+ ],
334
+ })
335
+ .build()
336
+ .unwrap();
337
+
338
+ // Create rows where entire row groups might be null
339
+ // Assuming default row group size, create patterns that span groups
340
+ let rows: Vec<Vec<ParquetValue>> = (0..10000)
341
+ .map(|i| {
342
+ vec![
343
+ ParquetValue::Int64(i),
344
+ // First 5000 rows: all null
345
+ // Next 2500 rows: all values
346
+ // Last 2500 rows: alternating
347
+ if i < 5000 {
348
+ ParquetValue::Null
349
+ } else if i < 7500 {
350
+ ParquetValue::String(Arc::from(format!("value_{}", i)))
351
+ } else if i % 2 == 0 {
352
+ ParquetValue::Null
353
+ } else {
354
+ ParquetValue::String(Arc::from(format!("value_{}", i)))
355
+ },
356
+ ]
357
+ })
358
+ .collect();
359
+
360
+ // Use test helper with specific batch size to control row groups
361
+ let result = test_roundtrip_with_options(
362
+ rows.clone(),
363
+ schema,
364
+ parquet::basic::Compression::UNCOMPRESSED,
365
+ Some(1000), // Force smaller row groups
366
+ );
367
+
368
+ assert!(result.is_ok());
369
+
370
+ // Additional verification - ensure the null pattern is preserved
371
+ let null_count = rows
372
+ .iter()
373
+ .filter(|row| matches!(row[1], ParquetValue::Null))
374
+ .count();
375
+ assert_eq!(null_count, 6250); // 5000 + 1250 = 6250 nulls
376
+ }
377
+
378
+ #[test]
379
+ fn test_sparse_columns_with_compression() {
380
+ // Test compression effectiveness on sparse columns (95% null)
381
+ let schema = SchemaBuilder::new()
382
+ .with_root(SchemaNode::Struct {
383
+ name: "root".to_string(),
384
+ nullable: false,
385
+ fields: vec![
386
+ SchemaNode::Primitive {
387
+ name: "id".to_string(),
388
+ primitive_type: PrimitiveType::Int32,
389
+ nullable: false,
390
+ format: None,
391
+ },
392
+ SchemaNode::Primitive {
393
+ name: "sparse_data".to_string(),
394
+ primitive_type: PrimitiveType::String,
395
+ nullable: true,
396
+ format: None,
397
+ },
398
+ ],
399
+ })
400
+ .build()
401
+ .unwrap();
402
+
403
+ let rows: Vec<Vec<ParquetValue>> = (0..10000)
404
+ .map(|i| {
405
+ vec![
406
+ ParquetValue::Int32(i),
407
+ if i % 20 == 0 {
408
+ ParquetValue::String(Arc::from(format!("rare_value_{}", i)))
409
+ } else {
410
+ ParquetValue::Null
411
+ },
412
+ ]
413
+ })
414
+ .collect();
415
+
416
+ use parquet::basic::Compression;
417
+
418
+ let compressions = vec![
419
+ ("UNCOMPRESSED", Compression::UNCOMPRESSED),
420
+ ("SNAPPY", Compression::SNAPPY),
421
+ ("ZSTD", Compression::ZSTD(Default::default())),
422
+ ];
423
+
424
+ for (name, compression) in compressions {
425
+ let result = test_roundtrip_with_options(rows.clone(), schema.clone(), compression, None);
426
+
427
+ assert!(result.is_ok(), "Failed with {}: {:?}", name, result);
428
+ println!("Sparse column (95% null) with {} succeeded", name);
429
+ }
430
+ }
@@ -0,0 +1,181 @@
1
+ use bytes::Bytes;
2
+ use parquet_core::*;
3
+ use std::time::Instant;
4
+
5
+ #[test]
6
+ fn test_iterator_early_termination() {
7
+ // Test that dropping an iterator early doesn't cause issues
8
+ let schema = SchemaBuilder::new()
9
+ .with_root(SchemaNode::Struct {
10
+ name: "root".to_string(),
11
+ nullable: false,
12
+ fields: vec![SchemaNode::Primitive {
13
+ name: "value".to_string(),
14
+ primitive_type: PrimitiveType::Int32,
15
+ nullable: false,
16
+ format: None,
17
+ }],
18
+ })
19
+ .build()
20
+ .unwrap();
21
+
22
+ let rows: Vec<Vec<ParquetValue>> = (0..100).map(|i| vec![ParquetValue::Int32(i)]).collect();
23
+
24
+ let mut buffer = Vec::new();
25
+ {
26
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
27
+ writer.write_rows(rows).unwrap();
28
+ writer.close().unwrap();
29
+ }
30
+
31
+ let bytes = Bytes::from(buffer);
32
+ let reader = Reader::new(bytes.clone());
33
+
34
+ // Only read first 10 rows then drop iterator
35
+ let mut count = 0;
36
+ for row_result in reader.read_rows().unwrap() {
37
+ let _row = row_result.unwrap();
38
+ count += 1;
39
+ if count >= 10 {
40
+ break;
41
+ }
42
+ }
43
+
44
+ assert_eq!(count, 10);
45
+
46
+ // Ensure we can create a new iterator after dropping the previous one
47
+ let reader2 = Reader::new(bytes);
48
+ let all_rows: Vec<_> = reader2
49
+ .read_rows()
50
+ .unwrap()
51
+ .collect::<Result<Vec<_>>>()
52
+ .unwrap();
53
+
54
+ assert_eq!(all_rows.len(), 100);
55
+ }
56
+
57
+ #[test]
58
+ fn test_performance_different_batch_sizes() {
59
+ // Test performance characteristics with different batch sizes
60
+ let schema = SchemaBuilder::new()
61
+ .with_root(SchemaNode::Struct {
62
+ name: "root".to_string(),
63
+ nullable: false,
64
+ fields: vec![SchemaNode::Primitive {
65
+ name: "value".to_string(),
66
+ primitive_type: PrimitiveType::Int64,
67
+ nullable: false,
68
+ format: None,
69
+ }],
70
+ })
71
+ .build()
72
+ .unwrap();
73
+
74
+ let total_rows = 10000;
75
+ let batch_sizes = vec![1, 10, 100, 1000, 10000];
76
+
77
+ for batch_size in batch_sizes {
78
+ let mut buffer = Vec::new();
79
+
80
+ let write_start = Instant::now();
81
+ {
82
+ let mut writer = Writer::new(&mut buffer, schema.clone()).unwrap();
83
+
84
+ for batch_start in (0..total_rows).step_by(batch_size) {
85
+ let batch_end = (batch_start + batch_size).min(total_rows);
86
+ let rows: Vec<Vec<ParquetValue>> = (batch_start..batch_end)
87
+ .map(|i| vec![ParquetValue::Int64(i as i64)])
88
+ .collect();
89
+
90
+ writer.write_rows(rows).unwrap();
91
+ }
92
+
93
+ writer.close().unwrap();
94
+ }
95
+ let write_duration = write_start.elapsed();
96
+
97
+ // Read back
98
+ let bytes = Bytes::from(buffer);
99
+ let reader = Reader::new(bytes);
100
+
101
+ let read_start = Instant::now();
102
+ let count = reader.read_rows().unwrap().count();
103
+ let read_duration = read_start.elapsed();
104
+
105
+ assert_eq!(count, total_rows);
106
+
107
+ println!(
108
+ "Batch size {}: Write {:?}, Read {:?}",
109
+ batch_size, write_duration, read_duration
110
+ );
111
+ }
112
+ }
113
+
114
+ #[test]
115
+ fn test_string_interning_efficiency() {
116
+ // Test efficiency when writing many repeated strings
117
+ let schema = SchemaBuilder::new()
118
+ .with_root(SchemaNode::Struct {
119
+ name: "root".to_string(),
120
+ nullable: false,
121
+ fields: vec![
122
+ SchemaNode::Primitive {
123
+ name: "category".to_string(),
124
+ primitive_type: PrimitiveType::String,
125
+ nullable: false,
126
+ format: None,
127
+ },
128
+ SchemaNode::Primitive {
129
+ name: "value".to_string(),
130
+ primitive_type: PrimitiveType::Int32,
131
+ nullable: false,
132
+ format: None,
133
+ },
134
+ ],
135
+ })
136
+ .build()
137
+ .unwrap();
138
+
139
+ let categories = ["A", "B", "C", "D", "E"];
140
+ let rows: Vec<Vec<ParquetValue>> = (0..10000)
141
+ .map(|i| {
142
+ vec![
143
+ ParquetValue::String(categories[i % categories.len()].into()),
144
+ ParquetValue::Int32(i as i32),
145
+ ]
146
+ })
147
+ .collect();
148
+
149
+ let mut buffer = Vec::new();
150
+ {
151
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
152
+ writer.write_rows(rows).unwrap();
153
+ writer.close().unwrap();
154
+ }
155
+
156
+ // The file should be efficiently encoded due to repeated strings
157
+ let file_size = buffer.len();
158
+ println!("File size with repeated strings: {} bytes", file_size);
159
+
160
+ // Verify we can read it back correctly
161
+ let bytes = Bytes::from(buffer);
162
+ let reader = Reader::new(bytes);
163
+
164
+ let read_rows: Vec<_> = reader
165
+ .read_rows()
166
+ .unwrap()
167
+ .collect::<Result<Vec<_>>>()
168
+ .unwrap();
169
+
170
+ assert_eq!(read_rows.len(), 10000);
171
+
172
+ // Verify the pattern
173
+ for (i, row) in read_rows.iter().enumerate() {
174
+ match &row[0] {
175
+ ParquetValue::String(s) => {
176
+ assert_eq!(*s, categories[i % categories.len()].into());
177
+ }
178
+ _ => panic!("Expected string value"),
179
+ }
180
+ }
181
+ }