parquet 0.5.13 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +3 -0
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -605
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,434 @@
1
+ use bytes::Bytes;
2
+ use parquet::basic::Compression;
3
+ use parquet::file::properties::WriterProperties;
4
+ use parquet_core::*;
5
+ use std::sync::Arc;
6
+ use std::time::Instant;
7
+
8
+ #[test]
9
+ fn test_compression_effectiveness() {
10
+ // Test compression ratios for different data patterns
11
+ let schema = SchemaBuilder::new()
12
+ .with_root(SchemaNode::Struct {
13
+ name: "root".to_string(),
14
+ nullable: false,
15
+ fields: vec![
16
+ SchemaNode::Primitive {
17
+ name: "repetitive".to_string(),
18
+ primitive_type: PrimitiveType::String,
19
+ nullable: false,
20
+ format: None,
21
+ },
22
+ SchemaNode::Primitive {
23
+ name: "random".to_string(),
24
+ primitive_type: PrimitiveType::Binary,
25
+ nullable: false,
26
+ format: None,
27
+ },
28
+ SchemaNode::Primitive {
29
+ name: "sequential".to_string(),
30
+ primitive_type: PrimitiveType::Int64,
31
+ nullable: false,
32
+ format: None,
33
+ },
34
+ ],
35
+ })
36
+ .build()
37
+ .unwrap();
38
+
39
+ // Create test data with different compressibility characteristics
40
+ let rows: Vec<Vec<ParquetValue>> = (0..1000)
41
+ .map(|i| {
42
+ vec![
43
+ // Highly repetitive data (should compress well)
44
+ ParquetValue::String(Arc::from("A".repeat(100))),
45
+ // Random data (should not compress well)
46
+ ParquetValue::Bytes(Bytes::from(
47
+ (0..100)
48
+ .map(|j| ((i * 31 + j * 17) % 256) as u8)
49
+ .collect::<Vec<u8>>(),
50
+ )),
51
+ // Sequential data (should compress moderately)
52
+ ParquetValue::Int64(i as i64),
53
+ ]
54
+ })
55
+ .collect();
56
+
57
+ let compressions = vec![
58
+ ("UNCOMPRESSED", Compression::UNCOMPRESSED),
59
+ ("SNAPPY", Compression::SNAPPY),
60
+ ("GZIP", Compression::GZIP(Default::default())),
61
+ ("LZ4", Compression::LZ4),
62
+ ("ZSTD", Compression::ZSTD(Default::default())),
63
+ ];
64
+
65
+ let mut results = vec![];
66
+
67
+ for (name, compression) in compressions {
68
+ let mut buffer = Vec::new();
69
+
70
+ let start = Instant::now();
71
+ {
72
+ let props = WriterProperties::builder()
73
+ .set_compression(compression)
74
+ .build();
75
+
76
+ let mut writer =
77
+ Writer::new_with_properties(&mut buffer, schema.clone(), props).unwrap();
78
+ writer.write_rows(rows.clone()).unwrap();
79
+ writer.close().unwrap();
80
+ }
81
+ let write_time = start.elapsed();
82
+
83
+ let file_size = buffer.len();
84
+
85
+ // Test read performance
86
+ let bytes = Bytes::from(buffer);
87
+ let reader = Reader::new(bytes);
88
+
89
+ let start = Instant::now();
90
+ let read_count = reader.read_rows().unwrap().count();
91
+ let read_time = start.elapsed();
92
+
93
+ assert_eq!(read_count, 1000);
94
+
95
+ results.push((name, file_size, write_time, read_time));
96
+ }
97
+
98
+ // Print results
99
+ println!("\nCompression comparison:");
100
+ println!(
101
+ "{:<15} {:>12} {:>15} {:>15}",
102
+ "Compression", "Size (bytes)", "Write Time", "Read Time"
103
+ );
104
+ println!("{:-<60}", "");
105
+
106
+ let uncompressed_size = results[0].1;
107
+ for (name, size, write_time, read_time) in results {
108
+ let ratio = (uncompressed_size as f64 / size as f64 * 100.0) as u32;
109
+ println!(
110
+ "{:<15} {:>12} ({:>3}%) {:>15?} {:>15?}",
111
+ name, size, ratio, write_time, read_time
112
+ );
113
+ }
114
+ }
115
+
116
+ #[test]
117
+ fn test_compression_with_nulls() {
118
+ // Test how null values affect compression
119
+ let schema = SchemaBuilder::new()
120
+ .with_root(SchemaNode::Struct {
121
+ name: "root".to_string(),
122
+ nullable: false,
123
+ fields: vec![
124
+ SchemaNode::Primitive {
125
+ name: "sparse_data".to_string(),
126
+ primitive_type: PrimitiveType::String,
127
+ nullable: true,
128
+ format: None,
129
+ },
130
+ SchemaNode::Primitive {
131
+ name: "dense_data".to_string(),
132
+ primitive_type: PrimitiveType::Int32,
133
+ nullable: true,
134
+ format: None,
135
+ },
136
+ ],
137
+ })
138
+ .build()
139
+ .unwrap();
140
+
141
+ // Create data with different null patterns
142
+ let sparse_rows: Vec<Vec<ParquetValue>> = (0..1000)
143
+ .map(|i| {
144
+ vec![
145
+ // 90% nulls
146
+ if i % 10 == 0 {
147
+ ParquetValue::String(Arc::from(format!("Value {}", i)))
148
+ } else {
149
+ ParquetValue::Null
150
+ },
151
+ // 10% nulls
152
+ if i % 10 == 0 {
153
+ ParquetValue::Null
154
+ } else {
155
+ ParquetValue::Int32(i)
156
+ },
157
+ ]
158
+ })
159
+ .collect();
160
+
161
+ let compressions = vec![
162
+ ("UNCOMPRESSED", Compression::UNCOMPRESSED),
163
+ ("SNAPPY", Compression::SNAPPY),
164
+ ("ZSTD", Compression::ZSTD(Default::default())),
165
+ ];
166
+
167
+ println!("\nNull compression comparison:");
168
+ for (name, compression) in compressions {
169
+ let mut buffer = Vec::new();
170
+ {
171
+ let props = WriterProperties::builder()
172
+ .set_compression(compression)
173
+ .build();
174
+
175
+ let mut writer =
176
+ Writer::new_with_properties(&mut buffer, schema.clone(), props).unwrap();
177
+ writer.write_rows(sparse_rows.clone()).unwrap();
178
+ writer.close().unwrap();
179
+ }
180
+
181
+ println!("{}: {} bytes", name, buffer.len());
182
+
183
+ // Verify nulls are preserved
184
+ let bytes = Bytes::from(buffer);
185
+ let reader = Reader::new(bytes);
186
+
187
+ let mut null_count = 0;
188
+ for row_result in reader.read_rows().unwrap() {
189
+ let row = row_result.unwrap();
190
+ for value in &row {
191
+ if matches!(value, ParquetValue::Null) {
192
+ null_count += 1;
193
+ }
194
+ }
195
+ }
196
+
197
+ // Should have 900 + 100 = 1000 nulls total
198
+ assert_eq!(null_count, 1000);
199
+ }
200
+ }
201
+
202
+ #[test]
203
+ fn test_compression_level_comparison() {
204
+ // Test different compression levels for GZIP and ZSTD
205
+ let schema = SchemaBuilder::new()
206
+ .with_root(SchemaNode::Struct {
207
+ name: "root".to_string(),
208
+ nullable: false,
209
+ fields: vec![SchemaNode::Primitive {
210
+ name: "data".to_string(),
211
+ primitive_type: PrimitiveType::String,
212
+ nullable: false,
213
+ format: None,
214
+ }],
215
+ })
216
+ .build()
217
+ .unwrap();
218
+
219
+ // Create moderately compressible data
220
+ let rows: Vec<Vec<ParquetValue>> = (0..1000)
221
+ .map(|i| {
222
+ vec![ParquetValue::String(Arc::from(format!(
223
+ "This is row number {} with some repeated text pattern pattern pattern",
224
+ i
225
+ )))]
226
+ })
227
+ .collect();
228
+
229
+ // Test various compression algorithms and their levels
230
+ let compression_configs = vec![
231
+ // GZIP levels
232
+ (
233
+ "GZIP_FAST",
234
+ Compression::GZIP(parquet::basic::GzipLevel::try_new(1).unwrap()),
235
+ ),
236
+ ("GZIP_DEFAULT", Compression::GZIP(Default::default())),
237
+ (
238
+ "GZIP_BEST",
239
+ Compression::GZIP(parquet::basic::GzipLevel::try_new(9).unwrap()),
240
+ ),
241
+ // ZSTD levels
242
+ (
243
+ "ZSTD_FAST",
244
+ Compression::ZSTD(parquet::basic::ZstdLevel::try_new(1).unwrap()),
245
+ ),
246
+ ("ZSTD_DEFAULT", Compression::ZSTD(Default::default())),
247
+ (
248
+ "ZSTD_BEST",
249
+ Compression::ZSTD(parquet::basic::ZstdLevel::try_new(10).unwrap()),
250
+ ),
251
+ ];
252
+
253
+ println!("\nCompression level comparison:");
254
+ println!(
255
+ "{:<15} {:>12} {:>15}",
256
+ "Compression", "Size (bytes)", "Time"
257
+ );
258
+ println!("{:-<45}", "");
259
+
260
+ let mut results = Vec::new();
261
+ for (name, compression) in compression_configs {
262
+ let mut buffer = Vec::new();
263
+ let start = Instant::now();
264
+ {
265
+ let props = WriterProperties::builder()
266
+ .set_compression(compression)
267
+ .build();
268
+
269
+ let mut writer =
270
+ Writer::new_with_properties(&mut buffer, schema.clone(), props).unwrap();
271
+ writer.write_rows(rows.clone()).unwrap();
272
+ writer.close().unwrap();
273
+ }
274
+ let duration = start.elapsed();
275
+
276
+ results.push((name, buffer.len(), duration));
277
+ println!("{:<15} {:>12} {:>15?}", name, buffer.len(), duration);
278
+
279
+ // Verify we can read the data back
280
+ let bytes = Bytes::from(buffer);
281
+ let reader = Reader::new(bytes);
282
+ let read_count = reader.read_rows().unwrap().count();
283
+ assert_eq!(
284
+ read_count, 1000,
285
+ "Failed to read back data compressed with {}",
286
+ name
287
+ );
288
+ }
289
+
290
+ // Basic validation that compression is working
291
+ // GZIP_BEST and ZSTD_BEST should produce smaller files than their FAST counterparts
292
+ let gzip_fast_size = results
293
+ .iter()
294
+ .find(|(name, _, _)| *name == "GZIP_FAST")
295
+ .unwrap()
296
+ .1;
297
+ let gzip_best_size = results
298
+ .iter()
299
+ .find(|(name, _, _)| *name == "GZIP_BEST")
300
+ .unwrap()
301
+ .1;
302
+ assert!(
303
+ gzip_best_size <= gzip_fast_size,
304
+ "GZIP_BEST should produce smaller files than GZIP_FAST"
305
+ );
306
+
307
+ let zstd_fast_size = results
308
+ .iter()
309
+ .find(|(name, _, _)| *name == "ZSTD_FAST")
310
+ .unwrap()
311
+ .1;
312
+ let zstd_best_size = results
313
+ .iter()
314
+ .find(|(name, _, _)| *name == "ZSTD_BEST")
315
+ .unwrap()
316
+ .1;
317
+ assert!(
318
+ zstd_best_size <= zstd_fast_size,
319
+ "ZSTD_BEST should produce smaller files than ZSTD_FAST"
320
+ );
321
+ }
322
+
323
+ #[test]
324
+ fn test_column_specific_compression() {
325
+ // Test applying different compression to different columns
326
+ let schema = SchemaBuilder::new()
327
+ .with_root(SchemaNode::Struct {
328
+ name: "root".to_string(),
329
+ nullable: false,
330
+ fields: vec![
331
+ SchemaNode::Primitive {
332
+ name: "highly_compressible".to_string(),
333
+ primitive_type: PrimitiveType::String,
334
+ nullable: false,
335
+ format: None,
336
+ },
337
+ SchemaNode::Primitive {
338
+ name: "random_data".to_string(),
339
+ primitive_type: PrimitiveType::Binary,
340
+ nullable: false,
341
+ format: None,
342
+ },
343
+ ],
344
+ })
345
+ .build()
346
+ .unwrap();
347
+
348
+ let rows: Vec<Vec<ParquetValue>> = (0..500)
349
+ .map(|i| {
350
+ vec![
351
+ // Highly repetitive string
352
+ ParquetValue::String(Arc::from("AAAAAAAAAA".repeat(10))),
353
+ // Random binary data
354
+ ParquetValue::Bytes(Bytes::from(
355
+ (0..100)
356
+ .map(|j| ((i * 31 + j * 17) % 256) as u8)
357
+ .collect::<Vec<u8>>(),
358
+ )),
359
+ ]
360
+ })
361
+ .collect();
362
+
363
+ // Write with default compression
364
+ let mut buffer = Vec::new();
365
+ {
366
+ let props = WriterProperties::builder()
367
+ .set_compression(Compression::SNAPPY)
368
+ .build();
369
+
370
+ let mut writer = Writer::new_with_properties(&mut buffer, schema.clone(), props).unwrap();
371
+ writer.write_rows(rows.clone()).unwrap();
372
+ writer.close().unwrap();
373
+ }
374
+
375
+ let default_size = buffer.len();
376
+ println!("Default compression (SNAPPY): {} bytes", default_size);
377
+
378
+ // Ideally we'd set per-column compression, but if not supported,
379
+ // this test still validates the concept
380
+
381
+ // Verify data integrity
382
+ let bytes = Bytes::from(buffer);
383
+ let reader = Reader::new(bytes);
384
+
385
+ let read_count = reader.read_rows().unwrap().count();
386
+ assert_eq!(read_count, 500);
387
+ }
388
+
389
+ #[test]
390
+ fn test_compression_via_writer_builder() {
391
+ let compressions = vec![
392
+ ("UNCOMPRESSED", Compression::UNCOMPRESSED),
393
+ ("SNAPPY", Compression::SNAPPY),
394
+ ("GZIP", Compression::GZIP(Default::default())),
395
+ ("ZSTD", Compression::ZSTD(Default::default())),
396
+ ("LZ4", Compression::LZ4),
397
+ ];
398
+
399
+ for (name, compression) in compressions {
400
+ let schema = SchemaBuilder::new()
401
+ .with_root(SchemaNode::Struct {
402
+ name: "root".to_string(),
403
+ nullable: false,
404
+ fields: vec![SchemaNode::Primitive {
405
+ name: "value".to_string(),
406
+ primitive_type: PrimitiveType::Int32,
407
+ nullable: false,
408
+ format: None,
409
+ }],
410
+ })
411
+ .build()
412
+ .unwrap();
413
+
414
+ let rows: Vec<Vec<ParquetValue>> =
415
+ (0..1000).map(|i| vec![ParquetValue::Int32(i)]).collect();
416
+
417
+ let mut buffer = Vec::new();
418
+ {
419
+ let mut writer = WriterBuilder::new()
420
+ .with_compression(compression)
421
+ .build(&mut buffer, schema)
422
+ .unwrap();
423
+
424
+ writer.write_rows(rows).unwrap();
425
+ writer.close().unwrap();
426
+ }
427
+
428
+ // Verify we can read it back
429
+ let bytes = Bytes::from(buffer);
430
+ let reader = Reader::new(bytes);
431
+ let read_count = reader.read_rows().unwrap().count();
432
+ assert_eq!(read_count, 1000, "Failed with compression: {}", name);
433
+ }
434
+ }