parquet 0.5.13 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +3 -0
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -605
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,430 @@
1
+ use bytes::Bytes;
2
+ use parquet_core::*;
3
+ use std::sync::{Arc, Mutex};
4
+ use std::thread;
5
+
6
+ #[test]
7
+ fn test_concurrent_readers() {
8
+ // Test multiple threads reading the same file simultaneously
9
+ let schema = SchemaBuilder::new()
10
+ .with_root(SchemaNode::Struct {
11
+ name: "root".to_string(),
12
+ nullable: false,
13
+ fields: vec![
14
+ SchemaNode::Primitive {
15
+ name: "thread_id".to_string(),
16
+ primitive_type: PrimitiveType::Int32,
17
+ nullable: false,
18
+ format: None,
19
+ },
20
+ SchemaNode::Primitive {
21
+ name: "value".to_string(),
22
+ primitive_type: PrimitiveType::String,
23
+ nullable: false,
24
+ format: None,
25
+ },
26
+ ],
27
+ })
28
+ .build()
29
+ .unwrap();
30
+
31
+ // Create test data
32
+ let rows: Vec<Vec<ParquetValue>> = (0..1000)
33
+ .map(|i| {
34
+ vec![
35
+ ParquetValue::Int32(i),
36
+ ParquetValue::String(Arc::from(format!("Value {}", i))),
37
+ ]
38
+ })
39
+ .collect();
40
+
41
+ // Write to buffer
42
+ let mut buffer = Vec::new();
43
+ {
44
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
45
+ writer.write_rows(rows).unwrap();
46
+ writer.close().unwrap();
47
+ }
48
+
49
+ let bytes = Arc::new(Bytes::from(buffer));
50
+ let num_threads = 10;
51
+ let mut handles = vec![];
52
+
53
+ // Spawn multiple reader threads
54
+ for thread_id in 0..num_threads {
55
+ let bytes_clone = Arc::clone(&bytes);
56
+
57
+ let handle = thread::spawn(move || {
58
+ let reader = Reader::new((*bytes_clone).clone());
59
+
60
+ let mut row_count = 0;
61
+ let mut sum = 0i32;
62
+
63
+ for row_result in reader.read_rows().unwrap() {
64
+ let row = row_result.unwrap();
65
+ row_count += 1;
66
+
67
+ if let ParquetValue::Int32(val) = &row[0] {
68
+ sum += val;
69
+ }
70
+ }
71
+
72
+ println!("Thread {} read {} rows, sum: {}", thread_id, row_count, sum);
73
+ (row_count, sum)
74
+ });
75
+
76
+ handles.push(handle);
77
+ }
78
+
79
+ // Wait for all threads to complete
80
+ let mut results = vec![];
81
+ for handle in handles {
82
+ results.push(handle.join().unwrap());
83
+ }
84
+
85
+ // Verify all threads read the same data
86
+ let expected_count = 1000;
87
+ let expected_sum: i32 = (0..1000).sum();
88
+
89
+ for (count, sum) in results {
90
+ assert_eq!(count, expected_count);
91
+ assert_eq!(sum, expected_sum);
92
+ }
93
+ }
94
+
95
+ #[test]
96
+ fn test_reader_independence() {
97
+ // Test that multiple readers don't interfere with each other
98
+ let schema = SchemaBuilder::new()
99
+ .with_root(SchemaNode::Struct {
100
+ name: "root".to_string(),
101
+ nullable: false,
102
+ fields: vec![SchemaNode::Primitive {
103
+ name: "value".to_string(),
104
+ primitive_type: PrimitiveType::Int64,
105
+ nullable: false,
106
+ format: None,
107
+ }],
108
+ })
109
+ .build()
110
+ .unwrap();
111
+
112
+ let rows: Vec<Vec<ParquetValue>> = (0..100).map(|i| vec![ParquetValue::Int64(i)]).collect();
113
+
114
+ let mut buffer = Vec::new();
115
+ {
116
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
117
+ writer.write_rows(rows).unwrap();
118
+ writer.close().unwrap();
119
+ }
120
+
121
+ let bytes = Bytes::from(buffer);
122
+
123
+ // Create two readers
124
+ let reader1 = Reader::new(bytes.clone());
125
+ let reader2 = Reader::new(bytes.clone());
126
+
127
+ // Read alternately from both readers
128
+ let mut iter1 = reader1.read_rows().unwrap();
129
+ let mut iter2 = reader2.read_rows().unwrap();
130
+
131
+ let mut values1 = vec![];
132
+ let mut values2 = vec![];
133
+
134
+ // Read 10 from reader1
135
+ for _ in 0..10 {
136
+ if let Some(Ok(row)) = iter1.next() {
137
+ if let ParquetValue::Int64(val) = &row[0] {
138
+ values1.push(*val);
139
+ }
140
+ }
141
+ }
142
+
143
+ // Read 20 from reader2
144
+ for _ in 0..20 {
145
+ if let Some(Ok(row)) = iter2.next() {
146
+ if let ParquetValue::Int64(val) = &row[0] {
147
+ values2.push(*val);
148
+ }
149
+ }
150
+ }
151
+
152
+ // Continue reading from reader1
153
+ for row_result in iter1 {
154
+ let row = row_result.unwrap();
155
+ if let ParquetValue::Int64(val) = &row[0] {
156
+ values1.push(*val);
157
+ }
158
+ }
159
+
160
+ // Continue reading from reader2
161
+ for row_result in iter2 {
162
+ let row = row_result.unwrap();
163
+ if let ParquetValue::Int64(val) = &row[0] {
164
+ values2.push(*val);
165
+ }
166
+ }
167
+
168
+ // Verify both readers read all values independently
169
+ assert_eq!(values1.len(), 100);
170
+ assert_eq!(values2.len(), 100);
171
+
172
+ // Verify correct sequence
173
+ for (i, val) in values1.iter().enumerate() {
174
+ assert_eq!(*val, i as i64);
175
+ }
176
+ for (i, val) in values2.iter().enumerate() {
177
+ assert_eq!(*val, i as i64);
178
+ }
179
+ }
180
+
181
+ #[test]
182
+ fn test_concurrent_column_readers() {
183
+ // Test concurrent column-wise reading
184
+ let schema = SchemaBuilder::new()
185
+ .with_root(SchemaNode::Struct {
186
+ name: "root".to_string(),
187
+ nullable: false,
188
+ fields: vec![
189
+ SchemaNode::Primitive {
190
+ name: "col1".to_string(),
191
+ primitive_type: PrimitiveType::Int32,
192
+ nullable: false,
193
+ format: None,
194
+ },
195
+ SchemaNode::Primitive {
196
+ name: "col2".to_string(),
197
+ primitive_type: PrimitiveType::String,
198
+ nullable: false,
199
+ format: None,
200
+ },
201
+ SchemaNode::Primitive {
202
+ name: "col3".to_string(),
203
+ primitive_type: PrimitiveType::Float64,
204
+ nullable: false,
205
+ format: None,
206
+ },
207
+ ],
208
+ })
209
+ .build()
210
+ .unwrap();
211
+
212
+ let rows: Vec<Vec<ParquetValue>> = (0..500)
213
+ .map(|i| {
214
+ vec![
215
+ ParquetValue::Int32(i),
216
+ ParquetValue::String(Arc::from(format!("String {}", i))),
217
+ ParquetValue::Float64(ordered_float::OrderedFloat(i as f64 * 1.5)),
218
+ ]
219
+ })
220
+ .collect();
221
+
222
+ let mut buffer = Vec::new();
223
+ {
224
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
225
+ writer.write_rows(rows).unwrap();
226
+ writer.close().unwrap();
227
+ }
228
+
229
+ let bytes = Arc::new(Bytes::from(buffer));
230
+ let mut handles = vec![];
231
+
232
+ // Each thread reads a different column
233
+ let columns = ["col1", "col2", "col3"];
234
+
235
+ for (thread_id, column_name) in columns.iter().enumerate() {
236
+ let bytes_clone = Arc::clone(&bytes);
237
+ let column = column_name.to_string();
238
+
239
+ let handle = thread::spawn(move || {
240
+ let reader = Reader::new((*bytes_clone).clone());
241
+
242
+ let mut batch_count = 0;
243
+ let mut value_count = 0;
244
+
245
+ for batch_result in reader
246
+ .read_columns_with_projection(&[column.clone()], None)
247
+ .unwrap()
248
+ {
249
+ let batch = batch_result.unwrap();
250
+ batch_count += 1;
251
+
252
+ // ColumnBatch has columns as Vec<(String, Vec<ParquetValue>)>
253
+ for (col_name, values) in &batch.columns {
254
+ if col_name == &column {
255
+ value_count += values.len();
256
+ }
257
+ }
258
+ }
259
+
260
+ println!(
261
+ "Thread {} read column '{}': {} batches, {} values",
262
+ thread_id, column, batch_count, value_count
263
+ );
264
+
265
+ (batch_count, value_count)
266
+ });
267
+
268
+ handles.push(handle);
269
+ }
270
+
271
+ // Wait for all threads
272
+ let mut results = vec![];
273
+ for handle in handles {
274
+ results.push(handle.join().unwrap());
275
+ }
276
+
277
+ // Verify all threads read successfully
278
+ // At least one thread should have read values
279
+ let total_values: usize = results.iter().map(|(_, count)| count).sum();
280
+ assert!(total_values > 0, "No values read by any thread");
281
+
282
+ // Verify that the first column (col1) read all values
283
+ assert_eq!(results[0].1, 500, "Column col1 should have read 500 values");
284
+ }
285
+
286
+ #[test]
287
+ fn test_shared_writer_safety() {
288
+ // Test that writers cannot be safely shared between threads
289
+ // This test verifies that the API prevents unsafe concurrent writes
290
+
291
+ let schema = SchemaBuilder::new()
292
+ .with_root(SchemaNode::Struct {
293
+ name: "root".to_string(),
294
+ nullable: false,
295
+ fields: vec![SchemaNode::Primitive {
296
+ name: "value".to_string(),
297
+ primitive_type: PrimitiveType::Int32,
298
+ nullable: false,
299
+ format: None,
300
+ }],
301
+ })
302
+ .build()
303
+ .unwrap();
304
+
305
+ // Writers should not implement Send/Sync, so wrapping in Arc<Mutex<>> is necessary
306
+ let buffer = Arc::new(Mutex::new(Vec::new()));
307
+
308
+ // Create a writer wrapped in Arc<Mutex<>>
309
+ {
310
+ let buffer_clone = Arc::clone(&buffer);
311
+ let mut buf = buffer_clone.lock().unwrap();
312
+
313
+ let mut writer = Writer::new(&mut *buf, schema).unwrap();
314
+
315
+ // Write some data
316
+ writer.write_row(vec![ParquetValue::Int32(42)]).unwrap();
317
+ writer.close().unwrap();
318
+ }
319
+
320
+ // Verify the write succeeded
321
+ let final_buffer = buffer.lock().unwrap();
322
+ assert!(!final_buffer.is_empty());
323
+ }
324
+
325
+ #[test]
326
+ fn test_reader_cloning() {
327
+ // Test that readers can be used independently after cloning bytes
328
+ let schema = SchemaBuilder::new()
329
+ .with_root(SchemaNode::Struct {
330
+ name: "root".to_string(),
331
+ nullable: false,
332
+ fields: vec![SchemaNode::Primitive {
333
+ name: "id".to_string(),
334
+ primitive_type: PrimitiveType::Int32,
335
+ nullable: false,
336
+ format: None,
337
+ }],
338
+ })
339
+ .build()
340
+ .unwrap();
341
+
342
+ let rows: Vec<Vec<ParquetValue>> = (0..50).map(|i| vec![ParquetValue::Int32(i)]).collect();
343
+
344
+ let mut buffer = Vec::new();
345
+ {
346
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
347
+ writer.write_rows(rows).unwrap();
348
+ writer.close().unwrap();
349
+ }
350
+
351
+ let bytes = Bytes::from(buffer);
352
+
353
+ // Clone bytes multiple times
354
+ let bytes1 = bytes.clone();
355
+ let bytes2 = bytes.clone();
356
+ let bytes3 = bytes;
357
+
358
+ // Create readers from cloned bytes
359
+ let reader1 = Reader::new(bytes1);
360
+ let reader2 = Reader::new(bytes2);
361
+ let reader3 = Reader::new(bytes3);
362
+
363
+ // Read from all readers
364
+ let count1 = reader1.read_rows().unwrap().count();
365
+ let count2 = reader2.read_rows().unwrap().count();
366
+ let count3 = reader3.read_rows().unwrap().count();
367
+
368
+ assert_eq!(count1, 50);
369
+ assert_eq!(count2, 50);
370
+ assert_eq!(count3, 50);
371
+ }
372
+
373
+ #[test]
374
+ fn test_metadata_concurrent_access() {
375
+ // Test concurrent access to metadata
376
+ let schema = SchemaBuilder::new()
377
+ .with_root(SchemaNode::Struct {
378
+ name: "root".to_string(),
379
+ nullable: false,
380
+ fields: vec![SchemaNode::Primitive {
381
+ name: "value".to_string(),
382
+ primitive_type: PrimitiveType::String,
383
+ nullable: false,
384
+ format: None,
385
+ }],
386
+ })
387
+ .build()
388
+ .unwrap();
389
+
390
+ let rows: Vec<Vec<ParquetValue>> = (0..100)
391
+ .map(|i| vec![ParquetValue::String(Arc::from(format!("Value {}", i)))])
392
+ .collect();
393
+
394
+ let mut buffer = Vec::new();
395
+ {
396
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
397
+ writer.write_rows(rows).unwrap();
398
+ writer.close().unwrap();
399
+ }
400
+
401
+ let bytes = Arc::new(Bytes::from(buffer));
402
+ let mut handles = vec![];
403
+
404
+ // Multiple threads accessing metadata
405
+ for thread_id in 0..5 {
406
+ let bytes_clone = Arc::clone(&bytes);
407
+
408
+ let handle = thread::spawn(move || {
409
+ let mut reader = Reader::new((*bytes_clone).clone());
410
+
411
+ // Access metadata multiple times
412
+ for _ in 0..10 {
413
+ let metadata = reader.metadata().unwrap();
414
+ assert_eq!(metadata.num_rows(), 100);
415
+
416
+ // Small delay to increase chance of concurrent access
417
+ thread::yield_now();
418
+ }
419
+
420
+ println!("Thread {} successfully accessed metadata", thread_id);
421
+ });
422
+
423
+ handles.push(handle);
424
+ }
425
+
426
+ // Wait for all threads
427
+ for handle in handles {
428
+ handle.join().unwrap();
429
+ }
430
+ }