parquet-tyfoom 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/Cargo.lock +1854 -0
  3. data/Cargo.toml +3 -0
  4. data/Gemfile +21 -0
  5. data/LICENSE +21 -0
  6. data/README.md +428 -0
  7. data/Rakefile +43 -0
  8. data/ext/parquet/Cargo.toml +39 -0
  9. data/ext/parquet/build.rs +5 -0
  10. data/ext/parquet/extconf.rb +4 -0
  11. data/ext/parquet/src/adapter_ffi.rs +297 -0
  12. data/ext/parquet/src/allocator.rs +13 -0
  13. data/ext/parquet/src/lib.rs +24 -0
  14. data/ext/parquet-core/Cargo.toml +24 -0
  15. data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
  16. data/ext/parquet-core/src/error.rs +189 -0
  17. data/ext/parquet-core/src/lib.rs +60 -0
  18. data/ext/parquet-core/src/reader.rs +368 -0
  19. data/ext/parquet-core/src/schema.rs +452 -0
  20. data/ext/parquet-core/src/test_utils.rs +308 -0
  21. data/ext/parquet-core/src/traits/mod.rs +5 -0
  22. data/ext/parquet-core/src/traits/schema.rs +190 -0
  23. data/ext/parquet-core/src/value.rs +220 -0
  24. data/ext/parquet-core/src/writer.rs +1241 -0
  25. data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
  26. data/ext/parquet-core/tests/binary_data.rs +437 -0
  27. data/ext/parquet-core/tests/column_projection.rs +557 -0
  28. data/ext/parquet-core/tests/complex_types.rs +821 -0
  29. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  30. data/ext/parquet-core/tests/concurrent_access.rs +431 -0
  31. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  32. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  33. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
  34. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  35. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  36. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  37. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  38. data/ext/parquet-core/tests/review_regressions.rs +787 -0
  39. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  40. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
  41. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  42. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  43. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  44. data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
  45. data/ext/parquet-ruby-adapter/build.rs +5 -0
  46. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  47. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  48. data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
  49. data/ext/parquet-ruby-adapter/src/error.rs +141 -0
  50. data/ext/parquet-ruby-adapter/src/io.rs +432 -0
  51. data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
  52. data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
  53. data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
  54. data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
  55. data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
  56. data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
  57. data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
  58. data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
  59. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  60. data/ext/parquet-ruby-adapter/src/types.rs +98 -0
  61. data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
  62. data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
  63. data/lib/parquet/schema.rb +262 -0
  64. data/lib/parquet/version.rb +3 -0
  65. data/lib/parquet.rb +11 -0
  66. data/lib/parquet.rbi +181 -0
  67. metadata +165 -0
@@ -0,0 +1,431 @@
1
+ use bytes::Bytes;
2
+ use parquet_core::*;
3
+ use std::sync::{Arc as StdArc, Mutex};
4
+ use std::thread;
5
+ use triomphe::Arc;
6
+
7
+ #[test]
8
+ fn test_concurrent_readers() {
9
+ // Test multiple threads reading the same file simultaneously
10
+ let schema = SchemaBuilder::new()
11
+ .with_root(SchemaNode::Struct {
12
+ name: "root".to_string(),
13
+ nullable: false,
14
+ fields: vec![
15
+ SchemaNode::Primitive {
16
+ name: "thread_id".to_string(),
17
+ primitive_type: PrimitiveType::Int32,
18
+ nullable: false,
19
+ format: None,
20
+ },
21
+ SchemaNode::Primitive {
22
+ name: "value".to_string(),
23
+ primitive_type: PrimitiveType::String,
24
+ nullable: false,
25
+ format: None,
26
+ },
27
+ ],
28
+ })
29
+ .build()
30
+ .unwrap();
31
+
32
+ // Create test data
33
+ let rows: Vec<Vec<ParquetValue>> = (0..1000)
34
+ .map(|i| {
35
+ vec![
36
+ ParquetValue::Int32(i),
37
+ ParquetValue::String(Arc::from(format!("Value {}", i))),
38
+ ]
39
+ })
40
+ .collect();
41
+
42
+ // Write to buffer
43
+ let mut buffer = Vec::new();
44
+ {
45
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
46
+ writer.write_rows(rows).unwrap();
47
+ writer.close().unwrap();
48
+ }
49
+
50
+ let bytes = StdArc::new(Bytes::from(buffer));
51
+ let num_threads = 10;
52
+ let mut handles = vec![];
53
+
54
+ // Spawn multiple reader threads
55
+ for thread_id in 0..num_threads {
56
+ let bytes_clone = StdArc::clone(&bytes);
57
+
58
+ let handle = thread::spawn(move || {
59
+ let reader = Reader::new((*bytes_clone).clone());
60
+
61
+ let mut row_count = 0;
62
+ let mut sum = 0i32;
63
+
64
+ for row_result in reader.read_rows().unwrap() {
65
+ let row = row_result.unwrap();
66
+ row_count += 1;
67
+
68
+ if let ParquetValue::Int32(val) = &row[0] {
69
+ sum += val;
70
+ }
71
+ }
72
+
73
+ println!("Thread {} read {} rows, sum: {}", thread_id, row_count, sum);
74
+ (row_count, sum)
75
+ });
76
+
77
+ handles.push(handle);
78
+ }
79
+
80
+ // Wait for all threads to complete
81
+ let mut results = vec![];
82
+ for handle in handles {
83
+ results.push(handle.join().unwrap());
84
+ }
85
+
86
+ // Verify all threads read the same data
87
+ let expected_count = 1000;
88
+ let expected_sum: i32 = (0..1000).sum();
89
+
90
+ for (count, sum) in results {
91
+ assert_eq!(count, expected_count);
92
+ assert_eq!(sum, expected_sum);
93
+ }
94
+ }
95
+
96
+ #[test]
97
+ fn test_reader_independence() {
98
+ // Test that multiple readers don't interfere with each other
99
+ let schema = SchemaBuilder::new()
100
+ .with_root(SchemaNode::Struct {
101
+ name: "root".to_string(),
102
+ nullable: false,
103
+ fields: vec![SchemaNode::Primitive {
104
+ name: "value".to_string(),
105
+ primitive_type: PrimitiveType::Int64,
106
+ nullable: false,
107
+ format: None,
108
+ }],
109
+ })
110
+ .build()
111
+ .unwrap();
112
+
113
+ let rows: Vec<Vec<ParquetValue>> = (0..100).map(|i| vec![ParquetValue::Int64(i)]).collect();
114
+
115
+ let mut buffer = Vec::new();
116
+ {
117
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
118
+ writer.write_rows(rows).unwrap();
119
+ writer.close().unwrap();
120
+ }
121
+
122
+ let bytes = Bytes::from(buffer);
123
+
124
+ // Create two readers
125
+ let reader1 = Reader::new(bytes.clone());
126
+ let reader2 = Reader::new(bytes.clone());
127
+
128
+ // Read alternately from both readers
129
+ let mut iter1 = reader1.read_rows().unwrap();
130
+ let mut iter2 = reader2.read_rows().unwrap();
131
+
132
+ let mut values1 = vec![];
133
+ let mut values2 = vec![];
134
+
135
+ // Read 10 from reader1
136
+ for _ in 0..10 {
137
+ if let Some(Ok(row)) = iter1.next() {
138
+ if let ParquetValue::Int64(val) = &row[0] {
139
+ values1.push(*val);
140
+ }
141
+ }
142
+ }
143
+
144
+ // Read 20 from reader2
145
+ for _ in 0..20 {
146
+ if let Some(Ok(row)) = iter2.next() {
147
+ if let ParquetValue::Int64(val) = &row[0] {
148
+ values2.push(*val);
149
+ }
150
+ }
151
+ }
152
+
153
+ // Continue reading from reader1
154
+ for row_result in iter1 {
155
+ let row = row_result.unwrap();
156
+ if let ParquetValue::Int64(val) = &row[0] {
157
+ values1.push(*val);
158
+ }
159
+ }
160
+
161
+ // Continue reading from reader2
162
+ for row_result in iter2 {
163
+ let row = row_result.unwrap();
164
+ if let ParquetValue::Int64(val) = &row[0] {
165
+ values2.push(*val);
166
+ }
167
+ }
168
+
169
+ // Verify both readers read all values independently
170
+ assert_eq!(values1.len(), 100);
171
+ assert_eq!(values2.len(), 100);
172
+
173
+ // Verify correct sequence
174
+ for (i, val) in values1.iter().enumerate() {
175
+ assert_eq!(*val, i as i64);
176
+ }
177
+ for (i, val) in values2.iter().enumerate() {
178
+ assert_eq!(*val, i as i64);
179
+ }
180
+ }
181
+
182
+ #[test]
183
+ fn test_concurrent_column_readers() {
184
+ // Test concurrent column-wise reading
185
+ let schema = SchemaBuilder::new()
186
+ .with_root(SchemaNode::Struct {
187
+ name: "root".to_string(),
188
+ nullable: false,
189
+ fields: vec![
190
+ SchemaNode::Primitive {
191
+ name: "col1".to_string(),
192
+ primitive_type: PrimitiveType::Int32,
193
+ nullable: false,
194
+ format: None,
195
+ },
196
+ SchemaNode::Primitive {
197
+ name: "col2".to_string(),
198
+ primitive_type: PrimitiveType::String,
199
+ nullable: false,
200
+ format: None,
201
+ },
202
+ SchemaNode::Primitive {
203
+ name: "col3".to_string(),
204
+ primitive_type: PrimitiveType::Float64,
205
+ nullable: false,
206
+ format: None,
207
+ },
208
+ ],
209
+ })
210
+ .build()
211
+ .unwrap();
212
+
213
+ let rows: Vec<Vec<ParquetValue>> = (0..500)
214
+ .map(|i| {
215
+ vec![
216
+ ParquetValue::Int32(i),
217
+ ParquetValue::String(Arc::from(format!("String {}", i))),
218
+ ParquetValue::Float64(ordered_float::OrderedFloat(i as f64 * 1.5)),
219
+ ]
220
+ })
221
+ .collect();
222
+
223
+ let mut buffer = Vec::new();
224
+ {
225
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
226
+ writer.write_rows(rows).unwrap();
227
+ writer.close().unwrap();
228
+ }
229
+
230
+ let bytes = StdArc::new(Bytes::from(buffer));
231
+ let mut handles = vec![];
232
+
233
+ // Each thread reads a different column
234
+ let columns = ["col1", "col2", "col3"];
235
+
236
+ for (thread_id, column_name) in columns.iter().enumerate() {
237
+ let bytes_clone = StdArc::clone(&bytes);
238
+ let column = column_name.to_string();
239
+
240
+ let handle = thread::spawn(move || {
241
+ let reader = Reader::new((*bytes_clone).clone());
242
+
243
+ let mut batch_count = 0;
244
+ let mut value_count = 0;
245
+
246
+ for batch_result in reader
247
+ .read_columns_with_projection(&[column.clone()], None)
248
+ .unwrap()
249
+ {
250
+ let batch = batch_result.unwrap();
251
+ batch_count += 1;
252
+
253
+ // ColumnBatch has columns as Vec<(String, Vec<ParquetValue>)>
254
+ for (col_name, values) in &batch.columns {
255
+ if col_name == &column {
256
+ value_count += values.len();
257
+ }
258
+ }
259
+ }
260
+
261
+ println!(
262
+ "Thread {} read column '{}': {} batches, {} values",
263
+ thread_id, column, batch_count, value_count
264
+ );
265
+
266
+ (batch_count, value_count)
267
+ });
268
+
269
+ handles.push(handle);
270
+ }
271
+
272
+ // Wait for all threads
273
+ let mut results = vec![];
274
+ for handle in handles {
275
+ results.push(handle.join().unwrap());
276
+ }
277
+
278
+ // Verify all threads read successfully
279
+ // At least one thread should have read values
280
+ let total_values: usize = results.iter().map(|(_, count)| count).sum();
281
+ assert!(total_values > 0, "No values read by any thread");
282
+
283
+ // Verify that the first column (col1) read all values
284
+ assert_eq!(results[0].1, 500, "Column col1 should have read 500 values");
285
+ }
286
+
287
+ #[test]
288
+ fn test_shared_writer_safety() {
289
+ // Test that writers cannot be safely shared between threads
290
+ // This test verifies that the API prevents unsafe concurrent writes
291
+
292
+ let schema = SchemaBuilder::new()
293
+ .with_root(SchemaNode::Struct {
294
+ name: "root".to_string(),
295
+ nullable: false,
296
+ fields: vec![SchemaNode::Primitive {
297
+ name: "value".to_string(),
298
+ primitive_type: PrimitiveType::Int32,
299
+ nullable: false,
300
+ format: None,
301
+ }],
302
+ })
303
+ .build()
304
+ .unwrap();
305
+
306
+ // Writers should not implement Send/Sync, so wrapping in Arc<Mutex<>> is necessary
307
+ let buffer = StdArc::new(Mutex::new(Vec::new()));
308
+
309
+ // Create a writer wrapped in Arc<Mutex<>>
310
+ {
311
+ let buffer_clone = StdArc::clone(&buffer);
312
+ let mut buf = buffer_clone.lock().unwrap();
313
+
314
+ let mut writer = Writer::new(&mut *buf, schema).unwrap();
315
+
316
+ // Write some data
317
+ writer.write_row(vec![ParquetValue::Int32(42)]).unwrap();
318
+ writer.close().unwrap();
319
+ }
320
+
321
+ // Verify the write succeeded
322
+ let final_buffer = buffer.lock().unwrap();
323
+ assert!(!final_buffer.is_empty());
324
+ }
325
+
326
+ #[test]
327
+ fn test_reader_cloning() {
328
+ // Test that readers can be used independently after cloning bytes
329
+ let schema = SchemaBuilder::new()
330
+ .with_root(SchemaNode::Struct {
331
+ name: "root".to_string(),
332
+ nullable: false,
333
+ fields: vec![SchemaNode::Primitive {
334
+ name: "id".to_string(),
335
+ primitive_type: PrimitiveType::Int32,
336
+ nullable: false,
337
+ format: None,
338
+ }],
339
+ })
340
+ .build()
341
+ .unwrap();
342
+
343
+ let rows: Vec<Vec<ParquetValue>> = (0..50).map(|i| vec![ParquetValue::Int32(i)]).collect();
344
+
345
+ let mut buffer = Vec::new();
346
+ {
347
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
348
+ writer.write_rows(rows).unwrap();
349
+ writer.close().unwrap();
350
+ }
351
+
352
+ let bytes = Bytes::from(buffer);
353
+
354
+ // Clone bytes multiple times
355
+ let bytes1 = bytes.clone();
356
+ let bytes2 = bytes.clone();
357
+ let bytes3 = bytes;
358
+
359
+ // Create readers from cloned bytes
360
+ let reader1 = Reader::new(bytes1);
361
+ let reader2 = Reader::new(bytes2);
362
+ let reader3 = Reader::new(bytes3);
363
+
364
+ // Read from all readers
365
+ let count1 = reader1.read_rows().unwrap().count();
366
+ let count2 = reader2.read_rows().unwrap().count();
367
+ let count3 = reader3.read_rows().unwrap().count();
368
+
369
+ assert_eq!(count1, 50);
370
+ assert_eq!(count2, 50);
371
+ assert_eq!(count3, 50);
372
+ }
373
+
374
+ #[test]
375
+ fn test_metadata_concurrent_access() {
376
+ // Test concurrent access to metadata
377
+ let schema = SchemaBuilder::new()
378
+ .with_root(SchemaNode::Struct {
379
+ name: "root".to_string(),
380
+ nullable: false,
381
+ fields: vec![SchemaNode::Primitive {
382
+ name: "value".to_string(),
383
+ primitive_type: PrimitiveType::String,
384
+ nullable: false,
385
+ format: None,
386
+ }],
387
+ })
388
+ .build()
389
+ .unwrap();
390
+
391
+ let rows: Vec<Vec<ParquetValue>> = (0..100)
392
+ .map(|i| vec![ParquetValue::String(Arc::from(format!("Value {}", i)))])
393
+ .collect();
394
+
395
+ let mut buffer = Vec::new();
396
+ {
397
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
398
+ writer.write_rows(rows).unwrap();
399
+ writer.close().unwrap();
400
+ }
401
+
402
+ let bytes = StdArc::new(Bytes::from(buffer));
403
+ let mut handles = vec![];
404
+
405
+ // Multiple threads accessing metadata
406
+ for thread_id in 0..5 {
407
+ let bytes_clone = StdArc::clone(&bytes);
408
+
409
+ let handle = thread::spawn(move || {
410
+ let mut reader = Reader::new((*bytes_clone).clone());
411
+
412
+ // Access metadata multiple times
413
+ for _ in 0..10 {
414
+ let metadata = reader.metadata().unwrap();
415
+ assert_eq!(metadata.num_rows(), 100);
416
+
417
+ // Small delay to increase chance of concurrent access
418
+ thread::yield_now();
419
+ }
420
+
421
+ println!("Thread {} successfully accessed metadata", thread_id);
422
+ });
423
+
424
+ handles.push(handle);
425
+ }
426
+
427
+ // Wait for all threads
428
+ for handle in handles {
429
+ handle.join().unwrap();
430
+ }
431
+ }