parquet 0.5.13 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +295 -98
- data/Cargo.toml +1 -1
- data/Gemfile +1 -0
- data/README.md +94 -3
- data/ext/parquet/Cargo.toml +3 -0
- data/ext/parquet/src/adapter_ffi.rs +156 -0
- data/ext/parquet/src/lib.rs +13 -21
- data/ext/parquet-core/Cargo.toml +23 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
- data/ext/parquet-core/src/error.rs +163 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +263 -0
- data/ext/parquet-core/src/schema.rs +283 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +151 -0
- data/ext/parquet-core/src/value.rs +209 -0
- data/ext/parquet-core/src/writer.rs +839 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +430 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
- data/ext/parquet-ruby-adapter/src/error.rs +148 -0
- data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
- data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +94 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
- data/lib/parquet/schema.rb +19 -0
- data/lib/parquet/version.rb +1 -1
- metadata +50 -24
- data/ext/parquet/src/enumerator.rs +0 -68
- data/ext/parquet/src/header_cache.rs +0 -99
- data/ext/parquet/src/logger.rs +0 -171
- data/ext/parquet/src/reader/common.rs +0 -111
- data/ext/parquet/src/reader/mod.rs +0 -211
- data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
- data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
- data/ext/parquet/src/reader/unified/mod.rs +0 -363
- data/ext/parquet/src/types/core_types.rs +0 -120
- data/ext/parquet/src/types/mod.rs +0 -100
- data/ext/parquet/src/types/parquet_value.rs +0 -1275
- data/ext/parquet/src/types/record_types.rs +0 -605
- data/ext/parquet/src/types/schema_converter.rs +0 -290
- data/ext/parquet/src/types/schema_node.rs +0 -424
- data/ext/parquet/src/types/timestamp.rs +0 -285
- data/ext/parquet/src/types/type_conversion.rs +0 -1949
- data/ext/parquet/src/types/writer_types.rs +0 -329
- data/ext/parquet/src/utils.rs +0 -184
- data/ext/parquet/src/writer/mod.rs +0 -505
- data/ext/parquet/src/writer/write_columns.rs +0 -238
- data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,430 @@
|
|
1
|
+
use bytes::Bytes;
|
2
|
+
use parquet_core::*;
|
3
|
+
use std::sync::{Arc, Mutex};
|
4
|
+
use std::thread;
|
5
|
+
|
6
|
+
#[test]
|
7
|
+
fn test_concurrent_readers() {
|
8
|
+
// Test multiple threads reading the same file simultaneously
|
9
|
+
let schema = SchemaBuilder::new()
|
10
|
+
.with_root(SchemaNode::Struct {
|
11
|
+
name: "root".to_string(),
|
12
|
+
nullable: false,
|
13
|
+
fields: vec![
|
14
|
+
SchemaNode::Primitive {
|
15
|
+
name: "thread_id".to_string(),
|
16
|
+
primitive_type: PrimitiveType::Int32,
|
17
|
+
nullable: false,
|
18
|
+
format: None,
|
19
|
+
},
|
20
|
+
SchemaNode::Primitive {
|
21
|
+
name: "value".to_string(),
|
22
|
+
primitive_type: PrimitiveType::String,
|
23
|
+
nullable: false,
|
24
|
+
format: None,
|
25
|
+
},
|
26
|
+
],
|
27
|
+
})
|
28
|
+
.build()
|
29
|
+
.unwrap();
|
30
|
+
|
31
|
+
// Create test data
|
32
|
+
let rows: Vec<Vec<ParquetValue>> = (0..1000)
|
33
|
+
.map(|i| {
|
34
|
+
vec![
|
35
|
+
ParquetValue::Int32(i),
|
36
|
+
ParquetValue::String(Arc::from(format!("Value {}", i))),
|
37
|
+
]
|
38
|
+
})
|
39
|
+
.collect();
|
40
|
+
|
41
|
+
// Write to buffer
|
42
|
+
let mut buffer = Vec::new();
|
43
|
+
{
|
44
|
+
let mut writer = Writer::new(&mut buffer, schema).unwrap();
|
45
|
+
writer.write_rows(rows).unwrap();
|
46
|
+
writer.close().unwrap();
|
47
|
+
}
|
48
|
+
|
49
|
+
let bytes = Arc::new(Bytes::from(buffer));
|
50
|
+
let num_threads = 10;
|
51
|
+
let mut handles = vec![];
|
52
|
+
|
53
|
+
// Spawn multiple reader threads
|
54
|
+
for thread_id in 0..num_threads {
|
55
|
+
let bytes_clone = Arc::clone(&bytes);
|
56
|
+
|
57
|
+
let handle = thread::spawn(move || {
|
58
|
+
let reader = Reader::new((*bytes_clone).clone());
|
59
|
+
|
60
|
+
let mut row_count = 0;
|
61
|
+
let mut sum = 0i32;
|
62
|
+
|
63
|
+
for row_result in reader.read_rows().unwrap() {
|
64
|
+
let row = row_result.unwrap();
|
65
|
+
row_count += 1;
|
66
|
+
|
67
|
+
if let ParquetValue::Int32(val) = &row[0] {
|
68
|
+
sum += val;
|
69
|
+
}
|
70
|
+
}
|
71
|
+
|
72
|
+
println!("Thread {} read {} rows, sum: {}", thread_id, row_count, sum);
|
73
|
+
(row_count, sum)
|
74
|
+
});
|
75
|
+
|
76
|
+
handles.push(handle);
|
77
|
+
}
|
78
|
+
|
79
|
+
// Wait for all threads to complete
|
80
|
+
let mut results = vec![];
|
81
|
+
for handle in handles {
|
82
|
+
results.push(handle.join().unwrap());
|
83
|
+
}
|
84
|
+
|
85
|
+
// Verify all threads read the same data
|
86
|
+
let expected_count = 1000;
|
87
|
+
let expected_sum: i32 = (0..1000).sum();
|
88
|
+
|
89
|
+
for (count, sum) in results {
|
90
|
+
assert_eq!(count, expected_count);
|
91
|
+
assert_eq!(sum, expected_sum);
|
92
|
+
}
|
93
|
+
}
|
94
|
+
|
95
|
+
#[test]
|
96
|
+
fn test_reader_independence() {
|
97
|
+
// Test that multiple readers don't interfere with each other
|
98
|
+
let schema = SchemaBuilder::new()
|
99
|
+
.with_root(SchemaNode::Struct {
|
100
|
+
name: "root".to_string(),
|
101
|
+
nullable: false,
|
102
|
+
fields: vec![SchemaNode::Primitive {
|
103
|
+
name: "value".to_string(),
|
104
|
+
primitive_type: PrimitiveType::Int64,
|
105
|
+
nullable: false,
|
106
|
+
format: None,
|
107
|
+
}],
|
108
|
+
})
|
109
|
+
.build()
|
110
|
+
.unwrap();
|
111
|
+
|
112
|
+
let rows: Vec<Vec<ParquetValue>> = (0..100).map(|i| vec![ParquetValue::Int64(i)]).collect();
|
113
|
+
|
114
|
+
let mut buffer = Vec::new();
|
115
|
+
{
|
116
|
+
let mut writer = Writer::new(&mut buffer, schema).unwrap();
|
117
|
+
writer.write_rows(rows).unwrap();
|
118
|
+
writer.close().unwrap();
|
119
|
+
}
|
120
|
+
|
121
|
+
let bytes = Bytes::from(buffer);
|
122
|
+
|
123
|
+
// Create two readers
|
124
|
+
let reader1 = Reader::new(bytes.clone());
|
125
|
+
let reader2 = Reader::new(bytes.clone());
|
126
|
+
|
127
|
+
// Read alternately from both readers
|
128
|
+
let mut iter1 = reader1.read_rows().unwrap();
|
129
|
+
let mut iter2 = reader2.read_rows().unwrap();
|
130
|
+
|
131
|
+
let mut values1 = vec![];
|
132
|
+
let mut values2 = vec![];
|
133
|
+
|
134
|
+
// Read 10 from reader1
|
135
|
+
for _ in 0..10 {
|
136
|
+
if let Some(Ok(row)) = iter1.next() {
|
137
|
+
if let ParquetValue::Int64(val) = &row[0] {
|
138
|
+
values1.push(*val);
|
139
|
+
}
|
140
|
+
}
|
141
|
+
}
|
142
|
+
|
143
|
+
// Read 20 from reader2
|
144
|
+
for _ in 0..20 {
|
145
|
+
if let Some(Ok(row)) = iter2.next() {
|
146
|
+
if let ParquetValue::Int64(val) = &row[0] {
|
147
|
+
values2.push(*val);
|
148
|
+
}
|
149
|
+
}
|
150
|
+
}
|
151
|
+
|
152
|
+
// Continue reading from reader1
|
153
|
+
for row_result in iter1 {
|
154
|
+
let row = row_result.unwrap();
|
155
|
+
if let ParquetValue::Int64(val) = &row[0] {
|
156
|
+
values1.push(*val);
|
157
|
+
}
|
158
|
+
}
|
159
|
+
|
160
|
+
// Continue reading from reader2
|
161
|
+
for row_result in iter2 {
|
162
|
+
let row = row_result.unwrap();
|
163
|
+
if let ParquetValue::Int64(val) = &row[0] {
|
164
|
+
values2.push(*val);
|
165
|
+
}
|
166
|
+
}
|
167
|
+
|
168
|
+
// Verify both readers read all values independently
|
169
|
+
assert_eq!(values1.len(), 100);
|
170
|
+
assert_eq!(values2.len(), 100);
|
171
|
+
|
172
|
+
// Verify correct sequence
|
173
|
+
for (i, val) in values1.iter().enumerate() {
|
174
|
+
assert_eq!(*val, i as i64);
|
175
|
+
}
|
176
|
+
for (i, val) in values2.iter().enumerate() {
|
177
|
+
assert_eq!(*val, i as i64);
|
178
|
+
}
|
179
|
+
}
|
180
|
+
|
181
|
+
#[test]
|
182
|
+
fn test_concurrent_column_readers() {
|
183
|
+
// Test concurrent column-wise reading
|
184
|
+
let schema = SchemaBuilder::new()
|
185
|
+
.with_root(SchemaNode::Struct {
|
186
|
+
name: "root".to_string(),
|
187
|
+
nullable: false,
|
188
|
+
fields: vec![
|
189
|
+
SchemaNode::Primitive {
|
190
|
+
name: "col1".to_string(),
|
191
|
+
primitive_type: PrimitiveType::Int32,
|
192
|
+
nullable: false,
|
193
|
+
format: None,
|
194
|
+
},
|
195
|
+
SchemaNode::Primitive {
|
196
|
+
name: "col2".to_string(),
|
197
|
+
primitive_type: PrimitiveType::String,
|
198
|
+
nullable: false,
|
199
|
+
format: None,
|
200
|
+
},
|
201
|
+
SchemaNode::Primitive {
|
202
|
+
name: "col3".to_string(),
|
203
|
+
primitive_type: PrimitiveType::Float64,
|
204
|
+
nullable: false,
|
205
|
+
format: None,
|
206
|
+
},
|
207
|
+
],
|
208
|
+
})
|
209
|
+
.build()
|
210
|
+
.unwrap();
|
211
|
+
|
212
|
+
let rows: Vec<Vec<ParquetValue>> = (0..500)
|
213
|
+
.map(|i| {
|
214
|
+
vec![
|
215
|
+
ParquetValue::Int32(i),
|
216
|
+
ParquetValue::String(Arc::from(format!("String {}", i))),
|
217
|
+
ParquetValue::Float64(ordered_float::OrderedFloat(i as f64 * 1.5)),
|
218
|
+
]
|
219
|
+
})
|
220
|
+
.collect();
|
221
|
+
|
222
|
+
let mut buffer = Vec::new();
|
223
|
+
{
|
224
|
+
let mut writer = Writer::new(&mut buffer, schema).unwrap();
|
225
|
+
writer.write_rows(rows).unwrap();
|
226
|
+
writer.close().unwrap();
|
227
|
+
}
|
228
|
+
|
229
|
+
let bytes = Arc::new(Bytes::from(buffer));
|
230
|
+
let mut handles = vec![];
|
231
|
+
|
232
|
+
// Each thread reads a different column
|
233
|
+
let columns = ["col1", "col2", "col3"];
|
234
|
+
|
235
|
+
for (thread_id, column_name) in columns.iter().enumerate() {
|
236
|
+
let bytes_clone = Arc::clone(&bytes);
|
237
|
+
let column = column_name.to_string();
|
238
|
+
|
239
|
+
let handle = thread::spawn(move || {
|
240
|
+
let reader = Reader::new((*bytes_clone).clone());
|
241
|
+
|
242
|
+
let mut batch_count = 0;
|
243
|
+
let mut value_count = 0;
|
244
|
+
|
245
|
+
for batch_result in reader
|
246
|
+
.read_columns_with_projection(&[column.clone()], None)
|
247
|
+
.unwrap()
|
248
|
+
{
|
249
|
+
let batch = batch_result.unwrap();
|
250
|
+
batch_count += 1;
|
251
|
+
|
252
|
+
// ColumnBatch has columns as Vec<(String, Vec<ParquetValue>)>
|
253
|
+
for (col_name, values) in &batch.columns {
|
254
|
+
if col_name == &column {
|
255
|
+
value_count += values.len();
|
256
|
+
}
|
257
|
+
}
|
258
|
+
}
|
259
|
+
|
260
|
+
println!(
|
261
|
+
"Thread {} read column '{}': {} batches, {} values",
|
262
|
+
thread_id, column, batch_count, value_count
|
263
|
+
);
|
264
|
+
|
265
|
+
(batch_count, value_count)
|
266
|
+
});
|
267
|
+
|
268
|
+
handles.push(handle);
|
269
|
+
}
|
270
|
+
|
271
|
+
// Wait for all threads
|
272
|
+
let mut results = vec![];
|
273
|
+
for handle in handles {
|
274
|
+
results.push(handle.join().unwrap());
|
275
|
+
}
|
276
|
+
|
277
|
+
// Verify all threads read successfully
|
278
|
+
// At least one thread should have read values
|
279
|
+
let total_values: usize = results.iter().map(|(_, count)| count).sum();
|
280
|
+
assert!(total_values > 0, "No values read by any thread");
|
281
|
+
|
282
|
+
// Verify that the first column (col1) read all values
|
283
|
+
assert_eq!(results[0].1, 500, "Column col1 should have read 500 values");
|
284
|
+
}
|
285
|
+
|
286
|
+
#[test]
|
287
|
+
fn test_shared_writer_safety() {
|
288
|
+
// Test that writers cannot be safely shared between threads
|
289
|
+
// This test verifies that the API prevents unsafe concurrent writes
|
290
|
+
|
291
|
+
let schema = SchemaBuilder::new()
|
292
|
+
.with_root(SchemaNode::Struct {
|
293
|
+
name: "root".to_string(),
|
294
|
+
nullable: false,
|
295
|
+
fields: vec![SchemaNode::Primitive {
|
296
|
+
name: "value".to_string(),
|
297
|
+
primitive_type: PrimitiveType::Int32,
|
298
|
+
nullable: false,
|
299
|
+
format: None,
|
300
|
+
}],
|
301
|
+
})
|
302
|
+
.build()
|
303
|
+
.unwrap();
|
304
|
+
|
305
|
+
// Writers should not implement Send/Sync, so wrapping in Arc<Mutex<>> is necessary
|
306
|
+
let buffer = Arc::new(Mutex::new(Vec::new()));
|
307
|
+
|
308
|
+
// Create a writer wrapped in Arc<Mutex<>>
|
309
|
+
{
|
310
|
+
let buffer_clone = Arc::clone(&buffer);
|
311
|
+
let mut buf = buffer_clone.lock().unwrap();
|
312
|
+
|
313
|
+
let mut writer = Writer::new(&mut *buf, schema).unwrap();
|
314
|
+
|
315
|
+
// Write some data
|
316
|
+
writer.write_row(vec![ParquetValue::Int32(42)]).unwrap();
|
317
|
+
writer.close().unwrap();
|
318
|
+
}
|
319
|
+
|
320
|
+
// Verify the write succeeded
|
321
|
+
let final_buffer = buffer.lock().unwrap();
|
322
|
+
assert!(!final_buffer.is_empty());
|
323
|
+
}
|
324
|
+
|
325
|
+
#[test]
|
326
|
+
fn test_reader_cloning() {
|
327
|
+
// Test that readers can be used independently after cloning bytes
|
328
|
+
let schema = SchemaBuilder::new()
|
329
|
+
.with_root(SchemaNode::Struct {
|
330
|
+
name: "root".to_string(),
|
331
|
+
nullable: false,
|
332
|
+
fields: vec![SchemaNode::Primitive {
|
333
|
+
name: "id".to_string(),
|
334
|
+
primitive_type: PrimitiveType::Int32,
|
335
|
+
nullable: false,
|
336
|
+
format: None,
|
337
|
+
}],
|
338
|
+
})
|
339
|
+
.build()
|
340
|
+
.unwrap();
|
341
|
+
|
342
|
+
let rows: Vec<Vec<ParquetValue>> = (0..50).map(|i| vec![ParquetValue::Int32(i)]).collect();
|
343
|
+
|
344
|
+
let mut buffer = Vec::new();
|
345
|
+
{
|
346
|
+
let mut writer = Writer::new(&mut buffer, schema).unwrap();
|
347
|
+
writer.write_rows(rows).unwrap();
|
348
|
+
writer.close().unwrap();
|
349
|
+
}
|
350
|
+
|
351
|
+
let bytes = Bytes::from(buffer);
|
352
|
+
|
353
|
+
// Clone bytes multiple times
|
354
|
+
let bytes1 = bytes.clone();
|
355
|
+
let bytes2 = bytes.clone();
|
356
|
+
let bytes3 = bytes;
|
357
|
+
|
358
|
+
// Create readers from cloned bytes
|
359
|
+
let reader1 = Reader::new(bytes1);
|
360
|
+
let reader2 = Reader::new(bytes2);
|
361
|
+
let reader3 = Reader::new(bytes3);
|
362
|
+
|
363
|
+
// Read from all readers
|
364
|
+
let count1 = reader1.read_rows().unwrap().count();
|
365
|
+
let count2 = reader2.read_rows().unwrap().count();
|
366
|
+
let count3 = reader3.read_rows().unwrap().count();
|
367
|
+
|
368
|
+
assert_eq!(count1, 50);
|
369
|
+
assert_eq!(count2, 50);
|
370
|
+
assert_eq!(count3, 50);
|
371
|
+
}
|
372
|
+
|
373
|
+
#[test]
|
374
|
+
fn test_metadata_concurrent_access() {
|
375
|
+
// Test concurrent access to metadata
|
376
|
+
let schema = SchemaBuilder::new()
|
377
|
+
.with_root(SchemaNode::Struct {
|
378
|
+
name: "root".to_string(),
|
379
|
+
nullable: false,
|
380
|
+
fields: vec![SchemaNode::Primitive {
|
381
|
+
name: "value".to_string(),
|
382
|
+
primitive_type: PrimitiveType::String,
|
383
|
+
nullable: false,
|
384
|
+
format: None,
|
385
|
+
}],
|
386
|
+
})
|
387
|
+
.build()
|
388
|
+
.unwrap();
|
389
|
+
|
390
|
+
let rows: Vec<Vec<ParquetValue>> = (0..100)
|
391
|
+
.map(|i| vec![ParquetValue::String(Arc::from(format!("Value {}", i)))])
|
392
|
+
.collect();
|
393
|
+
|
394
|
+
let mut buffer = Vec::new();
|
395
|
+
{
|
396
|
+
let mut writer = Writer::new(&mut buffer, schema).unwrap();
|
397
|
+
writer.write_rows(rows).unwrap();
|
398
|
+
writer.close().unwrap();
|
399
|
+
}
|
400
|
+
|
401
|
+
let bytes = Arc::new(Bytes::from(buffer));
|
402
|
+
let mut handles = vec![];
|
403
|
+
|
404
|
+
// Multiple threads accessing metadata
|
405
|
+
for thread_id in 0..5 {
|
406
|
+
let bytes_clone = Arc::clone(&bytes);
|
407
|
+
|
408
|
+
let handle = thread::spawn(move || {
|
409
|
+
let mut reader = Reader::new((*bytes_clone).clone());
|
410
|
+
|
411
|
+
// Access metadata multiple times
|
412
|
+
for _ in 0..10 {
|
413
|
+
let metadata = reader.metadata().unwrap();
|
414
|
+
assert_eq!(metadata.num_rows(), 100);
|
415
|
+
|
416
|
+
// Small delay to increase chance of concurrent access
|
417
|
+
thread::yield_now();
|
418
|
+
}
|
419
|
+
|
420
|
+
println!("Thread {} successfully accessed metadata", thread_id);
|
421
|
+
});
|
422
|
+
|
423
|
+
handles.push(handle);
|
424
|
+
}
|
425
|
+
|
426
|
+
// Wait for all threads
|
427
|
+
for handle in handles {
|
428
|
+
handle.join().unwrap();
|
429
|
+
}
|
430
|
+
}
|