parquet 0.5.12 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +295 -98
- data/Cargo.toml +1 -1
- data/Gemfile +1 -0
- data/README.md +94 -3
- data/ext/parquet/Cargo.toml +8 -5
- data/ext/parquet/src/adapter_ffi.rs +156 -0
- data/ext/parquet/src/lib.rs +13 -21
- data/ext/parquet-core/Cargo.toml +23 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
- data/ext/parquet-core/src/error.rs +163 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +263 -0
- data/ext/parquet-core/src/schema.rs +283 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +151 -0
- data/ext/parquet-core/src/value.rs +209 -0
- data/ext/parquet-core/src/writer.rs +839 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +430 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
- data/ext/parquet-ruby-adapter/src/error.rs +148 -0
- data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
- data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +94 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
- data/lib/parquet/schema.rb +19 -0
- data/lib/parquet/version.rb +1 -1
- metadata +50 -24
- data/ext/parquet/src/enumerator.rs +0 -68
- data/ext/parquet/src/header_cache.rs +0 -99
- data/ext/parquet/src/logger.rs +0 -171
- data/ext/parquet/src/reader/common.rs +0 -111
- data/ext/parquet/src/reader/mod.rs +0 -211
- data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
- data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
- data/ext/parquet/src/reader/unified/mod.rs +0 -363
- data/ext/parquet/src/types/core_types.rs +0 -120
- data/ext/parquet/src/types/mod.rs +0 -100
- data/ext/parquet/src/types/parquet_value.rs +0 -1275
- data/ext/parquet/src/types/record_types.rs +0 -603
- data/ext/parquet/src/types/schema_converter.rs +0 -290
- data/ext/parquet/src/types/schema_node.rs +0 -424
- data/ext/parquet/src/types/timestamp.rs +0 -285
- data/ext/parquet/src/types/type_conversion.rs +0 -1949
- data/ext/parquet/src/types/writer_types.rs +0 -329
- data/ext/parquet/src/utils.rs +0 -184
- data/ext/parquet/src/writer/mod.rs +0 -505
- data/ext/parquet/src/writer/write_columns.rs +0 -238
- data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,430 @@
|
|
1
|
+
use bytes::Bytes;
|
2
|
+
use indexmap::IndexMap;
|
3
|
+
use parquet_core::*;
|
4
|
+
use std::sync::Arc;
|
5
|
+
|
6
|
+
mod test_helpers;
|
7
|
+
use test_helpers::*;
|
8
|
+
|
9
|
+
#[test]
|
10
|
+
fn test_null_handling_all_types() {
|
11
|
+
// Test null handling for all nullable primitive types
|
12
|
+
let schema = SchemaBuilder::new()
|
13
|
+
.with_root(SchemaNode::Struct {
|
14
|
+
name: "root".to_string(),
|
15
|
+
nullable: false,
|
16
|
+
fields: vec![
|
17
|
+
SchemaNode::Primitive {
|
18
|
+
name: "bool_field".to_string(),
|
19
|
+
primitive_type: PrimitiveType::Boolean,
|
20
|
+
nullable: true,
|
21
|
+
format: None,
|
22
|
+
},
|
23
|
+
SchemaNode::Primitive {
|
24
|
+
name: "int32_field".to_string(),
|
25
|
+
primitive_type: PrimitiveType::Int32,
|
26
|
+
nullable: true,
|
27
|
+
format: None,
|
28
|
+
},
|
29
|
+
SchemaNode::Primitive {
|
30
|
+
name: "int64_field".to_string(),
|
31
|
+
primitive_type: PrimitiveType::Int64,
|
32
|
+
nullable: true,
|
33
|
+
format: None,
|
34
|
+
},
|
35
|
+
SchemaNode::Primitive {
|
36
|
+
name: "float32_field".to_string(),
|
37
|
+
primitive_type: PrimitiveType::Float32,
|
38
|
+
nullable: true,
|
39
|
+
format: None,
|
40
|
+
},
|
41
|
+
SchemaNode::Primitive {
|
42
|
+
name: "float64_field".to_string(),
|
43
|
+
primitive_type: PrimitiveType::Float64,
|
44
|
+
nullable: true,
|
45
|
+
format: None,
|
46
|
+
},
|
47
|
+
SchemaNode::Primitive {
|
48
|
+
name: "string_field".to_string(),
|
49
|
+
primitive_type: PrimitiveType::String,
|
50
|
+
nullable: true,
|
51
|
+
format: None,
|
52
|
+
},
|
53
|
+
SchemaNode::Primitive {
|
54
|
+
name: "binary_field".to_string(),
|
55
|
+
primitive_type: PrimitiveType::Binary,
|
56
|
+
nullable: true,
|
57
|
+
format: None,
|
58
|
+
},
|
59
|
+
SchemaNode::Primitive {
|
60
|
+
name: "decimal128_field".to_string(),
|
61
|
+
primitive_type: PrimitiveType::Decimal128(10, 2),
|
62
|
+
nullable: true,
|
63
|
+
format: None,
|
64
|
+
},
|
65
|
+
],
|
66
|
+
})
|
67
|
+
.build()
|
68
|
+
.unwrap();
|
69
|
+
|
70
|
+
// Test 1: All values present
|
71
|
+
let all_present = vec![
|
72
|
+
ParquetValue::Boolean(true),
|
73
|
+
ParquetValue::Int32(42),
|
74
|
+
ParquetValue::Int64(12345),
|
75
|
+
ParquetValue::Float32(ordered_float::OrderedFloat(3.14)),
|
76
|
+
ParquetValue::Float64(ordered_float::OrderedFloat(2.718)),
|
77
|
+
ParquetValue::String(Arc::from("test")),
|
78
|
+
ParquetValue::Bytes(Bytes::from(vec![1, 2, 3, 4])),
|
79
|
+
ParquetValue::Decimal128(12345, 2),
|
80
|
+
];
|
81
|
+
|
82
|
+
// Test 2: All nulls
|
83
|
+
let all_nulls: Vec<ParquetValue> = (0..8).map(|_| ParquetValue::Null).collect();
|
84
|
+
|
85
|
+
// Test 3: Mixed nulls and values - alternating pattern
|
86
|
+
let mixed_alternating = vec![
|
87
|
+
ParquetValue::Boolean(true),
|
88
|
+
ParquetValue::Null,
|
89
|
+
ParquetValue::Int64(12345),
|
90
|
+
ParquetValue::Null,
|
91
|
+
ParquetValue::Float64(ordered_float::OrderedFloat(2.718)),
|
92
|
+
ParquetValue::Null,
|
93
|
+
ParquetValue::Bytes(Bytes::from(vec![1, 2, 3, 4])),
|
94
|
+
ParquetValue::Null,
|
95
|
+
];
|
96
|
+
|
97
|
+
// Test 4: Mixed nulls and values - sparse pattern (mostly nulls)
|
98
|
+
let mixed_sparse = vec![
|
99
|
+
ParquetValue::Null,
|
100
|
+
ParquetValue::Null,
|
101
|
+
ParquetValue::Int64(12345),
|
102
|
+
ParquetValue::Null,
|
103
|
+
ParquetValue::Null,
|
104
|
+
ParquetValue::Null,
|
105
|
+
ParquetValue::Null,
|
106
|
+
ParquetValue::Decimal128(12345, 2),
|
107
|
+
];
|
108
|
+
|
109
|
+
let test_rows = [
|
110
|
+
vec![all_present.clone()],
|
111
|
+
vec![all_nulls.clone()],
|
112
|
+
vec![mixed_alternating.clone()],
|
113
|
+
vec![mixed_sparse.clone()],
|
114
|
+
// Add multiple rows to test null patterns
|
115
|
+
(0..10)
|
116
|
+
.map(|i| {
|
117
|
+
if i % 3 == 0 {
|
118
|
+
all_nulls.clone()
|
119
|
+
} else if i % 2 == 0 {
|
120
|
+
mixed_alternating.clone()
|
121
|
+
} else {
|
122
|
+
all_present.clone()
|
123
|
+
}
|
124
|
+
})
|
125
|
+
.collect::<Vec<_>>(),
|
126
|
+
]
|
127
|
+
.concat();
|
128
|
+
|
129
|
+
// Use test helper for roundtrip
|
130
|
+
test_roundtrip(test_rows, schema).unwrap();
|
131
|
+
}
|
132
|
+
|
133
|
+
#[test]
|
134
|
+
fn test_all_null_column() {
|
135
|
+
// Test handling of columns where all values are null
|
136
|
+
let schema = SchemaBuilder::new()
|
137
|
+
.with_root(SchemaNode::Struct {
|
138
|
+
name: "root".to_string(),
|
139
|
+
nullable: false,
|
140
|
+
fields: vec![
|
141
|
+
SchemaNode::Primitive {
|
142
|
+
name: "id".to_string(),
|
143
|
+
primitive_type: PrimitiveType::Int32,
|
144
|
+
nullable: false,
|
145
|
+
format: None,
|
146
|
+
},
|
147
|
+
SchemaNode::Primitive {
|
148
|
+
name: "optional".to_string(),
|
149
|
+
primitive_type: PrimitiveType::String,
|
150
|
+
nullable: true,
|
151
|
+
format: None,
|
152
|
+
},
|
153
|
+
],
|
154
|
+
})
|
155
|
+
.build()
|
156
|
+
.unwrap();
|
157
|
+
|
158
|
+
let rows: Vec<Vec<ParquetValue>> = (0..100)
|
159
|
+
.map(|i| vec![ParquetValue::Int32(i), ParquetValue::Null])
|
160
|
+
.collect();
|
161
|
+
|
162
|
+
// Use test helper for roundtrip
|
163
|
+
test_roundtrip(rows, schema).unwrap();
|
164
|
+
}
|
165
|
+
|
166
|
+
#[test]
|
167
|
+
fn test_null_patterns() {
|
168
|
+
let patterns: Vec<(&str, Box<dyn Fn(usize) -> bool>)> = vec![
|
169
|
+
("alternating", Box::new(|i: usize| i % 2 == 0)),
|
170
|
+
("sparse_90_percent", Box::new(|i: usize| i % 10 != 0)),
|
171
|
+
("dense_10_percent", Box::new(|i: usize| i % 10 == 0)),
|
172
|
+
("first_half", Box::new(|i: usize| i < 500)),
|
173
|
+
("last_half", Box::new(|i: usize| i >= 500)),
|
174
|
+
("blocks_of_10", Box::new(|i: usize| (i / 10) % 2 == 0)),
|
175
|
+
];
|
176
|
+
|
177
|
+
for (pattern_name, is_null) in patterns {
|
178
|
+
// Test various null distribution patterns
|
179
|
+
let schema = SchemaBuilder::new()
|
180
|
+
.with_root(SchemaNode::Struct {
|
181
|
+
name: "root".to_string(),
|
182
|
+
nullable: false,
|
183
|
+
fields: vec![
|
184
|
+
SchemaNode::Primitive {
|
185
|
+
name: "id".to_string(),
|
186
|
+
primitive_type: PrimitiveType::Int32,
|
187
|
+
nullable: false,
|
188
|
+
format: None,
|
189
|
+
},
|
190
|
+
SchemaNode::Primitive {
|
191
|
+
name: "value".to_string(),
|
192
|
+
primitive_type: PrimitiveType::String,
|
193
|
+
nullable: true,
|
194
|
+
format: None,
|
195
|
+
},
|
196
|
+
],
|
197
|
+
})
|
198
|
+
.build()
|
199
|
+
.unwrap();
|
200
|
+
|
201
|
+
let rows: Vec<Vec<ParquetValue>> = (0..1000)
|
202
|
+
.map(|i| {
|
203
|
+
vec![
|
204
|
+
ParquetValue::Int32(i as i32),
|
205
|
+
if is_null(i) {
|
206
|
+
ParquetValue::Null
|
207
|
+
} else {
|
208
|
+
ParquetValue::String(Arc::from(format!("value_{}", i)))
|
209
|
+
},
|
210
|
+
]
|
211
|
+
})
|
212
|
+
.collect();
|
213
|
+
|
214
|
+
// Count nulls for verification
|
215
|
+
let null_count = rows
|
216
|
+
.iter()
|
217
|
+
.filter(|row| matches!(row[1], ParquetValue::Null))
|
218
|
+
.count();
|
219
|
+
println!("{} pattern - nulls: {}/1000", pattern_name, null_count);
|
220
|
+
|
221
|
+
// Use test helper for roundtrip
|
222
|
+
test_roundtrip(rows, schema).unwrap();
|
223
|
+
}
|
224
|
+
}
|
225
|
+
|
226
|
+
#[test]
|
227
|
+
fn test_deeply_nested_nulls() {
|
228
|
+
// Test nulls at various levels of nesting
|
229
|
+
let schema = SchemaBuilder::new()
|
230
|
+
.with_root(SchemaNode::Struct {
|
231
|
+
name: "root".to_string(),
|
232
|
+
nullable: false,
|
233
|
+
fields: vec![
|
234
|
+
SchemaNode::Primitive {
|
235
|
+
name: "id".to_string(),
|
236
|
+
primitive_type: PrimitiveType::Int32,
|
237
|
+
nullable: false,
|
238
|
+
format: None,
|
239
|
+
},
|
240
|
+
SchemaNode::Struct {
|
241
|
+
name: "nested".to_string(),
|
242
|
+
nullable: true,
|
243
|
+
fields: vec![
|
244
|
+
SchemaNode::Primitive {
|
245
|
+
name: "value".to_string(),
|
246
|
+
primitive_type: PrimitiveType::String,
|
247
|
+
nullable: true,
|
248
|
+
format: None,
|
249
|
+
},
|
250
|
+
SchemaNode::List {
|
251
|
+
name: "items".to_string(),
|
252
|
+
nullable: true,
|
253
|
+
item: Box::new(SchemaNode::Primitive {
|
254
|
+
name: "item".to_string(),
|
255
|
+
primitive_type: PrimitiveType::Int32,
|
256
|
+
nullable: true,
|
257
|
+
format: None,
|
258
|
+
}),
|
259
|
+
},
|
260
|
+
],
|
261
|
+
},
|
262
|
+
],
|
263
|
+
})
|
264
|
+
.build()
|
265
|
+
.unwrap();
|
266
|
+
|
267
|
+
let rows = vec![
|
268
|
+
// Entire struct is null
|
269
|
+
vec![ParquetValue::Int32(1), ParquetValue::Null],
|
270
|
+
// Struct with null value and null list
|
271
|
+
vec![
|
272
|
+
ParquetValue::Int32(2),
|
273
|
+
ParquetValue::Record({
|
274
|
+
let mut map = IndexMap::new();
|
275
|
+
map.insert(Arc::from("value"), ParquetValue::Null);
|
276
|
+
map.insert(Arc::from("items"), ParquetValue::Null);
|
277
|
+
map
|
278
|
+
}),
|
279
|
+
],
|
280
|
+
// Struct with value and list containing nulls
|
281
|
+
vec![
|
282
|
+
ParquetValue::Int32(3),
|
283
|
+
ParquetValue::Record({
|
284
|
+
let mut map = IndexMap::new();
|
285
|
+
map.insert(Arc::from("value"), ParquetValue::String(Arc::from("test")));
|
286
|
+
map.insert(
|
287
|
+
Arc::from("items"),
|
288
|
+
ParquetValue::List(vec![
|
289
|
+
ParquetValue::Int32(1),
|
290
|
+
ParquetValue::Null,
|
291
|
+
ParquetValue::Int32(3),
|
292
|
+
]),
|
293
|
+
);
|
294
|
+
map
|
295
|
+
}),
|
296
|
+
],
|
297
|
+
// Struct with null value and empty list
|
298
|
+
vec![
|
299
|
+
ParquetValue::Int32(4),
|
300
|
+
ParquetValue::Record({
|
301
|
+
let mut map = IndexMap::new();
|
302
|
+
map.insert(Arc::from("value"), ParquetValue::Null);
|
303
|
+
map.insert(Arc::from("items"), ParquetValue::List(vec![]));
|
304
|
+
map
|
305
|
+
}),
|
306
|
+
],
|
307
|
+
];
|
308
|
+
|
309
|
+
// Use test helper for roundtrip
|
310
|
+
test_roundtrip(rows, schema).unwrap();
|
311
|
+
}
|
312
|
+
|
313
|
+
#[test]
|
314
|
+
fn test_null_across_row_groups() {
|
315
|
+
// Test null handling when nulls span multiple row groups
|
316
|
+
let schema = SchemaBuilder::new()
|
317
|
+
.with_root(SchemaNode::Struct {
|
318
|
+
name: "root".to_string(),
|
319
|
+
nullable: false,
|
320
|
+
fields: vec![
|
321
|
+
SchemaNode::Primitive {
|
322
|
+
name: "id".to_string(),
|
323
|
+
primitive_type: PrimitiveType::Int64,
|
324
|
+
nullable: false,
|
325
|
+
format: None,
|
326
|
+
},
|
327
|
+
SchemaNode::Primitive {
|
328
|
+
name: "value".to_string(),
|
329
|
+
primitive_type: PrimitiveType::String,
|
330
|
+
nullable: true,
|
331
|
+
format: None,
|
332
|
+
},
|
333
|
+
],
|
334
|
+
})
|
335
|
+
.build()
|
336
|
+
.unwrap();
|
337
|
+
|
338
|
+
// Create rows where entire row groups might be null
|
339
|
+
// Assuming default row group size, create patterns that span groups
|
340
|
+
let rows: Vec<Vec<ParquetValue>> = (0..10000)
|
341
|
+
.map(|i| {
|
342
|
+
vec![
|
343
|
+
ParquetValue::Int64(i),
|
344
|
+
// First 5000 rows: all null
|
345
|
+
// Next 2500 rows: all values
|
346
|
+
// Last 2500 rows: alternating
|
347
|
+
if i < 5000 {
|
348
|
+
ParquetValue::Null
|
349
|
+
} else if i < 7500 {
|
350
|
+
ParquetValue::String(Arc::from(format!("value_{}", i)))
|
351
|
+
} else if i % 2 == 0 {
|
352
|
+
ParquetValue::Null
|
353
|
+
} else {
|
354
|
+
ParquetValue::String(Arc::from(format!("value_{}", i)))
|
355
|
+
},
|
356
|
+
]
|
357
|
+
})
|
358
|
+
.collect();
|
359
|
+
|
360
|
+
// Use test helper with specific batch size to control row groups
|
361
|
+
let result = test_roundtrip_with_options(
|
362
|
+
rows.clone(),
|
363
|
+
schema,
|
364
|
+
parquet::basic::Compression::UNCOMPRESSED,
|
365
|
+
Some(1000), // Force smaller row groups
|
366
|
+
);
|
367
|
+
|
368
|
+
assert!(result.is_ok());
|
369
|
+
|
370
|
+
// Additional verification - ensure the null pattern is preserved
|
371
|
+
let null_count = rows
|
372
|
+
.iter()
|
373
|
+
.filter(|row| matches!(row[1], ParquetValue::Null))
|
374
|
+
.count();
|
375
|
+
assert_eq!(null_count, 6250); // 5000 + 1250 = 6250 nulls
|
376
|
+
}
|
377
|
+
|
378
|
+
#[test]
|
379
|
+
fn test_sparse_columns_with_compression() {
|
380
|
+
// Test compression effectiveness on sparse columns (95% null)
|
381
|
+
let schema = SchemaBuilder::new()
|
382
|
+
.with_root(SchemaNode::Struct {
|
383
|
+
name: "root".to_string(),
|
384
|
+
nullable: false,
|
385
|
+
fields: vec![
|
386
|
+
SchemaNode::Primitive {
|
387
|
+
name: "id".to_string(),
|
388
|
+
primitive_type: PrimitiveType::Int32,
|
389
|
+
nullable: false,
|
390
|
+
format: None,
|
391
|
+
},
|
392
|
+
SchemaNode::Primitive {
|
393
|
+
name: "sparse_data".to_string(),
|
394
|
+
primitive_type: PrimitiveType::String,
|
395
|
+
nullable: true,
|
396
|
+
format: None,
|
397
|
+
},
|
398
|
+
],
|
399
|
+
})
|
400
|
+
.build()
|
401
|
+
.unwrap();
|
402
|
+
|
403
|
+
let rows: Vec<Vec<ParquetValue>> = (0..10000)
|
404
|
+
.map(|i| {
|
405
|
+
vec![
|
406
|
+
ParquetValue::Int32(i),
|
407
|
+
if i % 20 == 0 {
|
408
|
+
ParquetValue::String(Arc::from(format!("rare_value_{}", i)))
|
409
|
+
} else {
|
410
|
+
ParquetValue::Null
|
411
|
+
},
|
412
|
+
]
|
413
|
+
})
|
414
|
+
.collect();
|
415
|
+
|
416
|
+
use parquet::basic::Compression;
|
417
|
+
|
418
|
+
let compressions = vec![
|
419
|
+
("UNCOMPRESSED", Compression::UNCOMPRESSED),
|
420
|
+
("SNAPPY", Compression::SNAPPY),
|
421
|
+
("ZSTD", Compression::ZSTD(Default::default())),
|
422
|
+
];
|
423
|
+
|
424
|
+
for (name, compression) in compressions {
|
425
|
+
let result = test_roundtrip_with_options(rows.clone(), schema.clone(), compression, None);
|
426
|
+
|
427
|
+
assert!(result.is_ok(), "Failed with {}: {:?}", name, result);
|
428
|
+
println!("Sparse column (95% null) with {} succeeded", name);
|
429
|
+
}
|
430
|
+
}
|
@@ -0,0 +1,181 @@
|
|
1
|
+
use bytes::Bytes;
|
2
|
+
use parquet_core::*;
|
3
|
+
use std::time::Instant;
|
4
|
+
|
5
|
+
#[test]
|
6
|
+
fn test_iterator_early_termination() {
|
7
|
+
// Test that dropping an iterator early doesn't cause issues
|
8
|
+
let schema = SchemaBuilder::new()
|
9
|
+
.with_root(SchemaNode::Struct {
|
10
|
+
name: "root".to_string(),
|
11
|
+
nullable: false,
|
12
|
+
fields: vec![SchemaNode::Primitive {
|
13
|
+
name: "value".to_string(),
|
14
|
+
primitive_type: PrimitiveType::Int32,
|
15
|
+
nullable: false,
|
16
|
+
format: None,
|
17
|
+
}],
|
18
|
+
})
|
19
|
+
.build()
|
20
|
+
.unwrap();
|
21
|
+
|
22
|
+
let rows: Vec<Vec<ParquetValue>> = (0..100).map(|i| vec![ParquetValue::Int32(i)]).collect();
|
23
|
+
|
24
|
+
let mut buffer = Vec::new();
|
25
|
+
{
|
26
|
+
let mut writer = Writer::new(&mut buffer, schema).unwrap();
|
27
|
+
writer.write_rows(rows).unwrap();
|
28
|
+
writer.close().unwrap();
|
29
|
+
}
|
30
|
+
|
31
|
+
let bytes = Bytes::from(buffer);
|
32
|
+
let reader = Reader::new(bytes.clone());
|
33
|
+
|
34
|
+
// Only read first 10 rows then drop iterator
|
35
|
+
let mut count = 0;
|
36
|
+
for row_result in reader.read_rows().unwrap() {
|
37
|
+
let _row = row_result.unwrap();
|
38
|
+
count += 1;
|
39
|
+
if count >= 10 {
|
40
|
+
break;
|
41
|
+
}
|
42
|
+
}
|
43
|
+
|
44
|
+
assert_eq!(count, 10);
|
45
|
+
|
46
|
+
// Ensure we can create a new iterator after dropping the previous one
|
47
|
+
let reader2 = Reader::new(bytes);
|
48
|
+
let all_rows: Vec<_> = reader2
|
49
|
+
.read_rows()
|
50
|
+
.unwrap()
|
51
|
+
.collect::<Result<Vec<_>>>()
|
52
|
+
.unwrap();
|
53
|
+
|
54
|
+
assert_eq!(all_rows.len(), 100);
|
55
|
+
}
|
56
|
+
|
57
|
+
#[test]
|
58
|
+
fn test_performance_different_batch_sizes() {
|
59
|
+
// Test performance characteristics with different batch sizes
|
60
|
+
let schema = SchemaBuilder::new()
|
61
|
+
.with_root(SchemaNode::Struct {
|
62
|
+
name: "root".to_string(),
|
63
|
+
nullable: false,
|
64
|
+
fields: vec![SchemaNode::Primitive {
|
65
|
+
name: "value".to_string(),
|
66
|
+
primitive_type: PrimitiveType::Int64,
|
67
|
+
nullable: false,
|
68
|
+
format: None,
|
69
|
+
}],
|
70
|
+
})
|
71
|
+
.build()
|
72
|
+
.unwrap();
|
73
|
+
|
74
|
+
let total_rows = 10000;
|
75
|
+
let batch_sizes = vec![1, 10, 100, 1000, 10000];
|
76
|
+
|
77
|
+
for batch_size in batch_sizes {
|
78
|
+
let mut buffer = Vec::new();
|
79
|
+
|
80
|
+
let write_start = Instant::now();
|
81
|
+
{
|
82
|
+
let mut writer = Writer::new(&mut buffer, schema.clone()).unwrap();
|
83
|
+
|
84
|
+
for batch_start in (0..total_rows).step_by(batch_size) {
|
85
|
+
let batch_end = (batch_start + batch_size).min(total_rows);
|
86
|
+
let rows: Vec<Vec<ParquetValue>> = (batch_start..batch_end)
|
87
|
+
.map(|i| vec![ParquetValue::Int64(i as i64)])
|
88
|
+
.collect();
|
89
|
+
|
90
|
+
writer.write_rows(rows).unwrap();
|
91
|
+
}
|
92
|
+
|
93
|
+
writer.close().unwrap();
|
94
|
+
}
|
95
|
+
let write_duration = write_start.elapsed();
|
96
|
+
|
97
|
+
// Read back
|
98
|
+
let bytes = Bytes::from(buffer);
|
99
|
+
let reader = Reader::new(bytes);
|
100
|
+
|
101
|
+
let read_start = Instant::now();
|
102
|
+
let count = reader.read_rows().unwrap().count();
|
103
|
+
let read_duration = read_start.elapsed();
|
104
|
+
|
105
|
+
assert_eq!(count, total_rows);
|
106
|
+
|
107
|
+
println!(
|
108
|
+
"Batch size {}: Write {:?}, Read {:?}",
|
109
|
+
batch_size, write_duration, read_duration
|
110
|
+
);
|
111
|
+
}
|
112
|
+
}
|
113
|
+
|
114
|
+
#[test]
|
115
|
+
fn test_string_interning_efficiency() {
|
116
|
+
// Test efficiency when writing many repeated strings
|
117
|
+
let schema = SchemaBuilder::new()
|
118
|
+
.with_root(SchemaNode::Struct {
|
119
|
+
name: "root".to_string(),
|
120
|
+
nullable: false,
|
121
|
+
fields: vec![
|
122
|
+
SchemaNode::Primitive {
|
123
|
+
name: "category".to_string(),
|
124
|
+
primitive_type: PrimitiveType::String,
|
125
|
+
nullable: false,
|
126
|
+
format: None,
|
127
|
+
},
|
128
|
+
SchemaNode::Primitive {
|
129
|
+
name: "value".to_string(),
|
130
|
+
primitive_type: PrimitiveType::Int32,
|
131
|
+
nullable: false,
|
132
|
+
format: None,
|
133
|
+
},
|
134
|
+
],
|
135
|
+
})
|
136
|
+
.build()
|
137
|
+
.unwrap();
|
138
|
+
|
139
|
+
let categories = ["A", "B", "C", "D", "E"];
|
140
|
+
let rows: Vec<Vec<ParquetValue>> = (0..10000)
|
141
|
+
.map(|i| {
|
142
|
+
vec![
|
143
|
+
ParquetValue::String(categories[i % categories.len()].into()),
|
144
|
+
ParquetValue::Int32(i as i32),
|
145
|
+
]
|
146
|
+
})
|
147
|
+
.collect();
|
148
|
+
|
149
|
+
let mut buffer = Vec::new();
|
150
|
+
{
|
151
|
+
let mut writer = Writer::new(&mut buffer, schema).unwrap();
|
152
|
+
writer.write_rows(rows).unwrap();
|
153
|
+
writer.close().unwrap();
|
154
|
+
}
|
155
|
+
|
156
|
+
// The file should be efficiently encoded due to repeated strings
|
157
|
+
let file_size = buffer.len();
|
158
|
+
println!("File size with repeated strings: {} bytes", file_size);
|
159
|
+
|
160
|
+
// Verify we can read it back correctly
|
161
|
+
let bytes = Bytes::from(buffer);
|
162
|
+
let reader = Reader::new(bytes);
|
163
|
+
|
164
|
+
let read_rows: Vec<_> = reader
|
165
|
+
.read_rows()
|
166
|
+
.unwrap()
|
167
|
+
.collect::<Result<Vec<_>>>()
|
168
|
+
.unwrap();
|
169
|
+
|
170
|
+
assert_eq!(read_rows.len(), 10000);
|
171
|
+
|
172
|
+
// Verify the pattern
|
173
|
+
for (i, row) in read_rows.iter().enumerate() {
|
174
|
+
match &row[0] {
|
175
|
+
ParquetValue::String(s) => {
|
176
|
+
assert_eq!(*s, categories[i % categories.len()].into());
|
177
|
+
}
|
178
|
+
_ => panic!("Expected string value"),
|
179
|
+
}
|
180
|
+
}
|
181
|
+
}
|