parquet 0.5.12 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +295 -98
- data/Cargo.toml +1 -1
- data/Gemfile +1 -0
- data/README.md +94 -3
- data/ext/parquet/Cargo.toml +8 -5
- data/ext/parquet/src/adapter_ffi.rs +156 -0
- data/ext/parquet/src/lib.rs +13 -21
- data/ext/parquet-core/Cargo.toml +23 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
- data/ext/parquet-core/src/error.rs +163 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +263 -0
- data/ext/parquet-core/src/schema.rs +283 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +151 -0
- data/ext/parquet-core/src/value.rs +209 -0
- data/ext/parquet-core/src/writer.rs +839 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +430 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
- data/ext/parquet-ruby-adapter/src/error.rs +148 -0
- data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
- data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +94 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
- data/lib/parquet/schema.rb +19 -0
- data/lib/parquet/version.rb +1 -1
- metadata +50 -24
- data/ext/parquet/src/enumerator.rs +0 -68
- data/ext/parquet/src/header_cache.rs +0 -99
- data/ext/parquet/src/logger.rs +0 -171
- data/ext/parquet/src/reader/common.rs +0 -111
- data/ext/parquet/src/reader/mod.rs +0 -211
- data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
- data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
- data/ext/parquet/src/reader/unified/mod.rs +0 -363
- data/ext/parquet/src/types/core_types.rs +0 -120
- data/ext/parquet/src/types/mod.rs +0 -100
- data/ext/parquet/src/types/parquet_value.rs +0 -1275
- data/ext/parquet/src/types/record_types.rs +0 -603
- data/ext/parquet/src/types/schema_converter.rs +0 -290
- data/ext/parquet/src/types/schema_node.rs +0 -424
- data/ext/parquet/src/types/timestamp.rs +0 -285
- data/ext/parquet/src/types/type_conversion.rs +0 -1949
- data/ext/parquet/src/types/writer_types.rs +0 -329
- data/ext/parquet/src/utils.rs +0 -184
- data/ext/parquet/src/writer/mod.rs +0 -505
- data/ext/parquet/src/writer/write_columns.rs +0 -238
- data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,557 @@
|
|
1
|
+
use bytes::Bytes;
|
2
|
+
use indexmap::IndexMap;
|
3
|
+
use ordered_float::OrderedFloat;
|
4
|
+
use parquet_core::*;
|
5
|
+
use std::sync::Arc;
|
6
|
+
|
7
|
+
#[test]
|
8
|
+
fn test_read_with_missing_columns() {
|
9
|
+
// Test reading a file when the reader expects more columns than exist
|
10
|
+
let write_schema = SchemaBuilder::new()
|
11
|
+
.with_root(SchemaNode::Struct {
|
12
|
+
name: "root".to_string(),
|
13
|
+
nullable: false,
|
14
|
+
fields: vec![
|
15
|
+
SchemaNode::Primitive {
|
16
|
+
name: "id".to_string(),
|
17
|
+
primitive_type: PrimitiveType::Int64,
|
18
|
+
nullable: false,
|
19
|
+
format: None,
|
20
|
+
},
|
21
|
+
SchemaNode::Primitive {
|
22
|
+
name: "name".to_string(),
|
23
|
+
primitive_type: PrimitiveType::String,
|
24
|
+
nullable: false,
|
25
|
+
format: None,
|
26
|
+
},
|
27
|
+
],
|
28
|
+
})
|
29
|
+
.build()
|
30
|
+
.unwrap();
|
31
|
+
|
32
|
+
let rows: Vec<Vec<ParquetValue>> = (0..10)
|
33
|
+
.map(|i| {
|
34
|
+
vec![
|
35
|
+
ParquetValue::Int64(i),
|
36
|
+
ParquetValue::String(Arc::from(format!("Name {}", i))),
|
37
|
+
]
|
38
|
+
})
|
39
|
+
.collect();
|
40
|
+
|
41
|
+
let mut buffer = Vec::new();
|
42
|
+
{
|
43
|
+
let mut writer = Writer::new(&mut buffer, write_schema).unwrap();
|
44
|
+
writer.write_rows(rows).unwrap();
|
45
|
+
writer.close().unwrap();
|
46
|
+
}
|
47
|
+
|
48
|
+
// Read with projection asking for a column that doesn't exist
|
49
|
+
let bytes = Bytes::from(buffer);
|
50
|
+
let reader = Reader::new(bytes);
|
51
|
+
|
52
|
+
// Try to read non-existent column
|
53
|
+
let projection = vec!["id".to_string(), "name".to_string(), "age".to_string()];
|
54
|
+
let read_rows: Vec<_> = reader
|
55
|
+
.read_rows_with_projection(&projection)
|
56
|
+
.unwrap()
|
57
|
+
.collect::<Result<Vec<_>>>()
|
58
|
+
.unwrap();
|
59
|
+
|
60
|
+
// Should only get the columns that exist
|
61
|
+
assert_eq!(read_rows.len(), 10);
|
62
|
+
for row in &read_rows {
|
63
|
+
assert_eq!(row.len(), 2); // Only id and name
|
64
|
+
}
|
65
|
+
}
|
66
|
+
|
67
|
+
#[test]
|
68
|
+
fn test_read_with_extra_columns() {
|
69
|
+
// Test reading a file that has more columns than the reader expects
|
70
|
+
let write_schema = SchemaBuilder::new()
|
71
|
+
.with_root(SchemaNode::Struct {
|
72
|
+
name: "root".to_string(),
|
73
|
+
nullable: false,
|
74
|
+
fields: vec![
|
75
|
+
SchemaNode::Primitive {
|
76
|
+
name: "id".to_string(),
|
77
|
+
primitive_type: PrimitiveType::Int64,
|
78
|
+
nullable: false,
|
79
|
+
format: None,
|
80
|
+
},
|
81
|
+
SchemaNode::Primitive {
|
82
|
+
name: "name".to_string(),
|
83
|
+
primitive_type: PrimitiveType::String,
|
84
|
+
nullable: false,
|
85
|
+
format: None,
|
86
|
+
},
|
87
|
+
SchemaNode::Primitive {
|
88
|
+
name: "age".to_string(),
|
89
|
+
primitive_type: PrimitiveType::Int32,
|
90
|
+
nullable: false,
|
91
|
+
format: None,
|
92
|
+
},
|
93
|
+
SchemaNode::Primitive {
|
94
|
+
name: "salary".to_string(),
|
95
|
+
primitive_type: PrimitiveType::Float64,
|
96
|
+
nullable: false,
|
97
|
+
format: None,
|
98
|
+
},
|
99
|
+
],
|
100
|
+
})
|
101
|
+
.build()
|
102
|
+
.unwrap();
|
103
|
+
|
104
|
+
let rows: Vec<Vec<ParquetValue>> = (0..10)
|
105
|
+
.map(|i| {
|
106
|
+
vec![
|
107
|
+
ParquetValue::Int64(i),
|
108
|
+
ParquetValue::String(Arc::from(format!("Person {}", i))),
|
109
|
+
ParquetValue::Int32(25 + i as i32),
|
110
|
+
ParquetValue::Float64(OrderedFloat(50000.0 + i as f64 * 1000.0)),
|
111
|
+
]
|
112
|
+
})
|
113
|
+
.collect();
|
114
|
+
|
115
|
+
let mut buffer = Vec::new();
|
116
|
+
{
|
117
|
+
let mut writer = Writer::new(&mut buffer, write_schema).unwrap();
|
118
|
+
writer.write_rows(rows).unwrap();
|
119
|
+
writer.close().unwrap();
|
120
|
+
}
|
121
|
+
|
122
|
+
// Read only subset of columns
|
123
|
+
let bytes = Bytes::from(buffer);
|
124
|
+
let reader = Reader::new(bytes);
|
125
|
+
|
126
|
+
let projection = vec!["id".to_string(), "name".to_string()];
|
127
|
+
let read_rows: Vec<_> = reader
|
128
|
+
.read_rows_with_projection(&projection)
|
129
|
+
.unwrap()
|
130
|
+
.collect::<Result<Vec<_>>>()
|
131
|
+
.unwrap();
|
132
|
+
|
133
|
+
assert_eq!(read_rows.len(), 10);
|
134
|
+
for (i, row) in read_rows.iter().enumerate() {
|
135
|
+
assert_eq!(row.len(), 2);
|
136
|
+
assert_eq!(row[0], ParquetValue::Int64(i as i64));
|
137
|
+
assert_eq!(
|
138
|
+
row[1],
|
139
|
+
ParquetValue::String(Arc::from(format!("Person {}", i)))
|
140
|
+
);
|
141
|
+
}
|
142
|
+
}
|
143
|
+
|
144
|
+
#[test]
|
145
|
+
fn test_nullable_field_evolution() {
|
146
|
+
// Test reading files where field nullability has changed
|
147
|
+
|
148
|
+
// First, write with non-nullable field
|
149
|
+
let schema_v1 = SchemaBuilder::new()
|
150
|
+
.with_root(SchemaNode::Struct {
|
151
|
+
name: "root".to_string(),
|
152
|
+
nullable: false,
|
153
|
+
fields: vec![
|
154
|
+
SchemaNode::Primitive {
|
155
|
+
name: "id".to_string(),
|
156
|
+
primitive_type: PrimitiveType::Int64,
|
157
|
+
nullable: false,
|
158
|
+
format: None,
|
159
|
+
},
|
160
|
+
SchemaNode::Primitive {
|
161
|
+
name: "value".to_string(),
|
162
|
+
primitive_type: PrimitiveType::String,
|
163
|
+
nullable: false, // Non-nullable in v1
|
164
|
+
format: None,
|
165
|
+
},
|
166
|
+
],
|
167
|
+
})
|
168
|
+
.build()
|
169
|
+
.unwrap();
|
170
|
+
|
171
|
+
let rows_v1: Vec<Vec<ParquetValue>> = (0..5)
|
172
|
+
.map(|i| {
|
173
|
+
vec![
|
174
|
+
ParquetValue::Int64(i),
|
175
|
+
ParquetValue::String(Arc::from(format!("Value {}", i))),
|
176
|
+
]
|
177
|
+
})
|
178
|
+
.collect();
|
179
|
+
|
180
|
+
let mut buffer_v1 = Vec::new();
|
181
|
+
{
|
182
|
+
let mut writer = Writer::new(&mut buffer_v1, schema_v1).unwrap();
|
183
|
+
writer.write_rows(rows_v1).unwrap();
|
184
|
+
writer.close().unwrap();
|
185
|
+
}
|
186
|
+
|
187
|
+
// Now write with nullable field
|
188
|
+
let schema_v2 = SchemaBuilder::new()
|
189
|
+
.with_root(SchemaNode::Struct {
|
190
|
+
name: "root".to_string(),
|
191
|
+
nullable: false,
|
192
|
+
fields: vec![
|
193
|
+
SchemaNode::Primitive {
|
194
|
+
name: "id".to_string(),
|
195
|
+
primitive_type: PrimitiveType::Int64,
|
196
|
+
nullable: false,
|
197
|
+
format: None,
|
198
|
+
},
|
199
|
+
SchemaNode::Primitive {
|
200
|
+
name: "value".to_string(),
|
201
|
+
primitive_type: PrimitiveType::String,
|
202
|
+
nullable: true, // Nullable in v2
|
203
|
+
format: None,
|
204
|
+
},
|
205
|
+
],
|
206
|
+
})
|
207
|
+
.build()
|
208
|
+
.unwrap();
|
209
|
+
|
210
|
+
let rows_v2: Vec<Vec<ParquetValue>> = vec![
|
211
|
+
vec![
|
212
|
+
ParquetValue::Int64(5),
|
213
|
+
ParquetValue::String(Arc::from("Value 5")),
|
214
|
+
],
|
215
|
+
vec![ParquetValue::Int64(6), ParquetValue::Null],
|
216
|
+
vec![
|
217
|
+
ParquetValue::Int64(7),
|
218
|
+
ParquetValue::String(Arc::from("Value 7")),
|
219
|
+
],
|
220
|
+
];
|
221
|
+
|
222
|
+
let mut buffer_v2 = Vec::new();
|
223
|
+
{
|
224
|
+
let mut writer = Writer::new(&mut buffer_v2, schema_v2).unwrap();
|
225
|
+
writer.write_rows(rows_v2).unwrap();
|
226
|
+
writer.close().unwrap();
|
227
|
+
}
|
228
|
+
|
229
|
+
// Read both files and verify
|
230
|
+
let bytes_v1 = Bytes::from(buffer_v1);
|
231
|
+
let reader_v1 = Reader::new(bytes_v1);
|
232
|
+
|
233
|
+
let read_v1: Vec<_> = reader_v1
|
234
|
+
.read_rows()
|
235
|
+
.unwrap()
|
236
|
+
.collect::<Result<Vec<_>>>()
|
237
|
+
.unwrap();
|
238
|
+
|
239
|
+
assert_eq!(read_v1.len(), 5);
|
240
|
+
for row in &read_v1 {
|
241
|
+
assert!(!matches!(row[1], ParquetValue::Null));
|
242
|
+
}
|
243
|
+
|
244
|
+
let bytes_v2 = Bytes::from(buffer_v2);
|
245
|
+
let reader_v2 = Reader::new(bytes_v2);
|
246
|
+
|
247
|
+
let read_v2: Vec<_> = reader_v2
|
248
|
+
.read_rows()
|
249
|
+
.unwrap()
|
250
|
+
.collect::<Result<Vec<_>>>()
|
251
|
+
.unwrap();
|
252
|
+
|
253
|
+
assert_eq!(read_v2.len(), 3);
|
254
|
+
assert!(matches!(read_v2[1][1], ParquetValue::Null));
|
255
|
+
}
|
256
|
+
|
257
|
+
#[test]
|
258
|
+
fn test_type_promotion_compatibility() {
|
259
|
+
// Test reading files where numeric types have been promoted
|
260
|
+
// e.g., Int32 -> Int64, Float32 -> Float64
|
261
|
+
|
262
|
+
let schema_int32 = SchemaBuilder::new()
|
263
|
+
.with_root(SchemaNode::Struct {
|
264
|
+
name: "root".to_string(),
|
265
|
+
nullable: false,
|
266
|
+
fields: vec![SchemaNode::Primitive {
|
267
|
+
name: "value".to_string(),
|
268
|
+
primitive_type: PrimitiveType::Int32,
|
269
|
+
nullable: false,
|
270
|
+
format: None,
|
271
|
+
}],
|
272
|
+
})
|
273
|
+
.build()
|
274
|
+
.unwrap();
|
275
|
+
|
276
|
+
let rows_int32: Vec<Vec<ParquetValue>> = vec![
|
277
|
+
vec![ParquetValue::Int32(42)],
|
278
|
+
vec![ParquetValue::Int32(i32::MAX)],
|
279
|
+
vec![ParquetValue::Int32(i32::MIN)],
|
280
|
+
];
|
281
|
+
|
282
|
+
let mut buffer = Vec::new();
|
283
|
+
{
|
284
|
+
let mut writer = Writer::new(&mut buffer, schema_int32).unwrap();
|
285
|
+
writer.write_rows(rows_int32.clone()).unwrap();
|
286
|
+
writer.close().unwrap();
|
287
|
+
}
|
288
|
+
|
289
|
+
// Read back and verify values are preserved
|
290
|
+
let bytes = Bytes::from(buffer);
|
291
|
+
let reader = Reader::new(bytes);
|
292
|
+
|
293
|
+
let read_rows: Vec<_> = reader
|
294
|
+
.read_rows()
|
295
|
+
.unwrap()
|
296
|
+
.collect::<Result<Vec<_>>>()
|
297
|
+
.unwrap();
|
298
|
+
|
299
|
+
assert_eq!(read_rows.len(), 3);
|
300
|
+
assert_eq!(read_rows[0][0], ParquetValue::Int32(42));
|
301
|
+
assert_eq!(read_rows[1][0], ParquetValue::Int32(i32::MAX));
|
302
|
+
assert_eq!(read_rows[2][0], ParquetValue::Int32(i32::MIN));
|
303
|
+
}
|
304
|
+
|
305
|
+
#[test]
|
306
|
+
fn test_column_reordering() {
|
307
|
+
// Test reading files where column order has changed
|
308
|
+
let schema = SchemaBuilder::new()
|
309
|
+
.with_root(SchemaNode::Struct {
|
310
|
+
name: "root".to_string(),
|
311
|
+
nullable: false,
|
312
|
+
fields: vec![
|
313
|
+
SchemaNode::Primitive {
|
314
|
+
name: "a".to_string(),
|
315
|
+
primitive_type: PrimitiveType::Int32,
|
316
|
+
nullable: false,
|
317
|
+
format: None,
|
318
|
+
},
|
319
|
+
SchemaNode::Primitive {
|
320
|
+
name: "b".to_string(),
|
321
|
+
primitive_type: PrimitiveType::String,
|
322
|
+
nullable: false,
|
323
|
+
format: None,
|
324
|
+
},
|
325
|
+
SchemaNode::Primitive {
|
326
|
+
name: "c".to_string(),
|
327
|
+
primitive_type: PrimitiveType::Float64,
|
328
|
+
nullable: false,
|
329
|
+
format: None,
|
330
|
+
},
|
331
|
+
],
|
332
|
+
})
|
333
|
+
.build()
|
334
|
+
.unwrap();
|
335
|
+
|
336
|
+
let rows: Vec<Vec<ParquetValue>> = vec![
|
337
|
+
vec![
|
338
|
+
ParquetValue::Int32(1),
|
339
|
+
ParquetValue::String(Arc::from("one")),
|
340
|
+
ParquetValue::Float64(OrderedFloat(1.1)),
|
341
|
+
],
|
342
|
+
vec![
|
343
|
+
ParquetValue::Int32(2),
|
344
|
+
ParquetValue::String(Arc::from("two")),
|
345
|
+
ParquetValue::Float64(OrderedFloat(2.2)),
|
346
|
+
],
|
347
|
+
];
|
348
|
+
|
349
|
+
let mut buffer = Vec::new();
|
350
|
+
{
|
351
|
+
let mut writer = Writer::new(&mut buffer, schema).unwrap();
|
352
|
+
writer.write_rows(rows).unwrap();
|
353
|
+
writer.close().unwrap();
|
354
|
+
}
|
355
|
+
|
356
|
+
// Read columns in different order
|
357
|
+
let bytes = Bytes::from(buffer);
|
358
|
+
let reader = Reader::new(bytes);
|
359
|
+
|
360
|
+
// Request columns in different order: c, a, b
|
361
|
+
let projection = vec!["c".to_string(), "a".to_string(), "b".to_string()];
|
362
|
+
let read_rows: Vec<_> = reader
|
363
|
+
.read_rows_with_projection(&projection)
|
364
|
+
.unwrap()
|
365
|
+
.collect::<Result<Vec<_>>>()
|
366
|
+
.unwrap();
|
367
|
+
|
368
|
+
assert_eq!(read_rows.len(), 2);
|
369
|
+
|
370
|
+
// Verify values are returned (columns may be in schema order, not projection order)
|
371
|
+
// The projection filters which columns are returned, but doesn't necessarily reorder them
|
372
|
+
assert_eq!(read_rows[0].len(), 3); // All 3 requested columns
|
373
|
+
|
374
|
+
// Find the values regardless of order
|
375
|
+
let has_int32_1 = read_rows[0]
|
376
|
+
.iter()
|
377
|
+
.any(|v| matches!(v, ParquetValue::Int32(1)));
|
378
|
+
let has_float_1_1 = read_rows[0]
|
379
|
+
.iter()
|
380
|
+
.any(|v| matches!(v, ParquetValue::Float64(f) if f.0 == 1.1));
|
381
|
+
let has_string_one = read_rows[0]
|
382
|
+
.iter()
|
383
|
+
.any(|v| matches!(v, ParquetValue::String(s) if *s == Arc::from("one")));
|
384
|
+
|
385
|
+
assert!(has_int32_1, "Should have Int32(1) for column 'a'");
|
386
|
+
assert!(has_float_1_1, "Should have Float64(1.1) for column 'c'");
|
387
|
+
assert!(has_string_one, "Should have String('one') for column 'b'");
|
388
|
+
}
|
389
|
+
|
390
|
+
#[test]
|
391
|
+
fn test_nested_schema_evolution() {
|
392
|
+
// Test evolution of nested structures
|
393
|
+
|
394
|
+
// V1: Simple struct
|
395
|
+
let schema_v1 = SchemaBuilder::new()
|
396
|
+
.with_root(SchemaNode::Struct {
|
397
|
+
name: "root".to_string(),
|
398
|
+
nullable: false,
|
399
|
+
fields: vec![
|
400
|
+
SchemaNode::Primitive {
|
401
|
+
name: "id".to_string(),
|
402
|
+
primitive_type: PrimitiveType::Int64,
|
403
|
+
nullable: false,
|
404
|
+
format: None,
|
405
|
+
},
|
406
|
+
SchemaNode::Struct {
|
407
|
+
name: "address".to_string(),
|
408
|
+
nullable: false,
|
409
|
+
fields: vec![
|
410
|
+
SchemaNode::Primitive {
|
411
|
+
name: "street".to_string(),
|
412
|
+
primitive_type: PrimitiveType::String,
|
413
|
+
nullable: false,
|
414
|
+
format: None,
|
415
|
+
},
|
416
|
+
SchemaNode::Primitive {
|
417
|
+
name: "city".to_string(),
|
418
|
+
primitive_type: PrimitiveType::String,
|
419
|
+
nullable: false,
|
420
|
+
format: None,
|
421
|
+
},
|
422
|
+
],
|
423
|
+
},
|
424
|
+
],
|
425
|
+
})
|
426
|
+
.build()
|
427
|
+
.unwrap();
|
428
|
+
|
429
|
+
// V2: Extended struct with additional field
|
430
|
+
let schema_v2 = SchemaBuilder::new()
|
431
|
+
.with_root(SchemaNode::Struct {
|
432
|
+
name: "root".to_string(),
|
433
|
+
nullable: false,
|
434
|
+
fields: vec![
|
435
|
+
SchemaNode::Primitive {
|
436
|
+
name: "id".to_string(),
|
437
|
+
primitive_type: PrimitiveType::Int64,
|
438
|
+
nullable: false,
|
439
|
+
format: None,
|
440
|
+
},
|
441
|
+
SchemaNode::Struct {
|
442
|
+
name: "address".to_string(),
|
443
|
+
nullable: false,
|
444
|
+
fields: vec![
|
445
|
+
SchemaNode::Primitive {
|
446
|
+
name: "street".to_string(),
|
447
|
+
primitive_type: PrimitiveType::String,
|
448
|
+
nullable: false,
|
449
|
+
format: None,
|
450
|
+
},
|
451
|
+
SchemaNode::Primitive {
|
452
|
+
name: "city".to_string(),
|
453
|
+
primitive_type: PrimitiveType::String,
|
454
|
+
nullable: false,
|
455
|
+
format: None,
|
456
|
+
},
|
457
|
+
SchemaNode::Primitive {
|
458
|
+
name: "zip".to_string(),
|
459
|
+
primitive_type: PrimitiveType::String,
|
460
|
+
nullable: true, // New nullable field
|
461
|
+
format: None,
|
462
|
+
},
|
463
|
+
],
|
464
|
+
},
|
465
|
+
],
|
466
|
+
})
|
467
|
+
.build()
|
468
|
+
.unwrap();
|
469
|
+
|
470
|
+
// Write v1 data
|
471
|
+
let rows_v1 = vec![vec![
|
472
|
+
ParquetValue::Int64(1),
|
473
|
+
ParquetValue::Record({
|
474
|
+
let mut map = IndexMap::new();
|
475
|
+
map.insert(
|
476
|
+
Arc::from("street"),
|
477
|
+
ParquetValue::String(Arc::from("123 Main St")),
|
478
|
+
);
|
479
|
+
map.insert(
|
480
|
+
Arc::from("city"),
|
481
|
+
ParquetValue::String(Arc::from("Springfield")),
|
482
|
+
);
|
483
|
+
map
|
484
|
+
}),
|
485
|
+
]];
|
486
|
+
|
487
|
+
let mut buffer_v1 = Vec::new();
|
488
|
+
{
|
489
|
+
let mut writer = Writer::new(&mut buffer_v1, schema_v1).unwrap();
|
490
|
+
writer.write_rows(rows_v1).unwrap();
|
491
|
+
writer.close().unwrap();
|
492
|
+
}
|
493
|
+
|
494
|
+
// Write v2 data
|
495
|
+
let rows_v2 = vec![vec![
|
496
|
+
ParquetValue::Int64(2),
|
497
|
+
ParquetValue::Record({
|
498
|
+
let mut map = IndexMap::new();
|
499
|
+
map.insert(
|
500
|
+
Arc::from("street"),
|
501
|
+
ParquetValue::String(Arc::from("456 Oak Ave")),
|
502
|
+
);
|
503
|
+
map.insert(
|
504
|
+
Arc::from("city"),
|
505
|
+
ParquetValue::String(Arc::from("Shelbyville")),
|
506
|
+
);
|
507
|
+
map.insert(Arc::from("zip"), ParquetValue::String(Arc::from("12345")));
|
508
|
+
map
|
509
|
+
}),
|
510
|
+
]];
|
511
|
+
|
512
|
+
let mut buffer_v2 = Vec::new();
|
513
|
+
{
|
514
|
+
let mut writer = Writer::new(&mut buffer_v2, schema_v2).unwrap();
|
515
|
+
writer.write_rows(rows_v2).unwrap();
|
516
|
+
writer.close().unwrap();
|
517
|
+
}
|
518
|
+
|
519
|
+
// Read both files and verify
|
520
|
+
let bytes_v1 = Bytes::from(buffer_v1);
|
521
|
+
let reader_v1 = Reader::new(bytes_v1);
|
522
|
+
|
523
|
+
let read_v1: Vec<_> = reader_v1
|
524
|
+
.read_rows()
|
525
|
+
.unwrap()
|
526
|
+
.collect::<Result<Vec<_>>>()
|
527
|
+
.unwrap();
|
528
|
+
|
529
|
+
assert_eq!(read_v1.len(), 1);
|
530
|
+
match &read_v1[0][1] {
|
531
|
+
ParquetValue::Record(map) => {
|
532
|
+
assert_eq!(map.len(), 2); // Only street and city
|
533
|
+
assert!(map.contains_key("street"));
|
534
|
+
assert!(map.contains_key("city"));
|
535
|
+
assert!(!map.contains_key("zip"));
|
536
|
+
}
|
537
|
+
_ => panic!("Expected record"),
|
538
|
+
}
|
539
|
+
|
540
|
+
let bytes_v2 = Bytes::from(buffer_v2);
|
541
|
+
let reader_v2 = Reader::new(bytes_v2);
|
542
|
+
|
543
|
+
let read_v2: Vec<_> = reader_v2
|
544
|
+
.read_rows()
|
545
|
+
.unwrap()
|
546
|
+
.collect::<Result<Vec<_>>>()
|
547
|
+
.unwrap();
|
548
|
+
|
549
|
+
assert_eq!(read_v2.len(), 1);
|
550
|
+
match &read_v2[0][1] {
|
551
|
+
ParquetValue::Record(map) => {
|
552
|
+
assert_eq!(map.len(), 3); // street, city, and zip
|
553
|
+
assert!(map.contains_key("zip"));
|
554
|
+
}
|
555
|
+
_ => panic!("Expected record"),
|
556
|
+
}
|
557
|
+
}
|