parquet 0.5.12 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +295 -98
- data/Cargo.toml +1 -1
- data/Gemfile +1 -0
- data/README.md +94 -3
- data/ext/parquet/Cargo.toml +8 -5
- data/ext/parquet/src/adapter_ffi.rs +156 -0
- data/ext/parquet/src/lib.rs +13 -21
- data/ext/parquet-core/Cargo.toml +23 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
- data/ext/parquet-core/src/error.rs +163 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +263 -0
- data/ext/parquet-core/src/schema.rs +283 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +151 -0
- data/ext/parquet-core/src/value.rs +209 -0
- data/ext/parquet-core/src/writer.rs +839 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +430 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
- data/ext/parquet-ruby-adapter/src/error.rs +148 -0
- data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
- data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +94 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
- data/lib/parquet/schema.rb +19 -0
- data/lib/parquet/version.rb +1 -1
- metadata +50 -24
- data/ext/parquet/src/enumerator.rs +0 -68
- data/ext/parquet/src/header_cache.rs +0 -99
- data/ext/parquet/src/logger.rs +0 -171
- data/ext/parquet/src/reader/common.rs +0 -111
- data/ext/parquet/src/reader/mod.rs +0 -211
- data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
- data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
- data/ext/parquet/src/reader/unified/mod.rs +0 -363
- data/ext/parquet/src/types/core_types.rs +0 -120
- data/ext/parquet/src/types/mod.rs +0 -100
- data/ext/parquet/src/types/parquet_value.rs +0 -1275
- data/ext/parquet/src/types/record_types.rs +0 -603
- data/ext/parquet/src/types/schema_converter.rs +0 -290
- data/ext/parquet/src/types/schema_node.rs +0 -424
- data/ext/parquet/src/types/timestamp.rs +0 -285
- data/ext/parquet/src/types/type_conversion.rs +0 -1949
- data/ext/parquet/src/types/writer_types.rs +0 -329
- data/ext/parquet/src/utils.rs +0 -184
- data/ext/parquet/src/writer/mod.rs +0 -505
- data/ext/parquet/src/writer/write_columns.rs +0 -238
- data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,518 @@
|
|
1
|
+
use bytes::Bytes;
|
2
|
+
use parquet_core::*;
|
3
|
+
use std::sync::Arc;
|
4
|
+
|
5
|
+
mod test_helpers;
|
6
|
+
use test_helpers::*;
|
7
|
+
|
8
|
+
#[test]
|
9
|
+
fn test_date_types() {
|
10
|
+
let schema = SchemaBuilder::new()
|
11
|
+
.with_root(SchemaNode::Struct {
|
12
|
+
name: "root".to_string(),
|
13
|
+
nullable: false,
|
14
|
+
fields: vec![
|
15
|
+
SchemaNode::Primitive {
|
16
|
+
name: "date32".to_string(),
|
17
|
+
primitive_type: PrimitiveType::Date32,
|
18
|
+
nullable: false,
|
19
|
+
format: None,
|
20
|
+
},
|
21
|
+
SchemaNode::Primitive {
|
22
|
+
name: "date64".to_string(),
|
23
|
+
primitive_type: PrimitiveType::Date64,
|
24
|
+
nullable: false,
|
25
|
+
format: None,
|
26
|
+
},
|
27
|
+
SchemaNode::Primitive {
|
28
|
+
name: "date32_nullable".to_string(),
|
29
|
+
primitive_type: PrimitiveType::Date32,
|
30
|
+
nullable: true,
|
31
|
+
format: None,
|
32
|
+
},
|
33
|
+
],
|
34
|
+
})
|
35
|
+
.build()
|
36
|
+
.unwrap();
|
37
|
+
|
38
|
+
let epoch_date32 = 0; // 1970-01-01
|
39
|
+
let epoch_date64 = 0; // 1970-01-01
|
40
|
+
let today_date32 = 19000; // ~2022
|
41
|
+
let today_date64 = 19000 * 86400 * 1000; // Same day in milliseconds
|
42
|
+
|
43
|
+
let rows = vec![
|
44
|
+
vec![
|
45
|
+
ParquetValue::Date32(epoch_date32),
|
46
|
+
ParquetValue::Date64(epoch_date64),
|
47
|
+
ParquetValue::Date32(epoch_date32),
|
48
|
+
],
|
49
|
+
vec![
|
50
|
+
ParquetValue::Date32(today_date32),
|
51
|
+
ParquetValue::Date64(today_date64),
|
52
|
+
ParquetValue::Date32(today_date32),
|
53
|
+
],
|
54
|
+
vec![
|
55
|
+
ParquetValue::Date32(-365), // One year before epoch
|
56
|
+
ParquetValue::Date64(-365 * 86400 * 1000), // Same in milliseconds
|
57
|
+
ParquetValue::Null,
|
58
|
+
],
|
59
|
+
];
|
60
|
+
|
61
|
+
// Use test helper for roundtrip
|
62
|
+
test_roundtrip(rows, schema).unwrap();
|
63
|
+
}
|
64
|
+
|
65
|
+
#[test]
|
66
|
+
fn test_timestamp_types() {
|
67
|
+
let schema = SchemaBuilder::new()
|
68
|
+
.with_root(SchemaNode::Struct {
|
69
|
+
name: "root".to_string(),
|
70
|
+
nullable: false,
|
71
|
+
fields: vec![
|
72
|
+
SchemaNode::Primitive {
|
73
|
+
name: "ts_millis".to_string(),
|
74
|
+
primitive_type: PrimitiveType::TimestampMillis(None),
|
75
|
+
nullable: false,
|
76
|
+
format: None,
|
77
|
+
},
|
78
|
+
SchemaNode::Primitive {
|
79
|
+
name: "ts_micros".to_string(),
|
80
|
+
primitive_type: PrimitiveType::TimestampMicros(None),
|
81
|
+
nullable: false,
|
82
|
+
format: None,
|
83
|
+
},
|
84
|
+
SchemaNode::Primitive {
|
85
|
+
name: "ts_millis_tz".to_string(),
|
86
|
+
primitive_type: PrimitiveType::TimestampMillis(Some(Arc::from(
|
87
|
+
"America/New_York",
|
88
|
+
))),
|
89
|
+
nullable: false,
|
90
|
+
format: None,
|
91
|
+
},
|
92
|
+
SchemaNode::Primitive {
|
93
|
+
name: "ts_micros_tz".to_string(),
|
94
|
+
primitive_type: PrimitiveType::TimestampMicros(Some(Arc::from(
|
95
|
+
"America/New_York",
|
96
|
+
))),
|
97
|
+
nullable: false,
|
98
|
+
format: None,
|
99
|
+
},
|
100
|
+
],
|
101
|
+
})
|
102
|
+
.build()
|
103
|
+
.unwrap();
|
104
|
+
|
105
|
+
// Test various timestamp values
|
106
|
+
let epoch_millis = 0;
|
107
|
+
let epoch_micros = 0;
|
108
|
+
let now_millis = 1_700_000_000_000; // Approximate timestamp for 2023
|
109
|
+
let now_micros = now_millis * 1000;
|
110
|
+
let tz = Some(Arc::from("America/New_York"));
|
111
|
+
|
112
|
+
let rows = vec![
|
113
|
+
vec![
|
114
|
+
ParquetValue::TimestampMillis(epoch_millis, None),
|
115
|
+
ParquetValue::TimestampMicros(epoch_micros, None),
|
116
|
+
ParquetValue::TimestampMillis(epoch_millis, tz.clone()),
|
117
|
+
ParquetValue::TimestampMicros(epoch_micros, tz.clone()),
|
118
|
+
],
|
119
|
+
vec![
|
120
|
+
ParquetValue::TimestampMillis(now_millis, None),
|
121
|
+
ParquetValue::TimestampMicros(now_micros, None),
|
122
|
+
ParquetValue::TimestampMillis(now_millis, tz.clone()),
|
123
|
+
ParquetValue::TimestampMicros(now_micros, tz.clone()),
|
124
|
+
],
|
125
|
+
vec![
|
126
|
+
ParquetValue::TimestampMillis(-86400000, None), // One day before epoch
|
127
|
+
ParquetValue::TimestampMicros(-86400000000, None),
|
128
|
+
ParquetValue::TimestampMillis(-86400000, Some(Arc::from("UTC"))),
|
129
|
+
ParquetValue::TimestampMicros(-86400000000, Some(Arc::from("UTC"))),
|
130
|
+
],
|
131
|
+
];
|
132
|
+
|
133
|
+
let mut buffer = Vec::new();
|
134
|
+
{
|
135
|
+
let mut writer = Writer::new(&mut buffer, schema).unwrap();
|
136
|
+
writer.write_rows(rows.clone()).unwrap();
|
137
|
+
writer.close().unwrap();
|
138
|
+
}
|
139
|
+
|
140
|
+
// Read back and verify
|
141
|
+
let bytes = Bytes::from(buffer);
|
142
|
+
let reader = Reader::new(bytes);
|
143
|
+
|
144
|
+
let read_rows: Vec<_> = reader
|
145
|
+
.read_rows()
|
146
|
+
.unwrap()
|
147
|
+
.collect::<Result<Vec<_>>>()
|
148
|
+
.unwrap();
|
149
|
+
|
150
|
+
assert_eq!(read_rows.len(), rows.len());
|
151
|
+
|
152
|
+
// Verify the timestamps match, accounting for the fact that field timezone overrides value timezone
|
153
|
+
for (row_idx, (expected_row, actual_row)) in rows.iter().zip(read_rows.iter()).enumerate() {
|
154
|
+
assert_eq!(expected_row.len(), actual_row.len());
|
155
|
+
for (col_idx, (expected_val, actual_val)) in
|
156
|
+
expected_row.iter().zip(actual_row.iter()).enumerate()
|
157
|
+
{
|
158
|
+
match (expected_val, actual_val) {
|
159
|
+
(
|
160
|
+
ParquetValue::TimestampMillis(e_ts, e_tz),
|
161
|
+
ParquetValue::TimestampMillis(a_ts, a_tz),
|
162
|
+
) => {
|
163
|
+
assert_eq!(
|
164
|
+
e_ts, a_ts,
|
165
|
+
"Timestamp value mismatch at row {}, col {}",
|
166
|
+
row_idx, col_idx
|
167
|
+
);
|
168
|
+
// For columns with timezone in schema (col 2 and 3), the schema timezone wins
|
169
|
+
if col_idx >= 2 {
|
170
|
+
assert_eq!(
|
171
|
+
a_tz.as_deref(),
|
172
|
+
Some("UTC"),
|
173
|
+
"Timezone mismatch at row {}, col {}",
|
174
|
+
row_idx,
|
175
|
+
col_idx
|
176
|
+
);
|
177
|
+
} else {
|
178
|
+
assert_eq!(
|
179
|
+
e_tz, a_tz,
|
180
|
+
"Timezone mismatch at row {}, col {}",
|
181
|
+
row_idx, col_idx
|
182
|
+
);
|
183
|
+
}
|
184
|
+
}
|
185
|
+
(
|
186
|
+
ParquetValue::TimestampMicros(e_ts, e_tz),
|
187
|
+
ParquetValue::TimestampMicros(a_ts, a_tz),
|
188
|
+
) => {
|
189
|
+
assert_eq!(
|
190
|
+
e_ts, a_ts,
|
191
|
+
"Timestamp value mismatch at row {}, col {}",
|
192
|
+
row_idx, col_idx
|
193
|
+
);
|
194
|
+
// For columns with timezone in schema (col 2 and 3), the schema timezone wins
|
195
|
+
if col_idx >= 2 {
|
196
|
+
assert_eq!(
|
197
|
+
a_tz.as_deref(),
|
198
|
+
Some("UTC"),
|
199
|
+
"Timezone mismatch at row {}, col {}",
|
200
|
+
row_idx,
|
201
|
+
col_idx
|
202
|
+
);
|
203
|
+
} else {
|
204
|
+
assert_eq!(
|
205
|
+
e_tz, a_tz,
|
206
|
+
"Timezone mismatch at row {}, col {}",
|
207
|
+
row_idx, col_idx
|
208
|
+
);
|
209
|
+
}
|
210
|
+
}
|
211
|
+
_ => panic!("Unexpected value types at row {}, col {}", row_idx, col_idx),
|
212
|
+
}
|
213
|
+
}
|
214
|
+
}
|
215
|
+
}
|
216
|
+
|
217
|
+
#[test]
|
218
|
+
fn test_time_types() {
|
219
|
+
let schema = SchemaBuilder::new()
|
220
|
+
.with_root(SchemaNode::Struct {
|
221
|
+
name: "root".to_string(),
|
222
|
+
nullable: false,
|
223
|
+
fields: vec![
|
224
|
+
SchemaNode::Primitive {
|
225
|
+
name: "time_millis".to_string(),
|
226
|
+
primitive_type: PrimitiveType::TimeMillis,
|
227
|
+
nullable: false,
|
228
|
+
format: None,
|
229
|
+
},
|
230
|
+
SchemaNode::Primitive {
|
231
|
+
name: "time_micros".to_string(),
|
232
|
+
primitive_type: PrimitiveType::TimeMicros,
|
233
|
+
nullable: false,
|
234
|
+
format: None,
|
235
|
+
},
|
236
|
+
SchemaNode::Primitive {
|
237
|
+
name: "time_millis_nullable".to_string(),
|
238
|
+
primitive_type: PrimitiveType::TimeMillis,
|
239
|
+
nullable: true,
|
240
|
+
format: None,
|
241
|
+
},
|
242
|
+
],
|
243
|
+
})
|
244
|
+
.build()
|
245
|
+
.unwrap();
|
246
|
+
|
247
|
+
// Time values (milliseconds/microseconds since midnight)
|
248
|
+
let midnight = 0;
|
249
|
+
let noon_millis = 12 * 60 * 60 * 1000; // 12:00:00
|
250
|
+
let noon_micros = noon_millis as i64 * 1000;
|
251
|
+
let end_of_day_millis = 23 * 60 * 60 * 1000 + 59 * 60 * 1000 + 59 * 1000 + 999; // 23:59:59.999
|
252
|
+
let end_of_day_micros = end_of_day_millis as i64 * 1000 + 999; // 23:59:59.999999
|
253
|
+
|
254
|
+
let rows = vec![
|
255
|
+
vec![
|
256
|
+
ParquetValue::TimeMillis(midnight),
|
257
|
+
ParquetValue::TimeMicros(midnight as i64),
|
258
|
+
ParquetValue::TimeMillis(midnight),
|
259
|
+
],
|
260
|
+
vec![
|
261
|
+
ParquetValue::TimeMillis(noon_millis),
|
262
|
+
ParquetValue::TimeMicros(noon_micros),
|
263
|
+
ParquetValue::TimeMillis(noon_millis),
|
264
|
+
],
|
265
|
+
vec![
|
266
|
+
ParquetValue::TimeMillis(end_of_day_millis),
|
267
|
+
ParquetValue::TimeMicros(end_of_day_micros),
|
268
|
+
ParquetValue::Null,
|
269
|
+
],
|
270
|
+
];
|
271
|
+
|
272
|
+
// Use test helper for roundtrip
|
273
|
+
test_roundtrip(rows, schema).unwrap();
|
274
|
+
}
|
275
|
+
|
276
|
+
#[test]
|
277
|
+
fn test_temporal_types_in_collections() {
|
278
|
+
// Test temporal types within lists and maps
|
279
|
+
let schema = SchemaBuilder::new()
|
280
|
+
.with_root(SchemaNode::Struct {
|
281
|
+
name: "root".to_string(),
|
282
|
+
nullable: false,
|
283
|
+
fields: vec![
|
284
|
+
SchemaNode::List {
|
285
|
+
name: "timestamp_list".to_string(),
|
286
|
+
nullable: false,
|
287
|
+
item: Box::new(SchemaNode::Primitive {
|
288
|
+
name: "item".to_string(),
|
289
|
+
primitive_type: PrimitiveType::TimestampMillis(None),
|
290
|
+
nullable: false,
|
291
|
+
format: None,
|
292
|
+
}),
|
293
|
+
},
|
294
|
+
SchemaNode::Map {
|
295
|
+
name: "date_map".to_string(),
|
296
|
+
nullable: false,
|
297
|
+
key: Box::new(SchemaNode::Primitive {
|
298
|
+
name: "key".to_string(),
|
299
|
+
primitive_type: PrimitiveType::String,
|
300
|
+
nullable: false,
|
301
|
+
format: None,
|
302
|
+
}),
|
303
|
+
value: Box::new(SchemaNode::Primitive {
|
304
|
+
name: "value".to_string(),
|
305
|
+
primitive_type: PrimitiveType::Date32,
|
306
|
+
nullable: true,
|
307
|
+
format: None,
|
308
|
+
}),
|
309
|
+
},
|
310
|
+
],
|
311
|
+
})
|
312
|
+
.build()
|
313
|
+
.unwrap();
|
314
|
+
|
315
|
+
let rows = vec![
|
316
|
+
vec![
|
317
|
+
ParquetValue::List(vec![
|
318
|
+
ParquetValue::TimestampMillis(1000000000000, None),
|
319
|
+
ParquetValue::TimestampMillis(1100000000000, None),
|
320
|
+
ParquetValue::TimestampMillis(1200000000000, None),
|
321
|
+
]),
|
322
|
+
ParquetValue::Map(vec![
|
323
|
+
(
|
324
|
+
ParquetValue::String(Arc::from("start_date")),
|
325
|
+
ParquetValue::Date32(18000),
|
326
|
+
),
|
327
|
+
(
|
328
|
+
ParquetValue::String(Arc::from("end_date")),
|
329
|
+
ParquetValue::Date32(18365),
|
330
|
+
),
|
331
|
+
(
|
332
|
+
ParquetValue::String(Arc::from("milestone")),
|
333
|
+
ParquetValue::Null,
|
334
|
+
),
|
335
|
+
]),
|
336
|
+
],
|
337
|
+
vec![ParquetValue::List(vec![]), ParquetValue::Map(vec![])],
|
338
|
+
];
|
339
|
+
|
340
|
+
// Use test helper for roundtrip
|
341
|
+
test_roundtrip(rows, schema).unwrap();
|
342
|
+
}
|
343
|
+
|
344
|
+
#[test]
|
345
|
+
fn test_temporal_edge_cases() {
|
346
|
+
// Comprehensive test for edge cases of all temporal types
|
347
|
+
let schema = SchemaBuilder::new()
|
348
|
+
.with_root(SchemaNode::Struct {
|
349
|
+
name: "root".to_string(),
|
350
|
+
nullable: false,
|
351
|
+
fields: vec![
|
352
|
+
// All timestamp types
|
353
|
+
SchemaNode::Primitive {
|
354
|
+
name: "ts_sec".to_string(),
|
355
|
+
primitive_type: PrimitiveType::TimestampSecond(None),
|
356
|
+
nullable: true,
|
357
|
+
format: None,
|
358
|
+
},
|
359
|
+
SchemaNode::Primitive {
|
360
|
+
name: "ts_millis".to_string(),
|
361
|
+
primitive_type: PrimitiveType::TimestampMillis(None),
|
362
|
+
nullable: true,
|
363
|
+
format: None,
|
364
|
+
},
|
365
|
+
SchemaNode::Primitive {
|
366
|
+
name: "ts_micros".to_string(),
|
367
|
+
primitive_type: PrimitiveType::TimestampMicros(None),
|
368
|
+
nullable: true,
|
369
|
+
format: None,
|
370
|
+
},
|
371
|
+
SchemaNode::Primitive {
|
372
|
+
name: "ts_nanos".to_string(),
|
373
|
+
primitive_type: PrimitiveType::TimestampNanos(None),
|
374
|
+
nullable: true,
|
375
|
+
format: None,
|
376
|
+
},
|
377
|
+
// Date types
|
378
|
+
SchemaNode::Primitive {
|
379
|
+
name: "date32".to_string(),
|
380
|
+
primitive_type: PrimitiveType::Date32,
|
381
|
+
nullable: true,
|
382
|
+
format: None,
|
383
|
+
},
|
384
|
+
SchemaNode::Primitive {
|
385
|
+
name: "date64".to_string(),
|
386
|
+
primitive_type: PrimitiveType::Date64,
|
387
|
+
nullable: true,
|
388
|
+
format: None,
|
389
|
+
},
|
390
|
+
// Time types
|
391
|
+
SchemaNode::Primitive {
|
392
|
+
name: "time_millis".to_string(),
|
393
|
+
primitive_type: PrimitiveType::TimeMillis,
|
394
|
+
nullable: true,
|
395
|
+
format: None,
|
396
|
+
},
|
397
|
+
SchemaNode::Primitive {
|
398
|
+
name: "time_micros".to_string(),
|
399
|
+
primitive_type: PrimitiveType::TimeMicros,
|
400
|
+
nullable: true,
|
401
|
+
format: None,
|
402
|
+
},
|
403
|
+
],
|
404
|
+
})
|
405
|
+
.build()
|
406
|
+
.unwrap();
|
407
|
+
|
408
|
+
let rows = vec![
|
409
|
+
// Minimum values
|
410
|
+
vec![
|
411
|
+
ParquetValue::TimestampSecond(i64::MIN, None),
|
412
|
+
ParquetValue::TimestampMillis(i64::MIN, None),
|
413
|
+
ParquetValue::TimestampMicros(i64::MIN, None),
|
414
|
+
ParquetValue::TimestampNanos(i64::MIN, None),
|
415
|
+
ParquetValue::Date32(i32::MIN),
|
416
|
+
ParquetValue::Date64(i64::MIN),
|
417
|
+
ParquetValue::TimeMillis(0), // Time can't be negative
|
418
|
+
ParquetValue::TimeMicros(0),
|
419
|
+
],
|
420
|
+
// Maximum values
|
421
|
+
vec![
|
422
|
+
ParquetValue::TimestampSecond(i64::MAX, None),
|
423
|
+
ParquetValue::TimestampMillis(i64::MAX, None),
|
424
|
+
ParquetValue::TimestampMicros(i64::MAX, None),
|
425
|
+
ParquetValue::TimestampNanos(i64::MAX, None),
|
426
|
+
ParquetValue::Date32(i32::MAX),
|
427
|
+
ParquetValue::Date64(i64::MAX),
|
428
|
+
ParquetValue::TimeMillis(86399999), // 23:59:59.999
|
429
|
+
ParquetValue::TimeMicros(86399999999), // 23:59:59.999999
|
430
|
+
],
|
431
|
+
// Zero values (Unix epoch / midnight)
|
432
|
+
vec![
|
433
|
+
ParquetValue::TimestampSecond(0, None),
|
434
|
+
ParquetValue::TimestampMillis(0, None),
|
435
|
+
ParquetValue::TimestampMicros(0, None),
|
436
|
+
ParquetValue::TimestampNanos(0, None),
|
437
|
+
ParquetValue::Date32(0),
|
438
|
+
ParquetValue::Date64(0),
|
439
|
+
ParquetValue::TimeMillis(0),
|
440
|
+
ParquetValue::TimeMicros(0),
|
441
|
+
],
|
442
|
+
// Common timestamp (2025-01-01 00:00:00 UTC)
|
443
|
+
vec![
|
444
|
+
ParquetValue::TimestampSecond(1735689600, None),
|
445
|
+
ParquetValue::TimestampMillis(1735689600000, None),
|
446
|
+
ParquetValue::TimestampMicros(1735689600000000, None),
|
447
|
+
ParquetValue::TimestampNanos(1735689600000000000, None),
|
448
|
+
ParquetValue::Date32(19723), // Days since Unix epoch
|
449
|
+
ParquetValue::Date64(1735689600000), // Milliseconds since Unix epoch
|
450
|
+
ParquetValue::TimeMillis(0), // Midnight
|
451
|
+
ParquetValue::TimeMicros(0),
|
452
|
+
],
|
453
|
+
// All nulls
|
454
|
+
vec![
|
455
|
+
ParquetValue::Null,
|
456
|
+
ParquetValue::Null,
|
457
|
+
ParquetValue::Null,
|
458
|
+
ParquetValue::Null,
|
459
|
+
ParquetValue::Null,
|
460
|
+
ParquetValue::Null,
|
461
|
+
ParquetValue::Null,
|
462
|
+
ParquetValue::Null,
|
463
|
+
],
|
464
|
+
];
|
465
|
+
|
466
|
+
let mut buffer = Vec::new();
|
467
|
+
{
|
468
|
+
let mut writer = Writer::new(&mut buffer, schema).unwrap();
|
469
|
+
writer.write_rows(rows.clone()).unwrap();
|
470
|
+
writer.close().unwrap();
|
471
|
+
}
|
472
|
+
|
473
|
+
// Read back and verify
|
474
|
+
let bytes = Bytes::from(buffer);
|
475
|
+
let reader = Reader::new(bytes);
|
476
|
+
|
477
|
+
let read_rows: Vec<_> = reader
|
478
|
+
.read_rows()
|
479
|
+
.unwrap()
|
480
|
+
.collect::<Result<Vec<_>>>()
|
481
|
+
.unwrap();
|
482
|
+
|
483
|
+
assert_eq!(read_rows.len(), rows.len());
|
484
|
+
|
485
|
+
// Verify values match exactly
|
486
|
+
for (i, (expected_row, actual_row)) in rows.iter().zip(read_rows.iter()).enumerate() {
|
487
|
+
for (j, (expected, actual)) in expected_row.iter().zip(actual_row.iter()).enumerate() {
|
488
|
+
match (expected, actual) {
|
489
|
+
(
|
490
|
+
ParquetValue::TimestampSecond(e_val, _),
|
491
|
+
ParquetValue::TimestampSecond(a_val, _),
|
492
|
+
) => {
|
493
|
+
assert_eq!(e_val, a_val, "Row {} col {}: timestamp values differ", i, j);
|
494
|
+
}
|
495
|
+
(
|
496
|
+
ParquetValue::TimestampMillis(e_val, _),
|
497
|
+
ParquetValue::TimestampMillis(a_val, _),
|
498
|
+
) => {
|
499
|
+
assert_eq!(e_val, a_val, "Row {} col {}: timestamp values differ", i, j);
|
500
|
+
}
|
501
|
+
(
|
502
|
+
ParquetValue::TimestampMicros(e_val, _),
|
503
|
+
ParquetValue::TimestampMicros(a_val, _),
|
504
|
+
) => {
|
505
|
+
assert_eq!(e_val, a_val, "Row {} col {}: timestamp values differ", i, j);
|
506
|
+
}
|
507
|
+
(
|
508
|
+
ParquetValue::TimestampNanos(e_val, _),
|
509
|
+
ParquetValue::TimestampNanos(a_val, _),
|
510
|
+
) => {
|
511
|
+
assert_eq!(e_val, a_val, "Row {} col {}: timestamp values differ", i, j);
|
512
|
+
}
|
513
|
+
(ParquetValue::Null, ParquetValue::Null) => {} // Both null is ok
|
514
|
+
_ => assert_eq!(expected, actual, "Row {} col {}: values differ", i, j),
|
515
|
+
}
|
516
|
+
}
|
517
|
+
}
|
518
|
+
}
|
@@ -0,0 +1,132 @@
|
|
1
|
+
use bytes::Bytes;
|
2
|
+
use parquet::basic::Compression;
|
3
|
+
use parquet::file::properties::WriterProperties;
|
4
|
+
use parquet_core::*;
|
5
|
+
use std::sync::Arc;
|
6
|
+
|
7
|
+
/// Create a test schema with common field types
|
8
|
+
pub fn create_test_schema() -> Schema {
|
9
|
+
SchemaBuilder::new()
|
10
|
+
.with_root(SchemaNode::Struct {
|
11
|
+
name: "root".to_string(),
|
12
|
+
nullable: false,
|
13
|
+
fields: vec![
|
14
|
+
SchemaNode::Primitive {
|
15
|
+
name: "id".to_string(),
|
16
|
+
primitive_type: PrimitiveType::Int32,
|
17
|
+
nullable: false,
|
18
|
+
format: None,
|
19
|
+
},
|
20
|
+
SchemaNode::Primitive {
|
21
|
+
name: "name".to_string(),
|
22
|
+
primitive_type: PrimitiveType::String,
|
23
|
+
nullable: true,
|
24
|
+
format: None,
|
25
|
+
},
|
26
|
+
SchemaNode::Primitive {
|
27
|
+
name: "value".to_string(),
|
28
|
+
primitive_type: PrimitiveType::Float64,
|
29
|
+
nullable: true,
|
30
|
+
format: None,
|
31
|
+
},
|
32
|
+
SchemaNode::Primitive {
|
33
|
+
name: "active".to_string(),
|
34
|
+
primitive_type: PrimitiveType::Boolean,
|
35
|
+
nullable: false,
|
36
|
+
format: None,
|
37
|
+
},
|
38
|
+
],
|
39
|
+
})
|
40
|
+
.build()
|
41
|
+
.unwrap()
|
42
|
+
}
|
43
|
+
|
44
|
+
/// Generate test rows with sequential data
|
45
|
+
pub fn generate_test_rows(count: usize) -> Vec<Vec<ParquetValue>> {
|
46
|
+
(0..count)
|
47
|
+
.map(|i| {
|
48
|
+
vec![
|
49
|
+
ParquetValue::Int32(i as i32),
|
50
|
+
ParquetValue::String(Arc::from(format!("name_{}", i))),
|
51
|
+
ParquetValue::Float64(ordered_float::OrderedFloat(i as f64 * 1.5)),
|
52
|
+
ParquetValue::Boolean(i % 2 == 0),
|
53
|
+
]
|
54
|
+
})
|
55
|
+
.collect()
|
56
|
+
}
|
57
|
+
|
58
|
+
/// Perform a roundtrip test and verify data integrity
|
59
|
+
pub fn test_roundtrip(
|
60
|
+
rows: Vec<Vec<ParquetValue>>,
|
61
|
+
schema: Schema,
|
62
|
+
) -> std::result::Result<(), Box<dyn std::error::Error>> {
|
63
|
+
test_roundtrip_with_options(rows, schema, Compression::UNCOMPRESSED, None)
|
64
|
+
}
|
65
|
+
|
66
|
+
/// Perform a roundtrip test with custom writer options
|
67
|
+
pub fn test_roundtrip_with_options(
|
68
|
+
rows: Vec<Vec<ParquetValue>>,
|
69
|
+
schema: Schema,
|
70
|
+
compression: Compression,
|
71
|
+
batch_size: Option<usize>,
|
72
|
+
) -> std::result::Result<(), Box<dyn std::error::Error>> {
|
73
|
+
use tempfile::NamedTempFile;
|
74
|
+
|
75
|
+
let temp_file = NamedTempFile::new()?;
|
76
|
+
let file_path = temp_file.path().to_str().unwrap();
|
77
|
+
|
78
|
+
// Write
|
79
|
+
let mut buffer = Vec::new();
|
80
|
+
{
|
81
|
+
let mut builder = WriterBuilder::new();
|
82
|
+
|
83
|
+
if let Some(size) = batch_size {
|
84
|
+
builder = builder.with_batch_size(size);
|
85
|
+
}
|
86
|
+
|
87
|
+
let props = WriterProperties::builder()
|
88
|
+
.set_compression(compression)
|
89
|
+
.build();
|
90
|
+
|
91
|
+
let mut writer = if batch_size.is_some() {
|
92
|
+
builder.build(&mut buffer, schema.clone())?
|
93
|
+
} else {
|
94
|
+
Writer::new_with_properties(&mut buffer, schema.clone(), props)?
|
95
|
+
};
|
96
|
+
|
97
|
+
writer.write_rows(rows.clone())?;
|
98
|
+
writer.close()?;
|
99
|
+
}
|
100
|
+
|
101
|
+
// Write to file for persistence
|
102
|
+
std::fs::write(file_path, &buffer)?;
|
103
|
+
|
104
|
+
// Read back
|
105
|
+
let bytes = Bytes::from(buffer);
|
106
|
+
let reader = Reader::new(bytes);
|
107
|
+
|
108
|
+
let read_rows: Vec<Vec<ParquetValue>> = reader.read_rows()?.collect::<Result<Vec<_>>>()?;
|
109
|
+
|
110
|
+
// Verify
|
111
|
+
assert_eq!(rows.len(), read_rows.len(), "Row count mismatch");
|
112
|
+
|
113
|
+
for (i, (original, read)) in rows.iter().zip(read_rows.iter()).enumerate() {
|
114
|
+
assert_eq!(original, read, "Row {} mismatch", i);
|
115
|
+
}
|
116
|
+
|
117
|
+
Ok(())
|
118
|
+
}
|
119
|
+
|
120
|
+
#[cfg(test)]
|
121
|
+
mod tests {
|
122
|
+
use super::*;
|
123
|
+
|
124
|
+
#[test]
|
125
|
+
fn test_helpers_work() {
|
126
|
+
let schema = create_test_schema();
|
127
|
+
let rows = generate_test_rows(10);
|
128
|
+
assert_eq!(rows.len(), 10);
|
129
|
+
|
130
|
+
test_roundtrip(rows, schema).unwrap();
|
131
|
+
}
|
132
|
+
}
|