parquet 0.5.12 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +295 -98
- data/Cargo.toml +1 -1
- data/Gemfile +1 -0
- data/README.md +94 -3
- data/ext/parquet/Cargo.toml +8 -5
- data/ext/parquet/src/adapter_ffi.rs +156 -0
- data/ext/parquet/src/lib.rs +13 -21
- data/ext/parquet-core/Cargo.toml +23 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
- data/ext/parquet-core/src/error.rs +163 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +263 -0
- data/ext/parquet-core/src/schema.rs +283 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +151 -0
- data/ext/parquet-core/src/value.rs +209 -0
- data/ext/parquet-core/src/writer.rs +839 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +430 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
- data/ext/parquet-ruby-adapter/src/error.rs +148 -0
- data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
- data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +94 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
- data/lib/parquet/schema.rb +19 -0
- data/lib/parquet/version.rb +1 -1
- metadata +50 -24
- data/ext/parquet/src/enumerator.rs +0 -68
- data/ext/parquet/src/header_cache.rs +0 -99
- data/ext/parquet/src/logger.rs +0 -171
- data/ext/parquet/src/reader/common.rs +0 -111
- data/ext/parquet/src/reader/mod.rs +0 -211
- data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
- data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
- data/ext/parquet/src/reader/unified/mod.rs +0 -363
- data/ext/parquet/src/types/core_types.rs +0 -120
- data/ext/parquet/src/types/mod.rs +0 -100
- data/ext/parquet/src/types/parquet_value.rs +0 -1275
- data/ext/parquet/src/types/record_types.rs +0 -603
- data/ext/parquet/src/types/schema_converter.rs +0 -290
- data/ext/parquet/src/types/schema_node.rs +0 -424
- data/ext/parquet/src/types/timestamp.rs +0 -285
- data/ext/parquet/src/types/type_conversion.rs +0 -1949
- data/ext/parquet/src/types/writer_types.rs +0 -329
- data/ext/parquet/src/utils.rs +0 -184
- data/ext/parquet/src/writer/mod.rs +0 -505
- data/ext/parquet/src/writer/write_columns.rs +0 -238
- data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,777 @@
|
|
1
|
+
use bytes::Bytes;
|
2
|
+
use indexmap::IndexMap;
|
3
|
+
use ordered_float::OrderedFloat;
|
4
|
+
use parquet::basic::Compression;
|
5
|
+
use parquet::file::properties::WriterProperties;
|
6
|
+
use parquet_core::*;
|
7
|
+
use std::sync::Arc;
|
8
|
+
|
9
|
+
#[test]
|
10
|
+
fn test_event_log_pattern() {
|
11
|
+
// Common pattern: event logs with timestamps, IDs, and JSON-like data
|
12
|
+
let schema = SchemaBuilder::new()
|
13
|
+
.with_root(SchemaNode::Struct {
|
14
|
+
name: "root".to_string(),
|
15
|
+
nullable: false,
|
16
|
+
fields: vec![
|
17
|
+
SchemaNode::Primitive {
|
18
|
+
name: "timestamp".to_string(),
|
19
|
+
primitive_type: PrimitiveType::TimestampMillis(None),
|
20
|
+
nullable: false,
|
21
|
+
format: None,
|
22
|
+
},
|
23
|
+
SchemaNode::Primitive {
|
24
|
+
name: "event_id".to_string(),
|
25
|
+
primitive_type: PrimitiveType::String,
|
26
|
+
nullable: false,
|
27
|
+
format: None,
|
28
|
+
},
|
29
|
+
SchemaNode::Primitive {
|
30
|
+
name: "event_type".to_string(),
|
31
|
+
primitive_type: PrimitiveType::String,
|
32
|
+
nullable: false,
|
33
|
+
format: None,
|
34
|
+
},
|
35
|
+
SchemaNode::Primitive {
|
36
|
+
name: "user_id".to_string(),
|
37
|
+
primitive_type: PrimitiveType::Int64,
|
38
|
+
nullable: true,
|
39
|
+
format: None,
|
40
|
+
},
|
41
|
+
SchemaNode::Map {
|
42
|
+
name: "properties".to_string(),
|
43
|
+
nullable: false,
|
44
|
+
key: Box::new(SchemaNode::Primitive {
|
45
|
+
name: "key".to_string(),
|
46
|
+
primitive_type: PrimitiveType::String,
|
47
|
+
nullable: false,
|
48
|
+
format: None,
|
49
|
+
}),
|
50
|
+
value: Box::new(SchemaNode::Primitive {
|
51
|
+
name: "value".to_string(),
|
52
|
+
primitive_type: PrimitiveType::String,
|
53
|
+
nullable: true,
|
54
|
+
format: None,
|
55
|
+
}),
|
56
|
+
},
|
57
|
+
],
|
58
|
+
})
|
59
|
+
.build()
|
60
|
+
.unwrap();
|
61
|
+
|
62
|
+
// Simulate a day's worth of events
|
63
|
+
let mut rows = Vec::new();
|
64
|
+
let event_types = ["page_view", "click", "purchase", "signup", "logout"];
|
65
|
+
let base_timestamp = 1735689600000i64; // 2025-01-01 00:00:00
|
66
|
+
|
67
|
+
for hour in 0..24 {
|
68
|
+
for minute in 0..60 {
|
69
|
+
for event_idx in 0..5 {
|
70
|
+
let timestamp =
|
71
|
+
base_timestamp + (hour * 3600 + minute * 60) * 1000 + event_idx * 100;
|
72
|
+
let event_type = event_types[(event_idx as usize) % event_types.len()];
|
73
|
+
let event_id = format!("evt_{:016x}", timestamp + event_idx);
|
74
|
+
let user_id = if event_type == "logout" || minute % 10 == 0 {
|
75
|
+
ParquetValue::Null
|
76
|
+
} else {
|
77
|
+
ParquetValue::Int64(1000000 + (hour * 1000 + minute))
|
78
|
+
};
|
79
|
+
|
80
|
+
let mut properties = vec![
|
81
|
+
(
|
82
|
+
ParquetValue::String(Arc::from("page")),
|
83
|
+
ParquetValue::String(Arc::from(format!("/page_{}", minute % 10))),
|
84
|
+
),
|
85
|
+
(
|
86
|
+
ParquetValue::String(Arc::from("referrer")),
|
87
|
+
if minute % 3 == 0 {
|
88
|
+
ParquetValue::Null
|
89
|
+
} else {
|
90
|
+
ParquetValue::String(Arc::from("https://search.example.com"))
|
91
|
+
},
|
92
|
+
),
|
93
|
+
];
|
94
|
+
|
95
|
+
if event_type == "purchase" {
|
96
|
+
properties.push((
|
97
|
+
ParquetValue::String(Arc::from("amount")),
|
98
|
+
ParquetValue::String(Arc::from(format!(
|
99
|
+
"{:.2}",
|
100
|
+
10.0 + (minute as f64) * 1.5
|
101
|
+
))),
|
102
|
+
));
|
103
|
+
}
|
104
|
+
|
105
|
+
rows.push(vec![
|
106
|
+
ParquetValue::TimestampMillis(timestamp, None),
|
107
|
+
ParquetValue::String(Arc::from(event_id)),
|
108
|
+
ParquetValue::String(Arc::from(event_type)),
|
109
|
+
user_id,
|
110
|
+
ParquetValue::Map(properties),
|
111
|
+
]);
|
112
|
+
}
|
113
|
+
}
|
114
|
+
}
|
115
|
+
|
116
|
+
// Write with appropriate settings for time-series data
|
117
|
+
let mut buffer = Vec::new();
|
118
|
+
{
|
119
|
+
let props = WriterProperties::builder()
|
120
|
+
.set_compression(Compression::SNAPPY)
|
121
|
+
.set_dictionary_enabled(true) // Good for repeated event types
|
122
|
+
.set_max_row_group_size(100000) // ~1.4 hours of data per row group
|
123
|
+
.build();
|
124
|
+
|
125
|
+
let mut writer = Writer::new_with_properties(&mut buffer, schema, props).unwrap();
|
126
|
+
writer.write_rows(rows.clone()).unwrap();
|
127
|
+
writer.close().unwrap();
|
128
|
+
}
|
129
|
+
|
130
|
+
// Verify data integrity
|
131
|
+
let bytes = Bytes::from(buffer);
|
132
|
+
let reader = Reader::new(bytes);
|
133
|
+
|
134
|
+
let read_rows: Vec<_> = reader
|
135
|
+
.read_rows()
|
136
|
+
.unwrap()
|
137
|
+
.collect::<Result<Vec<_>>>()
|
138
|
+
.unwrap();
|
139
|
+
|
140
|
+
assert_eq!(read_rows.len(), 24 * 60 * 5); // 24 hours * 60 minutes * 5 events
|
141
|
+
|
142
|
+
// Spot check some values
|
143
|
+
assert_eq!(
|
144
|
+
read_rows[0][2],
|
145
|
+
ParquetValue::String(Arc::from("page_view"))
|
146
|
+
);
|
147
|
+
assert_eq!(read_rows[4][2], ParquetValue::String(Arc::from("logout")));
|
148
|
+
}
|
149
|
+
|
150
|
+
#[test]
|
151
|
+
fn test_analytics_fact_table() {
|
152
|
+
// Common pattern: fact table with dimensions and metrics
|
153
|
+
let schema = SchemaBuilder::new()
|
154
|
+
.with_root(SchemaNode::Struct {
|
155
|
+
name: "root".to_string(),
|
156
|
+
nullable: false,
|
157
|
+
fields: vec![
|
158
|
+
SchemaNode::Primitive {
|
159
|
+
name: "date".to_string(),
|
160
|
+
primitive_type: PrimitiveType::Date32,
|
161
|
+
nullable: false,
|
162
|
+
format: None,
|
163
|
+
},
|
164
|
+
SchemaNode::Primitive {
|
165
|
+
name: "product_id".to_string(),
|
166
|
+
primitive_type: PrimitiveType::Int32,
|
167
|
+
nullable: false,
|
168
|
+
format: None,
|
169
|
+
},
|
170
|
+
SchemaNode::Primitive {
|
171
|
+
name: "store_id".to_string(),
|
172
|
+
primitive_type: PrimitiveType::Int16,
|
173
|
+
nullable: false,
|
174
|
+
format: None,
|
175
|
+
},
|
176
|
+
SchemaNode::Primitive {
|
177
|
+
name: "customer_segment".to_string(),
|
178
|
+
primitive_type: PrimitiveType::String,
|
179
|
+
nullable: true,
|
180
|
+
format: None,
|
181
|
+
},
|
182
|
+
SchemaNode::Primitive {
|
183
|
+
name: "units_sold".to_string(),
|
184
|
+
primitive_type: PrimitiveType::Int32,
|
185
|
+
nullable: false,
|
186
|
+
format: None,
|
187
|
+
},
|
188
|
+
SchemaNode::Primitive {
|
189
|
+
name: "revenue".to_string(),
|
190
|
+
primitive_type: PrimitiveType::Decimal128(18, 2),
|
191
|
+
nullable: false,
|
192
|
+
format: None,
|
193
|
+
},
|
194
|
+
SchemaNode::Primitive {
|
195
|
+
name: "cost".to_string(),
|
196
|
+
primitive_type: PrimitiveType::Decimal128(18, 2),
|
197
|
+
nullable: false,
|
198
|
+
format: None,
|
199
|
+
},
|
200
|
+
SchemaNode::Primitive {
|
201
|
+
name: "discount_pct".to_string(),
|
202
|
+
primitive_type: PrimitiveType::Float32,
|
203
|
+
nullable: true,
|
204
|
+
format: None,
|
205
|
+
},
|
206
|
+
],
|
207
|
+
})
|
208
|
+
.build()
|
209
|
+
.unwrap();
|
210
|
+
|
211
|
+
// Generate realistic fact table data
|
212
|
+
let segments = ["Premium", "Regular", "Budget", "Corporate"];
|
213
|
+
let mut rows = Vec::new();
|
214
|
+
|
215
|
+
// Simulate 30 days of data
|
216
|
+
for day in 0..30 {
|
217
|
+
// 100 products
|
218
|
+
for product_id in 1..=100 {
|
219
|
+
// 10 stores
|
220
|
+
for store_id in 1..=10 {
|
221
|
+
// Skip some combinations (sparse data)
|
222
|
+
if (day + product_id + store_id) % 7 == 0 {
|
223
|
+
continue;
|
224
|
+
}
|
225
|
+
|
226
|
+
let units = (product_id * store_id + day) % 50 + 1;
|
227
|
+
let unit_price = 10.0 + (product_id as f64) * 0.5;
|
228
|
+
let discount = if day % 7 == 0 {
|
229
|
+
// Weekend discount
|
230
|
+
Some(OrderedFloat(0.15))
|
231
|
+
} else if product_id % 10 == 0 {
|
232
|
+
// Special product discount
|
233
|
+
Some(OrderedFloat(0.10))
|
234
|
+
} else {
|
235
|
+
None
|
236
|
+
};
|
237
|
+
|
238
|
+
let revenue =
|
239
|
+
(units as f64 * unit_price * (1.0 - discount.map(|d| d.0).unwrap_or(0.0)))
|
240
|
+
as i128;
|
241
|
+
let cost = (units as f64 * unit_price * 0.6) as i128;
|
242
|
+
|
243
|
+
let segment = if units > 30 {
|
244
|
+
Some(segments[0])
|
245
|
+
} else if units > 20 {
|
246
|
+
Some(segments[1])
|
247
|
+
} else if units > 10 {
|
248
|
+
Some(segments[2])
|
249
|
+
} else if store_id <= 3 {
|
250
|
+
Some(segments[3])
|
251
|
+
} else {
|
252
|
+
None
|
253
|
+
};
|
254
|
+
|
255
|
+
rows.push(vec![
|
256
|
+
ParquetValue::Date32(19000 + day), // Days since epoch
|
257
|
+
ParquetValue::Int32(product_id),
|
258
|
+
ParquetValue::Int16(store_id as i16),
|
259
|
+
segment
|
260
|
+
.map(|s| ParquetValue::String(Arc::from(s)))
|
261
|
+
.unwrap_or(ParquetValue::Null),
|
262
|
+
ParquetValue::Int32(units),
|
263
|
+
ParquetValue::Decimal128(revenue * 100, 2), // Convert to cents
|
264
|
+
ParquetValue::Decimal128(cost * 100, 2),
|
265
|
+
discount
|
266
|
+
.map(|d| ParquetValue::Float32(OrderedFloat(d.0 as f32)))
|
267
|
+
.unwrap_or(ParquetValue::Null),
|
268
|
+
]);
|
269
|
+
}
|
270
|
+
}
|
271
|
+
}
|
272
|
+
|
273
|
+
// Write with settings optimized for analytics
|
274
|
+
let mut buffer = Vec::new();
|
275
|
+
{
|
276
|
+
let props = WriterProperties::builder()
|
277
|
+
.set_compression(Compression::ZSTD(Default::default()))
|
278
|
+
.set_dictionary_enabled(true)
|
279
|
+
.set_statistics_enabled(parquet::file::properties::EnabledStatistics::Chunk)
|
280
|
+
.build();
|
281
|
+
|
282
|
+
let mut writer = Writer::new_with_properties(&mut buffer, schema, props).unwrap();
|
283
|
+
writer.write_rows(rows.clone()).unwrap();
|
284
|
+
writer.close().unwrap();
|
285
|
+
}
|
286
|
+
|
287
|
+
// Read and verify
|
288
|
+
let bytes = Bytes::from(buffer);
|
289
|
+
let reader = Reader::new(bytes);
|
290
|
+
|
291
|
+
let read_rows: Vec<_> = reader
|
292
|
+
.read_rows()
|
293
|
+
.unwrap()
|
294
|
+
.collect::<Result<Vec<_>>>()
|
295
|
+
.unwrap();
|
296
|
+
|
297
|
+
assert_eq!(read_rows.len(), rows.len());
|
298
|
+
|
299
|
+
// Verify data patterns
|
300
|
+
let mut total_revenue = 0i128;
|
301
|
+
let mut total_cost = 0i128;
|
302
|
+
|
303
|
+
for row in &read_rows {
|
304
|
+
match &row[5] {
|
305
|
+
ParquetValue::Decimal128(rev, 2) => total_revenue += rev,
|
306
|
+
_ => panic!("Expected decimal revenue"),
|
307
|
+
}
|
308
|
+
match &row[6] {
|
309
|
+
ParquetValue::Decimal128(cost, 2) => total_cost += cost,
|
310
|
+
_ => panic!("Expected decimal cost"),
|
311
|
+
}
|
312
|
+
}
|
313
|
+
|
314
|
+
// Profit margin should be around 40%
|
315
|
+
let profit_margin = (total_revenue - total_cost) as f64 / total_revenue as f64;
|
316
|
+
assert!(
|
317
|
+
profit_margin > 0.35 && profit_margin < 0.45,
|
318
|
+
"Unexpected profit margin: {}",
|
319
|
+
profit_margin
|
320
|
+
);
|
321
|
+
}
|
322
|
+
|
323
|
+
#[test]
|
324
|
+
fn test_iot_sensor_data() {
|
325
|
+
// Common pattern: IoT sensor data with nested readings
|
326
|
+
let schema = SchemaBuilder::new()
|
327
|
+
.with_root(SchemaNode::Struct {
|
328
|
+
name: "root".to_string(),
|
329
|
+
nullable: false,
|
330
|
+
fields: vec![
|
331
|
+
SchemaNode::Primitive {
|
332
|
+
name: "device_id".to_string(),
|
333
|
+
primitive_type: PrimitiveType::String,
|
334
|
+
nullable: false,
|
335
|
+
format: None,
|
336
|
+
},
|
337
|
+
SchemaNode::Primitive {
|
338
|
+
name: "timestamp".to_string(),
|
339
|
+
primitive_type: PrimitiveType::TimestampMicros(None),
|
340
|
+
nullable: false,
|
341
|
+
format: None,
|
342
|
+
},
|
343
|
+
SchemaNode::Struct {
|
344
|
+
name: "location".to_string(),
|
345
|
+
nullable: true,
|
346
|
+
fields: vec![
|
347
|
+
SchemaNode::Primitive {
|
348
|
+
name: "latitude".to_string(),
|
349
|
+
primitive_type: PrimitiveType::Float64,
|
350
|
+
nullable: false,
|
351
|
+
format: None,
|
352
|
+
},
|
353
|
+
SchemaNode::Primitive {
|
354
|
+
name: "longitude".to_string(),
|
355
|
+
primitive_type: PrimitiveType::Float64,
|
356
|
+
nullable: false,
|
357
|
+
format: None,
|
358
|
+
},
|
359
|
+
SchemaNode::Primitive {
|
360
|
+
name: "altitude".to_string(),
|
361
|
+
primitive_type: PrimitiveType::Float32,
|
362
|
+
nullable: true,
|
363
|
+
format: None,
|
364
|
+
},
|
365
|
+
],
|
366
|
+
},
|
367
|
+
SchemaNode::List {
|
368
|
+
name: "readings".to_string(),
|
369
|
+
nullable: false,
|
370
|
+
item: Box::new(SchemaNode::Struct {
|
371
|
+
name: "reading".to_string(),
|
372
|
+
nullable: false,
|
373
|
+
fields: vec![
|
374
|
+
SchemaNode::Primitive {
|
375
|
+
name: "sensor_type".to_string(),
|
376
|
+
primitive_type: PrimitiveType::String,
|
377
|
+
nullable: false,
|
378
|
+
format: None,
|
379
|
+
},
|
380
|
+
SchemaNode::Primitive {
|
381
|
+
name: "value".to_string(),
|
382
|
+
primitive_type: PrimitiveType::Float64,
|
383
|
+
nullable: false,
|
384
|
+
format: None,
|
385
|
+
},
|
386
|
+
SchemaNode::Primitive {
|
387
|
+
name: "unit".to_string(),
|
388
|
+
primitive_type: PrimitiveType::String,
|
389
|
+
nullable: false,
|
390
|
+
format: None,
|
391
|
+
},
|
392
|
+
SchemaNode::Primitive {
|
393
|
+
name: "quality".to_string(),
|
394
|
+
primitive_type: PrimitiveType::Int8,
|
395
|
+
nullable: true,
|
396
|
+
format: None,
|
397
|
+
},
|
398
|
+
],
|
399
|
+
}),
|
400
|
+
},
|
401
|
+
SchemaNode::Primitive {
|
402
|
+
name: "battery_level".to_string(),
|
403
|
+
primitive_type: PrimitiveType::Float32,
|
404
|
+
nullable: true,
|
405
|
+
format: None,
|
406
|
+
},
|
407
|
+
],
|
408
|
+
})
|
409
|
+
.build()
|
410
|
+
.unwrap();
|
411
|
+
|
412
|
+
// Generate sensor data
|
413
|
+
let mut rows = Vec::new();
|
414
|
+
let base_timestamp = 1735689600000000i64; // microseconds
|
415
|
+
|
416
|
+
// 10 devices
|
417
|
+
for device_idx in 0..10 {
|
418
|
+
let device_id: Arc<str> = Arc::from(format!("sensor_{:04}", device_idx));
|
419
|
+
let base_lat = 37.7749 + (device_idx as f64) * 0.01;
|
420
|
+
let base_lon = -122.4194 + (device_idx as f64) * 0.01;
|
421
|
+
|
422
|
+
// 1 hour of data, reading every minute
|
423
|
+
for minute in 0..60 {
|
424
|
+
let timestamp = base_timestamp + (minute as i64 * 60 * 1000000);
|
425
|
+
|
426
|
+
// Location (some devices lose GPS occasionally)
|
427
|
+
let location = if minute % 15 == 0 && device_idx % 3 == 0 {
|
428
|
+
// When struct is null, represent it as a record with all null fields
|
429
|
+
ParquetValue::Null
|
430
|
+
} else {
|
431
|
+
ParquetValue::Record({
|
432
|
+
let mut map = IndexMap::new();
|
433
|
+
map.insert(
|
434
|
+
Arc::from("latitude"),
|
435
|
+
ParquetValue::Float64(OrderedFloat(base_lat + (minute as f64) * 0.0001)),
|
436
|
+
);
|
437
|
+
map.insert(
|
438
|
+
Arc::from("longitude"),
|
439
|
+
ParquetValue::Float64(OrderedFloat(base_lon + (minute as f64) * 0.0001)),
|
440
|
+
);
|
441
|
+
map.insert(
|
442
|
+
Arc::from("altitude"),
|
443
|
+
if device_idx < 5 {
|
444
|
+
ParquetValue::Float32(OrderedFloat(100.0 + (minute as f32) * 0.1))
|
445
|
+
} else {
|
446
|
+
ParquetValue::Null
|
447
|
+
},
|
448
|
+
);
|
449
|
+
map
|
450
|
+
})
|
451
|
+
};
|
452
|
+
|
453
|
+
// Sensor readings
|
454
|
+
let mut readings = vec![];
|
455
|
+
|
456
|
+
// Temperature
|
457
|
+
readings.push(ParquetValue::Record({
|
458
|
+
let mut map = IndexMap::new();
|
459
|
+
map.insert(
|
460
|
+
Arc::from("sensor_type"),
|
461
|
+
ParquetValue::String(Arc::from("temperature")),
|
462
|
+
);
|
463
|
+
map.insert(
|
464
|
+
Arc::from("value"),
|
465
|
+
ParquetValue::Float64(OrderedFloat(
|
466
|
+
20.0 + (minute as f64) * 0.1 + device_idx as f64,
|
467
|
+
)),
|
468
|
+
);
|
469
|
+
map.insert(
|
470
|
+
Arc::from("unit"),
|
471
|
+
ParquetValue::String(Arc::from("celsius")),
|
472
|
+
);
|
473
|
+
map.insert(Arc::from("quality"), ParquetValue::Int8(100));
|
474
|
+
map
|
475
|
+
}));
|
476
|
+
|
477
|
+
// Humidity
|
478
|
+
readings.push(ParquetValue::Record({
|
479
|
+
let mut map = IndexMap::new();
|
480
|
+
map.insert(
|
481
|
+
Arc::from("sensor_type"),
|
482
|
+
ParquetValue::String(Arc::from("humidity")),
|
483
|
+
);
|
484
|
+
map.insert(
|
485
|
+
Arc::from("value"),
|
486
|
+
ParquetValue::Float64(OrderedFloat(45.0 + (minute as f64) * 0.2)),
|
487
|
+
);
|
488
|
+
map.insert(
|
489
|
+
Arc::from("unit"),
|
490
|
+
ParquetValue::String(Arc::from("percent")),
|
491
|
+
);
|
492
|
+
map.insert(
|
493
|
+
Arc::from("quality"),
|
494
|
+
if minute % 10 == 0 {
|
495
|
+
ParquetValue::Null // Missing quality score
|
496
|
+
} else {
|
497
|
+
ParquetValue::Int8(95)
|
498
|
+
},
|
499
|
+
);
|
500
|
+
map
|
501
|
+
}));
|
502
|
+
|
503
|
+
// Some devices have additional sensors
|
504
|
+
if device_idx % 2 == 0 {
|
505
|
+
readings.push(ParquetValue::Record({
|
506
|
+
let mut map = IndexMap::new();
|
507
|
+
map.insert(
|
508
|
+
Arc::from("sensor_type"),
|
509
|
+
ParquetValue::String(Arc::from("pressure")),
|
510
|
+
);
|
511
|
+
map.insert(
|
512
|
+
Arc::from("value"),
|
513
|
+
ParquetValue::Float64(OrderedFloat(1013.25 + (minute as f64) * 0.01)),
|
514
|
+
);
|
515
|
+
map.insert(Arc::from("unit"), ParquetValue::String(Arc::from("hPa")));
|
516
|
+
map.insert(Arc::from("quality"), ParquetValue::Int8(90));
|
517
|
+
map
|
518
|
+
}));
|
519
|
+
}
|
520
|
+
|
521
|
+
// Battery level decreases over time
|
522
|
+
let battery = if minute == 0 {
|
523
|
+
ParquetValue::Float32(OrderedFloat(100.0))
|
524
|
+
} else {
|
525
|
+
ParquetValue::Float32(OrderedFloat(100.0 - (minute as f32) * 0.1))
|
526
|
+
};
|
527
|
+
|
528
|
+
rows.push(vec![
|
529
|
+
ParquetValue::String(device_id.clone()),
|
530
|
+
ParquetValue::TimestampMicros(timestamp, None),
|
531
|
+
location,
|
532
|
+
ParquetValue::List(readings),
|
533
|
+
battery,
|
534
|
+
]);
|
535
|
+
}
|
536
|
+
}
|
537
|
+
|
538
|
+
// Write with settings for time-series IoT data
|
539
|
+
let mut buffer = Vec::new();
|
540
|
+
{
|
541
|
+
let props = WriterProperties::builder()
|
542
|
+
.set_compression(Compression::SNAPPY)
|
543
|
+
.set_dictionary_enabled(true)
|
544
|
+
.build();
|
545
|
+
|
546
|
+
let mut writer = Writer::new_with_properties(&mut buffer, schema, props).unwrap();
|
547
|
+
writer.write_rows(rows.clone()).unwrap();
|
548
|
+
writer.close().unwrap();
|
549
|
+
}
|
550
|
+
|
551
|
+
// Verify
|
552
|
+
let bytes = Bytes::from(buffer);
|
553
|
+
let reader = Reader::new(bytes);
|
554
|
+
|
555
|
+
let read_rows: Vec<_> = reader
|
556
|
+
.read_rows()
|
557
|
+
.unwrap()
|
558
|
+
.collect::<Result<Vec<_>>>()
|
559
|
+
.unwrap();
|
560
|
+
|
561
|
+
assert_eq!(read_rows.len(), 10 * 60); // 10 devices * 60 minutes
|
562
|
+
|
563
|
+
// Check first and last readings
|
564
|
+
match &read_rows[0][0] {
|
565
|
+
ParquetValue::String(id) => assert_eq!(id.as_ref(), "sensor_0000"),
|
566
|
+
_ => panic!("Expected device ID"),
|
567
|
+
}
|
568
|
+
|
569
|
+
match &read_rows[0][3] {
|
570
|
+
ParquetValue::List(readings) => assert!(readings.len() >= 2),
|
571
|
+
_ => panic!("Expected readings list"),
|
572
|
+
}
|
573
|
+
}
|
574
|
+
|
575
|
+
#[test]
|
576
|
+
fn test_change_data_capture() {
|
577
|
+
// Common pattern: CDC (Change Data Capture) events
|
578
|
+
let schema = SchemaBuilder::new()
|
579
|
+
.with_root(SchemaNode::Struct {
|
580
|
+
name: "root".to_string(),
|
581
|
+
nullable: false,
|
582
|
+
fields: vec![
|
583
|
+
SchemaNode::Primitive {
|
584
|
+
name: "operation".to_string(),
|
585
|
+
primitive_type: PrimitiveType::String,
|
586
|
+
nullable: false,
|
587
|
+
format: None,
|
588
|
+
},
|
589
|
+
SchemaNode::Primitive {
|
590
|
+
name: "timestamp".to_string(),
|
591
|
+
primitive_type: PrimitiveType::TimestampMillis(None),
|
592
|
+
nullable: false,
|
593
|
+
format: None,
|
594
|
+
},
|
595
|
+
SchemaNode::Primitive {
|
596
|
+
name: "database".to_string(),
|
597
|
+
primitive_type: PrimitiveType::String,
|
598
|
+
nullable: false,
|
599
|
+
format: None,
|
600
|
+
},
|
601
|
+
SchemaNode::Primitive {
|
602
|
+
name: "table".to_string(),
|
603
|
+
primitive_type: PrimitiveType::String,
|
604
|
+
nullable: false,
|
605
|
+
format: None,
|
606
|
+
},
|
607
|
+
SchemaNode::Primitive {
|
608
|
+
name: "primary_key".to_string(),
|
609
|
+
primitive_type: PrimitiveType::String,
|
610
|
+
nullable: false,
|
611
|
+
format: None,
|
612
|
+
},
|
613
|
+
SchemaNode::Map {
|
614
|
+
name: "before".to_string(),
|
615
|
+
nullable: true,
|
616
|
+
key: Box::new(SchemaNode::Primitive {
|
617
|
+
name: "column".to_string(),
|
618
|
+
primitive_type: PrimitiveType::String,
|
619
|
+
nullable: false,
|
620
|
+
format: None,
|
621
|
+
}),
|
622
|
+
value: Box::new(SchemaNode::Primitive {
|
623
|
+
name: "value".to_string(),
|
624
|
+
primitive_type: PrimitiveType::String,
|
625
|
+
nullable: true,
|
626
|
+
format: None,
|
627
|
+
}),
|
628
|
+
},
|
629
|
+
SchemaNode::Map {
|
630
|
+
name: "after".to_string(),
|
631
|
+
nullable: true,
|
632
|
+
key: Box::new(SchemaNode::Primitive {
|
633
|
+
name: "column".to_string(),
|
634
|
+
primitive_type: PrimitiveType::String,
|
635
|
+
nullable: false,
|
636
|
+
format: None,
|
637
|
+
}),
|
638
|
+
value: Box::new(SchemaNode::Primitive {
|
639
|
+
name: "value".to_string(),
|
640
|
+
primitive_type: PrimitiveType::String,
|
641
|
+
nullable: true,
|
642
|
+
format: None,
|
643
|
+
}),
|
644
|
+
},
|
645
|
+
],
|
646
|
+
})
|
647
|
+
.build()
|
648
|
+
.unwrap();
|
649
|
+
|
650
|
+
// Generate CDC events
|
651
|
+
let mut rows = Vec::new();
|
652
|
+
let operations = ["INSERT", "UPDATE", "DELETE"];
|
653
|
+
let tables = ["users", "orders", "products"];
|
654
|
+
let base_timestamp = 1735689600000i64;
|
655
|
+
|
656
|
+
for i in 0..1000 {
|
657
|
+
let operation = operations[i % operations.len()];
|
658
|
+
let table = tables[(i / 10) % tables.len()];
|
659
|
+
let timestamp = base_timestamp + (i * 1000) as i64;
|
660
|
+
let primary_key = format!("{}_id:{}", table, i);
|
661
|
+
|
662
|
+
let (before, after) = match operation {
|
663
|
+
"INSERT" => (
|
664
|
+
ParquetValue::Null,
|
665
|
+
ParquetValue::Map(vec![
|
666
|
+
(
|
667
|
+
ParquetValue::String(Arc::from("id")),
|
668
|
+
ParquetValue::String(Arc::from(i.to_string())),
|
669
|
+
),
|
670
|
+
(
|
671
|
+
ParquetValue::String(Arc::from("name")),
|
672
|
+
ParquetValue::String(Arc::from(format!("{} {}", table, i))),
|
673
|
+
),
|
674
|
+
(
|
675
|
+
ParquetValue::String(Arc::from("created_at")),
|
676
|
+
ParquetValue::String(Arc::from(timestamp.to_string())),
|
677
|
+
),
|
678
|
+
]),
|
679
|
+
),
|
680
|
+
"UPDATE" => (
|
681
|
+
ParquetValue::Map(vec![
|
682
|
+
(
|
683
|
+
ParquetValue::String(Arc::from("id")),
|
684
|
+
ParquetValue::String(Arc::from(i.to_string())),
|
685
|
+
),
|
686
|
+
(
|
687
|
+
ParquetValue::String(Arc::from("name")),
|
688
|
+
ParquetValue::String(Arc::from(format!("old_{} {}", table, i))),
|
689
|
+
),
|
690
|
+
(
|
691
|
+
ParquetValue::String(Arc::from("updated_at")),
|
692
|
+
ParquetValue::String(Arc::from((timestamp - 86400000).to_string())),
|
693
|
+
),
|
694
|
+
]),
|
695
|
+
ParquetValue::Map(vec![
|
696
|
+
(
|
697
|
+
ParquetValue::String(Arc::from("id")),
|
698
|
+
ParquetValue::String(Arc::from(i.to_string())),
|
699
|
+
),
|
700
|
+
(
|
701
|
+
ParquetValue::String(Arc::from("name")),
|
702
|
+
ParquetValue::String(Arc::from(format!("new_{} {}", table, i))),
|
703
|
+
),
|
704
|
+
(
|
705
|
+
ParquetValue::String(Arc::from("updated_at")),
|
706
|
+
ParquetValue::String(Arc::from(timestamp.to_string())),
|
707
|
+
),
|
708
|
+
]),
|
709
|
+
),
|
710
|
+
"DELETE" => (
|
711
|
+
ParquetValue::Map(vec![
|
712
|
+
(
|
713
|
+
ParquetValue::String(Arc::from("id")),
|
714
|
+
ParquetValue::String(Arc::from(i.to_string())),
|
715
|
+
),
|
716
|
+
(
|
717
|
+
ParquetValue::String(Arc::from("name")),
|
718
|
+
ParquetValue::String(Arc::from(format!("{} {}", table, i))),
|
719
|
+
),
|
720
|
+
]),
|
721
|
+
ParquetValue::Null,
|
722
|
+
),
|
723
|
+
_ => unreachable!(),
|
724
|
+
};
|
725
|
+
|
726
|
+
rows.push(vec![
|
727
|
+
ParquetValue::String(Arc::from(operation)),
|
728
|
+
ParquetValue::TimestampMillis(timestamp, None),
|
729
|
+
ParquetValue::String(Arc::from("production")),
|
730
|
+
ParquetValue::String(Arc::from(table)),
|
731
|
+
ParquetValue::String(Arc::from(primary_key)),
|
732
|
+
before,
|
733
|
+
after,
|
734
|
+
]);
|
735
|
+
}
|
736
|
+
|
737
|
+
// Write
|
738
|
+
let mut buffer = Vec::new();
|
739
|
+
{
|
740
|
+
let mut writer = Writer::new(&mut buffer, schema).unwrap();
|
741
|
+
writer.write_rows(rows.clone()).unwrap();
|
742
|
+
writer.close().unwrap();
|
743
|
+
}
|
744
|
+
|
745
|
+
// Verify
|
746
|
+
let bytes = Bytes::from(buffer);
|
747
|
+
let reader = Reader::new(bytes);
|
748
|
+
|
749
|
+
let read_rows: Vec<_> = reader
|
750
|
+
.read_rows()
|
751
|
+
.unwrap()
|
752
|
+
.collect::<Result<Vec<_>>>()
|
753
|
+
.unwrap();
|
754
|
+
|
755
|
+
assert_eq!(read_rows.len(), 1000);
|
756
|
+
|
757
|
+
// Count operations
|
758
|
+
let mut insert_count = 0;
|
759
|
+
let mut update_count = 0;
|
760
|
+
let mut delete_count = 0;
|
761
|
+
|
762
|
+
for row in &read_rows {
|
763
|
+
match &row[0] {
|
764
|
+
ParquetValue::String(op) => match op.as_ref() {
|
765
|
+
"INSERT" => insert_count += 1,
|
766
|
+
"UPDATE" => update_count += 1,
|
767
|
+
"DELETE" => delete_count += 1,
|
768
|
+
_ => panic!("Unexpected operation"),
|
769
|
+
},
|
770
|
+
_ => panic!("Expected operation string"),
|
771
|
+
}
|
772
|
+
}
|
773
|
+
|
774
|
+
assert!(insert_count > 300);
|
775
|
+
assert!(update_count > 300);
|
776
|
+
assert!(delete_count > 300);
|
777
|
+
}
|