parquet 0.5.12 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +8 -5
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -603
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,777 @@
1
+ use bytes::Bytes;
2
+ use indexmap::IndexMap;
3
+ use ordered_float::OrderedFloat;
4
+ use parquet::basic::Compression;
5
+ use parquet::file::properties::WriterProperties;
6
+ use parquet_core::*;
7
+ use std::sync::Arc;
8
+
9
+ #[test]
10
+ fn test_event_log_pattern() {
11
+ // Common pattern: event logs with timestamps, IDs, and JSON-like data
12
+ let schema = SchemaBuilder::new()
13
+ .with_root(SchemaNode::Struct {
14
+ name: "root".to_string(),
15
+ nullable: false,
16
+ fields: vec![
17
+ SchemaNode::Primitive {
18
+ name: "timestamp".to_string(),
19
+ primitive_type: PrimitiveType::TimestampMillis(None),
20
+ nullable: false,
21
+ format: None,
22
+ },
23
+ SchemaNode::Primitive {
24
+ name: "event_id".to_string(),
25
+ primitive_type: PrimitiveType::String,
26
+ nullable: false,
27
+ format: None,
28
+ },
29
+ SchemaNode::Primitive {
30
+ name: "event_type".to_string(),
31
+ primitive_type: PrimitiveType::String,
32
+ nullable: false,
33
+ format: None,
34
+ },
35
+ SchemaNode::Primitive {
36
+ name: "user_id".to_string(),
37
+ primitive_type: PrimitiveType::Int64,
38
+ nullable: true,
39
+ format: None,
40
+ },
41
+ SchemaNode::Map {
42
+ name: "properties".to_string(),
43
+ nullable: false,
44
+ key: Box::new(SchemaNode::Primitive {
45
+ name: "key".to_string(),
46
+ primitive_type: PrimitiveType::String,
47
+ nullable: false,
48
+ format: None,
49
+ }),
50
+ value: Box::new(SchemaNode::Primitive {
51
+ name: "value".to_string(),
52
+ primitive_type: PrimitiveType::String,
53
+ nullable: true,
54
+ format: None,
55
+ }),
56
+ },
57
+ ],
58
+ })
59
+ .build()
60
+ .unwrap();
61
+
62
+ // Simulate a day's worth of events
63
+ let mut rows = Vec::new();
64
+ let event_types = ["page_view", "click", "purchase", "signup", "logout"];
65
+ let base_timestamp = 1735689600000i64; // 2025-01-01 00:00:00
66
+
67
+ for hour in 0..24 {
68
+ for minute in 0..60 {
69
+ for event_idx in 0..5 {
70
+ let timestamp =
71
+ base_timestamp + (hour * 3600 + minute * 60) * 1000 + event_idx * 100;
72
+ let event_type = event_types[(event_idx as usize) % event_types.len()];
73
+ let event_id = format!("evt_{:016x}", timestamp + event_idx);
74
+ let user_id = if event_type == "logout" || minute % 10 == 0 {
75
+ ParquetValue::Null
76
+ } else {
77
+ ParquetValue::Int64(1000000 + (hour * 1000 + minute))
78
+ };
79
+
80
+ let mut properties = vec![
81
+ (
82
+ ParquetValue::String(Arc::from("page")),
83
+ ParquetValue::String(Arc::from(format!("/page_{}", minute % 10))),
84
+ ),
85
+ (
86
+ ParquetValue::String(Arc::from("referrer")),
87
+ if minute % 3 == 0 {
88
+ ParquetValue::Null
89
+ } else {
90
+ ParquetValue::String(Arc::from("https://search.example.com"))
91
+ },
92
+ ),
93
+ ];
94
+
95
+ if event_type == "purchase" {
96
+ properties.push((
97
+ ParquetValue::String(Arc::from("amount")),
98
+ ParquetValue::String(Arc::from(format!(
99
+ "{:.2}",
100
+ 10.0 + (minute as f64) * 1.5
101
+ ))),
102
+ ));
103
+ }
104
+
105
+ rows.push(vec![
106
+ ParquetValue::TimestampMillis(timestamp, None),
107
+ ParquetValue::String(Arc::from(event_id)),
108
+ ParquetValue::String(Arc::from(event_type)),
109
+ user_id,
110
+ ParquetValue::Map(properties),
111
+ ]);
112
+ }
113
+ }
114
+ }
115
+
116
+ // Write with appropriate settings for time-series data
117
+ let mut buffer = Vec::new();
118
+ {
119
+ let props = WriterProperties::builder()
120
+ .set_compression(Compression::SNAPPY)
121
+ .set_dictionary_enabled(true) // Good for repeated event types
122
+ .set_max_row_group_size(100000) // ~1.4 hours of data per row group
123
+ .build();
124
+
125
+ let mut writer = Writer::new_with_properties(&mut buffer, schema, props).unwrap();
126
+ writer.write_rows(rows.clone()).unwrap();
127
+ writer.close().unwrap();
128
+ }
129
+
130
+ // Verify data integrity
131
+ let bytes = Bytes::from(buffer);
132
+ let reader = Reader::new(bytes);
133
+
134
+ let read_rows: Vec<_> = reader
135
+ .read_rows()
136
+ .unwrap()
137
+ .collect::<Result<Vec<_>>>()
138
+ .unwrap();
139
+
140
+ assert_eq!(read_rows.len(), 24 * 60 * 5); // 24 hours * 60 minutes * 5 events
141
+
142
+ // Spot check some values
143
+ assert_eq!(
144
+ read_rows[0][2],
145
+ ParquetValue::String(Arc::from("page_view"))
146
+ );
147
+ assert_eq!(read_rows[4][2], ParquetValue::String(Arc::from("logout")));
148
+ }
149
+
150
+ #[test]
151
+ fn test_analytics_fact_table() {
152
+ // Common pattern: fact table with dimensions and metrics
153
+ let schema = SchemaBuilder::new()
154
+ .with_root(SchemaNode::Struct {
155
+ name: "root".to_string(),
156
+ nullable: false,
157
+ fields: vec![
158
+ SchemaNode::Primitive {
159
+ name: "date".to_string(),
160
+ primitive_type: PrimitiveType::Date32,
161
+ nullable: false,
162
+ format: None,
163
+ },
164
+ SchemaNode::Primitive {
165
+ name: "product_id".to_string(),
166
+ primitive_type: PrimitiveType::Int32,
167
+ nullable: false,
168
+ format: None,
169
+ },
170
+ SchemaNode::Primitive {
171
+ name: "store_id".to_string(),
172
+ primitive_type: PrimitiveType::Int16,
173
+ nullable: false,
174
+ format: None,
175
+ },
176
+ SchemaNode::Primitive {
177
+ name: "customer_segment".to_string(),
178
+ primitive_type: PrimitiveType::String,
179
+ nullable: true,
180
+ format: None,
181
+ },
182
+ SchemaNode::Primitive {
183
+ name: "units_sold".to_string(),
184
+ primitive_type: PrimitiveType::Int32,
185
+ nullable: false,
186
+ format: None,
187
+ },
188
+ SchemaNode::Primitive {
189
+ name: "revenue".to_string(),
190
+ primitive_type: PrimitiveType::Decimal128(18, 2),
191
+ nullable: false,
192
+ format: None,
193
+ },
194
+ SchemaNode::Primitive {
195
+ name: "cost".to_string(),
196
+ primitive_type: PrimitiveType::Decimal128(18, 2),
197
+ nullable: false,
198
+ format: None,
199
+ },
200
+ SchemaNode::Primitive {
201
+ name: "discount_pct".to_string(),
202
+ primitive_type: PrimitiveType::Float32,
203
+ nullable: true,
204
+ format: None,
205
+ },
206
+ ],
207
+ })
208
+ .build()
209
+ .unwrap();
210
+
211
+ // Generate realistic fact table data
212
+ let segments = ["Premium", "Regular", "Budget", "Corporate"];
213
+ let mut rows = Vec::new();
214
+
215
+ // Simulate 30 days of data
216
+ for day in 0..30 {
217
+ // 100 products
218
+ for product_id in 1..=100 {
219
+ // 10 stores
220
+ for store_id in 1..=10 {
221
+ // Skip some combinations (sparse data)
222
+ if (day + product_id + store_id) % 7 == 0 {
223
+ continue;
224
+ }
225
+
226
+ let units = (product_id * store_id + day) % 50 + 1;
227
+ let unit_price = 10.0 + (product_id as f64) * 0.5;
228
+ let discount = if day % 7 == 0 {
229
+ // Weekend discount
230
+ Some(OrderedFloat(0.15))
231
+ } else if product_id % 10 == 0 {
232
+ // Special product discount
233
+ Some(OrderedFloat(0.10))
234
+ } else {
235
+ None
236
+ };
237
+
238
+ let revenue =
239
+ (units as f64 * unit_price * (1.0 - discount.map(|d| d.0).unwrap_or(0.0)))
240
+ as i128;
241
+ let cost = (units as f64 * unit_price * 0.6) as i128;
242
+
243
+ let segment = if units > 30 {
244
+ Some(segments[0])
245
+ } else if units > 20 {
246
+ Some(segments[1])
247
+ } else if units > 10 {
248
+ Some(segments[2])
249
+ } else if store_id <= 3 {
250
+ Some(segments[3])
251
+ } else {
252
+ None
253
+ };
254
+
255
+ rows.push(vec![
256
+ ParquetValue::Date32(19000 + day), // Days since epoch
257
+ ParquetValue::Int32(product_id),
258
+ ParquetValue::Int16(store_id as i16),
259
+ segment
260
+ .map(|s| ParquetValue::String(Arc::from(s)))
261
+ .unwrap_or(ParquetValue::Null),
262
+ ParquetValue::Int32(units),
263
+ ParquetValue::Decimal128(revenue * 100, 2), // Convert to cents
264
+ ParquetValue::Decimal128(cost * 100, 2),
265
+ discount
266
+ .map(|d| ParquetValue::Float32(OrderedFloat(d.0 as f32)))
267
+ .unwrap_or(ParquetValue::Null),
268
+ ]);
269
+ }
270
+ }
271
+ }
272
+
273
+ // Write with settings optimized for analytics
274
+ let mut buffer = Vec::new();
275
+ {
276
+ let props = WriterProperties::builder()
277
+ .set_compression(Compression::ZSTD(Default::default()))
278
+ .set_dictionary_enabled(true)
279
+ .set_statistics_enabled(parquet::file::properties::EnabledStatistics::Chunk)
280
+ .build();
281
+
282
+ let mut writer = Writer::new_with_properties(&mut buffer, schema, props).unwrap();
283
+ writer.write_rows(rows.clone()).unwrap();
284
+ writer.close().unwrap();
285
+ }
286
+
287
+ // Read and verify
288
+ let bytes = Bytes::from(buffer);
289
+ let reader = Reader::new(bytes);
290
+
291
+ let read_rows: Vec<_> = reader
292
+ .read_rows()
293
+ .unwrap()
294
+ .collect::<Result<Vec<_>>>()
295
+ .unwrap();
296
+
297
+ assert_eq!(read_rows.len(), rows.len());
298
+
299
+ // Verify data patterns
300
+ let mut total_revenue = 0i128;
301
+ let mut total_cost = 0i128;
302
+
303
+ for row in &read_rows {
304
+ match &row[5] {
305
+ ParquetValue::Decimal128(rev, 2) => total_revenue += rev,
306
+ _ => panic!("Expected decimal revenue"),
307
+ }
308
+ match &row[6] {
309
+ ParquetValue::Decimal128(cost, 2) => total_cost += cost,
310
+ _ => panic!("Expected decimal cost"),
311
+ }
312
+ }
313
+
314
+ // Profit margin should be around 40%
315
+ let profit_margin = (total_revenue - total_cost) as f64 / total_revenue as f64;
316
+ assert!(
317
+ profit_margin > 0.35 && profit_margin < 0.45,
318
+ "Unexpected profit margin: {}",
319
+ profit_margin
320
+ );
321
+ }
322
+
323
+ #[test]
324
+ fn test_iot_sensor_data() {
325
+ // Common pattern: IoT sensor data with nested readings
326
+ let schema = SchemaBuilder::new()
327
+ .with_root(SchemaNode::Struct {
328
+ name: "root".to_string(),
329
+ nullable: false,
330
+ fields: vec![
331
+ SchemaNode::Primitive {
332
+ name: "device_id".to_string(),
333
+ primitive_type: PrimitiveType::String,
334
+ nullable: false,
335
+ format: None,
336
+ },
337
+ SchemaNode::Primitive {
338
+ name: "timestamp".to_string(),
339
+ primitive_type: PrimitiveType::TimestampMicros(None),
340
+ nullable: false,
341
+ format: None,
342
+ },
343
+ SchemaNode::Struct {
344
+ name: "location".to_string(),
345
+ nullable: true,
346
+ fields: vec![
347
+ SchemaNode::Primitive {
348
+ name: "latitude".to_string(),
349
+ primitive_type: PrimitiveType::Float64,
350
+ nullable: false,
351
+ format: None,
352
+ },
353
+ SchemaNode::Primitive {
354
+ name: "longitude".to_string(),
355
+ primitive_type: PrimitiveType::Float64,
356
+ nullable: false,
357
+ format: None,
358
+ },
359
+ SchemaNode::Primitive {
360
+ name: "altitude".to_string(),
361
+ primitive_type: PrimitiveType::Float32,
362
+ nullable: true,
363
+ format: None,
364
+ },
365
+ ],
366
+ },
367
+ SchemaNode::List {
368
+ name: "readings".to_string(),
369
+ nullable: false,
370
+ item: Box::new(SchemaNode::Struct {
371
+ name: "reading".to_string(),
372
+ nullable: false,
373
+ fields: vec![
374
+ SchemaNode::Primitive {
375
+ name: "sensor_type".to_string(),
376
+ primitive_type: PrimitiveType::String,
377
+ nullable: false,
378
+ format: None,
379
+ },
380
+ SchemaNode::Primitive {
381
+ name: "value".to_string(),
382
+ primitive_type: PrimitiveType::Float64,
383
+ nullable: false,
384
+ format: None,
385
+ },
386
+ SchemaNode::Primitive {
387
+ name: "unit".to_string(),
388
+ primitive_type: PrimitiveType::String,
389
+ nullable: false,
390
+ format: None,
391
+ },
392
+ SchemaNode::Primitive {
393
+ name: "quality".to_string(),
394
+ primitive_type: PrimitiveType::Int8,
395
+ nullable: true,
396
+ format: None,
397
+ },
398
+ ],
399
+ }),
400
+ },
401
+ SchemaNode::Primitive {
402
+ name: "battery_level".to_string(),
403
+ primitive_type: PrimitiveType::Float32,
404
+ nullable: true,
405
+ format: None,
406
+ },
407
+ ],
408
+ })
409
+ .build()
410
+ .unwrap();
411
+
412
+ // Generate sensor data
413
+ let mut rows = Vec::new();
414
+ let base_timestamp = 1735689600000000i64; // microseconds
415
+
416
+ // 10 devices
417
+ for device_idx in 0..10 {
418
+ let device_id: Arc<str> = Arc::from(format!("sensor_{:04}", device_idx));
419
+ let base_lat = 37.7749 + (device_idx as f64) * 0.01;
420
+ let base_lon = -122.4194 + (device_idx as f64) * 0.01;
421
+
422
+ // 1 hour of data, reading every minute
423
+ for minute in 0..60 {
424
+ let timestamp = base_timestamp + (minute as i64 * 60 * 1000000);
425
+
426
+ // Location (some devices lose GPS occasionally)
427
+ let location = if minute % 15 == 0 && device_idx % 3 == 0 {
428
+ // When struct is null, represent it as a record with all null fields
429
+ ParquetValue::Null
430
+ } else {
431
+ ParquetValue::Record({
432
+ let mut map = IndexMap::new();
433
+ map.insert(
434
+ Arc::from("latitude"),
435
+ ParquetValue::Float64(OrderedFloat(base_lat + (minute as f64) * 0.0001)),
436
+ );
437
+ map.insert(
438
+ Arc::from("longitude"),
439
+ ParquetValue::Float64(OrderedFloat(base_lon + (minute as f64) * 0.0001)),
440
+ );
441
+ map.insert(
442
+ Arc::from("altitude"),
443
+ if device_idx < 5 {
444
+ ParquetValue::Float32(OrderedFloat(100.0 + (minute as f32) * 0.1))
445
+ } else {
446
+ ParquetValue::Null
447
+ },
448
+ );
449
+ map
450
+ })
451
+ };
452
+
453
+ // Sensor readings
454
+ let mut readings = vec![];
455
+
456
+ // Temperature
457
+ readings.push(ParquetValue::Record({
458
+ let mut map = IndexMap::new();
459
+ map.insert(
460
+ Arc::from("sensor_type"),
461
+ ParquetValue::String(Arc::from("temperature")),
462
+ );
463
+ map.insert(
464
+ Arc::from("value"),
465
+ ParquetValue::Float64(OrderedFloat(
466
+ 20.0 + (minute as f64) * 0.1 + device_idx as f64,
467
+ )),
468
+ );
469
+ map.insert(
470
+ Arc::from("unit"),
471
+ ParquetValue::String(Arc::from("celsius")),
472
+ );
473
+ map.insert(Arc::from("quality"), ParquetValue::Int8(100));
474
+ map
475
+ }));
476
+
477
+ // Humidity
478
+ readings.push(ParquetValue::Record({
479
+ let mut map = IndexMap::new();
480
+ map.insert(
481
+ Arc::from("sensor_type"),
482
+ ParquetValue::String(Arc::from("humidity")),
483
+ );
484
+ map.insert(
485
+ Arc::from("value"),
486
+ ParquetValue::Float64(OrderedFloat(45.0 + (minute as f64) * 0.2)),
487
+ );
488
+ map.insert(
489
+ Arc::from("unit"),
490
+ ParquetValue::String(Arc::from("percent")),
491
+ );
492
+ map.insert(
493
+ Arc::from("quality"),
494
+ if minute % 10 == 0 {
495
+ ParquetValue::Null // Missing quality score
496
+ } else {
497
+ ParquetValue::Int8(95)
498
+ },
499
+ );
500
+ map
501
+ }));
502
+
503
+ // Some devices have additional sensors
504
+ if device_idx % 2 == 0 {
505
+ readings.push(ParquetValue::Record({
506
+ let mut map = IndexMap::new();
507
+ map.insert(
508
+ Arc::from("sensor_type"),
509
+ ParquetValue::String(Arc::from("pressure")),
510
+ );
511
+ map.insert(
512
+ Arc::from("value"),
513
+ ParquetValue::Float64(OrderedFloat(1013.25 + (minute as f64) * 0.01)),
514
+ );
515
+ map.insert(Arc::from("unit"), ParquetValue::String(Arc::from("hPa")));
516
+ map.insert(Arc::from("quality"), ParquetValue::Int8(90));
517
+ map
518
+ }));
519
+ }
520
+
521
+ // Battery level decreases over time
522
+ let battery = if minute == 0 {
523
+ ParquetValue::Float32(OrderedFloat(100.0))
524
+ } else {
525
+ ParquetValue::Float32(OrderedFloat(100.0 - (minute as f32) * 0.1))
526
+ };
527
+
528
+ rows.push(vec![
529
+ ParquetValue::String(device_id.clone()),
530
+ ParquetValue::TimestampMicros(timestamp, None),
531
+ location,
532
+ ParquetValue::List(readings),
533
+ battery,
534
+ ]);
535
+ }
536
+ }
537
+
538
+ // Write with settings for time-series IoT data
539
+ let mut buffer = Vec::new();
540
+ {
541
+ let props = WriterProperties::builder()
542
+ .set_compression(Compression::SNAPPY)
543
+ .set_dictionary_enabled(true)
544
+ .build();
545
+
546
+ let mut writer = Writer::new_with_properties(&mut buffer, schema, props).unwrap();
547
+ writer.write_rows(rows.clone()).unwrap();
548
+ writer.close().unwrap();
549
+ }
550
+
551
+ // Verify
552
+ let bytes = Bytes::from(buffer);
553
+ let reader = Reader::new(bytes);
554
+
555
+ let read_rows: Vec<_> = reader
556
+ .read_rows()
557
+ .unwrap()
558
+ .collect::<Result<Vec<_>>>()
559
+ .unwrap();
560
+
561
+ assert_eq!(read_rows.len(), 10 * 60); // 10 devices * 60 minutes
562
+
563
+ // Check first and last readings
564
+ match &read_rows[0][0] {
565
+ ParquetValue::String(id) => assert_eq!(id.as_ref(), "sensor_0000"),
566
+ _ => panic!("Expected device ID"),
567
+ }
568
+
569
+ match &read_rows[0][3] {
570
+ ParquetValue::List(readings) => assert!(readings.len() >= 2),
571
+ _ => panic!("Expected readings list"),
572
+ }
573
+ }
574
+
575
+ #[test]
576
+ fn test_change_data_capture() {
577
+ // Common pattern: CDC (Change Data Capture) events
578
+ let schema = SchemaBuilder::new()
579
+ .with_root(SchemaNode::Struct {
580
+ name: "root".to_string(),
581
+ nullable: false,
582
+ fields: vec![
583
+ SchemaNode::Primitive {
584
+ name: "operation".to_string(),
585
+ primitive_type: PrimitiveType::String,
586
+ nullable: false,
587
+ format: None,
588
+ },
589
+ SchemaNode::Primitive {
590
+ name: "timestamp".to_string(),
591
+ primitive_type: PrimitiveType::TimestampMillis(None),
592
+ nullable: false,
593
+ format: None,
594
+ },
595
+ SchemaNode::Primitive {
596
+ name: "database".to_string(),
597
+ primitive_type: PrimitiveType::String,
598
+ nullable: false,
599
+ format: None,
600
+ },
601
+ SchemaNode::Primitive {
602
+ name: "table".to_string(),
603
+ primitive_type: PrimitiveType::String,
604
+ nullable: false,
605
+ format: None,
606
+ },
607
+ SchemaNode::Primitive {
608
+ name: "primary_key".to_string(),
609
+ primitive_type: PrimitiveType::String,
610
+ nullable: false,
611
+ format: None,
612
+ },
613
+ SchemaNode::Map {
614
+ name: "before".to_string(),
615
+ nullable: true,
616
+ key: Box::new(SchemaNode::Primitive {
617
+ name: "column".to_string(),
618
+ primitive_type: PrimitiveType::String,
619
+ nullable: false,
620
+ format: None,
621
+ }),
622
+ value: Box::new(SchemaNode::Primitive {
623
+ name: "value".to_string(),
624
+ primitive_type: PrimitiveType::String,
625
+ nullable: true,
626
+ format: None,
627
+ }),
628
+ },
629
+ SchemaNode::Map {
630
+ name: "after".to_string(),
631
+ nullable: true,
632
+ key: Box::new(SchemaNode::Primitive {
633
+ name: "column".to_string(),
634
+ primitive_type: PrimitiveType::String,
635
+ nullable: false,
636
+ format: None,
637
+ }),
638
+ value: Box::new(SchemaNode::Primitive {
639
+ name: "value".to_string(),
640
+ primitive_type: PrimitiveType::String,
641
+ nullable: true,
642
+ format: None,
643
+ }),
644
+ },
645
+ ],
646
+ })
647
+ .build()
648
+ .unwrap();
649
+
650
+ // Generate CDC events
651
+ let mut rows = Vec::new();
652
+ let operations = ["INSERT", "UPDATE", "DELETE"];
653
+ let tables = ["users", "orders", "products"];
654
+ let base_timestamp = 1735689600000i64;
655
+
656
+ for i in 0..1000 {
657
+ let operation = operations[i % operations.len()];
658
+ let table = tables[(i / 10) % tables.len()];
659
+ let timestamp = base_timestamp + (i * 1000) as i64;
660
+ let primary_key = format!("{}_id:{}", table, i);
661
+
662
+ let (before, after) = match operation {
663
+ "INSERT" => (
664
+ ParquetValue::Null,
665
+ ParquetValue::Map(vec![
666
+ (
667
+ ParquetValue::String(Arc::from("id")),
668
+ ParquetValue::String(Arc::from(i.to_string())),
669
+ ),
670
+ (
671
+ ParquetValue::String(Arc::from("name")),
672
+ ParquetValue::String(Arc::from(format!("{} {}", table, i))),
673
+ ),
674
+ (
675
+ ParquetValue::String(Arc::from("created_at")),
676
+ ParquetValue::String(Arc::from(timestamp.to_string())),
677
+ ),
678
+ ]),
679
+ ),
680
+ "UPDATE" => (
681
+ ParquetValue::Map(vec![
682
+ (
683
+ ParquetValue::String(Arc::from("id")),
684
+ ParquetValue::String(Arc::from(i.to_string())),
685
+ ),
686
+ (
687
+ ParquetValue::String(Arc::from("name")),
688
+ ParquetValue::String(Arc::from(format!("old_{} {}", table, i))),
689
+ ),
690
+ (
691
+ ParquetValue::String(Arc::from("updated_at")),
692
+ ParquetValue::String(Arc::from((timestamp - 86400000).to_string())),
693
+ ),
694
+ ]),
695
+ ParquetValue::Map(vec![
696
+ (
697
+ ParquetValue::String(Arc::from("id")),
698
+ ParquetValue::String(Arc::from(i.to_string())),
699
+ ),
700
+ (
701
+ ParquetValue::String(Arc::from("name")),
702
+ ParquetValue::String(Arc::from(format!("new_{} {}", table, i))),
703
+ ),
704
+ (
705
+ ParquetValue::String(Arc::from("updated_at")),
706
+ ParquetValue::String(Arc::from(timestamp.to_string())),
707
+ ),
708
+ ]),
709
+ ),
710
+ "DELETE" => (
711
+ ParquetValue::Map(vec![
712
+ (
713
+ ParquetValue::String(Arc::from("id")),
714
+ ParquetValue::String(Arc::from(i.to_string())),
715
+ ),
716
+ (
717
+ ParquetValue::String(Arc::from("name")),
718
+ ParquetValue::String(Arc::from(format!("{} {}", table, i))),
719
+ ),
720
+ ]),
721
+ ParquetValue::Null,
722
+ ),
723
+ _ => unreachable!(),
724
+ };
725
+
726
+ rows.push(vec![
727
+ ParquetValue::String(Arc::from(operation)),
728
+ ParquetValue::TimestampMillis(timestamp, None),
729
+ ParquetValue::String(Arc::from("production")),
730
+ ParquetValue::String(Arc::from(table)),
731
+ ParquetValue::String(Arc::from(primary_key)),
732
+ before,
733
+ after,
734
+ ]);
735
+ }
736
+
737
+ // Write
738
+ let mut buffer = Vec::new();
739
+ {
740
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
741
+ writer.write_rows(rows.clone()).unwrap();
742
+ writer.close().unwrap();
743
+ }
744
+
745
+ // Verify
746
+ let bytes = Bytes::from(buffer);
747
+ let reader = Reader::new(bytes);
748
+
749
+ let read_rows: Vec<_> = reader
750
+ .read_rows()
751
+ .unwrap()
752
+ .collect::<Result<Vec<_>>>()
753
+ .unwrap();
754
+
755
+ assert_eq!(read_rows.len(), 1000);
756
+
757
+ // Count operations
758
+ let mut insert_count = 0;
759
+ let mut update_count = 0;
760
+ let mut delete_count = 0;
761
+
762
+ for row in &read_rows {
763
+ match &row[0] {
764
+ ParquetValue::String(op) => match op.as_ref() {
765
+ "INSERT" => insert_count += 1,
766
+ "UPDATE" => update_count += 1,
767
+ "DELETE" => delete_count += 1,
768
+ _ => panic!("Unexpected operation"),
769
+ },
770
+ _ => panic!("Expected operation string"),
771
+ }
772
+ }
773
+
774
+ assert!(insert_count > 300);
775
+ assert!(update_count > 300);
776
+ assert!(delete_count > 300);
777
+ }