parquet 0.5.12 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +295 -98
- data/Cargo.toml +1 -1
- data/Gemfile +1 -0
- data/README.md +94 -3
- data/ext/parquet/Cargo.toml +8 -5
- data/ext/parquet/src/adapter_ffi.rs +156 -0
- data/ext/parquet/src/lib.rs +13 -21
- data/ext/parquet-core/Cargo.toml +23 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
- data/ext/parquet-core/src/error.rs +163 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +263 -0
- data/ext/parquet-core/src/schema.rs +283 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +151 -0
- data/ext/parquet-core/src/value.rs +209 -0
- data/ext/parquet-core/src/writer.rs +839 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +430 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
- data/ext/parquet-ruby-adapter/src/error.rs +148 -0
- data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
- data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +94 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
- data/lib/parquet/schema.rb +19 -0
- data/lib/parquet/version.rb +1 -1
- metadata +50 -24
- data/ext/parquet/src/enumerator.rs +0 -68
- data/ext/parquet/src/header_cache.rs +0 -99
- data/ext/parquet/src/logger.rs +0 -171
- data/ext/parquet/src/reader/common.rs +0 -111
- data/ext/parquet/src/reader/mod.rs +0 -211
- data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
- data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
- data/ext/parquet/src/reader/unified/mod.rs +0 -363
- data/ext/parquet/src/types/core_types.rs +0 -120
- data/ext/parquet/src/types/mod.rs +0 -100
- data/ext/parquet/src/types/parquet_value.rs +0 -1275
- data/ext/parquet/src/types/record_types.rs +0 -603
- data/ext/parquet/src/types/schema_converter.rs +0 -290
- data/ext/parquet/src/types/schema_node.rs +0 -424
- data/ext/parquet/src/types/timestamp.rs +0 -285
- data/ext/parquet/src/types/type_conversion.rs +0 -1949
- data/ext/parquet/src/types/writer_types.rs +0 -329
- data/ext/parquet/src/utils.rs +0 -184
- data/ext/parquet/src/writer/mod.rs +0 -505
- data/ext/parquet/src/writer/write_columns.rs +0 -238
- data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,545 @@
|
|
1
|
+
use bytes::Bytes;
|
2
|
+
use ordered_float::OrderedFloat;
|
3
|
+
use parquet::basic::Compression;
|
4
|
+
use parquet::file::properties::WriterProperties;
|
5
|
+
use parquet_core::*;
|
6
|
+
use std::sync::Arc;
|
7
|
+
|
8
|
+
mod test_helpers;
|
9
|
+
use test_helpers::*;
|
10
|
+
|
11
|
+
// =============================================================================
|
12
|
+
// Basic Writer Functionality Tests
|
13
|
+
// =============================================================================
|
14
|
+
|
15
|
+
#[test]
|
16
|
+
fn test_writer_basic_functionality() {
|
17
|
+
let schema = SchemaBuilder::new()
|
18
|
+
.with_root(SchemaNode::Struct {
|
19
|
+
name: "root".to_string(),
|
20
|
+
nullable: false,
|
21
|
+
fields: vec![
|
22
|
+
SchemaNode::Primitive {
|
23
|
+
name: "id".to_string(),
|
24
|
+
primitive_type: PrimitiveType::Int32,
|
25
|
+
nullable: false,
|
26
|
+
format: None,
|
27
|
+
},
|
28
|
+
SchemaNode::Primitive {
|
29
|
+
name: "name".to_string(),
|
30
|
+
primitive_type: PrimitiveType::String,
|
31
|
+
nullable: true,
|
32
|
+
format: None,
|
33
|
+
},
|
34
|
+
],
|
35
|
+
})
|
36
|
+
.build()
|
37
|
+
.unwrap();
|
38
|
+
|
39
|
+
let rows = vec![
|
40
|
+
vec![ParquetValue::Int32(1), ParquetValue::String("Alice".into())],
|
41
|
+
vec![
|
42
|
+
ParquetValue::Int32(2),
|
43
|
+
ParquetValue::Null, // nullable field
|
44
|
+
],
|
45
|
+
];
|
46
|
+
|
47
|
+
test_roundtrip(rows, schema).unwrap();
|
48
|
+
}
|
49
|
+
|
50
|
+
// =============================================================================
|
51
|
+
// Batch Size Configuration Tests
|
52
|
+
// =============================================================================
|
53
|
+
|
54
|
+
#[test]
|
55
|
+
fn test_writer_fixed_batch_sizes() {
|
56
|
+
let schema = SchemaBuilder::new()
|
57
|
+
.with_root(SchemaNode::Struct {
|
58
|
+
name: "root".to_string(),
|
59
|
+
nullable: false,
|
60
|
+
fields: vec![
|
61
|
+
SchemaNode::Primitive {
|
62
|
+
name: "id".to_string(),
|
63
|
+
primitive_type: PrimitiveType::Int64,
|
64
|
+
nullable: false,
|
65
|
+
format: None,
|
66
|
+
},
|
67
|
+
SchemaNode::Primitive {
|
68
|
+
name: "data".to_string(),
|
69
|
+
primitive_type: PrimitiveType::String,
|
70
|
+
nullable: false,
|
71
|
+
format: None,
|
72
|
+
},
|
73
|
+
],
|
74
|
+
})
|
75
|
+
.build()
|
76
|
+
.unwrap();
|
77
|
+
|
78
|
+
// Test different batch sizes
|
79
|
+
let batch_sizes = vec![10, 100, 1000, 5000];
|
80
|
+
|
81
|
+
for batch_size in batch_sizes {
|
82
|
+
// Generate test data
|
83
|
+
let rows: Vec<Vec<ParquetValue>> = (0..10000)
|
84
|
+
.map(|i| {
|
85
|
+
vec![
|
86
|
+
ParquetValue::Int64(i),
|
87
|
+
ParquetValue::String(Arc::from(format!("Row {}", i))),
|
88
|
+
]
|
89
|
+
})
|
90
|
+
.collect();
|
91
|
+
|
92
|
+
// Use test_roundtrip_with_options for batch size testing
|
93
|
+
let result = test_roundtrip_with_options(
|
94
|
+
rows,
|
95
|
+
schema.clone(),
|
96
|
+
Compression::UNCOMPRESSED,
|
97
|
+
Some(batch_size),
|
98
|
+
);
|
99
|
+
|
100
|
+
assert!(
|
101
|
+
result.is_ok(),
|
102
|
+
"Batch size {} failed: {:?}",
|
103
|
+
batch_size,
|
104
|
+
result
|
105
|
+
);
|
106
|
+
}
|
107
|
+
}
|
108
|
+
|
109
|
+
#[test]
|
110
|
+
fn test_writer_adaptive_batch_sizing() {
|
111
|
+
let schema = SchemaBuilder::new()
|
112
|
+
.with_root(SchemaNode::Struct {
|
113
|
+
name: "root".to_string(),
|
114
|
+
nullable: false,
|
115
|
+
fields: vec![
|
116
|
+
SchemaNode::Primitive {
|
117
|
+
name: "id".to_string(),
|
118
|
+
primitive_type: PrimitiveType::Int32,
|
119
|
+
nullable: false,
|
120
|
+
format: None,
|
121
|
+
},
|
122
|
+
SchemaNode::Primitive {
|
123
|
+
name: "variable_string".to_string(),
|
124
|
+
primitive_type: PrimitiveType::String,
|
125
|
+
nullable: false,
|
126
|
+
format: None,
|
127
|
+
},
|
128
|
+
],
|
129
|
+
})
|
130
|
+
.build()
|
131
|
+
.unwrap();
|
132
|
+
|
133
|
+
let mut buffer = Vec::new();
|
134
|
+
{
|
135
|
+
// Don't set a fixed batch size - let it adapt
|
136
|
+
let mut writer = WriterBuilder::new()
|
137
|
+
.with_sample_size(50)
|
138
|
+
.build(&mut buffer, schema)
|
139
|
+
.unwrap();
|
140
|
+
|
141
|
+
// Write rows with varying sizes
|
142
|
+
for i in 0..1000 {
|
143
|
+
let string_size = if i % 100 == 0 {
|
144
|
+
10000 // Large string every 100 rows
|
145
|
+
} else {
|
146
|
+
100 // Normal string
|
147
|
+
};
|
148
|
+
|
149
|
+
let row = vec![
|
150
|
+
ParquetValue::Int32(i),
|
151
|
+
ParquetValue::String(Arc::from("x".repeat(string_size))),
|
152
|
+
];
|
153
|
+
|
154
|
+
writer.write_row(row).unwrap();
|
155
|
+
}
|
156
|
+
|
157
|
+
writer.close().unwrap();
|
158
|
+
}
|
159
|
+
|
160
|
+
// Verify all data was written
|
161
|
+
let bytes = Bytes::from(buffer);
|
162
|
+
let reader = Reader::new(bytes);
|
163
|
+
|
164
|
+
let read_rows: Vec<_> = reader
|
165
|
+
.read_rows()
|
166
|
+
.unwrap()
|
167
|
+
.collect::<Result<Vec<_>>>()
|
168
|
+
.unwrap();
|
169
|
+
|
170
|
+
assert_eq!(read_rows.len(), 1000);
|
171
|
+
|
172
|
+
// Verify variable string sizes
|
173
|
+
for (i, row) in read_rows.iter().enumerate() {
|
174
|
+
match &row[1] {
|
175
|
+
ParquetValue::String(s) => {
|
176
|
+
let expected_len = if i % 100 == 0 { 10000 } else { 100 };
|
177
|
+
assert_eq!(s.len(), expected_len, "Wrong string length at row {}", i);
|
178
|
+
}
|
179
|
+
_ => panic!("Expected string value"),
|
180
|
+
}
|
181
|
+
}
|
182
|
+
}
|
183
|
+
|
184
|
+
// =============================================================================
|
185
|
+
// Memory Management Tests
|
186
|
+
// =============================================================================
|
187
|
+
|
188
|
+
#[test]
|
189
|
+
fn test_memory_threshold_configuration() {
|
190
|
+
let schema = SchemaBuilder::new()
|
191
|
+
.with_root(SchemaNode::Struct {
|
192
|
+
name: "root".to_string(),
|
193
|
+
nullable: false,
|
194
|
+
fields: vec![SchemaNode::Primitive {
|
195
|
+
name: "large_string".to_string(),
|
196
|
+
primitive_type: PrimitiveType::String,
|
197
|
+
nullable: false,
|
198
|
+
format: None,
|
199
|
+
}],
|
200
|
+
})
|
201
|
+
.build()
|
202
|
+
.unwrap();
|
203
|
+
|
204
|
+
// Test with different memory thresholds
|
205
|
+
let thresholds = vec![
|
206
|
+
1024 * 1024, // 1MB
|
207
|
+
10 * 1024 * 1024, // 10MB
|
208
|
+
50 * 1024 * 1024, // 50MB
|
209
|
+
];
|
210
|
+
|
211
|
+
for threshold in thresholds {
|
212
|
+
let mut buffer = Vec::new();
|
213
|
+
{
|
214
|
+
let mut writer = WriterBuilder::new()
|
215
|
+
.with_memory_threshold(threshold)
|
216
|
+
.build(&mut buffer, schema.clone())
|
217
|
+
.unwrap();
|
218
|
+
|
219
|
+
// Write large strings that will trigger memory-based flushing
|
220
|
+
let large_string: Arc<str> = Arc::from("x".repeat(1024)); // 1KB string
|
221
|
+
let rows: Vec<Vec<ParquetValue>> = (0..5000)
|
222
|
+
.map(|_| vec![ParquetValue::String(large_string.clone())])
|
223
|
+
.collect();
|
224
|
+
|
225
|
+
writer.write_rows(rows).unwrap();
|
226
|
+
writer.close().unwrap();
|
227
|
+
}
|
228
|
+
|
229
|
+
// Verify data was written correctly
|
230
|
+
let bytes = Bytes::from(buffer);
|
231
|
+
let reader = Reader::new(bytes);
|
232
|
+
let read_count = reader.read_rows().unwrap().count();
|
233
|
+
assert_eq!(read_count, 5000);
|
234
|
+
}
|
235
|
+
}
|
236
|
+
|
237
|
+
#[test]
|
238
|
+
fn test_writer_memory_flushing_with_binary() {
|
239
|
+
let schema = SchemaBuilder::new()
|
240
|
+
.with_root(SchemaNode::Struct {
|
241
|
+
name: "root".to_string(),
|
242
|
+
nullable: false,
|
243
|
+
fields: vec![SchemaNode::Primitive {
|
244
|
+
name: "data".to_string(),
|
245
|
+
primitive_type: PrimitiveType::Binary,
|
246
|
+
nullable: true,
|
247
|
+
format: None,
|
248
|
+
}],
|
249
|
+
})
|
250
|
+
.build()
|
251
|
+
.unwrap();
|
252
|
+
|
253
|
+
// Generate test data
|
254
|
+
let rows: Vec<Vec<ParquetValue>> = (0..100)
|
255
|
+
.map(|i| {
|
256
|
+
let size = if i % 10 == 0 { 500 } else { 50 };
|
257
|
+
vec![ParquetValue::Bytes(Bytes::from(vec![i as u8; size]))]
|
258
|
+
})
|
259
|
+
.collect();
|
260
|
+
|
261
|
+
// Use a custom writer with memory threshold
|
262
|
+
let mut buffer = Vec::new();
|
263
|
+
{
|
264
|
+
let mut writer = WriterBuilder::new()
|
265
|
+
.with_memory_threshold(1024) // 1KB threshold
|
266
|
+
.with_sample_size(5)
|
267
|
+
.build(&mut buffer, schema.clone())
|
268
|
+
.unwrap();
|
269
|
+
|
270
|
+
writer.write_rows(rows.clone()).unwrap();
|
271
|
+
writer.close().unwrap();
|
272
|
+
}
|
273
|
+
|
274
|
+
// Verify data was written correctly
|
275
|
+
let reader = Reader::new(Bytes::from(buffer));
|
276
|
+
let read_rows: Vec<_> = reader
|
277
|
+
.read_rows()
|
278
|
+
.unwrap()
|
279
|
+
.collect::<Result<Vec<_>>>()
|
280
|
+
.unwrap();
|
281
|
+
|
282
|
+
assert_eq!(read_rows, rows);
|
283
|
+
}
|
284
|
+
|
285
|
+
// =============================================================================
|
286
|
+
// Advanced Configuration Tests
|
287
|
+
// =============================================================================
|
288
|
+
|
289
|
+
#[test]
|
290
|
+
fn test_writer_properties_direct() {
|
291
|
+
let schema = SchemaBuilder::new()
|
292
|
+
.with_root(SchemaNode::Struct {
|
293
|
+
name: "root".to_string(),
|
294
|
+
nullable: false,
|
295
|
+
fields: vec![SchemaNode::Primitive {
|
296
|
+
name: "value".to_string(),
|
297
|
+
primitive_type: PrimitiveType::String,
|
298
|
+
nullable: false,
|
299
|
+
format: None,
|
300
|
+
}],
|
301
|
+
})
|
302
|
+
.build()
|
303
|
+
.unwrap();
|
304
|
+
|
305
|
+
// Test custom writer properties
|
306
|
+
let props = WriterProperties::builder()
|
307
|
+
.set_writer_version(parquet::file::properties::WriterVersion::PARQUET_2_0)
|
308
|
+
.set_compression(Compression::ZSTD(
|
309
|
+
parquet::basic::ZstdLevel::try_new(3).unwrap(),
|
310
|
+
))
|
311
|
+
.set_data_page_size_limit(1024) // Small page size
|
312
|
+
.set_dictionary_enabled(true)
|
313
|
+
.set_statistics_enabled(parquet::file::properties::EnabledStatistics::Page)
|
314
|
+
.build();
|
315
|
+
|
316
|
+
let mut buffer = Vec::new();
|
317
|
+
{
|
318
|
+
let mut writer = Writer::new_with_properties(&mut buffer, schema, props).unwrap();
|
319
|
+
|
320
|
+
// Write data with repeated values to test dictionary encoding
|
321
|
+
let rows: Vec<Vec<ParquetValue>> = (0..1000)
|
322
|
+
.map(|i| {
|
323
|
+
vec![ParquetValue::String(Arc::from(format!(
|
324
|
+
"Category_{}",
|
325
|
+
i % 10
|
326
|
+
)))]
|
327
|
+
})
|
328
|
+
.collect();
|
329
|
+
|
330
|
+
writer.write_rows(rows).unwrap();
|
331
|
+
writer.close().unwrap();
|
332
|
+
}
|
333
|
+
|
334
|
+
// Read back and verify
|
335
|
+
let bytes = Bytes::from(buffer);
|
336
|
+
let mut reader = Reader::new(bytes);
|
337
|
+
|
338
|
+
// Check metadata
|
339
|
+
let metadata = reader.metadata().unwrap();
|
340
|
+
assert!(metadata.num_rows() == 1000);
|
341
|
+
|
342
|
+
// Verify data integrity
|
343
|
+
let read_count = reader.read_rows().unwrap().count();
|
344
|
+
assert_eq!(read_count, 1000);
|
345
|
+
}
|
346
|
+
|
347
|
+
#[test]
|
348
|
+
fn test_writer_version_compatibility() {
|
349
|
+
let schema = SchemaBuilder::new()
|
350
|
+
.with_root(SchemaNode::Struct {
|
351
|
+
name: "root".to_string(),
|
352
|
+
nullable: false,
|
353
|
+
fields: vec![SchemaNode::Primitive {
|
354
|
+
name: "value".to_string(),
|
355
|
+
primitive_type: PrimitiveType::Int32,
|
356
|
+
nullable: false,
|
357
|
+
format: None,
|
358
|
+
}],
|
359
|
+
})
|
360
|
+
.build()
|
361
|
+
.unwrap();
|
362
|
+
|
363
|
+
let rows: Vec<Vec<ParquetValue>> = (0..100).map(|i| vec![ParquetValue::Int32(i)]).collect();
|
364
|
+
|
365
|
+
// Test different writer versions
|
366
|
+
let versions = vec![
|
367
|
+
parquet::file::properties::WriterVersion::PARQUET_1_0,
|
368
|
+
parquet::file::properties::WriterVersion::PARQUET_2_0,
|
369
|
+
];
|
370
|
+
|
371
|
+
for version in versions {
|
372
|
+
let mut buffer = Vec::new();
|
373
|
+
{
|
374
|
+
let props = WriterProperties::builder()
|
375
|
+
.set_writer_version(version)
|
376
|
+
.build();
|
377
|
+
|
378
|
+
let mut writer =
|
379
|
+
Writer::new_with_properties(&mut buffer, schema.clone(), props).unwrap();
|
380
|
+
writer.write_rows(rows.clone()).unwrap();
|
381
|
+
writer.close().unwrap();
|
382
|
+
}
|
383
|
+
|
384
|
+
// Verify we can read both versions
|
385
|
+
let bytes = Bytes::from(buffer);
|
386
|
+
let reader = Reader::new(bytes);
|
387
|
+
|
388
|
+
let read_rows: Vec<_> = reader
|
389
|
+
.read_rows()
|
390
|
+
.unwrap()
|
391
|
+
.collect::<Result<Vec<_>>>()
|
392
|
+
.unwrap();
|
393
|
+
|
394
|
+
assert_eq!(read_rows.len(), 100);
|
395
|
+
}
|
396
|
+
}
|
397
|
+
|
398
|
+
// =============================================================================
|
399
|
+
// Large Data Handling Tests
|
400
|
+
// =============================================================================
|
401
|
+
|
402
|
+
#[test]
|
403
|
+
fn test_large_string_handling() {
|
404
|
+
let schema = SchemaBuilder::new()
|
405
|
+
.with_root(SchemaNode::Struct {
|
406
|
+
name: "root".to_string(),
|
407
|
+
nullable: false,
|
408
|
+
fields: vec![
|
409
|
+
SchemaNode::Primitive {
|
410
|
+
name: "id".to_string(),
|
411
|
+
primitive_type: PrimitiveType::Int32,
|
412
|
+
nullable: false,
|
413
|
+
format: None,
|
414
|
+
},
|
415
|
+
SchemaNode::Primitive {
|
416
|
+
name: "content".to_string(),
|
417
|
+
primitive_type: PrimitiveType::String,
|
418
|
+
nullable: true,
|
419
|
+
format: None,
|
420
|
+
},
|
421
|
+
],
|
422
|
+
})
|
423
|
+
.build()
|
424
|
+
.unwrap();
|
425
|
+
|
426
|
+
// Create strings of various sizes
|
427
|
+
let small = "a".repeat(100);
|
428
|
+
let medium = "b".repeat(10_000);
|
429
|
+
let large = "c".repeat(100_000);
|
430
|
+
|
431
|
+
// Generate test data
|
432
|
+
let rows: Vec<Vec<ParquetValue>> = (0..30)
|
433
|
+
.map(|i| {
|
434
|
+
let content = match i % 3 {
|
435
|
+
0 => ParquetValue::String(small.clone().into()),
|
436
|
+
1 => ParquetValue::String(medium.clone().into()),
|
437
|
+
2 => ParquetValue::String(large.clone().into()),
|
438
|
+
_ => unreachable!(),
|
439
|
+
};
|
440
|
+
|
441
|
+
vec![ParquetValue::Int32(i), content]
|
442
|
+
})
|
443
|
+
.collect();
|
444
|
+
|
445
|
+
// Use custom writer with memory threshold
|
446
|
+
let mut buffer = Vec::new();
|
447
|
+
{
|
448
|
+
let mut writer = WriterBuilder::new()
|
449
|
+
.with_memory_threshold(1024 * 1024) // 1MB
|
450
|
+
.build(&mut buffer, schema.clone())
|
451
|
+
.unwrap();
|
452
|
+
|
453
|
+
writer.write_rows(rows.clone()).unwrap();
|
454
|
+
writer.close().unwrap();
|
455
|
+
}
|
456
|
+
|
457
|
+
// Verify all data was written
|
458
|
+
let reader = Reader::new(Bytes::from(buffer));
|
459
|
+
let read_rows: Vec<_> = reader.read_rows().unwrap().collect::<Result<_>>().unwrap();
|
460
|
+
assert_eq!(read_rows, rows);
|
461
|
+
}
|
462
|
+
|
463
|
+
#[test]
|
464
|
+
fn test_complex_nested_data_memory() {
|
465
|
+
let schema = SchemaBuilder::new()
|
466
|
+
.with_root(SchemaNode::Struct {
|
467
|
+
name: "root".to_string(),
|
468
|
+
nullable: false,
|
469
|
+
fields: vec![
|
470
|
+
SchemaNode::Primitive {
|
471
|
+
name: "id".to_string(),
|
472
|
+
primitive_type: PrimitiveType::Int32,
|
473
|
+
nullable: false,
|
474
|
+
format: None,
|
475
|
+
},
|
476
|
+
SchemaNode::List {
|
477
|
+
name: "items".to_string(),
|
478
|
+
nullable: true,
|
479
|
+
item: Box::new(SchemaNode::Struct {
|
480
|
+
name: "item".to_string(),
|
481
|
+
nullable: false,
|
482
|
+
fields: vec![
|
483
|
+
SchemaNode::Primitive {
|
484
|
+
name: "key".to_string(),
|
485
|
+
primitive_type: PrimitiveType::String,
|
486
|
+
nullable: false,
|
487
|
+
format: None,
|
488
|
+
},
|
489
|
+
SchemaNode::Primitive {
|
490
|
+
name: "value".to_string(),
|
491
|
+
primitive_type: PrimitiveType::Float64,
|
492
|
+
nullable: true,
|
493
|
+
format: None,
|
494
|
+
},
|
495
|
+
],
|
496
|
+
}),
|
497
|
+
},
|
498
|
+
],
|
499
|
+
})
|
500
|
+
.build()
|
501
|
+
.unwrap();
|
502
|
+
|
503
|
+
// Generate test data
|
504
|
+
let rows: Vec<Vec<ParquetValue>> = (0..100)
|
505
|
+
.map(|i| {
|
506
|
+
let num_items = (i % 10 + 1) as usize;
|
507
|
+
let mut items = Vec::new();
|
508
|
+
|
509
|
+
for j in 0..num_items {
|
510
|
+
items.push(ParquetValue::Record(indexmap::indexmap! {
|
511
|
+
"key".into() => ParquetValue::String(format!("key_{}_{}", i, j).into()),
|
512
|
+
"value".into() => if j % 2 == 0 {
|
513
|
+
ParquetValue::Float64(OrderedFloat(j as f64 * 1.5))
|
514
|
+
} else {
|
515
|
+
ParquetValue::Null
|
516
|
+
},
|
517
|
+
}));
|
518
|
+
}
|
519
|
+
|
520
|
+
vec![ParquetValue::Int32(i), ParquetValue::List(items)]
|
521
|
+
})
|
522
|
+
.collect();
|
523
|
+
|
524
|
+
// Use custom writer with memory threshold
|
525
|
+
let mut buffer = Vec::new();
|
526
|
+
{
|
527
|
+
let mut writer = WriterBuilder::new()
|
528
|
+
.with_memory_threshold(500 * 1024) // 500KB
|
529
|
+
.build(&mut buffer, schema.clone())
|
530
|
+
.unwrap();
|
531
|
+
|
532
|
+
writer.write_rows(rows.clone()).unwrap();
|
533
|
+
writer.close().unwrap();
|
534
|
+
}
|
535
|
+
|
536
|
+
// Read back and verify
|
537
|
+
let reader = Reader::new(Bytes::from(buffer));
|
538
|
+
let read_rows: Vec<_> = reader
|
539
|
+
.read_rows()
|
540
|
+
.unwrap()
|
541
|
+
.collect::<Result<Vec<_>>>()
|
542
|
+
.unwrap();
|
543
|
+
|
544
|
+
assert_eq!(read_rows, rows);
|
545
|
+
}
|
@@ -0,0 +1,22 @@
|
|
1
|
+
[package]
|
2
|
+
name = "parquet-ruby-adapter"
|
3
|
+
version = "0.1.0"
|
4
|
+
edition = "2021"
|
5
|
+
|
6
|
+
[build-dependencies]
|
7
|
+
rb-sys-env = "^0.2"
|
8
|
+
|
9
|
+
[dependencies]
|
10
|
+
arrow-array = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
|
11
|
+
arrow-buffer = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
|
12
|
+
arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
|
13
|
+
bytes = "1.5"
|
14
|
+
magnus = { version = "0.7", features = ["rb-sys"] }
|
15
|
+
num = "0.4.3"
|
16
|
+
ordered-float = "5.0.0"
|
17
|
+
parquet = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader", features = ["arrow"] }
|
18
|
+
parquet-core = { path = "../parquet-core" }
|
19
|
+
rb-sys = { version = "0.9", features = ["stable-api-compiled-fallback"] }
|
20
|
+
tempfile = "^3.15"
|
21
|
+
thiserror = "2.0"
|
22
|
+
indexmap = "2.2"
|
@@ -0,0 +1,98 @@
|
|
1
|
+
use magnus::{IntoValue, Ruby, Value};
|
2
|
+
use parquet_ruby_adapter::{Result, RubyAdapterError, TryIntoValue};
|
3
|
+
|
4
|
+
/// Example struct that can fail during conversion to Ruby
|
5
|
+
struct ComplexData {
|
6
|
+
name: String,
|
7
|
+
values: Vec<i32>,
|
8
|
+
metadata: std::collections::HashMap<String, String>,
|
9
|
+
}
|
10
|
+
|
11
|
+
impl TryIntoValue for ComplexData {
|
12
|
+
fn try_into_value(self, handle: &Ruby) -> Result<Value> {
|
13
|
+
let hash = handle.hash_new();
|
14
|
+
|
15
|
+
// Set name
|
16
|
+
hash.aset("name", self.name)
|
17
|
+
.map_err(|e| RubyAdapterError::type_conversion(format!("Failed to set name: {}", e)))?;
|
18
|
+
|
19
|
+
// Convert values array
|
20
|
+
let values_array = handle.ary_new();
|
21
|
+
for value in self.values {
|
22
|
+
values_array.push(value).map_err(|e| {
|
23
|
+
RubyAdapterError::type_conversion(format!("Failed to push value: {}", e))
|
24
|
+
})?;
|
25
|
+
}
|
26
|
+
hash.aset("values", values_array).map_err(|e| {
|
27
|
+
RubyAdapterError::type_conversion(format!("Failed to set values: {}", e))
|
28
|
+
})?;
|
29
|
+
|
30
|
+
// Convert metadata hash
|
31
|
+
let metadata_hash = handle.hash_new();
|
32
|
+
for (key, value) in self.metadata {
|
33
|
+
metadata_hash.aset(key.clone(), value).map_err(|e| {
|
34
|
+
RubyAdapterError::type_conversion(format!(
|
35
|
+
"Failed to set metadata key {}: {}",
|
36
|
+
key, e
|
37
|
+
))
|
38
|
+
})?;
|
39
|
+
}
|
40
|
+
hash.aset("metadata", metadata_hash).map_err(|e| {
|
41
|
+
RubyAdapterError::type_conversion(format!("Failed to set metadata: {}", e))
|
42
|
+
})?;
|
43
|
+
|
44
|
+
Ok(handle.into_value(hash))
|
45
|
+
}
|
46
|
+
}
|
47
|
+
|
48
|
+
// Example of a type that might fail validation during conversion
|
49
|
+
struct ValidatedNumber {
|
50
|
+
value: i32,
|
51
|
+
}
|
52
|
+
|
53
|
+
impl TryIntoValue for ValidatedNumber {
|
54
|
+
fn try_into_value(self, handle: &Ruby) -> Result<Value> {
|
55
|
+
// Validate the number is positive
|
56
|
+
if self.value < 0 {
|
57
|
+
return Err(RubyAdapterError::type_conversion(format!(
|
58
|
+
"ValidatedNumber must be positive, got {}",
|
59
|
+
self.value
|
60
|
+
)));
|
61
|
+
}
|
62
|
+
|
63
|
+
// If valid, convert to Ruby
|
64
|
+
Ok(self.value.into_value_with(handle))
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
fn main() -> Result<()> {
|
69
|
+
// Example usage:
|
70
|
+
let ruby = Ruby::get().map_err(|_| RubyAdapterError::runtime("Failed to get Ruby runtime"))?;
|
71
|
+
|
72
|
+
// Success case
|
73
|
+
let data = ComplexData {
|
74
|
+
name: "example".to_string(),
|
75
|
+
values: vec![1, 2, 3],
|
76
|
+
metadata: std::collections::HashMap::from([
|
77
|
+
("key1".to_string(), "value1".to_string()),
|
78
|
+
("key2".to_string(), "value2".to_string()),
|
79
|
+
]),
|
80
|
+
};
|
81
|
+
|
82
|
+
let _ruby_value = data.try_into_value(&ruby)?;
|
83
|
+
println!("Successfully converted ComplexData to Ruby value");
|
84
|
+
|
85
|
+
// Validation failure case
|
86
|
+
let invalid_number = ValidatedNumber { value: -5 };
|
87
|
+
match invalid_number.try_into_value(&ruby) {
|
88
|
+
Ok(_) => println!("This shouldn't happen"),
|
89
|
+
Err(e) => println!("Expected validation error: {}", e),
|
90
|
+
}
|
91
|
+
|
92
|
+
// Using the convenience method
|
93
|
+
let valid_number = ValidatedNumber { value: 42 };
|
94
|
+
let _ruby_value = valid_number.try_into_value_with_current_thread()?;
|
95
|
+
println!("Successfully converted ValidatedNumber to Ruby value");
|
96
|
+
|
97
|
+
Ok(())
|
98
|
+
}
|