parquet 0.5.12 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +8 -5
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -603
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -1,424 +0,0 @@
1
- use std::sync::Arc;
2
-
3
- use arrow_schema::{
4
- DataType as ArrowDataType, Field as ArrowField, Fields as ArrowFields, Schema as ArrowSchema,
5
- };
6
- use magnus::{Error as MagnusError, RArray, RHash, Ruby, Symbol, TryConvert, Value};
7
-
8
- use crate::logger::RubyLogger;
9
- use crate::types::{PrimitiveType, SchemaNode};
10
- use crate::utils::parse_string_or_symbol;
11
-
12
- /// Builds an Arrow schema from a SchemaNode tree - placeholder declaration
13
- /// The actual implementation appears later in the file
14
- fn _build_arrow_schema_placeholder() {}
15
-
16
- /// Helper to extract common fields from a schema node hash
17
- fn extract_common_fields(
18
- ruby: &Ruby,
19
- node_hash: &RHash,
20
- ) -> Result<(String, bool, Option<String>), MagnusError> {
21
- // extract `name:` if present, else default
22
- let name_val = node_hash.get(Symbol::new("name"));
23
- let name: String = if let Some(v) = name_val {
24
- let name_option = parse_string_or_symbol(ruby, v)?;
25
- name_option.unwrap_or_else(|| "".to_string())
26
- } else {
27
- "".to_string() // top-level might omit name
28
- };
29
-
30
- // extract `nullable:`
31
- let nullable_val = node_hash.get(Symbol::new("nullable"));
32
- let nullable: bool = if let Some(v) = nullable_val {
33
- bool::try_convert(v).unwrap_or(true)
34
- } else {
35
- true // default to nullable
36
- };
37
-
38
- // optional `format:`
39
- let format_val = node_hash.get(Symbol::new("format"));
40
- let format: Option<String> = if let Some(v) = format_val {
41
- parse_string_or_symbol(ruby, v)?
42
- } else {
43
- None
44
- };
45
-
46
- Ok((name, nullable, format))
47
- }
48
-
49
- /// Parse a struct schema node
50
- fn parse_struct_node(
51
- ruby: &Ruby,
52
- node_hash: &RHash,
53
- name: String,
54
- nullable: bool,
55
- ) -> Result<SchemaNode, MagnusError> {
56
- // parse subfields array from `fields`
57
- let fields_val = node_hash.get(Symbol::new("fields")).ok_or_else(|| {
58
- MagnusError::new(
59
- ruby.exception_arg_error(),
60
- "Struct must have :fields array defined",
61
- )
62
- })?;
63
- let fields_arr: RArray = RArray::try_convert(fields_val).map_err(|_| {
64
- MagnusError::new(
65
- ruby.exception_type_error(),
66
- "The :fields value must be an array",
67
- )
68
- })?;
69
-
70
- // Check for empty struct immediately
71
- if fields_arr.is_empty() {
72
- return Err(MagnusError::new(
73
- ruby.exception_arg_error(),
74
- format!("Cannot create a struct with zero fields. Struct name: '{}'. Parquet doesn't support empty structs", name)
75
- ));
76
- }
77
-
78
- let mut fields = Vec::with_capacity(fields_arr.len());
79
- for item in fields_arr.into_iter() {
80
- fields.push(parse_schema_node(ruby, item)?);
81
- }
82
-
83
- Ok(SchemaNode::Struct {
84
- name,
85
- nullable,
86
- fields,
87
- })
88
- }
89
-
90
- /// Parse a list schema node
91
- fn parse_list_node(
92
- ruby: &Ruby,
93
- node_hash: &RHash,
94
- name: String,
95
- nullable: bool,
96
- ) -> Result<SchemaNode, MagnusError> {
97
- // parse `item`
98
- let item_val = node_hash.get(Symbol::new("item")).ok_or_else(|| {
99
- MagnusError::new(
100
- ruby.exception_arg_error(),
101
- "List type must have :item field defined",
102
- )
103
- })?;
104
- let item_node = parse_schema_node(ruby, item_val)?;
105
-
106
- Ok(SchemaNode::List {
107
- name,
108
- nullable,
109
- item: Box::new(item_node),
110
- })
111
- }
112
-
113
- /// Parse a map schema node
114
- fn parse_map_node(
115
- ruby: &Ruby,
116
- node_hash: &RHash,
117
- name: String,
118
- nullable: bool,
119
- ) -> Result<SchemaNode, MagnusError> {
120
- // parse `key` and `value`
121
- let key_val = node_hash.get(Symbol::new("key")).ok_or_else(|| {
122
- MagnusError::new(
123
- ruby.exception_arg_error(),
124
- "Map type must have :key field defined",
125
- )
126
- })?;
127
- let value_val = node_hash.get(Symbol::new("value")).ok_or_else(|| {
128
- MagnusError::new(
129
- ruby.exception_arg_error(),
130
- "Map type must have :value field defined",
131
- )
132
- })?;
133
-
134
- let key_node = parse_schema_node(ruby, key_val)?;
135
- let value_node = parse_schema_node(ruby, value_val)?;
136
-
137
- Ok(SchemaNode::Map {
138
- name,
139
- nullable,
140
- key: Box::new(key_node),
141
- value: Box::new(value_node),
142
- })
143
- }
144
-
145
- /// Parse a Ruby schema hash into a SchemaNode tree
146
- pub fn parse_schema_node(ruby: &Ruby, node_value: Value) -> Result<SchemaNode, MagnusError> {
147
- // The node_value should be a Ruby Hash with keys: :name, :type, :nullable, etc.
148
- let node_hash = RHash::from_value(node_value).ok_or_else(|| {
149
- MagnusError::new(
150
- ruby.exception_type_error(),
151
- "Schema node must be a Hash with :type and other fields",
152
- )
153
- })?;
154
-
155
- // extract `type:` which is a symbol/string
156
- let type_val = node_hash.get(Symbol::new("type")).ok_or_else(|| {
157
- MagnusError::new(
158
- ruby.exception_arg_error(),
159
- "Missing required :type field in schema node",
160
- )
161
- })?;
162
- let type_str_option = parse_string_or_symbol(ruby, type_val)?;
163
- let type_str = type_str_option.ok_or_else(|| {
164
- MagnusError::new(
165
- ruby.exception_arg_error(),
166
- "Type cannot be nil - please specify a valid type string or symbol",
167
- )
168
- })?;
169
-
170
- // Extract common fields (name, nullable, format)
171
- let (name, nullable, format) = extract_common_fields(ruby, &node_hash)?;
172
-
173
- // Delegate to type-specific parsers with clear error handling
174
- match type_str.as_str() {
175
- "struct" => parse_struct_node(ruby, &node_hash, name, nullable),
176
- "list" => parse_list_node(ruby, &node_hash, name, nullable),
177
- "map" => parse_map_node(ruby, &node_hash, name, nullable),
178
- "decimal" => {
179
- // Check for precision and scale
180
- let precision_val = node_hash.get(Symbol::new("precision"));
181
- let scale_val = node_hash.get(Symbol::new("scale"));
182
-
183
- // Handle different precision/scale combinations:
184
- // 1. When no precision or scale - use max precision (38)
185
- // 2. When precision only - use scale 0
186
- // 3. When scale only - use max precision (38)
187
- let (precision, scale) = match (precision_val, scale_val) {
188
- (None, None) => (38, 0), // Maximum accuracy, scale 0
189
- (Some(p), None) => {
190
- // Precision provided, scale defaults to 0
191
- let prec = u8::try_convert(p).map_err(|_| {
192
- MagnusError::new(
193
- ruby.exception_type_error(),
194
- "Invalid precision value for decimal type, expected a positive integer"
195
- .to_string(),
196
- )
197
- })?;
198
- (prec, 0)
199
- }
200
- (None, Some(s)) => {
201
- // Scale provided, precision set to maximum (38)
202
- let scl = i8::try_convert(s).map_err(|_| {
203
- MagnusError::new(
204
- ruby.exception_type_error(),
205
- "Invalid scale value for decimal type, expected an integer".to_string(),
206
- )
207
- })?;
208
- (38, scl)
209
- }
210
- (Some(p), Some(s)) => {
211
- // Both provided
212
- let prec = u8::try_convert(p).map_err(|_| {
213
- MagnusError::new(
214
- ruby.exception_type_error(),
215
- "Invalid precision value for decimal type, expected a positive integer"
216
- .to_string(),
217
- )
218
- })?;
219
- let scl = i8::try_convert(s).map_err(|_| {
220
- MagnusError::new(
221
- ruby.exception_type_error(),
222
- "Invalid scale value for decimal type, expected an integer".to_string(),
223
- )
224
- })?;
225
- (prec, scl)
226
- }
227
- };
228
-
229
- // Validate precision is in a valid range
230
- if precision < 1 {
231
- return Err(MagnusError::new(
232
- ruby.exception_arg_error(),
233
- format!(
234
- "Precision for decimal type must be at least 1, got {}",
235
- precision
236
- ),
237
- ));
238
- }
239
-
240
- if precision > 38 {
241
- return Err(MagnusError::new(
242
- ruby.exception_arg_error(),
243
- format!(
244
- "Precision for decimal type cannot exceed 38, got {}",
245
- precision
246
- ),
247
- ));
248
- }
249
-
250
- Ok(SchemaNode::Primitive {
251
- name,
252
- parquet_type: PrimitiveType::Decimal128(precision, scale),
253
- nullable,
254
- format,
255
- })
256
- }
257
- // For primitives, provide better error messages when type isn't recognized
258
- other => {
259
- if let Some(parquet_type) = parse_primitive_type(other) {
260
- Ok(SchemaNode::Primitive {
261
- name,
262
- parquet_type,
263
- nullable,
264
- format,
265
- })
266
- } else {
267
- Err(MagnusError::new(
268
- magnus::exception::arg_error(),
269
- format!(
270
- "Unknown type: '{}'. Supported types are: struct, list, map, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float32, float64, boolean, string, binary, date32, timestamp_millis, timestamp_micros, decimal",
271
- other
272
- )
273
- ))
274
- }
275
- }
276
- }
277
- }
278
-
279
- /// Convert a type string like "int32" to a PrimitiveType
280
- fn parse_primitive_type(s: &str) -> Option<PrimitiveType> {
281
- match s.to_lowercase().as_str() {
282
- "int8" | "i8" => Some(PrimitiveType::Int8),
283
- "int16" | "i16" => Some(PrimitiveType::Int16),
284
- "int32" | "i32" | "int" => Some(PrimitiveType::Int32),
285
- "int64" | "i64" | "long" | "bigint" => Some(PrimitiveType::Int64),
286
- "uint8" | "u8" | "byte" => Some(PrimitiveType::UInt8),
287
- "uint16" | "u16" => Some(PrimitiveType::UInt16),
288
- "uint32" | "u32" | "uint" => Some(PrimitiveType::UInt32),
289
- "uint64" | "u64" | "ulong" => Some(PrimitiveType::UInt64),
290
- "float" | "float32" | "f32" => Some(PrimitiveType::Float32),
291
- "double" | "float64" | "f64" => Some(PrimitiveType::Float64),
292
- "bool" | "boolean" => Some(PrimitiveType::Boolean),
293
- "string" | "utf8" | "str" | "text" => Some(PrimitiveType::String),
294
- "binary" | "bytes" | "blob" => Some(PrimitiveType::Binary),
295
- "date" | "date32" => Some(PrimitiveType::Date32),
296
- "timestamp_millis" | "timestamp_ms" => Some(PrimitiveType::TimestampMillis),
297
- "timestamp_micros" | "timestamp_us" => Some(PrimitiveType::TimestampMicros),
298
- "time_millis" | "time_ms" => Some(PrimitiveType::TimeMillis),
299
- "time_micros" | "time_us" => Some(PrimitiveType::TimeMicros),
300
- "decimal" => Some(PrimitiveType::Decimal128(38, 0)), // Maximum precision, scale 0
301
- "decimal256" => Some(PrimitiveType::Decimal256(38, 0)), // Maximum precision, scale 0
302
- _ => None,
303
- }
304
- }
305
-
306
- /// Convert a SchemaNode to an Arrow field
307
- pub fn schema_node_to_arrow_field(node: &SchemaNode) -> ArrowField {
308
- match node {
309
- SchemaNode::Primitive {
310
- name,
311
- parquet_type,
312
- nullable,
313
- format: _,
314
- } => {
315
- let dt = match parquet_type {
316
- PrimitiveType::Int8 => ArrowDataType::Int8,
317
- PrimitiveType::Int16 => ArrowDataType::Int16,
318
- PrimitiveType::Int32 => ArrowDataType::Int32,
319
- PrimitiveType::Int64 => ArrowDataType::Int64,
320
- PrimitiveType::UInt8 => ArrowDataType::UInt8,
321
- PrimitiveType::UInt16 => ArrowDataType::UInt16,
322
- PrimitiveType::UInt32 => ArrowDataType::UInt32,
323
- PrimitiveType::UInt64 => ArrowDataType::UInt64,
324
- PrimitiveType::Float32 => ArrowDataType::Float32,
325
- PrimitiveType::Float64 => ArrowDataType::Float64,
326
- PrimitiveType::Decimal128(precision, scale) => {
327
- ArrowDataType::Decimal128(*precision, *scale)
328
- }
329
- PrimitiveType::Decimal256(precision, scale) => {
330
- ArrowDataType::Decimal256(*precision, *scale)
331
- }
332
- PrimitiveType::Boolean => ArrowDataType::Boolean,
333
- PrimitiveType::String => ArrowDataType::Utf8,
334
- PrimitiveType::Binary => ArrowDataType::Binary,
335
- PrimitiveType::Date32 => ArrowDataType::Date32,
336
- PrimitiveType::TimestampMillis => {
337
- ArrowDataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None)
338
- }
339
- PrimitiveType::TimestampMicros => {
340
- ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None)
341
- }
342
- PrimitiveType::TimeMillis => {
343
- ArrowDataType::Time32(arrow_schema::TimeUnit::Millisecond)
344
- }
345
- PrimitiveType::TimeMicros => {
346
- ArrowDataType::Time64(arrow_schema::TimeUnit::Microsecond)
347
- }
348
- };
349
- ArrowField::new(name, dt, *nullable)
350
- }
351
-
352
- SchemaNode::List {
353
- name,
354
- nullable,
355
- item,
356
- } => {
357
- let child_field = schema_node_to_arrow_field(item);
358
- let list_type = ArrowDataType::List(Arc::new(child_field));
359
- ArrowField::new(name, list_type, *nullable)
360
- }
361
-
362
- SchemaNode::Map {
363
- name,
364
- nullable,
365
- key,
366
- value,
367
- } => {
368
- // A Map is basically: Map( Struct([key, value]), keysSorted=false )
369
- let key_field = schema_node_to_arrow_field(key);
370
- let value_field = schema_node_to_arrow_field(value);
371
-
372
- let entries_struct = ArrowDataType::Struct(ArrowFields::from(vec![
373
- ArrowField::new("key", key_field.data_type().clone(), false),
374
- ArrowField::new(
375
- "value",
376
- value_field.data_type().clone(),
377
- value_field.is_nullable(),
378
- ),
379
- ]));
380
-
381
- let map_data_type = ArrowDataType::Map(
382
- Arc::new(ArrowField::new("entries", entries_struct, false)),
383
- false, // not sorted
384
- );
385
- ArrowField::new(name, map_data_type, *nullable)
386
- }
387
-
388
- SchemaNode::Struct {
389
- name,
390
- nullable,
391
- fields,
392
- } => {
393
- // Field validation happens earlier - no empty structs allowed
394
- let mut arrow_subfields = Vec::with_capacity(fields.len());
395
- for f in fields {
396
- arrow_subfields.push(schema_node_to_arrow_field(f));
397
- }
398
- let struct_type = ArrowDataType::Struct(ArrowFields::from(arrow_subfields));
399
- ArrowField::new(name, struct_type, *nullable)
400
- }
401
- }
402
- }
403
-
404
- /// Build an Arrow schema from the top-level Node, which must be a Struct
405
- pub fn build_arrow_schema(
406
- root: &SchemaNode,
407
- logger: &RubyLogger,
408
- ) -> Result<Arc<ArrowSchema>, MagnusError> {
409
- match root {
410
- SchemaNode::Struct { fields, .. } => {
411
- // Fields debug output removed - we've fixed the empty struct issue
412
-
413
- let arrow_fields: Vec<ArrowField> =
414
- fields.iter().map(schema_node_to_arrow_field).collect();
415
- let arrow_schema = ArrowSchema::new(arrow_fields);
416
- logger.debug(|| format!("Constructed Arrow schema: {:?}", arrow_schema))?;
417
- Ok(Arc::new(arrow_schema))
418
- }
419
- _ => Err(MagnusError::new(
420
- magnus::exception::arg_error(),
421
- "Top-level schema must be a Struct".to_owned(),
422
- )),
423
- }
424
- }