parquet 0.5.13 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +3 -0
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -605
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -1,290 +0,0 @@
1
- use magnus::value::ReprValue; // Add ReprValue trait to scope
2
- use magnus::{Error as MagnusError, IntoValue, RArray, Ruby, TryConvert, Value};
3
-
4
- use crate::types::{ParquetSchemaType as PST, PrimitiveType, SchemaField, SchemaNode};
5
- use crate::utils::parse_string_or_symbol;
6
-
7
- /// Recursively converts a SchemaField to a SchemaNode for any level of nesting
8
- fn convert_schema_field_to_node(field: &SchemaField) -> SchemaNode {
9
- match &field.type_ {
10
- PST::Primitive(primative) => SchemaNode::Primitive {
11
- name: field.name.clone(),
12
- nullable: field.nullable,
13
- parquet_type: *primative,
14
- format: field.format.clone(),
15
- },
16
- PST::List(list_field) => {
17
- // Create item node by recursively converting the list item type to a node
18
- let item_node = match &list_field.item_type {
19
- // For primitive types, create a primitive node with name "item"
20
- PST::Primitive(_) => {
21
- // Use a temporary SchemaField to convert item type
22
- let item_field = SchemaField {
23
- name: "item".to_string(),
24
- type_: list_field.item_type.clone(),
25
- format: list_field.format.map(String::from),
26
- nullable: list_field.nullable,
27
- };
28
- convert_schema_field_to_node(&item_field)
29
- }
30
- // For nested types (List, Map, Struct), recursively convert them
31
- PST::List(_) | PST::Map(_) | PST::Struct(_) => {
32
- // Use a temporary SchemaField to convert item type
33
- let item_field = SchemaField {
34
- name: "item".to_string(),
35
- type_: list_field.item_type.clone(),
36
- format: list_field.format.map(String::from),
37
- nullable: list_field.nullable,
38
- };
39
- convert_schema_field_to_node(&item_field)
40
- }
41
- };
42
-
43
- SchemaNode::List {
44
- name: field.name.clone(),
45
- nullable: field.nullable,
46
- item: Box::new(item_node),
47
- }
48
- }
49
- PST::Map(map_field) => {
50
- let key_field = SchemaField {
51
- name: "key".to_string(),
52
- type_: map_field.key_type.clone(),
53
- format: map_field.key_format.map(String::from),
54
- nullable: false, // Map keys can never be null in Parquet
55
- };
56
- let value_field = SchemaField {
57
- name: "value".to_string(),
58
- type_: map_field.value_type.clone(),
59
- format: map_field.value_format.map(String::from),
60
- nullable: map_field.value_nullable,
61
- };
62
-
63
- let key_node = convert_schema_field_to_node(&key_field);
64
- let value_node = convert_schema_field_to_node(&value_field);
65
-
66
- SchemaNode::Map {
67
- name: field.name.clone(),
68
- nullable: field.nullable,
69
- key: Box::new(key_node),
70
- value: Box::new(value_node),
71
- }
72
- }
73
- PST::Struct(struct_field) => {
74
- // Convert each subfield recursively
75
- let mut field_nodes = Vec::with_capacity(struct_field.fields.len());
76
-
77
- for subfield in struct_field.fields.iter() {
78
- // Recursively convert each subfield, supporting any level of nesting
79
- field_nodes.push(convert_schema_field_to_node(subfield));
80
- }
81
-
82
- SchemaNode::Struct {
83
- name: field.name.clone(),
84
- nullable: field.nullable,
85
- fields: field_nodes,
86
- }
87
- }
88
- }
89
- }
90
-
91
- /// Converts the legacy schema format (array of field hashes) to the new DSL format (SchemaNode)
92
- pub fn legacy_schema_to_dsl(
93
- _ruby: &Ruby,
94
- schema_fields: Vec<SchemaField>,
95
- ) -> Result<SchemaNode, MagnusError> {
96
- // Create a top-level struct node with fields for each schema field
97
- let mut field_nodes = Vec::with_capacity(schema_fields.len());
98
-
99
- for field in schema_fields {
100
- // Use our recursive converter to handle any level of nesting
101
- field_nodes.push(convert_schema_field_to_node(&field));
102
- }
103
-
104
- Ok(SchemaNode::Struct {
105
- name: "".to_string(), // Top level has no name
106
- nullable: false, // Top level is not nullable
107
- fields: field_nodes,
108
- })
109
- }
110
-
111
- /// Parses the legacy format schema (array of field hashes)
112
- pub fn parse_legacy_schema(
113
- ruby: &Ruby,
114
- schema_value: Value,
115
- ) -> Result<Vec<SchemaField>, MagnusError> {
116
- if schema_value.is_nil()
117
- || (schema_value.is_kind_of(ruby.class_array())
118
- && RArray::from_value(schema_value)
119
- .ok_or_else(|| {
120
- MagnusError::new(
121
- ruby.exception_type_error(),
122
- "Schema must be an array of field definitions or nil",
123
- )
124
- })?
125
- .is_empty())
126
- {
127
- // If schema is nil or an empty array, we'll handle this in the caller
128
- return Ok(Vec::new());
129
- }
130
-
131
- if schema_value.is_kind_of(ruby.class_array()) {
132
- let schema_array = RArray::from_value(schema_value).ok_or_else(|| {
133
- MagnusError::new(
134
- ruby.exception_type_error(),
135
- "Schema must be an array of field definitions or nil",
136
- )
137
- })?;
138
- let mut schema = Vec::with_capacity(schema_array.len());
139
-
140
- for (idx, field_hash) in schema_array.into_iter().enumerate() {
141
- if !field_hash.is_kind_of(ruby.class_hash()) {
142
- return Err(MagnusError::new(
143
- ruby.exception_type_error(),
144
- format!("schema[{}] must be a hash", idx),
145
- ));
146
- }
147
-
148
- let entries: Vec<(Value, Value)> = field_hash.funcall("to_a", ())?;
149
- if entries.len() != 1 {
150
- return Err(MagnusError::new(
151
- ruby.exception_type_error(),
152
- format!("schema[{}] must contain exactly one key-value pair", idx),
153
- ));
154
- }
155
-
156
- let (name, type_value) = &entries[0];
157
- let name_option = parse_string_or_symbol(ruby, *name)?;
158
- let name = name_option.ok_or_else(|| {
159
- MagnusError::new(ruby.exception_runtime_error(), "Field name cannot be nil")
160
- })?;
161
-
162
- let (type_, format, nullable) = if type_value.is_kind_of(ruby.class_hash()) {
163
- let type_hash: Vec<(Value, Value)> = type_value.funcall("to_a", ())?;
164
- let mut type_str = None;
165
- let mut format_str = None;
166
- let mut nullable = true; // Default to true if not specified
167
-
168
- let mut precision: Option<Value> = None;
169
- let mut scale: Option<Value> = None;
170
-
171
- for (key, value) in type_hash {
172
- let key_option = parse_string_or_symbol(ruby, key)?;
173
- let key = key_option.ok_or_else(|| {
174
- MagnusError::new(ruby.exception_runtime_error(), "Type key cannot be nil")
175
- })?;
176
- match key.as_str() {
177
- "type" => type_str = Some(value),
178
- "format" => {
179
- let format_option = parse_string_or_symbol(ruby, value)?;
180
- format_str = format_option;
181
- }
182
- "nullable" => {
183
- // Extract nullable if present - convert to boolean
184
- nullable = bool::try_convert(value).unwrap_or(true);
185
- }
186
- "precision" => {
187
- precision = Some(value);
188
- }
189
- "scale" => {
190
- scale = Some(value);
191
- }
192
- _ => {
193
- return Err(MagnusError::new(
194
- ruby.exception_type_error(),
195
- format!("Unknown key '{}' in type definition", key),
196
- ))
197
- }
198
- }
199
- }
200
-
201
- let type_str = type_str.ok_or_else(|| {
202
- MagnusError::new(
203
- ruby.exception_type_error(),
204
- "Missing 'type' in type definition",
205
- )
206
- })?;
207
-
208
- // Handle decimal type with precision and scale
209
- let mut type_result = PST::try_convert(type_str)?;
210
-
211
- // If it's a decimal type and we have precision and scale, override the type
212
- if let PST::Primitive(PrimitiveType::Decimal128(_, _)) = type_result {
213
- // Do nothing
214
- } else if let Some(type_name) = parse_string_or_symbol(ruby, type_str)? {
215
- if type_name == "decimal" {
216
- let precision_value = precision.unwrap_or_else(|| {
217
- let val: u8 = 38;
218
- val.into_value_with(ruby)
219
- });
220
-
221
- let scale_value = scale.unwrap_or_else(|| {
222
- let val: i8 = 0;
223
- val.into_value_with(ruby)
224
- });
225
-
226
- let precision_u8 = u8::try_convert(precision_value).map_err(|_| {
227
- MagnusError::new(
228
- ruby.exception_type_error(),
229
- "Invalid precision value for decimal type, expected a positive integer".to_string(),
230
- )
231
- })?;
232
-
233
- let scale_i8 = i8::try_convert(scale_value).map_err(|_| {
234
- MagnusError::new(
235
- ruby.exception_type_error(),
236
- "Invalid scale value for decimal type, expected an integer"
237
- .to_string(),
238
- )
239
- })?;
240
-
241
- type_result =
242
- PST::Primitive(PrimitiveType::Decimal128(precision_u8, scale_i8));
243
- }
244
- }
245
-
246
- (type_result, format_str, nullable)
247
- } else {
248
- (PST::try_convert(*type_value)?, None, true)
249
- };
250
-
251
- schema.push(SchemaField {
252
- name,
253
- type_,
254
- format,
255
- nullable,
256
- });
257
- }
258
-
259
- Ok(schema)
260
- } else {
261
- Err(MagnusError::new(
262
- ruby.exception_type_error(),
263
- "Schema must be an array of field definitions or nil",
264
- ))
265
- }
266
- }
267
-
268
- /// Generates schema fields by inferring from the first row
269
- pub fn infer_schema_from_first_row(
270
- ruby: &Ruby,
271
- first_value: Value,
272
- nullable: bool,
273
- ) -> Result<Vec<SchemaField>, MagnusError> {
274
- let array = RArray::from_value(first_value).ok_or_else(|| {
275
- MagnusError::new(
276
- ruby.exception_type_error(),
277
- "First value must be an array when schema is not provided",
278
- )
279
- })?;
280
-
281
- // Generate field names f0, f1, f2, etc.
282
- Ok((0..array.len())
283
- .map(|i| SchemaField {
284
- name: format!("f{}", i),
285
- type_: PST::Primitive(PrimitiveType::String), // Default to String type when inferring
286
- format: None,
287
- nullable,
288
- })
289
- .collect())
290
- }