parquet 0.5.13 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +3 -0
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -605
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -1,238 +0,0 @@
1
- use super::{
2
- arrow_data_type_to_parquet_schema_type, copy_temp_file_to_io_like, create_writer,
3
- parse_parquet_write_args, DEFAULT_MEMORY_THRESHOLD,
4
- };
5
- use crate::{
6
- convert_ruby_array_to_arrow,
7
- logger::RubyLogger,
8
- types::{schema_node::build_arrow_schema, ParquetGemError, WriterOutput},
9
- IoLikeValue, ParquetSchemaType as PST, ParquetWriteArgs,
10
- };
11
- use crate::{types::PrimitiveType, SchemaNode};
12
- use arrow_array::{Array, RecordBatch};
13
- use magnus::{value::ReprValue, Error as MagnusError, RArray, Ruby, Value};
14
- use std::{rc::Rc, sync::Arc};
15
-
16
- #[inline]
17
- pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
18
- let ruby = unsafe { Ruby::get_unchecked() };
19
- write_columns_impl(Rc::new(ruby), args).map_err(|e| {
20
- let z: MagnusError = e.into();
21
- z
22
- })?;
23
- Ok(())
24
- }
25
-
26
- #[inline]
27
- fn write_columns_impl(ruby: Rc<Ruby>, args: &[Value]) -> Result<(), ParquetGemError> {
28
- let ParquetWriteArgs {
29
- read_from,
30
- write_to,
31
- schema,
32
- batch_size: _,
33
- compression,
34
- flush_threshold,
35
- sample_size: _,
36
- logger,
37
- } = parse_parquet_write_args(&ruby, args)?;
38
-
39
- let logger = RubyLogger::new(&ruby, logger)?;
40
- let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
41
-
42
- // Get the Arrow schema from the SchemaNode (we only have DSL schema now, since legacy is converted)
43
- let arrow_schema = build_arrow_schema(&schema, &logger).map_err(|e| {
44
- MagnusError::new(
45
- magnus::exception::runtime_error(),
46
- format!("Failed to build Arrow schema from DSL schema: {}", e),
47
- )
48
- })?;
49
-
50
- // Create the writer
51
- let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
52
-
53
- if read_from.is_kind_of(ruby.class_enumerator()) {
54
- loop {
55
- match read_from.funcall::<_, _, Value>("next", ()) {
56
- Ok(batch) => {
57
- let batch_array = RArray::from_value(batch).ok_or_else(|| {
58
- MagnusError::new(ruby.exception_type_error(), "Batch must be an array")
59
- })?;
60
-
61
- // Batch array must be an array of arrays. Check that the first value in `batch_array` is an array.
62
- batch_array.entry::<RArray>(0).map_err(|_| {
63
- MagnusError::new(
64
- ruby.exception_type_error(),
65
- "When writing columns, data must be formatted as batches of columns: [[batch1_col1, batch1_col2], [batch2_col1, batch2_col2]].",
66
- )
67
- })?;
68
-
69
- // Validate batch length matches schema
70
- // Get schema length and field names - we only have DSL schema now
71
- let (schema_len, field_names): (usize, Vec<&str>) = {
72
- let fields = match &schema {
73
- SchemaNode::Struct { fields, .. } => fields,
74
- _ => {
75
- return Err(MagnusError::new(
76
- magnus::exception::type_error(),
77
- "Root schema node must be a struct type",
78
- ))?
79
- }
80
- };
81
- (
82
- fields.len(),
83
- fields
84
- .iter()
85
- .map(|f| match f {
86
- SchemaNode::Primitive { name, .. } => name.as_str(),
87
- SchemaNode::List { name, .. } => name.as_str(),
88
- SchemaNode::Map { name, .. } => name.as_str(),
89
- SchemaNode::Struct { name, .. } => name.as_str(),
90
- })
91
- .to_owned()
92
- .collect(),
93
- )
94
- };
95
-
96
- if batch_array.len() != schema_len {
97
- Err(MagnusError::new(
98
- magnus::exception::type_error(),
99
- format!(
100
- "Batch column count ({}) does not match schema length ({}). Schema expects columns: {:?}",
101
- batch_array.len(),
102
- schema_len,
103
- field_names
104
- ),
105
- ))?;
106
- }
107
-
108
- // Convert each column in the batch to Arrow arrays
109
- let arrow_arrays: Vec<(String, Arc<dyn Array>)> = {
110
- // Process each field in the DSL schema
111
- let fields = arrow_schema.fields();
112
- let top_fields =
113
- match &schema {
114
- SchemaNode::Struct { fields, .. } => fields,
115
- _ => return Err(MagnusError::new(
116
- magnus::exception::runtime_error(),
117
- "Top-level DSL schema must be a struct for columns approach",
118
- ))?,
119
- };
120
- if top_fields.len() != fields.len() {
121
- Err(MagnusError::new(
122
- magnus::exception::runtime_error(),
123
- "Mismatch top-level DSL fields vs Arrow fields",
124
- ))?;
125
- }
126
-
127
- let mut out = vec![];
128
- for ((arrow_f, dsl_f), col_val) in
129
- fields.iter().zip(top_fields.iter()).zip(batch_array)
130
- {
131
- let col_arr = RArray::from_value(col_val).ok_or_else(|| {
132
- MagnusError::new(
133
- magnus::exception::type_error(),
134
- format!("Column '{}' must be an array", arrow_f.name()),
135
- )
136
- })?;
137
- // Get appropriate parquet_type
138
- let ptype = match dsl_f {
139
- SchemaNode::Primitive {
140
- parquet_type,
141
- // Format is handled internally now
142
- ..
143
- } => match *parquet_type {
144
- PrimitiveType::Int8 => PST::Primitive(PrimitiveType::Int8),
145
- PrimitiveType::Int16 => PST::Primitive(PrimitiveType::Int16),
146
- PrimitiveType::Int32 => PST::Primitive(PrimitiveType::Int32),
147
- PrimitiveType::Int64 => PST::Primitive(PrimitiveType::Int64),
148
- PrimitiveType::UInt8 => PST::Primitive(PrimitiveType::UInt8),
149
- PrimitiveType::UInt16 => PST::Primitive(PrimitiveType::UInt16),
150
- PrimitiveType::UInt32 => PST::Primitive(PrimitiveType::UInt32),
151
- PrimitiveType::UInt64 => PST::Primitive(PrimitiveType::UInt64),
152
- PrimitiveType::Float32 => {
153
- PST::Primitive(PrimitiveType::Float32)
154
- }
155
- PrimitiveType::Float64 => {
156
- PST::Primitive(PrimitiveType::Float64)
157
- }
158
- PrimitiveType::Decimal128(precision, scale) => {
159
- PST::Primitive(PrimitiveType::Decimal128(precision, scale))
160
- }
161
- PrimitiveType::String => PST::Primitive(PrimitiveType::String),
162
- PrimitiveType::Binary => PST::Primitive(PrimitiveType::Binary),
163
- PrimitiveType::Boolean => {
164
- PST::Primitive(PrimitiveType::Boolean)
165
- }
166
- PrimitiveType::Date32 => PST::Primitive(PrimitiveType::Date32),
167
- PrimitiveType::TimestampMillis => {
168
- PST::Primitive(PrimitiveType::TimestampMillis)
169
- }
170
- PrimitiveType::TimestampMicros => {
171
- PST::Primitive(PrimitiveType::TimestampMicros)
172
- }
173
- PrimitiveType::TimeMillis => {
174
- PST::Primitive(PrimitiveType::TimeMillis)
175
- }
176
- PrimitiveType::TimeMicros => {
177
- PST::Primitive(PrimitiveType::TimeMicros)
178
- }
179
- PrimitiveType::Decimal256(precision, scale) => {
180
- PST::Primitive(PrimitiveType::Decimal256(precision, scale))
181
- }
182
- },
183
- SchemaNode::List { .. }
184
- | SchemaNode::Map { .. }
185
- | SchemaNode::Struct { .. } => {
186
- // For nested, we just do a single "column" as well
187
- arrow_data_type_to_parquet_schema_type(arrow_f.data_type())?
188
- }
189
- };
190
- out.push((
191
- arrow_f.name().clone(),
192
- convert_ruby_array_to_arrow(&ruby, col_arr, &ptype)?,
193
- ));
194
- }
195
- out
196
- };
197
-
198
- // Create and write record batch
199
- let record_batch = RecordBatch::try_from_iter(arrow_arrays).map_err(|e| {
200
- MagnusError::new(
201
- magnus::exception::runtime_error(),
202
- format!("Failed to create record batch: {}", e),
203
- )
204
- })?;
205
-
206
- writer.write(&record_batch)?;
207
-
208
- match &mut writer {
209
- WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
210
- if w.in_progress_size() >= flush_threshold {
211
- w.flush()?;
212
- }
213
- }
214
- }
215
- }
216
- Err(e) => {
217
- if e.is_kind_of(ruby.exception_stop_iteration()) {
218
- break;
219
- }
220
- Err(e)?;
221
- }
222
- }
223
- }
224
- } else {
225
- Err(MagnusError::new(
226
- magnus::exception::type_error(),
227
- "read_from must be an Enumerator".to_string(),
228
- ))?;
229
- }
230
-
231
- // Ensure everything is written and get the temp file if it exists
232
- if let Some(temp_file) = writer.close()? {
233
- // If we got a temp file back, we need to copy its contents to the IO-like object
234
- copy_temp_file_to_io_like(temp_file, IoLikeValue(write_to))?;
235
- }
236
-
237
- Ok(())
238
- }