parquet 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,226 @@
1
+ use super::{
2
+ arrow_data_type_to_parquet_schema_type, copy_temp_file_to_io_like, create_writer,
3
+ parse_parquet_write_args, DEFAULT_MEMORY_THRESHOLD,
4
+ };
5
+ use crate::{
6
+ convert_ruby_array_to_arrow,
7
+ logger::RubyLogger,
8
+ types::{schema_node::build_arrow_schema, ParquetGemError, WriterOutput},
9
+ IoLikeValue, ParquetSchemaType as PST, ParquetWriteArgs,
10
+ };
11
+ use crate::{types::PrimitiveType, SchemaNode};
12
+ use arrow_array::{Array, RecordBatch};
13
+ use magnus::{value::ReprValue, Error as MagnusError, RArray, Ruby, Value};
14
+ use std::sync::Arc;
15
+
16
+ #[inline]
17
+ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
18
+ let ruby = unsafe { Ruby::get_unchecked() };
19
+ write_columns_impl(Arc::new(ruby), args).map_err(|e| {
20
+ let z: MagnusError = e.into();
21
+ z
22
+ })?;
23
+ Ok(())
24
+ }
25
+
26
+ #[inline]
27
+ fn write_columns_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemError> {
28
+ let ParquetWriteArgs {
29
+ read_from,
30
+ write_to,
31
+ schema,
32
+ batch_size: _,
33
+ compression,
34
+ flush_threshold,
35
+ sample_size: _,
36
+ logger,
37
+ } = parse_parquet_write_args(&ruby, args)?;
38
+
39
+ let logger = RubyLogger::new(&ruby, logger)?;
40
+ let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
41
+
42
+ // Get the Arrow schema from the SchemaNode (we only have DSL schema now, since legacy is converted)
43
+ let arrow_schema = build_arrow_schema(&schema, &logger).map_err(|e| {
44
+ MagnusError::new(
45
+ magnus::exception::runtime_error(),
46
+ format!("Failed to build Arrow schema from DSL schema: {}", e),
47
+ )
48
+ })?;
49
+
50
+ // Create the writer
51
+ let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
52
+
53
+ if read_from.is_kind_of(ruby.class_enumerator()) {
54
+ loop {
55
+ match read_from.funcall::<_, _, Value>("next", ()) {
56
+ Ok(batch) => {
57
+ let batch_array = RArray::from_value(batch).ok_or_else(|| {
58
+ MagnusError::new(ruby.exception_type_error(), "Batch must be an array")
59
+ })?;
60
+
61
+ // Batch array must be an array of arrays. Check that the first value in `batch_array` is an array.
62
+ batch_array.entry::<RArray>(0).map_err(|_| {
63
+ MagnusError::new(
64
+ ruby.exception_type_error(),
65
+ "When writing columns, data must be formatted as batches of columns: [[batch1_col1, batch1_col2], [batch2_col1, batch2_col2]].",
66
+ )
67
+ })?;
68
+
69
+ // Validate batch length matches schema
70
+ // Get schema length and field names - we only have DSL schema now
71
+ let (schema_len, field_names): (usize, Vec<&str>) = {
72
+ let fields = match &schema {
73
+ SchemaNode::Struct { fields, .. } => fields,
74
+ _ => {
75
+ return Err(MagnusError::new(
76
+ magnus::exception::type_error(),
77
+ "Root schema node must be a struct type",
78
+ ))?
79
+ }
80
+ };
81
+ (
82
+ fields.len(),
83
+ fields
84
+ .iter()
85
+ .map(|f| match f {
86
+ SchemaNode::Primitive { name, .. } => name.as_str(),
87
+ SchemaNode::List { name, .. } => name.as_str(),
88
+ SchemaNode::Map { name, .. } => name.as_str(),
89
+ SchemaNode::Struct { name, .. } => name.as_str(),
90
+ })
91
+ .to_owned()
92
+ .collect(),
93
+ )
94
+ };
95
+
96
+ if batch_array.len() != schema_len {
97
+ return Err(MagnusError::new(
98
+ magnus::exception::type_error(),
99
+ format!(
100
+ "Batch column count ({}) does not match schema length ({}). Schema expects columns: {:?}",
101
+ batch_array.len(),
102
+ schema_len,
103
+ field_names
104
+ ),
105
+ ))?;
106
+ }
107
+
108
+ // Convert each column in the batch to Arrow arrays
109
+ let arrow_arrays: Vec<(String, Arc<dyn Array>)> = {
110
+ // Process each field in the DSL schema
111
+ let fields = arrow_schema.fields();
112
+ let top_fields =
113
+ match &schema {
114
+ SchemaNode::Struct { fields, .. } => fields,
115
+ _ => return Err(MagnusError::new(
116
+ magnus::exception::runtime_error(),
117
+ "Top-level DSL schema must be a struct for columns approach",
118
+ ))?,
119
+ };
120
+ if top_fields.len() != fields.len() {
121
+ return Err(MagnusError::new(
122
+ magnus::exception::runtime_error(),
123
+ "Mismatch top-level DSL fields vs Arrow fields",
124
+ ))?;
125
+ }
126
+
127
+ let mut out = vec![];
128
+ for ((arrow_f, dsl_f), col_val) in
129
+ fields.iter().zip(top_fields.iter()).zip(batch_array)
130
+ {
131
+ let col_arr = RArray::from_value(col_val).ok_or_else(|| {
132
+ MagnusError::new(
133
+ magnus::exception::type_error(),
134
+ format!("Column '{}' must be an array", arrow_f.name()),
135
+ )
136
+ })?;
137
+ // Get appropriate parquet_type
138
+ let ptype = match dsl_f {
139
+ SchemaNode::Primitive {
140
+ parquet_type,
141
+ // Format is handled internally now
142
+ ..
143
+ } => match parquet_type {
144
+ &PrimitiveType::Int8 => PST::Primitive(PrimitiveType::Int8),
145
+ &PrimitiveType::Int16 => PST::Primitive(PrimitiveType::Int16),
146
+ &PrimitiveType::Int32 => PST::Primitive(PrimitiveType::Int32),
147
+ &PrimitiveType::Int64 => PST::Primitive(PrimitiveType::Int64),
148
+ &PrimitiveType::UInt8 => PST::Primitive(PrimitiveType::UInt8),
149
+ &PrimitiveType::UInt16 => PST::Primitive(PrimitiveType::UInt16),
150
+ &PrimitiveType::UInt32 => PST::Primitive(PrimitiveType::UInt32),
151
+ &PrimitiveType::UInt64 => PST::Primitive(PrimitiveType::UInt64),
152
+ &PrimitiveType::Float32 => {
153
+ PST::Primitive(PrimitiveType::Float32)
154
+ }
155
+ &PrimitiveType::Float64 => {
156
+ PST::Primitive(PrimitiveType::Float64)
157
+ }
158
+ &PrimitiveType::String => PST::Primitive(PrimitiveType::String),
159
+ &PrimitiveType::Binary => PST::Primitive(PrimitiveType::Binary),
160
+ &PrimitiveType::Boolean => {
161
+ PST::Primitive(PrimitiveType::Boolean)
162
+ }
163
+ &PrimitiveType::Date32 => PST::Primitive(PrimitiveType::Date32),
164
+ &PrimitiveType::TimestampMillis => {
165
+ PST::Primitive(PrimitiveType::TimestampMillis)
166
+ }
167
+ &PrimitiveType::TimestampMicros => {
168
+ PST::Primitive(PrimitiveType::TimestampMicros)
169
+ }
170
+ },
171
+ SchemaNode::List { .. }
172
+ | SchemaNode::Map { .. }
173
+ | SchemaNode::Struct { .. } => {
174
+ // For nested, we just do a single "column" as well
175
+ arrow_data_type_to_parquet_schema_type(arrow_f.data_type())?
176
+ }
177
+ };
178
+ out.push((
179
+ arrow_f.name().clone(),
180
+ convert_ruby_array_to_arrow(&ruby, col_arr, &ptype)?,
181
+ ));
182
+ }
183
+ out
184
+ };
185
+
186
+ // Create and write record batch
187
+ let record_batch = RecordBatch::try_from_iter(arrow_arrays).map_err(|e| {
188
+ MagnusError::new(
189
+ magnus::exception::runtime_error(),
190
+ format!("Failed to create record batch: {}", e),
191
+ )
192
+ })?;
193
+
194
+ writer.write(&record_batch)?;
195
+
196
+ match &mut writer {
197
+ WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
198
+ if w.in_progress_size() >= flush_threshold {
199
+ w.flush()?;
200
+ }
201
+ }
202
+ }
203
+ }
204
+ Err(e) => {
205
+ if e.is_kind_of(ruby.exception_stop_iteration()) {
206
+ break;
207
+ }
208
+ return Err(e)?;
209
+ }
210
+ }
211
+ }
212
+ } else {
213
+ return Err(MagnusError::new(
214
+ magnus::exception::type_error(),
215
+ "read_from must be an Enumerator".to_string(),
216
+ ))?;
217
+ }
218
+
219
+ // Ensure everything is written and get the temp file if it exists
220
+ if let Some(temp_file) = writer.close()? {
221
+ // If we got a temp file back, we need to copy its contents to the IO-like object
222
+ copy_temp_file_to_io_like(temp_file, IoLikeValue(write_to))?;
223
+ }
224
+
225
+ Ok(())
226
+ }