parquet 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/ext/parquet/src/header_cache.rs +4 -9
- data/ext/parquet/src/logger.rs +2 -2
- data/ext/parquet/src/reader/common.rs +12 -15
- data/ext/parquet/src/reader/mod.rs +0 -56
- data/ext/parquet/src/reader/parquet_column_reader.rs +20 -16
- data/ext/parquet/src/reader/parquet_row_reader.rs +21 -14
- data/ext/parquet/src/ruby_reader.rs +37 -25
- data/ext/parquet/src/types/core_types.rs +2 -17
- data/ext/parquet/src/types/mod.rs +56 -0
- data/ext/parquet/src/types/parquet_value.rs +101 -95
- data/ext/parquet/src/types/record_types.rs +12 -14
- data/ext/parquet/src/types/schema_converter.rs +4 -109
- data/ext/parquet/src/types/timestamp.rs +3 -5
- data/ext/parquet/src/types/type_conversion.rs +116 -81
- data/ext/parquet/src/types/writer_types.rs +26 -54
- data/ext/parquet/src/writer/mod.rs +176 -839
- data/ext/parquet/src/writer/write_columns.rs +226 -0
- data/ext/parquet/src/writer/write_rows.rs +484 -0
- data/lib/parquet/version.rb +1 -1
- metadata +3 -1
@@ -0,0 +1,226 @@
|
|
1
|
+
use super::{
|
2
|
+
arrow_data_type_to_parquet_schema_type, copy_temp_file_to_io_like, create_writer,
|
3
|
+
parse_parquet_write_args, DEFAULT_MEMORY_THRESHOLD,
|
4
|
+
};
|
5
|
+
use crate::{
|
6
|
+
convert_ruby_array_to_arrow,
|
7
|
+
logger::RubyLogger,
|
8
|
+
types::{schema_node::build_arrow_schema, ParquetGemError, WriterOutput},
|
9
|
+
IoLikeValue, ParquetSchemaType as PST, ParquetWriteArgs,
|
10
|
+
};
|
11
|
+
use crate::{types::PrimitiveType, SchemaNode};
|
12
|
+
use arrow_array::{Array, RecordBatch};
|
13
|
+
use magnus::{value::ReprValue, Error as MagnusError, RArray, Ruby, Value};
|
14
|
+
use std::sync::Arc;
|
15
|
+
|
16
|
+
#[inline]
|
17
|
+
pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
18
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
19
|
+
write_columns_impl(Arc::new(ruby), args).map_err(|e| {
|
20
|
+
let z: MagnusError = e.into();
|
21
|
+
z
|
22
|
+
})?;
|
23
|
+
Ok(())
|
24
|
+
}
|
25
|
+
|
26
|
+
#[inline]
|
27
|
+
fn write_columns_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemError> {
|
28
|
+
let ParquetWriteArgs {
|
29
|
+
read_from,
|
30
|
+
write_to,
|
31
|
+
schema,
|
32
|
+
batch_size: _,
|
33
|
+
compression,
|
34
|
+
flush_threshold,
|
35
|
+
sample_size: _,
|
36
|
+
logger,
|
37
|
+
} = parse_parquet_write_args(&ruby, args)?;
|
38
|
+
|
39
|
+
let logger = RubyLogger::new(&ruby, logger)?;
|
40
|
+
let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
|
41
|
+
|
42
|
+
// Get the Arrow schema from the SchemaNode (we only have DSL schema now, since legacy is converted)
|
43
|
+
let arrow_schema = build_arrow_schema(&schema, &logger).map_err(|e| {
|
44
|
+
MagnusError::new(
|
45
|
+
magnus::exception::runtime_error(),
|
46
|
+
format!("Failed to build Arrow schema from DSL schema: {}", e),
|
47
|
+
)
|
48
|
+
})?;
|
49
|
+
|
50
|
+
// Create the writer
|
51
|
+
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
|
52
|
+
|
53
|
+
if read_from.is_kind_of(ruby.class_enumerator()) {
|
54
|
+
loop {
|
55
|
+
match read_from.funcall::<_, _, Value>("next", ()) {
|
56
|
+
Ok(batch) => {
|
57
|
+
let batch_array = RArray::from_value(batch).ok_or_else(|| {
|
58
|
+
MagnusError::new(ruby.exception_type_error(), "Batch must be an array")
|
59
|
+
})?;
|
60
|
+
|
61
|
+
// Batch array must be an array of arrays. Check that the first value in `batch_array` is an array.
|
62
|
+
batch_array.entry::<RArray>(0).map_err(|_| {
|
63
|
+
MagnusError::new(
|
64
|
+
ruby.exception_type_error(),
|
65
|
+
"When writing columns, data must be formatted as batches of columns: [[batch1_col1, batch1_col2], [batch2_col1, batch2_col2]].",
|
66
|
+
)
|
67
|
+
})?;
|
68
|
+
|
69
|
+
// Validate batch length matches schema
|
70
|
+
// Get schema length and field names - we only have DSL schema now
|
71
|
+
let (schema_len, field_names): (usize, Vec<&str>) = {
|
72
|
+
let fields = match &schema {
|
73
|
+
SchemaNode::Struct { fields, .. } => fields,
|
74
|
+
_ => {
|
75
|
+
return Err(MagnusError::new(
|
76
|
+
magnus::exception::type_error(),
|
77
|
+
"Root schema node must be a struct type",
|
78
|
+
))?
|
79
|
+
}
|
80
|
+
};
|
81
|
+
(
|
82
|
+
fields.len(),
|
83
|
+
fields
|
84
|
+
.iter()
|
85
|
+
.map(|f| match f {
|
86
|
+
SchemaNode::Primitive { name, .. } => name.as_str(),
|
87
|
+
SchemaNode::List { name, .. } => name.as_str(),
|
88
|
+
SchemaNode::Map { name, .. } => name.as_str(),
|
89
|
+
SchemaNode::Struct { name, .. } => name.as_str(),
|
90
|
+
})
|
91
|
+
.to_owned()
|
92
|
+
.collect(),
|
93
|
+
)
|
94
|
+
};
|
95
|
+
|
96
|
+
if batch_array.len() != schema_len {
|
97
|
+
return Err(MagnusError::new(
|
98
|
+
magnus::exception::type_error(),
|
99
|
+
format!(
|
100
|
+
"Batch column count ({}) does not match schema length ({}). Schema expects columns: {:?}",
|
101
|
+
batch_array.len(),
|
102
|
+
schema_len,
|
103
|
+
field_names
|
104
|
+
),
|
105
|
+
))?;
|
106
|
+
}
|
107
|
+
|
108
|
+
// Convert each column in the batch to Arrow arrays
|
109
|
+
let arrow_arrays: Vec<(String, Arc<dyn Array>)> = {
|
110
|
+
// Process each field in the DSL schema
|
111
|
+
let fields = arrow_schema.fields();
|
112
|
+
let top_fields =
|
113
|
+
match &schema {
|
114
|
+
SchemaNode::Struct { fields, .. } => fields,
|
115
|
+
_ => return Err(MagnusError::new(
|
116
|
+
magnus::exception::runtime_error(),
|
117
|
+
"Top-level DSL schema must be a struct for columns approach",
|
118
|
+
))?,
|
119
|
+
};
|
120
|
+
if top_fields.len() != fields.len() {
|
121
|
+
return Err(MagnusError::new(
|
122
|
+
magnus::exception::runtime_error(),
|
123
|
+
"Mismatch top-level DSL fields vs Arrow fields",
|
124
|
+
))?;
|
125
|
+
}
|
126
|
+
|
127
|
+
let mut out = vec![];
|
128
|
+
for ((arrow_f, dsl_f), col_val) in
|
129
|
+
fields.iter().zip(top_fields.iter()).zip(batch_array)
|
130
|
+
{
|
131
|
+
let col_arr = RArray::from_value(col_val).ok_or_else(|| {
|
132
|
+
MagnusError::new(
|
133
|
+
magnus::exception::type_error(),
|
134
|
+
format!("Column '{}' must be an array", arrow_f.name()),
|
135
|
+
)
|
136
|
+
})?;
|
137
|
+
// Get appropriate parquet_type
|
138
|
+
let ptype = match dsl_f {
|
139
|
+
SchemaNode::Primitive {
|
140
|
+
parquet_type,
|
141
|
+
// Format is handled internally now
|
142
|
+
..
|
143
|
+
} => match parquet_type {
|
144
|
+
&PrimitiveType::Int8 => PST::Primitive(PrimitiveType::Int8),
|
145
|
+
&PrimitiveType::Int16 => PST::Primitive(PrimitiveType::Int16),
|
146
|
+
&PrimitiveType::Int32 => PST::Primitive(PrimitiveType::Int32),
|
147
|
+
&PrimitiveType::Int64 => PST::Primitive(PrimitiveType::Int64),
|
148
|
+
&PrimitiveType::UInt8 => PST::Primitive(PrimitiveType::UInt8),
|
149
|
+
&PrimitiveType::UInt16 => PST::Primitive(PrimitiveType::UInt16),
|
150
|
+
&PrimitiveType::UInt32 => PST::Primitive(PrimitiveType::UInt32),
|
151
|
+
&PrimitiveType::UInt64 => PST::Primitive(PrimitiveType::UInt64),
|
152
|
+
&PrimitiveType::Float32 => {
|
153
|
+
PST::Primitive(PrimitiveType::Float32)
|
154
|
+
}
|
155
|
+
&PrimitiveType::Float64 => {
|
156
|
+
PST::Primitive(PrimitiveType::Float64)
|
157
|
+
}
|
158
|
+
&PrimitiveType::String => PST::Primitive(PrimitiveType::String),
|
159
|
+
&PrimitiveType::Binary => PST::Primitive(PrimitiveType::Binary),
|
160
|
+
&PrimitiveType::Boolean => {
|
161
|
+
PST::Primitive(PrimitiveType::Boolean)
|
162
|
+
}
|
163
|
+
&PrimitiveType::Date32 => PST::Primitive(PrimitiveType::Date32),
|
164
|
+
&PrimitiveType::TimestampMillis => {
|
165
|
+
PST::Primitive(PrimitiveType::TimestampMillis)
|
166
|
+
}
|
167
|
+
&PrimitiveType::TimestampMicros => {
|
168
|
+
PST::Primitive(PrimitiveType::TimestampMicros)
|
169
|
+
}
|
170
|
+
},
|
171
|
+
SchemaNode::List { .. }
|
172
|
+
| SchemaNode::Map { .. }
|
173
|
+
| SchemaNode::Struct { .. } => {
|
174
|
+
// For nested, we just do a single "column" as well
|
175
|
+
arrow_data_type_to_parquet_schema_type(arrow_f.data_type())?
|
176
|
+
}
|
177
|
+
};
|
178
|
+
out.push((
|
179
|
+
arrow_f.name().clone(),
|
180
|
+
convert_ruby_array_to_arrow(&ruby, col_arr, &ptype)?,
|
181
|
+
));
|
182
|
+
}
|
183
|
+
out
|
184
|
+
};
|
185
|
+
|
186
|
+
// Create and write record batch
|
187
|
+
let record_batch = RecordBatch::try_from_iter(arrow_arrays).map_err(|e| {
|
188
|
+
MagnusError::new(
|
189
|
+
magnus::exception::runtime_error(),
|
190
|
+
format!("Failed to create record batch: {}", e),
|
191
|
+
)
|
192
|
+
})?;
|
193
|
+
|
194
|
+
writer.write(&record_batch)?;
|
195
|
+
|
196
|
+
match &mut writer {
|
197
|
+
WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
|
198
|
+
if w.in_progress_size() >= flush_threshold {
|
199
|
+
w.flush()?;
|
200
|
+
}
|
201
|
+
}
|
202
|
+
}
|
203
|
+
}
|
204
|
+
Err(e) => {
|
205
|
+
if e.is_kind_of(ruby.exception_stop_iteration()) {
|
206
|
+
break;
|
207
|
+
}
|
208
|
+
return Err(e)?;
|
209
|
+
}
|
210
|
+
}
|
211
|
+
}
|
212
|
+
} else {
|
213
|
+
return Err(MagnusError::new(
|
214
|
+
magnus::exception::type_error(),
|
215
|
+
"read_from must be an Enumerator".to_string(),
|
216
|
+
))?;
|
217
|
+
}
|
218
|
+
|
219
|
+
// Ensure everything is written and get the temp file if it exists
|
220
|
+
if let Some(temp_file) = writer.close()? {
|
221
|
+
// If we got a temp file back, we need to copy its contents to the IO-like object
|
222
|
+
copy_temp_file_to_io_like(temp_file, IoLikeValue(write_to))?;
|
223
|
+
}
|
224
|
+
|
225
|
+
Ok(())
|
226
|
+
}
|