parquet 0.4.2 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +66 -59
- data/README.md +105 -1
- data/ext/parquet/Cargo.toml +4 -3
- data/ext/parquet/src/enumerator.rs +8 -0
- data/ext/parquet/src/header_cache.rs +7 -3
- data/ext/parquet/src/lib.rs +1 -0
- data/ext/parquet/src/logger.rs +171 -0
- data/ext/parquet/src/reader/common.rs +113 -0
- data/ext/parquet/src/reader/mod.rs +27 -13
- data/ext/parquet/src/reader/parquet_column_reader.rs +38 -78
- data/ext/parquet/src/reader/parquet_row_reader.rs +42 -19
- data/ext/parquet/src/types/core_types.rs +57 -1
- data/ext/parquet/src/types/mod.rs +8 -1
- data/ext/parquet/src/types/parquet_value.rs +211 -35
- data/ext/parquet/src/types/record_types.rs +18 -15
- data/ext/parquet/src/types/schema_converter.rs +349 -0
- data/ext/parquet/src/types/schema_node.rs +329 -0
- data/ext/parquet/src/types/timestamp.rs +18 -8
- data/ext/parquet/src/types/type_conversion.rs +1106 -511
- data/ext/parquet/src/types/writer_types.rs +78 -107
- data/ext/parquet/src/utils.rs +29 -9
- data/ext/parquet/src/writer/mod.rs +828 -280
- data/lib/parquet/schema.rb +154 -0
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rb +1 -0
- metadata +7 -2
@@ -1,32 +1,36 @@
|
|
1
|
+
use super::core_types::SchemaNode;
|
2
|
+
use crate::{
|
3
|
+
reader::ReaderError,
|
4
|
+
types::{ListField, MapField, ParquetSchemaType},
|
5
|
+
};
|
6
|
+
use arrow_array::{Array, RecordBatch};
|
7
|
+
use magnus::{value::ReprValue, Error as MagnusError, RString, Ruby, Symbol, TryConvert, Value};
|
8
|
+
use parquet::{arrow::ArrowWriter, errors::ParquetError};
|
1
9
|
use std::{
|
2
10
|
io::{self, Write},
|
3
11
|
str::FromStr,
|
4
12
|
sync::Arc,
|
5
13
|
};
|
6
|
-
|
7
|
-
use arrow_array::{Array, RecordBatch};
|
8
|
-
use magnus::{value::ReprValue, Error as MagnusError, RString, Ruby, Symbol, TryConvert, Value};
|
9
|
-
use parquet::{arrow::ArrowWriter, errors::ParquetError};
|
10
14
|
use tempfile::NamedTempFile;
|
11
15
|
|
12
|
-
|
13
|
-
|
14
|
-
#[derive(Debug)]
|
16
|
+
#[derive(Debug, Clone)]
|
15
17
|
pub struct SchemaField<'a> {
|
16
18
|
pub name: String,
|
17
19
|
pub type_: ParquetSchemaType<'a>,
|
18
20
|
pub format: Option<String>,
|
21
|
+
pub nullable: bool,
|
19
22
|
}
|
20
23
|
|
21
24
|
#[derive(Debug)]
|
22
|
-
pub struct ParquetWriteArgs
|
25
|
+
pub struct ParquetWriteArgs {
|
23
26
|
pub read_from: Value,
|
24
27
|
pub write_to: Value,
|
25
|
-
pub schema:
|
28
|
+
pub schema: SchemaNode,
|
26
29
|
pub batch_size: Option<usize>,
|
27
30
|
pub flush_threshold: Option<usize>,
|
28
31
|
pub compression: Option<String>,
|
29
32
|
pub sample_size: Option<usize>,
|
33
|
+
pub logger: Option<Value>,
|
30
34
|
}
|
31
35
|
|
32
36
|
pub trait SendableWrite: Send + Write {}
|
@@ -59,6 +63,42 @@ impl<'a> FromStr for ParquetSchemaType<'a> {
|
|
59
63
|
type Err = MagnusError;
|
60
64
|
|
61
65
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
66
|
+
// Check if it's a list type
|
67
|
+
if let Some(inner_type_str) = s.strip_prefix("list<").and_then(|s| s.strip_suffix(">")) {
|
68
|
+
let inner_type = inner_type_str.parse::<ParquetSchemaType>()?;
|
69
|
+
return Ok(ParquetSchemaType::List(Box::new(ListField {
|
70
|
+
item_type: inner_type,
|
71
|
+
format: None,
|
72
|
+
nullable: true,
|
73
|
+
})));
|
74
|
+
}
|
75
|
+
|
76
|
+
// Check if it's a map type
|
77
|
+
if let Some(kv_types_str) = s.strip_prefix("map<").and_then(|s| s.strip_suffix(">")) {
|
78
|
+
let parts: Vec<&str> = kv_types_str.splitn(2, ',').collect();
|
79
|
+
if parts.len() != 2 {
|
80
|
+
return Err(MagnusError::new(
|
81
|
+
magnus::exception::runtime_error(),
|
82
|
+
format!(
|
83
|
+
"Invalid map format. Expected 'map<keyType,valueType>', got '{}'",
|
84
|
+
s
|
85
|
+
),
|
86
|
+
));
|
87
|
+
}
|
88
|
+
|
89
|
+
let key_type = parts[0].trim().parse::<ParquetSchemaType>()?;
|
90
|
+
let value_type = parts[1].trim().parse::<ParquetSchemaType>()?;
|
91
|
+
|
92
|
+
return Ok(ParquetSchemaType::Map(Box::new(MapField {
|
93
|
+
key_type,
|
94
|
+
value_type,
|
95
|
+
key_format: None,
|
96
|
+
value_format: None,
|
97
|
+
value_nullable: true,
|
98
|
+
})));
|
99
|
+
}
|
100
|
+
|
101
|
+
// Handle primitive types
|
62
102
|
match s {
|
63
103
|
"int8" => Ok(ParquetSchemaType::Int8),
|
64
104
|
"int16" => Ok(ParquetSchemaType::Int16),
|
@@ -77,13 +117,16 @@ impl<'a> FromStr for ParquetSchemaType<'a> {
|
|
77
117
|
"timestamp_millis" => Ok(ParquetSchemaType::TimestampMillis),
|
78
118
|
"timestamp_micros" => Ok(ParquetSchemaType::TimestampMicros),
|
79
119
|
"list" => Ok(ParquetSchemaType::List(Box::new(ListField {
|
80
|
-
item_type: ParquetSchemaType::
|
120
|
+
item_type: ParquetSchemaType::String,
|
81
121
|
format: None,
|
122
|
+
nullable: true,
|
82
123
|
}))),
|
83
124
|
"map" => Ok(ParquetSchemaType::Map(Box::new(MapField {
|
84
125
|
key_type: ParquetSchemaType::String,
|
85
|
-
value_type: ParquetSchemaType::
|
86
|
-
|
126
|
+
value_type: ParquetSchemaType::String,
|
127
|
+
key_format: None,
|
128
|
+
value_format: None,
|
129
|
+
value_nullable: true,
|
87
130
|
}))),
|
88
131
|
_ => Err(MagnusError::new(
|
89
132
|
magnus::exception::runtime_error(),
|
@@ -98,7 +141,11 @@ impl<'a> TryConvert for ParquetSchemaType<'a> {
|
|
98
141
|
let ruby = unsafe { Ruby::get_unchecked() };
|
99
142
|
let schema_type = parse_string_or_symbol(&ruby, value)?;
|
100
143
|
|
101
|
-
schema_type
|
144
|
+
schema_type
|
145
|
+
.ok_or_else(|| {
|
146
|
+
MagnusError::new(magnus::exception::type_error(), "Invalid schema type")
|
147
|
+
})?
|
148
|
+
.parse()
|
102
149
|
}
|
103
150
|
}
|
104
151
|
|
@@ -157,126 +204,50 @@ impl WriterOutput {
|
|
157
204
|
}
|
158
205
|
}
|
159
206
|
|
160
|
-
pub struct ParquetErrorWrapper(pub ParquetError);
|
161
|
-
|
162
|
-
impl From<ParquetErrorWrapper> for MagnusError {
|
163
|
-
fn from(err: ParquetErrorWrapper) -> Self {
|
164
|
-
MagnusError::new(
|
165
|
-
magnus::exception::runtime_error(),
|
166
|
-
format!("Parquet error: {}", err.0),
|
167
|
-
)
|
168
|
-
}
|
169
|
-
}
|
170
|
-
|
171
207
|
pub struct ColumnCollector<'a> {
|
172
208
|
pub name: String,
|
173
209
|
pub type_: ParquetSchemaType<'a>,
|
174
210
|
pub format: Option<String>,
|
211
|
+
pub nullable: bool,
|
175
212
|
pub values: Vec<crate::types::ParquetValue>,
|
176
213
|
}
|
177
214
|
|
178
215
|
impl<'a> ColumnCollector<'a> {
|
179
|
-
pub fn new(
|
216
|
+
pub fn new(
|
217
|
+
name: String,
|
218
|
+
type_: ParquetSchemaType<'a>,
|
219
|
+
format: Option<String>,
|
220
|
+
nullable: bool,
|
221
|
+
) -> Self {
|
180
222
|
Self {
|
181
223
|
name,
|
182
224
|
type_,
|
183
225
|
format,
|
226
|
+
nullable,
|
184
227
|
values: Vec::new(),
|
185
228
|
}
|
186
229
|
}
|
187
230
|
|
188
231
|
pub fn push_value(&mut self, value: Value) -> Result<(), MagnusError> {
|
189
232
|
use crate::types::ParquetValue;
|
190
|
-
use crate::{
|
191
|
-
convert_to_binary, convert_to_boolean, convert_to_date32, convert_to_list,
|
192
|
-
convert_to_map, convert_to_timestamp_micros, convert_to_timestamp_millis,
|
193
|
-
NumericConverter,
|
194
|
-
};
|
195
233
|
|
196
234
|
if value.is_nil() {
|
197
|
-
self.
|
198
|
-
|
235
|
+
if !self.nullable {
|
236
|
+
// For non-nullable fields, raise an error
|
237
|
+
return Err(MagnusError::new(
|
238
|
+
magnus::exception::runtime_error(),
|
239
|
+
"Cannot write nil value for non-nullable field",
|
240
|
+
));
|
241
|
+
}
|
199
242
|
}
|
200
243
|
|
201
|
-
|
202
|
-
|
203
|
-
let v = NumericConverter::<i8>::convert_with_string_fallback(value)?;
|
204
|
-
ParquetValue::Int8(v)
|
205
|
-
}
|
206
|
-
ParquetSchemaType::Int16 => {
|
207
|
-
let v = NumericConverter::<i16>::convert_with_string_fallback(value)?;
|
208
|
-
ParquetValue::Int16(v)
|
209
|
-
}
|
210
|
-
ParquetSchemaType::Int32 => {
|
211
|
-
let v = NumericConverter::<i32>::convert_with_string_fallback(value)?;
|
212
|
-
ParquetValue::Int32(v)
|
213
|
-
}
|
214
|
-
ParquetSchemaType::Int64 => {
|
215
|
-
let v = NumericConverter::<i64>::convert_with_string_fallback(value)?;
|
216
|
-
ParquetValue::Int64(v)
|
217
|
-
}
|
218
|
-
ParquetSchemaType::UInt8 => {
|
219
|
-
let v = NumericConverter::<u8>::convert_with_string_fallback(value)?;
|
220
|
-
ParquetValue::UInt8(v)
|
221
|
-
}
|
222
|
-
ParquetSchemaType::UInt16 => {
|
223
|
-
let v = NumericConverter::<u16>::convert_with_string_fallback(value)?;
|
224
|
-
ParquetValue::UInt16(v)
|
225
|
-
}
|
226
|
-
ParquetSchemaType::UInt32 => {
|
227
|
-
let v = NumericConverter::<u32>::convert_with_string_fallback(value)?;
|
228
|
-
ParquetValue::UInt32(v)
|
229
|
-
}
|
230
|
-
ParquetSchemaType::UInt64 => {
|
231
|
-
let v = NumericConverter::<u64>::convert_with_string_fallback(value)?;
|
232
|
-
ParquetValue::UInt64(v)
|
233
|
-
}
|
234
|
-
ParquetSchemaType::Float => {
|
235
|
-
let v = NumericConverter::<f32>::convert_with_string_fallback(value)?;
|
236
|
-
ParquetValue::Float32(v)
|
237
|
-
}
|
238
|
-
ParquetSchemaType::Double => {
|
239
|
-
let v = NumericConverter::<f64>::convert_with_string_fallback(value)?;
|
240
|
-
ParquetValue::Float64(v)
|
241
|
-
}
|
242
|
-
ParquetSchemaType::String => {
|
243
|
-
let v = convert_to_string(value)?;
|
244
|
-
ParquetValue::String(v)
|
245
|
-
}
|
246
|
-
ParquetSchemaType::Binary => {
|
247
|
-
let v = convert_to_binary(value)?;
|
248
|
-
ParquetValue::Bytes(v)
|
249
|
-
}
|
250
|
-
ParquetSchemaType::Boolean => {
|
251
|
-
let v = convert_to_boolean(value)?;
|
252
|
-
ParquetValue::Boolean(v)
|
253
|
-
}
|
254
|
-
ParquetSchemaType::Date32 => {
|
255
|
-
let v = convert_to_date32(value, self.format.as_deref())?;
|
256
|
-
ParquetValue::Date32(v)
|
257
|
-
}
|
258
|
-
ParquetSchemaType::TimestampMillis => {
|
259
|
-
let v = convert_to_timestamp_millis(value, self.format.as_deref())?;
|
260
|
-
ParquetValue::TimestampMillis(v, None)
|
261
|
-
}
|
262
|
-
ParquetSchemaType::TimestampMicros => {
|
263
|
-
let v = convert_to_timestamp_micros(value, self.format.as_deref())?;
|
264
|
-
ParquetValue::TimestampMicros(v, None)
|
265
|
-
}
|
266
|
-
ParquetSchemaType::List(list_field) => {
|
267
|
-
let values = convert_to_list(value, list_field)?;
|
268
|
-
ParquetValue::List(values)
|
269
|
-
}
|
270
|
-
ParquetSchemaType::Map(map_field) => {
|
271
|
-
let map = convert_to_map(value, map_field)?;
|
272
|
-
ParquetValue::Map(map)
|
273
|
-
}
|
274
|
-
};
|
244
|
+
// For all other types, proceed as normal
|
245
|
+
let parquet_value = ParquetValue::from_value(value, &self.type_, self.format.as_deref())?;
|
275
246
|
self.values.push(parquet_value);
|
276
247
|
Ok(())
|
277
248
|
}
|
278
249
|
|
279
|
-
pub fn take_array(&mut self) -> Result<Arc<dyn Array>,
|
250
|
+
pub fn take_array(&mut self) -> Result<Arc<dyn Array>, ReaderError> {
|
280
251
|
let values = std::mem::take(&mut self.values);
|
281
252
|
crate::convert_parquet_values_to_arrow(values, &self.type_)
|
282
253
|
}
|
data/ext/parquet/src/utils.rs
CHANGED
@@ -1,12 +1,12 @@
|
|
1
|
+
use crate::ParserResultType;
|
1
2
|
use magnus::{
|
2
3
|
scan_args::{get_kwargs, scan_args},
|
3
4
|
value::ReprValue,
|
4
5
|
Error, RString, Ruby, Symbol, Value,
|
5
6
|
};
|
6
7
|
|
7
|
-
|
8
|
-
|
9
|
-
fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, Error> {
|
8
|
+
/// Convert a Ruby Value to a String, handling both String and Symbol types
|
9
|
+
pub fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, Error> {
|
10
10
|
if value.is_nil() {
|
11
11
|
Ok(None)
|
12
12
|
} else if value.is_kind_of(ruby.class_string()) {
|
@@ -33,9 +33,10 @@ pub struct ParquetRowsArgs {
|
|
33
33
|
pub result_type: ParserResultType,
|
34
34
|
pub columns: Option<Vec<String>>,
|
35
35
|
pub strict: bool,
|
36
|
+
pub logger: Option<Value>,
|
36
37
|
}
|
37
38
|
|
38
|
-
/// Parse common arguments for
|
39
|
+
/// Parse common arguments for parquet row iteration
|
39
40
|
pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRowsArgs, Error> {
|
40
41
|
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
41
42
|
let (to_read,) = parsed_args.required;
|
@@ -47,12 +48,13 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
|
|
47
48
|
Option<Option<Value>>,
|
48
49
|
Option<Option<Vec<String>>>,
|
49
50
|
Option<Option<bool>>,
|
51
|
+
Option<Option<Value>>,
|
50
52
|
),
|
51
53
|
(),
|
52
54
|
>(
|
53
55
|
parsed_args.keywords,
|
54
56
|
&[],
|
55
|
-
&["result_type", "columns", "strict"],
|
57
|
+
&["result_type", "columns", "strict", "logger"],
|
56
58
|
)?;
|
57
59
|
|
58
60
|
let result_type: ParserResultType = match kwargs
|
@@ -84,12 +86,14 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
|
|
84
86
|
};
|
85
87
|
|
86
88
|
let strict = kwargs.optional.2.flatten().unwrap_or(true);
|
89
|
+
let logger = kwargs.optional.3.flatten();
|
87
90
|
|
88
91
|
Ok(ParquetRowsArgs {
|
89
92
|
to_read,
|
90
93
|
result_type,
|
91
94
|
columns: kwargs.optional.1.flatten(),
|
92
95
|
strict,
|
96
|
+
logger,
|
93
97
|
})
|
94
98
|
}
|
95
99
|
|
@@ -100,9 +104,10 @@ pub struct ParquetColumnsArgs {
|
|
100
104
|
pub columns: Option<Vec<String>>,
|
101
105
|
pub batch_size: Option<usize>,
|
102
106
|
pub strict: bool,
|
107
|
+
pub logger: Option<Value>,
|
103
108
|
}
|
104
109
|
|
105
|
-
/// Parse common arguments for
|
110
|
+
/// Parse common arguments for parquet column iteration
|
106
111
|
pub fn parse_parquet_columns_args(
|
107
112
|
ruby: &Ruby,
|
108
113
|
args: &[Value],
|
@@ -118,12 +123,13 @@ pub fn parse_parquet_columns_args(
|
|
118
123
|
Option<Option<Vec<String>>>,
|
119
124
|
Option<Option<usize>>,
|
120
125
|
Option<Option<bool>>,
|
126
|
+
Option<Option<Value>>,
|
121
127
|
),
|
122
128
|
(),
|
123
129
|
>(
|
124
130
|
parsed_args.keywords,
|
125
131
|
&[],
|
126
|
-
&["result_type", "columns", "batch_size", "strict"],
|
132
|
+
&["result_type", "columns", "batch_size", "strict", "logger"],
|
127
133
|
)?;
|
128
134
|
|
129
135
|
let result_type: ParserResultType = match kwargs
|
@@ -154,11 +160,25 @@ pub fn parse_parquet_columns_args(
|
|
154
160
|
None => ParserResultType::Hash,
|
155
161
|
};
|
156
162
|
|
163
|
+
let batch_size = kwargs.optional.2.flatten();
|
164
|
+
if let Some(sz) = batch_size {
|
165
|
+
if sz <= 0 {
|
166
|
+
return Err(Error::new(
|
167
|
+
ruby.exception_arg_error(),
|
168
|
+
format!("batch_size must be > 0, got {}", sz),
|
169
|
+
));
|
170
|
+
}
|
171
|
+
}
|
172
|
+
|
173
|
+
let strict = kwargs.optional.3.flatten().unwrap_or(true);
|
174
|
+
let logger = kwargs.optional.4.flatten();
|
175
|
+
|
157
176
|
Ok(ParquetColumnsArgs {
|
158
177
|
to_read,
|
159
178
|
result_type,
|
160
179
|
columns: kwargs.optional.1.flatten(),
|
161
|
-
batch_size
|
162
|
-
strict
|
180
|
+
batch_size,
|
181
|
+
strict,
|
182
|
+
logger,
|
163
183
|
})
|
164
184
|
}
|