parquet 0.2.5 → 0.2.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ext/parquet/src/types/core_types.rs +10 -8
- data/ext/parquet/src/types/parquet_value.rs +7 -3
- data/ext/parquet/src/types/type_conversion.rs +86 -30
- data/ext/parquet/src/types/writer_types.rs +24 -14
- data/ext/parquet/src/utils.rs +16 -5
- data/ext/parquet/src/writer/mod.rs +40 -6
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +37 -5
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c1ed4f490a4f03443598dbe1b0e110746052f613a4c5575f9b8e47c6e160bb40
|
4
|
+
data.tar.gz: 4db314d1707e633799e996c6fb777135ff0ea364a76c0a7d8fc5c429e2394d9f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b3f0a15cf467d030d3002c21bc6b64b6cd16e91e972b8de1e928abfd9bd373cfb5c4f77cdd1a6db7c620055e9657ec623866e0d8a0cb3a8e21a0c252bde3df87
|
7
|
+
data.tar.gz: 77f41921f5818051b597d2941688f6eca2a24d86333c58dec45d6e47e7161bfdd70e78f50a0f7ddd6cc99356c2b477451ab43adf9caa201501815c6b1a731d5c
|
@@ -40,18 +40,20 @@ impl std::fmt::Display for ParserResultType {
|
|
40
40
|
}
|
41
41
|
|
42
42
|
#[derive(Debug, Clone)]
|
43
|
-
pub struct ListField {
|
44
|
-
pub item_type: ParquetSchemaType
|
43
|
+
pub struct ListField<'a> {
|
44
|
+
pub item_type: ParquetSchemaType<'a>,
|
45
|
+
pub format: Option<&'a str>,
|
45
46
|
}
|
46
47
|
|
47
48
|
#[derive(Debug, Clone)]
|
48
|
-
pub struct MapField {
|
49
|
-
pub key_type: ParquetSchemaType
|
50
|
-
pub value_type: ParquetSchemaType
|
49
|
+
pub struct MapField<'a> {
|
50
|
+
pub key_type: ParquetSchemaType<'a>,
|
51
|
+
pub value_type: ParquetSchemaType<'a>,
|
52
|
+
pub format: Option<&'a str>,
|
51
53
|
}
|
52
54
|
|
53
55
|
#[derive(Debug, Clone)]
|
54
|
-
pub enum ParquetSchemaType {
|
56
|
+
pub enum ParquetSchemaType<'a> {
|
55
57
|
Int8,
|
56
58
|
Int16,
|
57
59
|
Int32,
|
@@ -68,6 +70,6 @@ pub enum ParquetSchemaType {
|
|
68
70
|
Date32,
|
69
71
|
TimestampMillis,
|
70
72
|
TimestampMicros,
|
71
|
-
List(Box<ListField
|
72
|
-
Map(Box<MapField
|
73
|
+
List(Box<ListField<'a>>),
|
74
|
+
Map(Box<MapField<'a>>),
|
73
75
|
}
|
@@ -157,6 +157,10 @@ impl IntoValue for ParquetValue {
|
|
157
157
|
|
158
158
|
impl ParquetValue {
|
159
159
|
pub fn from_value(value: Value, type_: &ParquetSchemaType) -> Result<Self, MagnusError> {
|
160
|
+
if value.is_nil() {
|
161
|
+
return Ok(ParquetValue::Null);
|
162
|
+
}
|
163
|
+
|
160
164
|
match type_ {
|
161
165
|
ParquetSchemaType::Int8 => {
|
162
166
|
let v = NumericConverter::<i8>::convert_with_string_fallback(value)?;
|
@@ -211,15 +215,15 @@ impl ParquetValue {
|
|
211
215
|
Ok(ParquetValue::Boolean(v))
|
212
216
|
}
|
213
217
|
ParquetSchemaType::Date32 => {
|
214
|
-
let v = convert_to_date32(value)?;
|
218
|
+
let v = convert_to_date32(value, None)?;
|
215
219
|
Ok(ParquetValue::Date32(v))
|
216
220
|
}
|
217
221
|
ParquetSchemaType::TimestampMillis => {
|
218
|
-
let v = convert_to_timestamp_millis(value)?;
|
222
|
+
let v = convert_to_timestamp_millis(value, None)?;
|
219
223
|
Ok(ParquetValue::TimestampMillis(v, None))
|
220
224
|
}
|
221
225
|
ParquetSchemaType::TimestampMicros => {
|
222
|
-
let v = convert_to_timestamp_micros(value)?;
|
226
|
+
let v = convert_to_timestamp_micros(value, None)?;
|
223
227
|
Ok(ParquetValue::TimestampMicros(v, None))
|
224
228
|
}
|
225
229
|
ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => Err(MagnusError::new(
|
@@ -30,17 +30,35 @@ where
|
|
30
30
|
}
|
31
31
|
}
|
32
32
|
|
33
|
-
pub fn convert_to_date32(value: Value) -> Result<i32, MagnusError> {
|
33
|
+
pub fn convert_to_date32(value: Value, format: Option<&str>) -> Result<i32, MagnusError> {
|
34
34
|
let ruby = unsafe { Ruby::get_unchecked() };
|
35
35
|
if value.is_kind_of(ruby.class_string()) {
|
36
36
|
let s = String::try_convert(value)?;
|
37
|
-
// Parse string into
|
38
|
-
let date
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
37
|
+
// Parse string into Date using jiff
|
38
|
+
let date = if let Some(fmt) = format {
|
39
|
+
jiff::civil::Date::strptime(&fmt, &s).or_else(|e1| {
|
40
|
+
// Try parsing as DateTime and convert to Date with zero offset
|
41
|
+
jiff::civil::DateTime::strptime(&fmt, &s)
|
42
|
+
.and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
|
43
|
+
.map(|dt| dt.date())
|
44
|
+
.map_err(|e2| {
|
45
|
+
MagnusError::new(
|
46
|
+
magnus::exception::type_error(),
|
47
|
+
format!(
|
48
|
+
"Failed to parse '{}' with format '{}' as date32: {} (and as datetime: {})",
|
49
|
+
s, fmt, e1, e2
|
50
|
+
),
|
51
|
+
)
|
52
|
+
})
|
53
|
+
})?
|
54
|
+
} else {
|
55
|
+
s.parse().map_err(|e| {
|
56
|
+
MagnusError::new(
|
57
|
+
magnus::exception::type_error(),
|
58
|
+
format!("Failed to parse '{}' as date32: {}", s, e),
|
59
|
+
)
|
60
|
+
})?
|
61
|
+
};
|
44
62
|
|
45
63
|
let timestamp = date.at(0, 0, 0, 0);
|
46
64
|
|
@@ -63,17 +81,36 @@ pub fn convert_to_date32(value: Value) -> Result<i32, MagnusError> {
|
|
63
81
|
}
|
64
82
|
}
|
65
83
|
|
66
|
-
pub fn convert_to_timestamp_millis(value: Value) -> Result<i64, MagnusError> {
|
84
|
+
pub fn convert_to_timestamp_millis(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
|
67
85
|
let ruby = unsafe { Ruby::get_unchecked() };
|
68
86
|
if value.is_kind_of(ruby.class_string()) {
|
69
87
|
let s = String::try_convert(value)?;
|
70
88
|
// Parse string into Timestamp using jiff
|
71
|
-
let timestamp
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
89
|
+
let timestamp = if let Some(fmt) = format {
|
90
|
+
jiff::Timestamp::strptime(&fmt, &s)
|
91
|
+
.or_else(|e1| {
|
92
|
+
// Try parsing as DateTime and convert to Timestamp with zero offset
|
93
|
+
jiff::civil::DateTime::strptime(&fmt, &s)
|
94
|
+
.and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
|
95
|
+
.map(|dt| dt.timestamp())
|
96
|
+
.map_err(|e2| {
|
97
|
+
MagnusError::new(
|
98
|
+
magnus::exception::type_error(),
|
99
|
+
format!(
|
100
|
+
"Failed to parse '{}' with format '{}' as timestamp_millis: {} (and as datetime: {})",
|
101
|
+
s, fmt, e1, e2
|
102
|
+
),
|
103
|
+
)
|
104
|
+
})
|
105
|
+
})?
|
106
|
+
} else {
|
107
|
+
s.parse().map_err(|e| {
|
108
|
+
MagnusError::new(
|
109
|
+
magnus::exception::type_error(),
|
110
|
+
format!("Failed to parse '{}' as timestamp_millis: {}", s, e),
|
111
|
+
)
|
112
|
+
})?
|
113
|
+
};
|
77
114
|
// Convert to milliseconds
|
78
115
|
Ok(timestamp.as_millisecond())
|
79
116
|
} else if value.is_kind_of(ruby.class_time()) {
|
@@ -91,17 +128,36 @@ pub fn convert_to_timestamp_millis(value: Value) -> Result<i64, MagnusError> {
|
|
91
128
|
}
|
92
129
|
}
|
93
130
|
|
94
|
-
pub fn convert_to_timestamp_micros(value: Value) -> Result<i64, MagnusError> {
|
131
|
+
pub fn convert_to_timestamp_micros(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
|
95
132
|
let ruby = unsafe { Ruby::get_unchecked() };
|
96
133
|
if value.is_kind_of(ruby.class_string()) {
|
97
134
|
let s = String::try_convert(value)?;
|
98
135
|
// Parse string into Timestamp using jiff
|
99
|
-
let timestamp
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
136
|
+
let timestamp = if let Some(fmt) = format {
|
137
|
+
jiff::Timestamp::strptime(&fmt, &s).or_else(|e1| {
|
138
|
+
// Try parsing as DateTime and convert to Timestamp with zero offset
|
139
|
+
jiff::civil::DateTime::strptime(&fmt, &s).and_then(|dt| {
|
140
|
+
dt.to_zoned(TimeZone::fixed(Offset::constant(0)))
|
141
|
+
})
|
142
|
+
.map(|dt| dt.timestamp())
|
143
|
+
.map_err(|e2| {
|
144
|
+
MagnusError::new(
|
145
|
+
magnus::exception::type_error(),
|
146
|
+
format!(
|
147
|
+
"Failed to parse '{}' with format '{}' as timestamp_micros: {} (and as datetime: {})",
|
148
|
+
s, fmt, e1, e2
|
149
|
+
),
|
150
|
+
)
|
151
|
+
})
|
152
|
+
})?
|
153
|
+
} else {
|
154
|
+
s.parse().map_err(|e| {
|
155
|
+
MagnusError::new(
|
156
|
+
magnus::exception::type_error(),
|
157
|
+
format!("Failed to parse '{}' as timestamp_micros: {}", s, e),
|
158
|
+
)
|
159
|
+
})?
|
160
|
+
};
|
105
161
|
// Convert to microseconds
|
106
162
|
Ok(timestamp.as_microsecond())
|
107
163
|
} else if value.is_kind_of(ruby.class_time()) {
|
@@ -204,15 +260,15 @@ pub fn convert_to_list(
|
|
204
260
|
ParquetValue::Boolean(v)
|
205
261
|
}
|
206
262
|
ParquetSchemaType::Date32 => {
|
207
|
-
let v = convert_to_date32(item_value)?;
|
263
|
+
let v = convert_to_date32(item_value, list_field.format)?;
|
208
264
|
ParquetValue::Date32(v)
|
209
265
|
}
|
210
266
|
ParquetSchemaType::TimestampMillis => {
|
211
|
-
let v = convert_to_timestamp_millis(item_value)?;
|
267
|
+
let v = convert_to_timestamp_millis(item_value, list_field.format)?;
|
212
268
|
ParquetValue::TimestampMillis(v, None)
|
213
269
|
}
|
214
270
|
ParquetSchemaType::TimestampMicros => {
|
215
|
-
let v = convert_to_timestamp_micros(item_value)?;
|
271
|
+
let v = convert_to_timestamp_micros(item_value, list_field.format)?;
|
216
272
|
ParquetValue::TimestampMicros(v, None)
|
217
273
|
}
|
218
274
|
ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
|
@@ -310,15 +366,15 @@ pub fn convert_to_map(
|
|
310
366
|
ParquetValue::Boolean(v)
|
311
367
|
}
|
312
368
|
ParquetSchemaType::Date32 => {
|
313
|
-
let v = convert_to_date32(value)?;
|
369
|
+
let v = convert_to_date32(value, map_field.format)?;
|
314
370
|
ParquetValue::Date32(v)
|
315
371
|
}
|
316
372
|
ParquetSchemaType::TimestampMillis => {
|
317
|
-
let v = convert_to_timestamp_millis(value)?;
|
373
|
+
let v = convert_to_timestamp_millis(value, map_field.format)?;
|
318
374
|
ParquetValue::TimestampMillis(v, None)
|
319
375
|
}
|
320
376
|
ParquetSchemaType::TimestampMicros => {
|
321
|
-
let v = convert_to_timestamp_micros(value)?;
|
377
|
+
let v = convert_to_timestamp_micros(value, map_field.format)?;
|
322
378
|
ParquetValue::TimestampMicros(v, None)
|
323
379
|
}
|
324
380
|
ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
|
@@ -592,12 +648,12 @@ pub fn convert_parquet_values_to_arrow(
|
|
592
648
|
};
|
593
649
|
|
594
650
|
let mut list_builder = ListBuilder::new(value_builder);
|
651
|
+
|
595
652
|
for value in values {
|
596
653
|
match value {
|
597
654
|
ParquetValue::List(items) => {
|
598
|
-
list_builder.append(true);
|
599
655
|
for item in items {
|
600
|
-
match list_field.item_type {
|
656
|
+
match &list_field.item_type {
|
601
657
|
ParquetSchemaType::Int8 => append_list_value_copy!(
|
602
658
|
list_builder,
|
603
659
|
ParquetSchemaType::Int8,
|
@@ -12,16 +12,17 @@ use tempfile::NamedTempFile;
|
|
12
12
|
use crate::types::{ListField, MapField, ParquetSchemaType};
|
13
13
|
|
14
14
|
#[derive(Debug)]
|
15
|
-
pub struct SchemaField {
|
15
|
+
pub struct SchemaField<'a> {
|
16
16
|
pub name: String,
|
17
|
-
pub type_: ParquetSchemaType
|
17
|
+
pub type_: ParquetSchemaType<'a>,
|
18
|
+
pub format: Option<String>,
|
18
19
|
}
|
19
20
|
|
20
21
|
#[derive(Debug)]
|
21
|
-
pub struct ParquetWriteArgs {
|
22
|
+
pub struct ParquetWriteArgs<'a> {
|
22
23
|
pub read_from: Value,
|
23
24
|
pub write_to: Value,
|
24
|
-
pub schema: Vec<SchemaField
|
25
|
+
pub schema: Vec<SchemaField<'a>>,
|
25
26
|
pub batch_size: Option<usize>,
|
26
27
|
}
|
27
28
|
|
@@ -51,7 +52,7 @@ impl Write for IoLikeValue {
|
|
51
52
|
}
|
52
53
|
}
|
53
54
|
|
54
|
-
impl FromStr for ParquetSchemaType {
|
55
|
+
impl<'a> FromStr for ParquetSchemaType<'a> {
|
55
56
|
type Err = MagnusError;
|
56
57
|
|
57
58
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
@@ -74,10 +75,12 @@ impl FromStr for ParquetSchemaType {
|
|
74
75
|
"timestamp_micros" => Ok(ParquetSchemaType::TimestampMicros),
|
75
76
|
"list" => Ok(ParquetSchemaType::List(Box::new(ListField {
|
76
77
|
item_type: ParquetSchemaType::Int8,
|
78
|
+
format: None,
|
77
79
|
}))),
|
78
80
|
"map" => Ok(ParquetSchemaType::Map(Box::new(MapField {
|
79
81
|
key_type: ParquetSchemaType::String,
|
80
82
|
value_type: ParquetSchemaType::Int8,
|
83
|
+
format: None,
|
81
84
|
}))),
|
82
85
|
_ => Err(MagnusError::new(
|
83
86
|
magnus::exception::runtime_error(),
|
@@ -87,7 +90,7 @@ impl FromStr for ParquetSchemaType {
|
|
87
90
|
}
|
88
91
|
}
|
89
92
|
|
90
|
-
impl TryConvert for ParquetSchemaType {
|
93
|
+
impl<'a> TryConvert for ParquetSchemaType<'a> {
|
91
94
|
fn try_convert(value: Value) -> Result<Self, MagnusError> {
|
92
95
|
let ruby = unsafe { Ruby::get_unchecked() };
|
93
96
|
let schema_type = parse_string_or_symbol(&ruby, value)?;
|
@@ -98,7 +101,7 @@ impl TryConvert for ParquetSchemaType {
|
|
98
101
|
|
99
102
|
// We know this type is safe to move between threads because it's just an enum
|
100
103
|
// with simple primitive types and strings
|
101
|
-
unsafe impl Send for ParquetSchemaType {}
|
104
|
+
unsafe impl<'a> Send for ParquetSchemaType<'a> {}
|
102
105
|
|
103
106
|
fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, MagnusError> {
|
104
107
|
if value.is_nil() {
|
@@ -162,17 +165,19 @@ impl From<ParquetErrorWrapper> for MagnusError {
|
|
162
165
|
}
|
163
166
|
}
|
164
167
|
|
165
|
-
pub struct ColumnCollector {
|
168
|
+
pub struct ColumnCollector<'a> {
|
166
169
|
pub name: String,
|
167
|
-
pub type_: ParquetSchemaType
|
170
|
+
pub type_: ParquetSchemaType<'a>,
|
171
|
+
pub format: Option<String>,
|
168
172
|
pub values: Vec<crate::types::ParquetValue>,
|
169
173
|
}
|
170
174
|
|
171
|
-
impl ColumnCollector {
|
172
|
-
pub fn new(name: String, type_: ParquetSchemaType) -> Self {
|
175
|
+
impl<'a> ColumnCollector<'a> {
|
176
|
+
pub fn new(name: String, type_: ParquetSchemaType<'a>, format: Option<String>) -> Self {
|
173
177
|
Self {
|
174
178
|
name,
|
175
179
|
type_,
|
180
|
+
format,
|
176
181
|
values: Vec::new(),
|
177
182
|
}
|
178
183
|
}
|
@@ -185,6 +190,11 @@ impl ColumnCollector {
|
|
185
190
|
NumericConverter,
|
186
191
|
};
|
187
192
|
|
193
|
+
if value.is_nil() {
|
194
|
+
self.values.push(ParquetValue::Null);
|
195
|
+
return Ok(());
|
196
|
+
}
|
197
|
+
|
188
198
|
let parquet_value = match &self.type_ {
|
189
199
|
ParquetSchemaType::Int8 => {
|
190
200
|
let v = NumericConverter::<i8>::convert_with_string_fallback(value)?;
|
@@ -239,15 +249,15 @@ impl ColumnCollector {
|
|
239
249
|
ParquetValue::Boolean(v)
|
240
250
|
}
|
241
251
|
ParquetSchemaType::Date32 => {
|
242
|
-
let v = convert_to_date32(value)?;
|
252
|
+
let v = convert_to_date32(value, self.format.as_deref())?;
|
243
253
|
ParquetValue::Date32(v)
|
244
254
|
}
|
245
255
|
ParquetSchemaType::TimestampMillis => {
|
246
|
-
let v = convert_to_timestamp_millis(value)?;
|
256
|
+
let v = convert_to_timestamp_millis(value, self.format.as_deref())?;
|
247
257
|
ParquetValue::TimestampMillis(v, None)
|
248
258
|
}
|
249
259
|
ParquetSchemaType::TimestampMicros => {
|
250
|
-
let v = convert_to_timestamp_micros(value)?;
|
260
|
+
let v = convert_to_timestamp_micros(value, self.format.as_deref())?;
|
251
261
|
ParquetValue::TimestampMicros(v, None)
|
252
262
|
}
|
253
263
|
ParquetSchemaType::List(list_field) => {
|
data/ext/parquet/src/utils.rs
CHANGED
@@ -39,7 +39,7 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
|
|
39
39
|
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
40
40
|
let (to_read,) = parsed_args.required;
|
41
41
|
|
42
|
-
let kwargs = get_kwargs::<_, (), (Option<Value
|
42
|
+
let kwargs = get_kwargs::<_, (), (Option<Option<Value>>, Option<Option<Vec<String>>>), ()>(
|
43
43
|
parsed_args.keywords,
|
44
44
|
&[],
|
45
45
|
&["result_type", "columns"],
|
@@ -48,6 +48,7 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
|
|
48
48
|
let result_type: ParserResultType = match kwargs
|
49
49
|
.optional
|
50
50
|
.0
|
51
|
+
.flatten()
|
51
52
|
.map(|value| parse_string_or_symbol(ruby, value))
|
52
53
|
{
|
53
54
|
Some(Ok(Some(parsed))) => parsed.try_into().map_err(|e| {
|
@@ -75,7 +76,7 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
|
|
75
76
|
Ok(ParquetRowsArgs {
|
76
77
|
to_read,
|
77
78
|
result_type,
|
78
|
-
columns: kwargs.optional.1,
|
79
|
+
columns: kwargs.optional.1.flatten(),
|
79
80
|
})
|
80
81
|
}
|
81
82
|
|
@@ -95,7 +96,16 @@ pub fn parse_parquet_columns_args(
|
|
95
96
|
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
96
97
|
let (to_read,) = parsed_args.required;
|
97
98
|
|
98
|
-
let kwargs = get_kwargs::<
|
99
|
+
let kwargs = get_kwargs::<
|
100
|
+
_,
|
101
|
+
(),
|
102
|
+
(
|
103
|
+
Option<Option<Value>>,
|
104
|
+
Option<Option<Vec<String>>>,
|
105
|
+
Option<Option<usize>>,
|
106
|
+
),
|
107
|
+
(),
|
108
|
+
>(
|
99
109
|
parsed_args.keywords,
|
100
110
|
&[],
|
101
111
|
&["result_type", "columns", "batch_size"],
|
@@ -104,6 +114,7 @@ pub fn parse_parquet_columns_args(
|
|
104
114
|
let result_type: ParserResultType = match kwargs
|
105
115
|
.optional
|
106
116
|
.0
|
117
|
+
.flatten()
|
107
118
|
.map(|value| parse_string_or_symbol(ruby, value))
|
108
119
|
{
|
109
120
|
Some(Ok(Some(parsed))) => parsed.try_into().map_err(|e| {
|
@@ -131,7 +142,7 @@ pub fn parse_parquet_columns_args(
|
|
131
142
|
Ok(ParquetColumnsArgs {
|
132
143
|
to_read,
|
133
144
|
result_type,
|
134
|
-
columns: kwargs.optional.1,
|
135
|
-
batch_size: kwargs.optional.2,
|
145
|
+
columns: kwargs.optional.1.flatten(),
|
146
|
+
batch_size: kwargs.optional.2.flatten(),
|
136
147
|
})
|
137
148
|
}
|
@@ -28,7 +28,7 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
28
28
|
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
29
29
|
let (read_from,) = parsed_args.required;
|
30
30
|
|
31
|
-
let kwargs = get_kwargs::<_, (Value, Value), (Option<usize
|
31
|
+
let kwargs = get_kwargs::<_, (Value, Value), (Option<Option<usize>>,), ()>(
|
32
32
|
parsed_args.keywords,
|
33
33
|
&["schema", "write_to"],
|
34
34
|
&["batch_size"],
|
@@ -59,18 +59,52 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
59
59
|
));
|
60
60
|
}
|
61
61
|
|
62
|
-
let (name,
|
62
|
+
let (name, type_value) = &entries[0];
|
63
63
|
let name = String::try_convert(name.clone())?;
|
64
|
-
let type_ = ParquetSchemaType::try_convert(type_str.clone())?;
|
65
64
|
|
66
|
-
|
65
|
+
let (type_, format) = if type_value.is_kind_of(ruby.class_hash()) {
|
66
|
+
let type_hash: Vec<(Value, Value)> = type_value.funcall("to_a", ())?;
|
67
|
+
let mut type_str = None;
|
68
|
+
let mut format_str = None;
|
69
|
+
|
70
|
+
for (key, value) in type_hash {
|
71
|
+
let key = String::try_convert(key)?;
|
72
|
+
match key.as_str() {
|
73
|
+
"type" => type_str = Some(value),
|
74
|
+
"format" => format_str = Some(String::try_convert(value)?),
|
75
|
+
_ => {
|
76
|
+
return Err(MagnusError::new(
|
77
|
+
magnus::exception::type_error(),
|
78
|
+
format!("Unknown key '{}' in type definition", key),
|
79
|
+
))
|
80
|
+
}
|
81
|
+
}
|
82
|
+
}
|
83
|
+
|
84
|
+
let type_str = type_str.ok_or_else(|| {
|
85
|
+
MagnusError::new(
|
86
|
+
magnus::exception::type_error(),
|
87
|
+
"Missing 'type' in type definition",
|
88
|
+
)
|
89
|
+
})?;
|
90
|
+
|
91
|
+
(ParquetSchemaType::try_convert(type_str)?, format_str)
|
92
|
+
} else {
|
93
|
+
(ParquetSchemaType::try_convert(type_value.clone())?, None)
|
94
|
+
};
|
95
|
+
|
96
|
+
schema.push(SchemaField {
|
97
|
+
name,
|
98
|
+
type_,
|
99
|
+
format,
|
100
|
+
});
|
67
101
|
}
|
68
102
|
|
69
103
|
Ok(ParquetWriteArgs {
|
70
104
|
read_from,
|
71
105
|
write_to: kwargs.required.1,
|
72
106
|
schema,
|
73
|
-
batch_size: kwargs.optional.0,
|
107
|
+
batch_size: kwargs.optional.0.flatten(),
|
74
108
|
})
|
75
109
|
}
|
76
110
|
|
@@ -130,7 +164,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
130
164
|
// Create collectors for each column
|
131
165
|
let mut column_collectors: Vec<ColumnCollector> = schema
|
132
166
|
.into_iter()
|
133
|
-
.map(|field| ColumnCollector::new(field.name, field.type_))
|
167
|
+
.map(|field| ColumnCollector::new(field.name, field.type_, field.format))
|
134
168
|
.collect();
|
135
169
|
|
136
170
|
let mut rows_in_batch = 0;
|
data/lib/parquet/version.rb
CHANGED
data/lib/parquet.rbi
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# typed:
|
1
|
+
# typed: true
|
2
2
|
|
3
3
|
module Parquet
|
4
4
|
# Options:
|
@@ -7,13 +7,20 @@ module Parquet
|
|
7
7
|
# ("hash" or "array" or :hash or :array)
|
8
8
|
# - `columns`: When present, only the specified columns will be included in the output.
|
9
9
|
# This is useful for reducing how much data is read and improving performance.
|
10
|
+
sig do
|
11
|
+
params(
|
12
|
+
input: T.any(String, File, StringIO, IO),
|
13
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
14
|
+
columns: T.nilable(T::Array[String])
|
15
|
+
).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
|
16
|
+
end
|
10
17
|
sig do
|
11
18
|
params(
|
12
19
|
input: T.any(String, File, StringIO, IO),
|
13
20
|
result_type: T.nilable(T.any(String, Symbol)),
|
14
21
|
columns: T.nilable(T::Array[String]),
|
15
22
|
blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
|
16
|
-
).returns(
|
23
|
+
).returns(NilClass)
|
17
24
|
end
|
18
25
|
def self.each_row(input, result_type: nil, columns: nil, &blk)
|
19
26
|
end
|
@@ -24,6 +31,14 @@ module Parquet
|
|
24
31
|
# ("hash" or "array" or :hash or :array)
|
25
32
|
# - `columns`: When present, only the specified columns will be included in the output.
|
26
33
|
# - `batch_size`: When present, specifies the number of rows per batch
|
34
|
+
sig do
|
35
|
+
params(
|
36
|
+
input: T.any(String, File, StringIO, IO),
|
37
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
38
|
+
columns: T.nilable(T::Array[String]),
|
39
|
+
batch_size: T.nilable(Integer)
|
40
|
+
).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
|
41
|
+
end
|
27
42
|
sig do
|
28
43
|
params(
|
29
44
|
input: T.any(String, File, StringIO, IO),
|
@@ -32,14 +47,22 @@ module Parquet
|
|
32
47
|
batch_size: T.nilable(Integer),
|
33
48
|
blk:
|
34
49
|
T.nilable(T.proc.params(batch: T.any(T::Hash[String, T::Array[T.untyped]], T::Array[T::Array[T.untyped]])).void)
|
35
|
-
).returns(
|
50
|
+
).returns(NilClass)
|
36
51
|
end
|
37
52
|
def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, &blk)
|
38
53
|
end
|
39
54
|
|
40
55
|
# Options:
|
41
56
|
# - `read_from`: An Enumerator yielding arrays of values representing each row
|
42
|
-
# - `schema`: Array of hashes specifying column names and types
|
57
|
+
# - `schema`: Array of hashes specifying column names and types. Supported types:
|
58
|
+
# - `int8`, `int16`, `int32`, `int64`
|
59
|
+
# - `uint8`, `uint16`, `uint32`, `uint64`
|
60
|
+
# - `float`, `double`
|
61
|
+
# - `string`
|
62
|
+
# - `binary`
|
63
|
+
# - `boolean`
|
64
|
+
# - `date32`
|
65
|
+
# - `timestamp_millis`, `timestamp_micros`
|
43
66
|
# - `write_to`: String path or IO object to write the parquet file to
|
44
67
|
# - `batch_size`: Optional batch size for writing (defaults to 1000)
|
45
68
|
sig do
|
@@ -55,7 +78,16 @@ module Parquet
|
|
55
78
|
|
56
79
|
# Options:
|
57
80
|
# - `read_from`: An Enumerator yielding arrays of column batches
|
58
|
-
# - `schema`: Array of hashes specifying column names and types
|
81
|
+
# - `schema`: Array of hashes specifying column names and types. Supported types:
|
82
|
+
# - `int8`, `int16`, `int32`, `int64`
|
83
|
+
# - `uint8`, `uint16`, `uint32`, `uint64`
|
84
|
+
# - `float`, `double`
|
85
|
+
# - `string`
|
86
|
+
# - `binary`
|
87
|
+
# - `boolean`
|
88
|
+
# - `date32`
|
89
|
+
# - `timestamp_millis`, `timestamp_micros`
|
90
|
+
# - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
|
59
91
|
# - `write_to`: String path or IO object to write the parquet file to
|
60
92
|
sig do
|
61
93
|
params(
|