parquet 0.2.5 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/parquet/src/types/core_types.rs +10 -8
- data/ext/parquet/src/types/parquet_value.rs +7 -3
- data/ext/parquet/src/types/type_conversion.rs +86 -30
- data/ext/parquet/src/types/writer_types.rs +24 -14
- data/ext/parquet/src/utils.rs +16 -5
- data/ext/parquet/src/writer/mod.rs +40 -6
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +37 -5
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c1ed4f490a4f03443598dbe1b0e110746052f613a4c5575f9b8e47c6e160bb40
|
4
|
+
data.tar.gz: 4db314d1707e633799e996c6fb777135ff0ea364a76c0a7d8fc5c429e2394d9f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b3f0a15cf467d030d3002c21bc6b64b6cd16e91e972b8de1e928abfd9bd373cfb5c4f77cdd1a6db7c620055e9657ec623866e0d8a0cb3a8e21a0c252bde3df87
|
7
|
+
data.tar.gz: 77f41921f5818051b597d2941688f6eca2a24d86333c58dec45d6e47e7161bfdd70e78f50a0f7ddd6cc99356c2b477451ab43adf9caa201501815c6b1a731d5c
|
@@ -40,18 +40,20 @@ impl std::fmt::Display for ParserResultType {
|
|
40
40
|
}
|
41
41
|
|
42
42
|
#[derive(Debug, Clone)]
|
43
|
-
pub struct ListField {
|
44
|
-
pub item_type: ParquetSchemaType
|
43
|
+
pub struct ListField<'a> {
|
44
|
+
pub item_type: ParquetSchemaType<'a>,
|
45
|
+
pub format: Option<&'a str>,
|
45
46
|
}
|
46
47
|
|
47
48
|
#[derive(Debug, Clone)]
|
48
|
-
pub struct MapField {
|
49
|
-
pub key_type: ParquetSchemaType
|
50
|
-
pub value_type: ParquetSchemaType
|
49
|
+
pub struct MapField<'a> {
|
50
|
+
pub key_type: ParquetSchemaType<'a>,
|
51
|
+
pub value_type: ParquetSchemaType<'a>,
|
52
|
+
pub format: Option<&'a str>,
|
51
53
|
}
|
52
54
|
|
53
55
|
#[derive(Debug, Clone)]
|
54
|
-
pub enum ParquetSchemaType {
|
56
|
+
pub enum ParquetSchemaType<'a> {
|
55
57
|
Int8,
|
56
58
|
Int16,
|
57
59
|
Int32,
|
@@ -68,6 +70,6 @@ pub enum ParquetSchemaType {
|
|
68
70
|
Date32,
|
69
71
|
TimestampMillis,
|
70
72
|
TimestampMicros,
|
71
|
-
List(Box<ListField
|
72
|
-
Map(Box<MapField
|
73
|
+
List(Box<ListField<'a>>),
|
74
|
+
Map(Box<MapField<'a>>),
|
73
75
|
}
|
@@ -157,6 +157,10 @@ impl IntoValue for ParquetValue {
|
|
157
157
|
|
158
158
|
impl ParquetValue {
|
159
159
|
pub fn from_value(value: Value, type_: &ParquetSchemaType) -> Result<Self, MagnusError> {
|
160
|
+
if value.is_nil() {
|
161
|
+
return Ok(ParquetValue::Null);
|
162
|
+
}
|
163
|
+
|
160
164
|
match type_ {
|
161
165
|
ParquetSchemaType::Int8 => {
|
162
166
|
let v = NumericConverter::<i8>::convert_with_string_fallback(value)?;
|
@@ -211,15 +215,15 @@ impl ParquetValue {
|
|
211
215
|
Ok(ParquetValue::Boolean(v))
|
212
216
|
}
|
213
217
|
ParquetSchemaType::Date32 => {
|
214
|
-
let v = convert_to_date32(value)?;
|
218
|
+
let v = convert_to_date32(value, None)?;
|
215
219
|
Ok(ParquetValue::Date32(v))
|
216
220
|
}
|
217
221
|
ParquetSchemaType::TimestampMillis => {
|
218
|
-
let v = convert_to_timestamp_millis(value)?;
|
222
|
+
let v = convert_to_timestamp_millis(value, None)?;
|
219
223
|
Ok(ParquetValue::TimestampMillis(v, None))
|
220
224
|
}
|
221
225
|
ParquetSchemaType::TimestampMicros => {
|
222
|
-
let v = convert_to_timestamp_micros(value)?;
|
226
|
+
let v = convert_to_timestamp_micros(value, None)?;
|
223
227
|
Ok(ParquetValue::TimestampMicros(v, None))
|
224
228
|
}
|
225
229
|
ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => Err(MagnusError::new(
|
@@ -30,17 +30,35 @@ where
|
|
30
30
|
}
|
31
31
|
}
|
32
32
|
|
33
|
-
pub fn convert_to_date32(value: Value) -> Result<i32, MagnusError> {
|
33
|
+
pub fn convert_to_date32(value: Value, format: Option<&str>) -> Result<i32, MagnusError> {
|
34
34
|
let ruby = unsafe { Ruby::get_unchecked() };
|
35
35
|
if value.is_kind_of(ruby.class_string()) {
|
36
36
|
let s = String::try_convert(value)?;
|
37
|
-
// Parse string into
|
38
|
-
let date
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
37
|
+
// Parse string into Date using jiff
|
38
|
+
let date = if let Some(fmt) = format {
|
39
|
+
jiff::civil::Date::strptime(&fmt, &s).or_else(|e1| {
|
40
|
+
// Try parsing as DateTime and convert to Date with zero offset
|
41
|
+
jiff::civil::DateTime::strptime(&fmt, &s)
|
42
|
+
.and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
|
43
|
+
.map(|dt| dt.date())
|
44
|
+
.map_err(|e2| {
|
45
|
+
MagnusError::new(
|
46
|
+
magnus::exception::type_error(),
|
47
|
+
format!(
|
48
|
+
"Failed to parse '{}' with format '{}' as date32: {} (and as datetime: {})",
|
49
|
+
s, fmt, e1, e2
|
50
|
+
),
|
51
|
+
)
|
52
|
+
})
|
53
|
+
})?
|
54
|
+
} else {
|
55
|
+
s.parse().map_err(|e| {
|
56
|
+
MagnusError::new(
|
57
|
+
magnus::exception::type_error(),
|
58
|
+
format!("Failed to parse '{}' as date32: {}", s, e),
|
59
|
+
)
|
60
|
+
})?
|
61
|
+
};
|
44
62
|
|
45
63
|
let timestamp = date.at(0, 0, 0, 0);
|
46
64
|
|
@@ -63,17 +81,36 @@ pub fn convert_to_date32(value: Value) -> Result<i32, MagnusError> {
|
|
63
81
|
}
|
64
82
|
}
|
65
83
|
|
66
|
-
pub fn convert_to_timestamp_millis(value: Value) -> Result<i64, MagnusError> {
|
84
|
+
pub fn convert_to_timestamp_millis(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
|
67
85
|
let ruby = unsafe { Ruby::get_unchecked() };
|
68
86
|
if value.is_kind_of(ruby.class_string()) {
|
69
87
|
let s = String::try_convert(value)?;
|
70
88
|
// Parse string into Timestamp using jiff
|
71
|
-
let timestamp
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
89
|
+
let timestamp = if let Some(fmt) = format {
|
90
|
+
jiff::Timestamp::strptime(&fmt, &s)
|
91
|
+
.or_else(|e1| {
|
92
|
+
// Try parsing as DateTime and convert to Timestamp with zero offset
|
93
|
+
jiff::civil::DateTime::strptime(&fmt, &s)
|
94
|
+
.and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
|
95
|
+
.map(|dt| dt.timestamp())
|
96
|
+
.map_err(|e2| {
|
97
|
+
MagnusError::new(
|
98
|
+
magnus::exception::type_error(),
|
99
|
+
format!(
|
100
|
+
"Failed to parse '{}' with format '{}' as timestamp_millis: {} (and as datetime: {})",
|
101
|
+
s, fmt, e1, e2
|
102
|
+
),
|
103
|
+
)
|
104
|
+
})
|
105
|
+
})?
|
106
|
+
} else {
|
107
|
+
s.parse().map_err(|e| {
|
108
|
+
MagnusError::new(
|
109
|
+
magnus::exception::type_error(),
|
110
|
+
format!("Failed to parse '{}' as timestamp_millis: {}", s, e),
|
111
|
+
)
|
112
|
+
})?
|
113
|
+
};
|
77
114
|
// Convert to milliseconds
|
78
115
|
Ok(timestamp.as_millisecond())
|
79
116
|
} else if value.is_kind_of(ruby.class_time()) {
|
@@ -91,17 +128,36 @@ pub fn convert_to_timestamp_millis(value: Value) -> Result<i64, MagnusError> {
|
|
91
128
|
}
|
92
129
|
}
|
93
130
|
|
94
|
-
pub fn convert_to_timestamp_micros(value: Value) -> Result<i64, MagnusError> {
|
131
|
+
pub fn convert_to_timestamp_micros(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
|
95
132
|
let ruby = unsafe { Ruby::get_unchecked() };
|
96
133
|
if value.is_kind_of(ruby.class_string()) {
|
97
134
|
let s = String::try_convert(value)?;
|
98
135
|
// Parse string into Timestamp using jiff
|
99
|
-
let timestamp
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
136
|
+
let timestamp = if let Some(fmt) = format {
|
137
|
+
jiff::Timestamp::strptime(&fmt, &s).or_else(|e1| {
|
138
|
+
// Try parsing as DateTime and convert to Timestamp with zero offset
|
139
|
+
jiff::civil::DateTime::strptime(&fmt, &s).and_then(|dt| {
|
140
|
+
dt.to_zoned(TimeZone::fixed(Offset::constant(0)))
|
141
|
+
})
|
142
|
+
.map(|dt| dt.timestamp())
|
143
|
+
.map_err(|e2| {
|
144
|
+
MagnusError::new(
|
145
|
+
magnus::exception::type_error(),
|
146
|
+
format!(
|
147
|
+
"Failed to parse '{}' with format '{}' as timestamp_micros: {} (and as datetime: {})",
|
148
|
+
s, fmt, e1, e2
|
149
|
+
),
|
150
|
+
)
|
151
|
+
})
|
152
|
+
})?
|
153
|
+
} else {
|
154
|
+
s.parse().map_err(|e| {
|
155
|
+
MagnusError::new(
|
156
|
+
magnus::exception::type_error(),
|
157
|
+
format!("Failed to parse '{}' as timestamp_micros: {}", s, e),
|
158
|
+
)
|
159
|
+
})?
|
160
|
+
};
|
105
161
|
// Convert to microseconds
|
106
162
|
Ok(timestamp.as_microsecond())
|
107
163
|
} else if value.is_kind_of(ruby.class_time()) {
|
@@ -204,15 +260,15 @@ pub fn convert_to_list(
|
|
204
260
|
ParquetValue::Boolean(v)
|
205
261
|
}
|
206
262
|
ParquetSchemaType::Date32 => {
|
207
|
-
let v = convert_to_date32(item_value)?;
|
263
|
+
let v = convert_to_date32(item_value, list_field.format)?;
|
208
264
|
ParquetValue::Date32(v)
|
209
265
|
}
|
210
266
|
ParquetSchemaType::TimestampMillis => {
|
211
|
-
let v = convert_to_timestamp_millis(item_value)?;
|
267
|
+
let v = convert_to_timestamp_millis(item_value, list_field.format)?;
|
212
268
|
ParquetValue::TimestampMillis(v, None)
|
213
269
|
}
|
214
270
|
ParquetSchemaType::TimestampMicros => {
|
215
|
-
let v = convert_to_timestamp_micros(item_value)?;
|
271
|
+
let v = convert_to_timestamp_micros(item_value, list_field.format)?;
|
216
272
|
ParquetValue::TimestampMicros(v, None)
|
217
273
|
}
|
218
274
|
ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
|
@@ -310,15 +366,15 @@ pub fn convert_to_map(
|
|
310
366
|
ParquetValue::Boolean(v)
|
311
367
|
}
|
312
368
|
ParquetSchemaType::Date32 => {
|
313
|
-
let v = convert_to_date32(value)?;
|
369
|
+
let v = convert_to_date32(value, map_field.format)?;
|
314
370
|
ParquetValue::Date32(v)
|
315
371
|
}
|
316
372
|
ParquetSchemaType::TimestampMillis => {
|
317
|
-
let v = convert_to_timestamp_millis(value)?;
|
373
|
+
let v = convert_to_timestamp_millis(value, map_field.format)?;
|
318
374
|
ParquetValue::TimestampMillis(v, None)
|
319
375
|
}
|
320
376
|
ParquetSchemaType::TimestampMicros => {
|
321
|
-
let v = convert_to_timestamp_micros(value)?;
|
377
|
+
let v = convert_to_timestamp_micros(value, map_field.format)?;
|
322
378
|
ParquetValue::TimestampMicros(v, None)
|
323
379
|
}
|
324
380
|
ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
|
@@ -592,12 +648,12 @@ pub fn convert_parquet_values_to_arrow(
|
|
592
648
|
};
|
593
649
|
|
594
650
|
let mut list_builder = ListBuilder::new(value_builder);
|
651
|
+
|
595
652
|
for value in values {
|
596
653
|
match value {
|
597
654
|
ParquetValue::List(items) => {
|
598
|
-
list_builder.append(true);
|
599
655
|
for item in items {
|
600
|
-
match list_field.item_type {
|
656
|
+
match &list_field.item_type {
|
601
657
|
ParquetSchemaType::Int8 => append_list_value_copy!(
|
602
658
|
list_builder,
|
603
659
|
ParquetSchemaType::Int8,
|
@@ -12,16 +12,17 @@ use tempfile::NamedTempFile;
|
|
12
12
|
use crate::types::{ListField, MapField, ParquetSchemaType};
|
13
13
|
|
14
14
|
#[derive(Debug)]
|
15
|
-
pub struct SchemaField {
|
15
|
+
pub struct SchemaField<'a> {
|
16
16
|
pub name: String,
|
17
|
-
pub type_: ParquetSchemaType
|
17
|
+
pub type_: ParquetSchemaType<'a>,
|
18
|
+
pub format: Option<String>,
|
18
19
|
}
|
19
20
|
|
20
21
|
#[derive(Debug)]
|
21
|
-
pub struct ParquetWriteArgs {
|
22
|
+
pub struct ParquetWriteArgs<'a> {
|
22
23
|
pub read_from: Value,
|
23
24
|
pub write_to: Value,
|
24
|
-
pub schema: Vec<SchemaField
|
25
|
+
pub schema: Vec<SchemaField<'a>>,
|
25
26
|
pub batch_size: Option<usize>,
|
26
27
|
}
|
27
28
|
|
@@ -51,7 +52,7 @@ impl Write for IoLikeValue {
|
|
51
52
|
}
|
52
53
|
}
|
53
54
|
|
54
|
-
impl FromStr for ParquetSchemaType {
|
55
|
+
impl<'a> FromStr for ParquetSchemaType<'a> {
|
55
56
|
type Err = MagnusError;
|
56
57
|
|
57
58
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
@@ -74,10 +75,12 @@ impl FromStr for ParquetSchemaType {
|
|
74
75
|
"timestamp_micros" => Ok(ParquetSchemaType::TimestampMicros),
|
75
76
|
"list" => Ok(ParquetSchemaType::List(Box::new(ListField {
|
76
77
|
item_type: ParquetSchemaType::Int8,
|
78
|
+
format: None,
|
77
79
|
}))),
|
78
80
|
"map" => Ok(ParquetSchemaType::Map(Box::new(MapField {
|
79
81
|
key_type: ParquetSchemaType::String,
|
80
82
|
value_type: ParquetSchemaType::Int8,
|
83
|
+
format: None,
|
81
84
|
}))),
|
82
85
|
_ => Err(MagnusError::new(
|
83
86
|
magnus::exception::runtime_error(),
|
@@ -87,7 +90,7 @@ impl FromStr for ParquetSchemaType {
|
|
87
90
|
}
|
88
91
|
}
|
89
92
|
|
90
|
-
impl TryConvert for ParquetSchemaType {
|
93
|
+
impl<'a> TryConvert for ParquetSchemaType<'a> {
|
91
94
|
fn try_convert(value: Value) -> Result<Self, MagnusError> {
|
92
95
|
let ruby = unsafe { Ruby::get_unchecked() };
|
93
96
|
let schema_type = parse_string_or_symbol(&ruby, value)?;
|
@@ -98,7 +101,7 @@ impl TryConvert for ParquetSchemaType {
|
|
98
101
|
|
99
102
|
// We know this type is safe to move between threads because it's just an enum
|
100
103
|
// with simple primitive types and strings
|
101
|
-
unsafe impl Send for ParquetSchemaType {}
|
104
|
+
unsafe impl<'a> Send for ParquetSchemaType<'a> {}
|
102
105
|
|
103
106
|
fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, MagnusError> {
|
104
107
|
if value.is_nil() {
|
@@ -162,17 +165,19 @@ impl From<ParquetErrorWrapper> for MagnusError {
|
|
162
165
|
}
|
163
166
|
}
|
164
167
|
|
165
|
-
pub struct ColumnCollector {
|
168
|
+
pub struct ColumnCollector<'a> {
|
166
169
|
pub name: String,
|
167
|
-
pub type_: ParquetSchemaType
|
170
|
+
pub type_: ParquetSchemaType<'a>,
|
171
|
+
pub format: Option<String>,
|
168
172
|
pub values: Vec<crate::types::ParquetValue>,
|
169
173
|
}
|
170
174
|
|
171
|
-
impl ColumnCollector {
|
172
|
-
pub fn new(name: String, type_: ParquetSchemaType) -> Self {
|
175
|
+
impl<'a> ColumnCollector<'a> {
|
176
|
+
pub fn new(name: String, type_: ParquetSchemaType<'a>, format: Option<String>) -> Self {
|
173
177
|
Self {
|
174
178
|
name,
|
175
179
|
type_,
|
180
|
+
format,
|
176
181
|
values: Vec::new(),
|
177
182
|
}
|
178
183
|
}
|
@@ -185,6 +190,11 @@ impl ColumnCollector {
|
|
185
190
|
NumericConverter,
|
186
191
|
};
|
187
192
|
|
193
|
+
if value.is_nil() {
|
194
|
+
self.values.push(ParquetValue::Null);
|
195
|
+
return Ok(());
|
196
|
+
}
|
197
|
+
|
188
198
|
let parquet_value = match &self.type_ {
|
189
199
|
ParquetSchemaType::Int8 => {
|
190
200
|
let v = NumericConverter::<i8>::convert_with_string_fallback(value)?;
|
@@ -239,15 +249,15 @@ impl ColumnCollector {
|
|
239
249
|
ParquetValue::Boolean(v)
|
240
250
|
}
|
241
251
|
ParquetSchemaType::Date32 => {
|
242
|
-
let v = convert_to_date32(value)?;
|
252
|
+
let v = convert_to_date32(value, self.format.as_deref())?;
|
243
253
|
ParquetValue::Date32(v)
|
244
254
|
}
|
245
255
|
ParquetSchemaType::TimestampMillis => {
|
246
|
-
let v = convert_to_timestamp_millis(value)?;
|
256
|
+
let v = convert_to_timestamp_millis(value, self.format.as_deref())?;
|
247
257
|
ParquetValue::TimestampMillis(v, None)
|
248
258
|
}
|
249
259
|
ParquetSchemaType::TimestampMicros => {
|
250
|
-
let v = convert_to_timestamp_micros(value)?;
|
260
|
+
let v = convert_to_timestamp_micros(value, self.format.as_deref())?;
|
251
261
|
ParquetValue::TimestampMicros(v, None)
|
252
262
|
}
|
253
263
|
ParquetSchemaType::List(list_field) => {
|
data/ext/parquet/src/utils.rs
CHANGED
@@ -39,7 +39,7 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
|
|
39
39
|
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
40
40
|
let (to_read,) = parsed_args.required;
|
41
41
|
|
42
|
-
let kwargs = get_kwargs::<_, (), (Option<Value
|
42
|
+
let kwargs = get_kwargs::<_, (), (Option<Option<Value>>, Option<Option<Vec<String>>>), ()>(
|
43
43
|
parsed_args.keywords,
|
44
44
|
&[],
|
45
45
|
&["result_type", "columns"],
|
@@ -48,6 +48,7 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
|
|
48
48
|
let result_type: ParserResultType = match kwargs
|
49
49
|
.optional
|
50
50
|
.0
|
51
|
+
.flatten()
|
51
52
|
.map(|value| parse_string_or_symbol(ruby, value))
|
52
53
|
{
|
53
54
|
Some(Ok(Some(parsed))) => parsed.try_into().map_err(|e| {
|
@@ -75,7 +76,7 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
|
|
75
76
|
Ok(ParquetRowsArgs {
|
76
77
|
to_read,
|
77
78
|
result_type,
|
78
|
-
columns: kwargs.optional.1,
|
79
|
+
columns: kwargs.optional.1.flatten(),
|
79
80
|
})
|
80
81
|
}
|
81
82
|
|
@@ -95,7 +96,16 @@ pub fn parse_parquet_columns_args(
|
|
95
96
|
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
96
97
|
let (to_read,) = parsed_args.required;
|
97
98
|
|
98
|
-
let kwargs = get_kwargs::<
|
99
|
+
let kwargs = get_kwargs::<
|
100
|
+
_,
|
101
|
+
(),
|
102
|
+
(
|
103
|
+
Option<Option<Value>>,
|
104
|
+
Option<Option<Vec<String>>>,
|
105
|
+
Option<Option<usize>>,
|
106
|
+
),
|
107
|
+
(),
|
108
|
+
>(
|
99
109
|
parsed_args.keywords,
|
100
110
|
&[],
|
101
111
|
&["result_type", "columns", "batch_size"],
|
@@ -104,6 +114,7 @@ pub fn parse_parquet_columns_args(
|
|
104
114
|
let result_type: ParserResultType = match kwargs
|
105
115
|
.optional
|
106
116
|
.0
|
117
|
+
.flatten()
|
107
118
|
.map(|value| parse_string_or_symbol(ruby, value))
|
108
119
|
{
|
109
120
|
Some(Ok(Some(parsed))) => parsed.try_into().map_err(|e| {
|
@@ -131,7 +142,7 @@ pub fn parse_parquet_columns_args(
|
|
131
142
|
Ok(ParquetColumnsArgs {
|
132
143
|
to_read,
|
133
144
|
result_type,
|
134
|
-
columns: kwargs.optional.1,
|
135
|
-
batch_size: kwargs.optional.2,
|
145
|
+
columns: kwargs.optional.1.flatten(),
|
146
|
+
batch_size: kwargs.optional.2.flatten(),
|
136
147
|
})
|
137
148
|
}
|
@@ -28,7 +28,7 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
28
28
|
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
29
29
|
let (read_from,) = parsed_args.required;
|
30
30
|
|
31
|
-
let kwargs = get_kwargs::<_, (Value, Value), (Option<usize
|
31
|
+
let kwargs = get_kwargs::<_, (Value, Value), (Option<Option<usize>>,), ()>(
|
32
32
|
parsed_args.keywords,
|
33
33
|
&["schema", "write_to"],
|
34
34
|
&["batch_size"],
|
@@ -59,18 +59,52 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
59
59
|
));
|
60
60
|
}
|
61
61
|
|
62
|
-
let (name,
|
62
|
+
let (name, type_value) = &entries[0];
|
63
63
|
let name = String::try_convert(name.clone())?;
|
64
|
-
let type_ = ParquetSchemaType::try_convert(type_str.clone())?;
|
65
64
|
|
66
|
-
|
65
|
+
let (type_, format) = if type_value.is_kind_of(ruby.class_hash()) {
|
66
|
+
let type_hash: Vec<(Value, Value)> = type_value.funcall("to_a", ())?;
|
67
|
+
let mut type_str = None;
|
68
|
+
let mut format_str = None;
|
69
|
+
|
70
|
+
for (key, value) in type_hash {
|
71
|
+
let key = String::try_convert(key)?;
|
72
|
+
match key.as_str() {
|
73
|
+
"type" => type_str = Some(value),
|
74
|
+
"format" => format_str = Some(String::try_convert(value)?),
|
75
|
+
_ => {
|
76
|
+
return Err(MagnusError::new(
|
77
|
+
magnus::exception::type_error(),
|
78
|
+
format!("Unknown key '{}' in type definition", key),
|
79
|
+
))
|
80
|
+
}
|
81
|
+
}
|
82
|
+
}
|
83
|
+
|
84
|
+
let type_str = type_str.ok_or_else(|| {
|
85
|
+
MagnusError::new(
|
86
|
+
magnus::exception::type_error(),
|
87
|
+
"Missing 'type' in type definition",
|
88
|
+
)
|
89
|
+
})?;
|
90
|
+
|
91
|
+
(ParquetSchemaType::try_convert(type_str)?, format_str)
|
92
|
+
} else {
|
93
|
+
(ParquetSchemaType::try_convert(type_value.clone())?, None)
|
94
|
+
};
|
95
|
+
|
96
|
+
schema.push(SchemaField {
|
97
|
+
name,
|
98
|
+
type_,
|
99
|
+
format,
|
100
|
+
});
|
67
101
|
}
|
68
102
|
|
69
103
|
Ok(ParquetWriteArgs {
|
70
104
|
read_from,
|
71
105
|
write_to: kwargs.required.1,
|
72
106
|
schema,
|
73
|
-
batch_size: kwargs.optional.0,
|
107
|
+
batch_size: kwargs.optional.0.flatten(),
|
74
108
|
})
|
75
109
|
}
|
76
110
|
|
@@ -130,7 +164,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
130
164
|
// Create collectors for each column
|
131
165
|
let mut column_collectors: Vec<ColumnCollector> = schema
|
132
166
|
.into_iter()
|
133
|
-
.map(|field| ColumnCollector::new(field.name, field.type_))
|
167
|
+
.map(|field| ColumnCollector::new(field.name, field.type_, field.format))
|
134
168
|
.collect();
|
135
169
|
|
136
170
|
let mut rows_in_batch = 0;
|
data/lib/parquet/version.rb
CHANGED
data/lib/parquet.rbi
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# typed:
|
1
|
+
# typed: true
|
2
2
|
|
3
3
|
module Parquet
|
4
4
|
# Options:
|
@@ -7,13 +7,20 @@ module Parquet
|
|
7
7
|
# ("hash" or "array" or :hash or :array)
|
8
8
|
# - `columns`: When present, only the specified columns will be included in the output.
|
9
9
|
# This is useful for reducing how much data is read and improving performance.
|
10
|
+
sig do
|
11
|
+
params(
|
12
|
+
input: T.any(String, File, StringIO, IO),
|
13
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
14
|
+
columns: T.nilable(T::Array[String])
|
15
|
+
).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
|
16
|
+
end
|
10
17
|
sig do
|
11
18
|
params(
|
12
19
|
input: T.any(String, File, StringIO, IO),
|
13
20
|
result_type: T.nilable(T.any(String, Symbol)),
|
14
21
|
columns: T.nilable(T::Array[String]),
|
15
22
|
blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
|
16
|
-
).returns(
|
23
|
+
).returns(NilClass)
|
17
24
|
end
|
18
25
|
def self.each_row(input, result_type: nil, columns: nil, &blk)
|
19
26
|
end
|
@@ -24,6 +31,14 @@ module Parquet
|
|
24
31
|
# ("hash" or "array" or :hash or :array)
|
25
32
|
# - `columns`: When present, only the specified columns will be included in the output.
|
26
33
|
# - `batch_size`: When present, specifies the number of rows per batch
|
34
|
+
sig do
|
35
|
+
params(
|
36
|
+
input: T.any(String, File, StringIO, IO),
|
37
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
38
|
+
columns: T.nilable(T::Array[String]),
|
39
|
+
batch_size: T.nilable(Integer)
|
40
|
+
).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
|
41
|
+
end
|
27
42
|
sig do
|
28
43
|
params(
|
29
44
|
input: T.any(String, File, StringIO, IO),
|
@@ -32,14 +47,22 @@ module Parquet
|
|
32
47
|
batch_size: T.nilable(Integer),
|
33
48
|
blk:
|
34
49
|
T.nilable(T.proc.params(batch: T.any(T::Hash[String, T::Array[T.untyped]], T::Array[T::Array[T.untyped]])).void)
|
35
|
-
).returns(
|
50
|
+
).returns(NilClass)
|
36
51
|
end
|
37
52
|
def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, &blk)
|
38
53
|
end
|
39
54
|
|
40
55
|
# Options:
|
41
56
|
# - `read_from`: An Enumerator yielding arrays of values representing each row
|
42
|
-
# - `schema`: Array of hashes specifying column names and types
|
57
|
+
# - `schema`: Array of hashes specifying column names and types. Supported types:
|
58
|
+
# - `int8`, `int16`, `int32`, `int64`
|
59
|
+
# - `uint8`, `uint16`, `uint32`, `uint64`
|
60
|
+
# - `float`, `double`
|
61
|
+
# - `string`
|
62
|
+
# - `binary`
|
63
|
+
# - `boolean`
|
64
|
+
# - `date32`
|
65
|
+
# - `timestamp_millis`, `timestamp_micros`
|
43
66
|
# - `write_to`: String path or IO object to write the parquet file to
|
44
67
|
# - `batch_size`: Optional batch size for writing (defaults to 1000)
|
45
68
|
sig do
|
@@ -55,7 +78,16 @@ module Parquet
|
|
55
78
|
|
56
79
|
# Options:
|
57
80
|
# - `read_from`: An Enumerator yielding arrays of column batches
|
58
|
-
# - `schema`: Array of hashes specifying column names and types
|
81
|
+
# - `schema`: Array of hashes specifying column names and types. Supported types:
|
82
|
+
# - `int8`, `int16`, `int32`, `int64`
|
83
|
+
# - `uint8`, `uint16`, `uint32`, `uint64`
|
84
|
+
# - `float`, `double`
|
85
|
+
# - `string`
|
86
|
+
# - `binary`
|
87
|
+
# - `boolean`
|
88
|
+
# - `date32`
|
89
|
+
# - `timestamp_millis`, `timestamp_micros`
|
90
|
+
# - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
|
59
91
|
# - `write_to`: String path or IO object to write the parquet file to
|
60
92
|
sig do
|
61
93
|
params(
|