parquet 0.2.6 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/parquet/src/types/core_types.rs +10 -8
- data/ext/parquet/src/types/parquet_value.rs +3 -3
- data/ext/parquet/src/types/type_conversion.rs +84 -28
- data/ext/parquet/src/types/writer_types.rs +19 -14
- data/ext/parquet/src/writer/mod.rs +38 -4
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +37 -5
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c1ed4f490a4f03443598dbe1b0e110746052f613a4c5575f9b8e47c6e160bb40
|
|
4
|
+
data.tar.gz: 4db314d1707e633799e996c6fb777135ff0ea364a76c0a7d8fc5c429e2394d9f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b3f0a15cf467d030d3002c21bc6b64b6cd16e91e972b8de1e928abfd9bd373cfb5c4f77cdd1a6db7c620055e9657ec623866e0d8a0cb3a8e21a0c252bde3df87
|
|
7
|
+
data.tar.gz: 77f41921f5818051b597d2941688f6eca2a24d86333c58dec45d6e47e7161bfdd70e78f50a0f7ddd6cc99356c2b477451ab43adf9caa201501815c6b1a731d5c
|
|
@@ -40,18 +40,20 @@ impl std::fmt::Display for ParserResultType {
|
|
|
40
40
|
}
|
|
41
41
|
|
|
42
42
|
#[derive(Debug, Clone)]
|
|
43
|
-
pub struct ListField {
|
|
44
|
-
pub item_type: ParquetSchemaType
|
|
43
|
+
pub struct ListField<'a> {
|
|
44
|
+
pub item_type: ParquetSchemaType<'a>,
|
|
45
|
+
pub format: Option<&'a str>,
|
|
45
46
|
}
|
|
46
47
|
|
|
47
48
|
#[derive(Debug, Clone)]
|
|
48
|
-
pub struct MapField {
|
|
49
|
-
pub key_type: ParquetSchemaType
|
|
50
|
-
pub value_type: ParquetSchemaType
|
|
49
|
+
pub struct MapField<'a> {
|
|
50
|
+
pub key_type: ParquetSchemaType<'a>,
|
|
51
|
+
pub value_type: ParquetSchemaType<'a>,
|
|
52
|
+
pub format: Option<&'a str>,
|
|
51
53
|
}
|
|
52
54
|
|
|
53
55
|
#[derive(Debug, Clone)]
|
|
54
|
-
pub enum ParquetSchemaType {
|
|
56
|
+
pub enum ParquetSchemaType<'a> {
|
|
55
57
|
Int8,
|
|
56
58
|
Int16,
|
|
57
59
|
Int32,
|
|
@@ -68,6 +70,6 @@ pub enum ParquetSchemaType {
|
|
|
68
70
|
Date32,
|
|
69
71
|
TimestampMillis,
|
|
70
72
|
TimestampMicros,
|
|
71
|
-
List(Box<ListField
|
|
72
|
-
Map(Box<MapField
|
|
73
|
+
List(Box<ListField<'a>>),
|
|
74
|
+
Map(Box<MapField<'a>>),
|
|
73
75
|
}
|
|
@@ -215,15 +215,15 @@ impl ParquetValue {
|
|
|
215
215
|
Ok(ParquetValue::Boolean(v))
|
|
216
216
|
}
|
|
217
217
|
ParquetSchemaType::Date32 => {
|
|
218
|
-
let v = convert_to_date32(value)?;
|
|
218
|
+
let v = convert_to_date32(value, None)?;
|
|
219
219
|
Ok(ParquetValue::Date32(v))
|
|
220
220
|
}
|
|
221
221
|
ParquetSchemaType::TimestampMillis => {
|
|
222
|
-
let v = convert_to_timestamp_millis(value)?;
|
|
222
|
+
let v = convert_to_timestamp_millis(value, None)?;
|
|
223
223
|
Ok(ParquetValue::TimestampMillis(v, None))
|
|
224
224
|
}
|
|
225
225
|
ParquetSchemaType::TimestampMicros => {
|
|
226
|
-
let v = convert_to_timestamp_micros(value)?;
|
|
226
|
+
let v = convert_to_timestamp_micros(value, None)?;
|
|
227
227
|
Ok(ParquetValue::TimestampMicros(v, None))
|
|
228
228
|
}
|
|
229
229
|
ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => Err(MagnusError::new(
|
|
@@ -30,17 +30,35 @@ where
|
|
|
30
30
|
}
|
|
31
31
|
}
|
|
32
32
|
|
|
33
|
-
pub fn convert_to_date32(value: Value) -> Result<i32, MagnusError> {
|
|
33
|
+
pub fn convert_to_date32(value: Value, format: Option<&str>) -> Result<i32, MagnusError> {
|
|
34
34
|
let ruby = unsafe { Ruby::get_unchecked() };
|
|
35
35
|
if value.is_kind_of(ruby.class_string()) {
|
|
36
36
|
let s = String::try_convert(value)?;
|
|
37
|
-
// Parse string into
|
|
38
|
-
let date
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
37
|
+
// Parse string into Date using jiff
|
|
38
|
+
let date = if let Some(fmt) = format {
|
|
39
|
+
jiff::civil::Date::strptime(&fmt, &s).or_else(|e1| {
|
|
40
|
+
// Try parsing as DateTime and convert to Date with zero offset
|
|
41
|
+
jiff::civil::DateTime::strptime(&fmt, &s)
|
|
42
|
+
.and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
|
|
43
|
+
.map(|dt| dt.date())
|
|
44
|
+
.map_err(|e2| {
|
|
45
|
+
MagnusError::new(
|
|
46
|
+
magnus::exception::type_error(),
|
|
47
|
+
format!(
|
|
48
|
+
"Failed to parse '{}' with format '{}' as date32: {} (and as datetime: {})",
|
|
49
|
+
s, fmt, e1, e2
|
|
50
|
+
),
|
|
51
|
+
)
|
|
52
|
+
})
|
|
53
|
+
})?
|
|
54
|
+
} else {
|
|
55
|
+
s.parse().map_err(|e| {
|
|
56
|
+
MagnusError::new(
|
|
57
|
+
magnus::exception::type_error(),
|
|
58
|
+
format!("Failed to parse '{}' as date32: {}", s, e),
|
|
59
|
+
)
|
|
60
|
+
})?
|
|
61
|
+
};
|
|
44
62
|
|
|
45
63
|
let timestamp = date.at(0, 0, 0, 0);
|
|
46
64
|
|
|
@@ -63,17 +81,36 @@ pub fn convert_to_date32(value: Value) -> Result<i32, MagnusError> {
|
|
|
63
81
|
}
|
|
64
82
|
}
|
|
65
83
|
|
|
66
|
-
pub fn convert_to_timestamp_millis(value: Value) -> Result<i64, MagnusError> {
|
|
84
|
+
pub fn convert_to_timestamp_millis(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
|
|
67
85
|
let ruby = unsafe { Ruby::get_unchecked() };
|
|
68
86
|
if value.is_kind_of(ruby.class_string()) {
|
|
69
87
|
let s = String::try_convert(value)?;
|
|
70
88
|
// Parse string into Timestamp using jiff
|
|
71
|
-
let timestamp
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
89
|
+
let timestamp = if let Some(fmt) = format {
|
|
90
|
+
jiff::Timestamp::strptime(&fmt, &s)
|
|
91
|
+
.or_else(|e1| {
|
|
92
|
+
// Try parsing as DateTime and convert to Timestamp with zero offset
|
|
93
|
+
jiff::civil::DateTime::strptime(&fmt, &s)
|
|
94
|
+
.and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
|
|
95
|
+
.map(|dt| dt.timestamp())
|
|
96
|
+
.map_err(|e2| {
|
|
97
|
+
MagnusError::new(
|
|
98
|
+
magnus::exception::type_error(),
|
|
99
|
+
format!(
|
|
100
|
+
"Failed to parse '{}' with format '{}' as timestamp_millis: {} (and as datetime: {})",
|
|
101
|
+
s, fmt, e1, e2
|
|
102
|
+
),
|
|
103
|
+
)
|
|
104
|
+
})
|
|
105
|
+
})?
|
|
106
|
+
} else {
|
|
107
|
+
s.parse().map_err(|e| {
|
|
108
|
+
MagnusError::new(
|
|
109
|
+
magnus::exception::type_error(),
|
|
110
|
+
format!("Failed to parse '{}' as timestamp_millis: {}", s, e),
|
|
111
|
+
)
|
|
112
|
+
})?
|
|
113
|
+
};
|
|
77
114
|
// Convert to milliseconds
|
|
78
115
|
Ok(timestamp.as_millisecond())
|
|
79
116
|
} else if value.is_kind_of(ruby.class_time()) {
|
|
@@ -91,17 +128,36 @@ pub fn convert_to_timestamp_millis(value: Value) -> Result<i64, MagnusError> {
|
|
|
91
128
|
}
|
|
92
129
|
}
|
|
93
130
|
|
|
94
|
-
pub fn convert_to_timestamp_micros(value: Value) -> Result<i64, MagnusError> {
|
|
131
|
+
pub fn convert_to_timestamp_micros(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
|
|
95
132
|
let ruby = unsafe { Ruby::get_unchecked() };
|
|
96
133
|
if value.is_kind_of(ruby.class_string()) {
|
|
97
134
|
let s = String::try_convert(value)?;
|
|
98
135
|
// Parse string into Timestamp using jiff
|
|
99
|
-
let timestamp
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
136
|
+
let timestamp = if let Some(fmt) = format {
|
|
137
|
+
jiff::Timestamp::strptime(&fmt, &s).or_else(|e1| {
|
|
138
|
+
// Try parsing as DateTime and convert to Timestamp with zero offset
|
|
139
|
+
jiff::civil::DateTime::strptime(&fmt, &s).and_then(|dt| {
|
|
140
|
+
dt.to_zoned(TimeZone::fixed(Offset::constant(0)))
|
|
141
|
+
})
|
|
142
|
+
.map(|dt| dt.timestamp())
|
|
143
|
+
.map_err(|e2| {
|
|
144
|
+
MagnusError::new(
|
|
145
|
+
magnus::exception::type_error(),
|
|
146
|
+
format!(
|
|
147
|
+
"Failed to parse '{}' with format '{}' as timestamp_micros: {} (and as datetime: {})",
|
|
148
|
+
s, fmt, e1, e2
|
|
149
|
+
),
|
|
150
|
+
)
|
|
151
|
+
})
|
|
152
|
+
})?
|
|
153
|
+
} else {
|
|
154
|
+
s.parse().map_err(|e| {
|
|
155
|
+
MagnusError::new(
|
|
156
|
+
magnus::exception::type_error(),
|
|
157
|
+
format!("Failed to parse '{}' as timestamp_micros: {}", s, e),
|
|
158
|
+
)
|
|
159
|
+
})?
|
|
160
|
+
};
|
|
105
161
|
// Convert to microseconds
|
|
106
162
|
Ok(timestamp.as_microsecond())
|
|
107
163
|
} else if value.is_kind_of(ruby.class_time()) {
|
|
@@ -204,15 +260,15 @@ pub fn convert_to_list(
|
|
|
204
260
|
ParquetValue::Boolean(v)
|
|
205
261
|
}
|
|
206
262
|
ParquetSchemaType::Date32 => {
|
|
207
|
-
let v = convert_to_date32(item_value)?;
|
|
263
|
+
let v = convert_to_date32(item_value, list_field.format)?;
|
|
208
264
|
ParquetValue::Date32(v)
|
|
209
265
|
}
|
|
210
266
|
ParquetSchemaType::TimestampMillis => {
|
|
211
|
-
let v = convert_to_timestamp_millis(item_value)?;
|
|
267
|
+
let v = convert_to_timestamp_millis(item_value, list_field.format)?;
|
|
212
268
|
ParquetValue::TimestampMillis(v, None)
|
|
213
269
|
}
|
|
214
270
|
ParquetSchemaType::TimestampMicros => {
|
|
215
|
-
let v = convert_to_timestamp_micros(item_value)?;
|
|
271
|
+
let v = convert_to_timestamp_micros(item_value, list_field.format)?;
|
|
216
272
|
ParquetValue::TimestampMicros(v, None)
|
|
217
273
|
}
|
|
218
274
|
ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
|
|
@@ -310,15 +366,15 @@ pub fn convert_to_map(
|
|
|
310
366
|
ParquetValue::Boolean(v)
|
|
311
367
|
}
|
|
312
368
|
ParquetSchemaType::Date32 => {
|
|
313
|
-
let v = convert_to_date32(value)?;
|
|
369
|
+
let v = convert_to_date32(value, map_field.format)?;
|
|
314
370
|
ParquetValue::Date32(v)
|
|
315
371
|
}
|
|
316
372
|
ParquetSchemaType::TimestampMillis => {
|
|
317
|
-
let v = convert_to_timestamp_millis(value)?;
|
|
373
|
+
let v = convert_to_timestamp_millis(value, map_field.format)?;
|
|
318
374
|
ParquetValue::TimestampMillis(v, None)
|
|
319
375
|
}
|
|
320
376
|
ParquetSchemaType::TimestampMicros => {
|
|
321
|
-
let v = convert_to_timestamp_micros(value)?;
|
|
377
|
+
let v = convert_to_timestamp_micros(value, map_field.format)?;
|
|
322
378
|
ParquetValue::TimestampMicros(v, None)
|
|
323
379
|
}
|
|
324
380
|
ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
|
|
@@ -12,16 +12,17 @@ use tempfile::NamedTempFile;
|
|
|
12
12
|
use crate::types::{ListField, MapField, ParquetSchemaType};
|
|
13
13
|
|
|
14
14
|
#[derive(Debug)]
|
|
15
|
-
pub struct SchemaField {
|
|
15
|
+
pub struct SchemaField<'a> {
|
|
16
16
|
pub name: String,
|
|
17
|
-
pub type_: ParquetSchemaType
|
|
17
|
+
pub type_: ParquetSchemaType<'a>,
|
|
18
|
+
pub format: Option<String>,
|
|
18
19
|
}
|
|
19
20
|
|
|
20
21
|
#[derive(Debug)]
|
|
21
|
-
pub struct ParquetWriteArgs {
|
|
22
|
+
pub struct ParquetWriteArgs<'a> {
|
|
22
23
|
pub read_from: Value,
|
|
23
24
|
pub write_to: Value,
|
|
24
|
-
pub schema: Vec<SchemaField
|
|
25
|
+
pub schema: Vec<SchemaField<'a>>,
|
|
25
26
|
pub batch_size: Option<usize>,
|
|
26
27
|
}
|
|
27
28
|
|
|
@@ -51,7 +52,7 @@ impl Write for IoLikeValue {
|
|
|
51
52
|
}
|
|
52
53
|
}
|
|
53
54
|
|
|
54
|
-
impl FromStr for ParquetSchemaType {
|
|
55
|
+
impl<'a> FromStr for ParquetSchemaType<'a> {
|
|
55
56
|
type Err = MagnusError;
|
|
56
57
|
|
|
57
58
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
|
@@ -74,10 +75,12 @@ impl FromStr for ParquetSchemaType {
|
|
|
74
75
|
"timestamp_micros" => Ok(ParquetSchemaType::TimestampMicros),
|
|
75
76
|
"list" => Ok(ParquetSchemaType::List(Box::new(ListField {
|
|
76
77
|
item_type: ParquetSchemaType::Int8,
|
|
78
|
+
format: None,
|
|
77
79
|
}))),
|
|
78
80
|
"map" => Ok(ParquetSchemaType::Map(Box::new(MapField {
|
|
79
81
|
key_type: ParquetSchemaType::String,
|
|
80
82
|
value_type: ParquetSchemaType::Int8,
|
|
83
|
+
format: None,
|
|
81
84
|
}))),
|
|
82
85
|
_ => Err(MagnusError::new(
|
|
83
86
|
magnus::exception::runtime_error(),
|
|
@@ -87,7 +90,7 @@ impl FromStr for ParquetSchemaType {
|
|
|
87
90
|
}
|
|
88
91
|
}
|
|
89
92
|
|
|
90
|
-
impl TryConvert for ParquetSchemaType {
|
|
93
|
+
impl<'a> TryConvert for ParquetSchemaType<'a> {
|
|
91
94
|
fn try_convert(value: Value) -> Result<Self, MagnusError> {
|
|
92
95
|
let ruby = unsafe { Ruby::get_unchecked() };
|
|
93
96
|
let schema_type = parse_string_or_symbol(&ruby, value)?;
|
|
@@ -98,7 +101,7 @@ impl TryConvert for ParquetSchemaType {
|
|
|
98
101
|
|
|
99
102
|
// We know this type is safe to move between threads because it's just an enum
|
|
100
103
|
// with simple primitive types and strings
|
|
101
|
-
unsafe impl Send for ParquetSchemaType {}
|
|
104
|
+
unsafe impl<'a> Send for ParquetSchemaType<'a> {}
|
|
102
105
|
|
|
103
106
|
fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, MagnusError> {
|
|
104
107
|
if value.is_nil() {
|
|
@@ -162,17 +165,19 @@ impl From<ParquetErrorWrapper> for MagnusError {
|
|
|
162
165
|
}
|
|
163
166
|
}
|
|
164
167
|
|
|
165
|
-
pub struct ColumnCollector {
|
|
168
|
+
pub struct ColumnCollector<'a> {
|
|
166
169
|
pub name: String,
|
|
167
|
-
pub type_: ParquetSchemaType
|
|
170
|
+
pub type_: ParquetSchemaType<'a>,
|
|
171
|
+
pub format: Option<String>,
|
|
168
172
|
pub values: Vec<crate::types::ParquetValue>,
|
|
169
173
|
}
|
|
170
174
|
|
|
171
|
-
impl ColumnCollector {
|
|
172
|
-
pub fn new(name: String, type_: ParquetSchemaType) -> Self {
|
|
175
|
+
impl<'a> ColumnCollector<'a> {
|
|
176
|
+
pub fn new(name: String, type_: ParquetSchemaType<'a>, format: Option<String>) -> Self {
|
|
173
177
|
Self {
|
|
174
178
|
name,
|
|
175
179
|
type_,
|
|
180
|
+
format,
|
|
176
181
|
values: Vec::new(),
|
|
177
182
|
}
|
|
178
183
|
}
|
|
@@ -244,15 +249,15 @@ impl ColumnCollector {
|
|
|
244
249
|
ParquetValue::Boolean(v)
|
|
245
250
|
}
|
|
246
251
|
ParquetSchemaType::Date32 => {
|
|
247
|
-
let v = convert_to_date32(value)?;
|
|
252
|
+
let v = convert_to_date32(value, self.format.as_deref())?;
|
|
248
253
|
ParquetValue::Date32(v)
|
|
249
254
|
}
|
|
250
255
|
ParquetSchemaType::TimestampMillis => {
|
|
251
|
-
let v = convert_to_timestamp_millis(value)?;
|
|
256
|
+
let v = convert_to_timestamp_millis(value, self.format.as_deref())?;
|
|
252
257
|
ParquetValue::TimestampMillis(v, None)
|
|
253
258
|
}
|
|
254
259
|
ParquetSchemaType::TimestampMicros => {
|
|
255
|
-
let v = convert_to_timestamp_micros(value)?;
|
|
260
|
+
let v = convert_to_timestamp_micros(value, self.format.as_deref())?;
|
|
256
261
|
ParquetValue::TimestampMicros(v, None)
|
|
257
262
|
}
|
|
258
263
|
ParquetSchemaType::List(list_field) => {
|
|
@@ -59,11 +59,45 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
|
59
59
|
));
|
|
60
60
|
}
|
|
61
61
|
|
|
62
|
-
let (name,
|
|
62
|
+
let (name, type_value) = &entries[0];
|
|
63
63
|
let name = String::try_convert(name.clone())?;
|
|
64
|
-
let type_ = ParquetSchemaType::try_convert(type_str.clone())?;
|
|
65
64
|
|
|
66
|
-
|
|
65
|
+
let (type_, format) = if type_value.is_kind_of(ruby.class_hash()) {
|
|
66
|
+
let type_hash: Vec<(Value, Value)> = type_value.funcall("to_a", ())?;
|
|
67
|
+
let mut type_str = None;
|
|
68
|
+
let mut format_str = None;
|
|
69
|
+
|
|
70
|
+
for (key, value) in type_hash {
|
|
71
|
+
let key = String::try_convert(key)?;
|
|
72
|
+
match key.as_str() {
|
|
73
|
+
"type" => type_str = Some(value),
|
|
74
|
+
"format" => format_str = Some(String::try_convert(value)?),
|
|
75
|
+
_ => {
|
|
76
|
+
return Err(MagnusError::new(
|
|
77
|
+
magnus::exception::type_error(),
|
|
78
|
+
format!("Unknown key '{}' in type definition", key),
|
|
79
|
+
))
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
let type_str = type_str.ok_or_else(|| {
|
|
85
|
+
MagnusError::new(
|
|
86
|
+
magnus::exception::type_error(),
|
|
87
|
+
"Missing 'type' in type definition",
|
|
88
|
+
)
|
|
89
|
+
})?;
|
|
90
|
+
|
|
91
|
+
(ParquetSchemaType::try_convert(type_str)?, format_str)
|
|
92
|
+
} else {
|
|
93
|
+
(ParquetSchemaType::try_convert(type_value.clone())?, None)
|
|
94
|
+
};
|
|
95
|
+
|
|
96
|
+
schema.push(SchemaField {
|
|
97
|
+
name,
|
|
98
|
+
type_,
|
|
99
|
+
format,
|
|
100
|
+
});
|
|
67
101
|
}
|
|
68
102
|
|
|
69
103
|
Ok(ParquetWriteArgs {
|
|
@@ -130,7 +164,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
|
130
164
|
// Create collectors for each column
|
|
131
165
|
let mut column_collectors: Vec<ColumnCollector> = schema
|
|
132
166
|
.into_iter()
|
|
133
|
-
.map(|field| ColumnCollector::new(field.name, field.type_))
|
|
167
|
+
.map(|field| ColumnCollector::new(field.name, field.type_, field.format))
|
|
134
168
|
.collect();
|
|
135
169
|
|
|
136
170
|
let mut rows_in_batch = 0;
|
data/lib/parquet/version.rb
CHANGED
data/lib/parquet.rbi
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# typed:
|
|
1
|
+
# typed: true
|
|
2
2
|
|
|
3
3
|
module Parquet
|
|
4
4
|
# Options:
|
|
@@ -7,13 +7,20 @@ module Parquet
|
|
|
7
7
|
# ("hash" or "array" or :hash or :array)
|
|
8
8
|
# - `columns`: When present, only the specified columns will be included in the output.
|
|
9
9
|
# This is useful for reducing how much data is read and improving performance.
|
|
10
|
+
sig do
|
|
11
|
+
params(
|
|
12
|
+
input: T.any(String, File, StringIO, IO),
|
|
13
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
|
14
|
+
columns: T.nilable(T::Array[String])
|
|
15
|
+
).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
|
|
16
|
+
end
|
|
10
17
|
sig do
|
|
11
18
|
params(
|
|
12
19
|
input: T.any(String, File, StringIO, IO),
|
|
13
20
|
result_type: T.nilable(T.any(String, Symbol)),
|
|
14
21
|
columns: T.nilable(T::Array[String]),
|
|
15
22
|
blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
|
|
16
|
-
).returns(
|
|
23
|
+
).returns(NilClass)
|
|
17
24
|
end
|
|
18
25
|
def self.each_row(input, result_type: nil, columns: nil, &blk)
|
|
19
26
|
end
|
|
@@ -24,6 +31,14 @@ module Parquet
|
|
|
24
31
|
# ("hash" or "array" or :hash or :array)
|
|
25
32
|
# - `columns`: When present, only the specified columns will be included in the output.
|
|
26
33
|
# - `batch_size`: When present, specifies the number of rows per batch
|
|
34
|
+
sig do
|
|
35
|
+
params(
|
|
36
|
+
input: T.any(String, File, StringIO, IO),
|
|
37
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
|
38
|
+
columns: T.nilable(T::Array[String]),
|
|
39
|
+
batch_size: T.nilable(Integer)
|
|
40
|
+
).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
|
|
41
|
+
end
|
|
27
42
|
sig do
|
|
28
43
|
params(
|
|
29
44
|
input: T.any(String, File, StringIO, IO),
|
|
@@ -32,14 +47,22 @@ module Parquet
|
|
|
32
47
|
batch_size: T.nilable(Integer),
|
|
33
48
|
blk:
|
|
34
49
|
T.nilable(T.proc.params(batch: T.any(T::Hash[String, T::Array[T.untyped]], T::Array[T::Array[T.untyped]])).void)
|
|
35
|
-
).returns(
|
|
50
|
+
).returns(NilClass)
|
|
36
51
|
end
|
|
37
52
|
def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, &blk)
|
|
38
53
|
end
|
|
39
54
|
|
|
40
55
|
# Options:
|
|
41
56
|
# - `read_from`: An Enumerator yielding arrays of values representing each row
|
|
42
|
-
# - `schema`: Array of hashes specifying column names and types
|
|
57
|
+
# - `schema`: Array of hashes specifying column names and types. Supported types:
|
|
58
|
+
# - `int8`, `int16`, `int32`, `int64`
|
|
59
|
+
# - `uint8`, `uint16`, `uint32`, `uint64`
|
|
60
|
+
# - `float`, `double`
|
|
61
|
+
# - `string`
|
|
62
|
+
# - `binary`
|
|
63
|
+
# - `boolean`
|
|
64
|
+
# - `date32`
|
|
65
|
+
# - `timestamp_millis`, `timestamp_micros`
|
|
43
66
|
# - `write_to`: String path or IO object to write the parquet file to
|
|
44
67
|
# - `batch_size`: Optional batch size for writing (defaults to 1000)
|
|
45
68
|
sig do
|
|
@@ -55,7 +78,16 @@ module Parquet
|
|
|
55
78
|
|
|
56
79
|
# Options:
|
|
57
80
|
# - `read_from`: An Enumerator yielding arrays of column batches
|
|
58
|
-
# - `schema`: Array of hashes specifying column names and types
|
|
81
|
+
# - `schema`: Array of hashes specifying column names and types. Supported types:
|
|
82
|
+
# - `int8`, `int16`, `int32`, `int64`
|
|
83
|
+
# - `uint8`, `uint16`, `uint32`, `uint64`
|
|
84
|
+
# - `float`, `double`
|
|
85
|
+
# - `string`
|
|
86
|
+
# - `binary`
|
|
87
|
+
# - `boolean`
|
|
88
|
+
# - `date32`
|
|
89
|
+
# - `timestamp_millis`, `timestamp_micros`
|
|
90
|
+
# - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
|
|
59
91
|
# - `write_to`: String path or IO object to write the parquet file to
|
|
60
92
|
sig do
|
|
61
93
|
params(
|