parquet 0.2.6 → 0.2.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +8 -1
- data/ext/parquet/src/types/core_types.rs +10 -8
- data/ext/parquet/src/types/parquet_value.rs +3 -3
- data/ext/parquet/src/types/type_conversion.rs +84 -28
- data/ext/parquet/src/types/writer_types.rs +20 -14
- data/ext/parquet/src/writer/mod.rs +70 -15
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +47 -9
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2dea9b9b171070949497da37aff1888de71c0782e76968ba218f38e5dc2f1606
|
4
|
+
data.tar.gz: 74f4599b00a818cfca62d7fc162d02a87658da014ace361a76c998b718def9f2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 209ca0339ccb11224501efc1d1adfed241097763475aa44e3997fce811123e9744f1bbfb1447e91decd1b020181b722ded94a6655630288db1f22e88aa8c09ae
|
7
|
+
data.tar.gz: a889e46dc8fca484043b3f1513ee6487b0f8caa8096c826cdbe4fa9ff2d6aa457c2d84e1bd95f7b05819e0ce2e33017a77a720aa331be7115cfa2ac470557a59
|
data/README.md
CHANGED
@@ -152,9 +152,16 @@ batches = [
|
|
152
152
|
# Create an enumerator from the batches
|
153
153
|
columns = batches.each
|
154
154
|
|
155
|
-
# Write to a parquet file
|
155
|
+
# Write to a parquet file with default ZSTD compression
|
156
156
|
Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
|
157
157
|
|
158
|
+
# Write to a parquet file with specific compression
|
159
|
+
Parquet.write_columns(columns,
|
160
|
+
schema: schema,
|
161
|
+
write_to: "data.parquet",
|
162
|
+
compression: "snappy" # Supported: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
163
|
+
)
|
164
|
+
|
158
165
|
# Write to an IO object
|
159
166
|
File.open("data.parquet", "wb") do |file|
|
160
167
|
Parquet.write_columns(columns, schema: schema, write_to: file)
|
@@ -40,18 +40,20 @@ impl std::fmt::Display for ParserResultType {
|
|
40
40
|
}
|
41
41
|
|
42
42
|
#[derive(Debug, Clone)]
|
43
|
-
pub struct ListField {
|
44
|
-
pub item_type: ParquetSchemaType
|
43
|
+
pub struct ListField<'a> {
|
44
|
+
pub item_type: ParquetSchemaType<'a>,
|
45
|
+
pub format: Option<&'a str>,
|
45
46
|
}
|
46
47
|
|
47
48
|
#[derive(Debug, Clone)]
|
48
|
-
pub struct MapField {
|
49
|
-
pub key_type: ParquetSchemaType
|
50
|
-
pub value_type: ParquetSchemaType
|
49
|
+
pub struct MapField<'a> {
|
50
|
+
pub key_type: ParquetSchemaType<'a>,
|
51
|
+
pub value_type: ParquetSchemaType<'a>,
|
52
|
+
pub format: Option<&'a str>,
|
51
53
|
}
|
52
54
|
|
53
55
|
#[derive(Debug, Clone)]
|
54
|
-
pub enum ParquetSchemaType {
|
56
|
+
pub enum ParquetSchemaType<'a> {
|
55
57
|
Int8,
|
56
58
|
Int16,
|
57
59
|
Int32,
|
@@ -68,6 +70,6 @@ pub enum ParquetSchemaType {
|
|
68
70
|
Date32,
|
69
71
|
TimestampMillis,
|
70
72
|
TimestampMicros,
|
71
|
-
List(Box<ListField
|
72
|
-
Map(Box<MapField
|
73
|
+
List(Box<ListField<'a>>),
|
74
|
+
Map(Box<MapField<'a>>),
|
73
75
|
}
|
@@ -215,15 +215,15 @@ impl ParquetValue {
|
|
215
215
|
Ok(ParquetValue::Boolean(v))
|
216
216
|
}
|
217
217
|
ParquetSchemaType::Date32 => {
|
218
|
-
let v = convert_to_date32(value)?;
|
218
|
+
let v = convert_to_date32(value, None)?;
|
219
219
|
Ok(ParquetValue::Date32(v))
|
220
220
|
}
|
221
221
|
ParquetSchemaType::TimestampMillis => {
|
222
|
-
let v = convert_to_timestamp_millis(value)?;
|
222
|
+
let v = convert_to_timestamp_millis(value, None)?;
|
223
223
|
Ok(ParquetValue::TimestampMillis(v, None))
|
224
224
|
}
|
225
225
|
ParquetSchemaType::TimestampMicros => {
|
226
|
-
let v = convert_to_timestamp_micros(value)?;
|
226
|
+
let v = convert_to_timestamp_micros(value, None)?;
|
227
227
|
Ok(ParquetValue::TimestampMicros(v, None))
|
228
228
|
}
|
229
229
|
ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => Err(MagnusError::new(
|
@@ -30,17 +30,35 @@ where
|
|
30
30
|
}
|
31
31
|
}
|
32
32
|
|
33
|
-
pub fn convert_to_date32(value: Value) -> Result<i32, MagnusError> {
|
33
|
+
pub fn convert_to_date32(value: Value, format: Option<&str>) -> Result<i32, MagnusError> {
|
34
34
|
let ruby = unsafe { Ruby::get_unchecked() };
|
35
35
|
if value.is_kind_of(ruby.class_string()) {
|
36
36
|
let s = String::try_convert(value)?;
|
37
|
-
// Parse string into
|
38
|
-
let date
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
37
|
+
// Parse string into Date using jiff
|
38
|
+
let date = if let Some(fmt) = format {
|
39
|
+
jiff::civil::Date::strptime(&fmt, &s).or_else(|e1| {
|
40
|
+
// Try parsing as DateTime and convert to Date with zero offset
|
41
|
+
jiff::civil::DateTime::strptime(&fmt, &s)
|
42
|
+
.and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
|
43
|
+
.map(|dt| dt.date())
|
44
|
+
.map_err(|e2| {
|
45
|
+
MagnusError::new(
|
46
|
+
magnus::exception::type_error(),
|
47
|
+
format!(
|
48
|
+
"Failed to parse '{}' with format '{}' as date32: {} (and as datetime: {})",
|
49
|
+
s, fmt, e1, e2
|
50
|
+
),
|
51
|
+
)
|
52
|
+
})
|
53
|
+
})?
|
54
|
+
} else {
|
55
|
+
s.parse().map_err(|e| {
|
56
|
+
MagnusError::new(
|
57
|
+
magnus::exception::type_error(),
|
58
|
+
format!("Failed to parse '{}' as date32: {}", s, e),
|
59
|
+
)
|
60
|
+
})?
|
61
|
+
};
|
44
62
|
|
45
63
|
let timestamp = date.at(0, 0, 0, 0);
|
46
64
|
|
@@ -63,17 +81,36 @@ pub fn convert_to_date32(value: Value) -> Result<i32, MagnusError> {
|
|
63
81
|
}
|
64
82
|
}
|
65
83
|
|
66
|
-
pub fn convert_to_timestamp_millis(value: Value) -> Result<i64, MagnusError> {
|
84
|
+
pub fn convert_to_timestamp_millis(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
|
67
85
|
let ruby = unsafe { Ruby::get_unchecked() };
|
68
86
|
if value.is_kind_of(ruby.class_string()) {
|
69
87
|
let s = String::try_convert(value)?;
|
70
88
|
// Parse string into Timestamp using jiff
|
71
|
-
let timestamp
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
89
|
+
let timestamp = if let Some(fmt) = format {
|
90
|
+
jiff::Timestamp::strptime(&fmt, &s)
|
91
|
+
.or_else(|e1| {
|
92
|
+
// Try parsing as DateTime and convert to Timestamp with zero offset
|
93
|
+
jiff::civil::DateTime::strptime(&fmt, &s)
|
94
|
+
.and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
|
95
|
+
.map(|dt| dt.timestamp())
|
96
|
+
.map_err(|e2| {
|
97
|
+
MagnusError::new(
|
98
|
+
magnus::exception::type_error(),
|
99
|
+
format!(
|
100
|
+
"Failed to parse '{}' with format '{}' as timestamp_millis: {} (and as datetime: {})",
|
101
|
+
s, fmt, e1, e2
|
102
|
+
),
|
103
|
+
)
|
104
|
+
})
|
105
|
+
})?
|
106
|
+
} else {
|
107
|
+
s.parse().map_err(|e| {
|
108
|
+
MagnusError::new(
|
109
|
+
magnus::exception::type_error(),
|
110
|
+
format!("Failed to parse '{}' as timestamp_millis: {}", s, e),
|
111
|
+
)
|
112
|
+
})?
|
113
|
+
};
|
77
114
|
// Convert to milliseconds
|
78
115
|
Ok(timestamp.as_millisecond())
|
79
116
|
} else if value.is_kind_of(ruby.class_time()) {
|
@@ -91,17 +128,36 @@ pub fn convert_to_timestamp_millis(value: Value) -> Result<i64, MagnusError> {
|
|
91
128
|
}
|
92
129
|
}
|
93
130
|
|
94
|
-
pub fn convert_to_timestamp_micros(value: Value) -> Result<i64, MagnusError> {
|
131
|
+
pub fn convert_to_timestamp_micros(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
|
95
132
|
let ruby = unsafe { Ruby::get_unchecked() };
|
96
133
|
if value.is_kind_of(ruby.class_string()) {
|
97
134
|
let s = String::try_convert(value)?;
|
98
135
|
// Parse string into Timestamp using jiff
|
99
|
-
let timestamp
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
136
|
+
let timestamp = if let Some(fmt) = format {
|
137
|
+
jiff::Timestamp::strptime(&fmt, &s).or_else(|e1| {
|
138
|
+
// Try parsing as DateTime and convert to Timestamp with zero offset
|
139
|
+
jiff::civil::DateTime::strptime(&fmt, &s).and_then(|dt| {
|
140
|
+
dt.to_zoned(TimeZone::fixed(Offset::constant(0)))
|
141
|
+
})
|
142
|
+
.map(|dt| dt.timestamp())
|
143
|
+
.map_err(|e2| {
|
144
|
+
MagnusError::new(
|
145
|
+
magnus::exception::type_error(),
|
146
|
+
format!(
|
147
|
+
"Failed to parse '{}' with format '{}' as timestamp_micros: {} (and as datetime: {})",
|
148
|
+
s, fmt, e1, e2
|
149
|
+
),
|
150
|
+
)
|
151
|
+
})
|
152
|
+
})?
|
153
|
+
} else {
|
154
|
+
s.parse().map_err(|e| {
|
155
|
+
MagnusError::new(
|
156
|
+
magnus::exception::type_error(),
|
157
|
+
format!("Failed to parse '{}' as timestamp_micros: {}", s, e),
|
158
|
+
)
|
159
|
+
})?
|
160
|
+
};
|
105
161
|
// Convert to microseconds
|
106
162
|
Ok(timestamp.as_microsecond())
|
107
163
|
} else if value.is_kind_of(ruby.class_time()) {
|
@@ -204,15 +260,15 @@ pub fn convert_to_list(
|
|
204
260
|
ParquetValue::Boolean(v)
|
205
261
|
}
|
206
262
|
ParquetSchemaType::Date32 => {
|
207
|
-
let v = convert_to_date32(item_value)?;
|
263
|
+
let v = convert_to_date32(item_value, list_field.format)?;
|
208
264
|
ParquetValue::Date32(v)
|
209
265
|
}
|
210
266
|
ParquetSchemaType::TimestampMillis => {
|
211
|
-
let v = convert_to_timestamp_millis(item_value)?;
|
267
|
+
let v = convert_to_timestamp_millis(item_value, list_field.format)?;
|
212
268
|
ParquetValue::TimestampMillis(v, None)
|
213
269
|
}
|
214
270
|
ParquetSchemaType::TimestampMicros => {
|
215
|
-
let v = convert_to_timestamp_micros(item_value)?;
|
271
|
+
let v = convert_to_timestamp_micros(item_value, list_field.format)?;
|
216
272
|
ParquetValue::TimestampMicros(v, None)
|
217
273
|
}
|
218
274
|
ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
|
@@ -310,15 +366,15 @@ pub fn convert_to_map(
|
|
310
366
|
ParquetValue::Boolean(v)
|
311
367
|
}
|
312
368
|
ParquetSchemaType::Date32 => {
|
313
|
-
let v = convert_to_date32(value)?;
|
369
|
+
let v = convert_to_date32(value, map_field.format)?;
|
314
370
|
ParquetValue::Date32(v)
|
315
371
|
}
|
316
372
|
ParquetSchemaType::TimestampMillis => {
|
317
|
-
let v = convert_to_timestamp_millis(value)?;
|
373
|
+
let v = convert_to_timestamp_millis(value, map_field.format)?;
|
318
374
|
ParquetValue::TimestampMillis(v, None)
|
319
375
|
}
|
320
376
|
ParquetSchemaType::TimestampMicros => {
|
321
|
-
let v = convert_to_timestamp_micros(value)?;
|
377
|
+
let v = convert_to_timestamp_micros(value, map_field.format)?;
|
322
378
|
ParquetValue::TimestampMicros(v, None)
|
323
379
|
}
|
324
380
|
ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
|
@@ -12,17 +12,19 @@ use tempfile::NamedTempFile;
|
|
12
12
|
use crate::types::{ListField, MapField, ParquetSchemaType};
|
13
13
|
|
14
14
|
#[derive(Debug)]
|
15
|
-
pub struct SchemaField {
|
15
|
+
pub struct SchemaField<'a> {
|
16
16
|
pub name: String,
|
17
|
-
pub type_: ParquetSchemaType
|
17
|
+
pub type_: ParquetSchemaType<'a>,
|
18
|
+
pub format: Option<String>,
|
18
19
|
}
|
19
20
|
|
20
21
|
#[derive(Debug)]
|
21
|
-
pub struct ParquetWriteArgs {
|
22
|
+
pub struct ParquetWriteArgs<'a> {
|
22
23
|
pub read_from: Value,
|
23
24
|
pub write_to: Value,
|
24
|
-
pub schema: Vec<SchemaField
|
25
|
+
pub schema: Vec<SchemaField<'a>>,
|
25
26
|
pub batch_size: Option<usize>,
|
27
|
+
pub compression: Option<String>,
|
26
28
|
}
|
27
29
|
|
28
30
|
pub trait SendableWrite: Send + Write {}
|
@@ -51,7 +53,7 @@ impl Write for IoLikeValue {
|
|
51
53
|
}
|
52
54
|
}
|
53
55
|
|
54
|
-
impl FromStr for ParquetSchemaType {
|
56
|
+
impl<'a> FromStr for ParquetSchemaType<'a> {
|
55
57
|
type Err = MagnusError;
|
56
58
|
|
57
59
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
@@ -74,10 +76,12 @@ impl FromStr for ParquetSchemaType {
|
|
74
76
|
"timestamp_micros" => Ok(ParquetSchemaType::TimestampMicros),
|
75
77
|
"list" => Ok(ParquetSchemaType::List(Box::new(ListField {
|
76
78
|
item_type: ParquetSchemaType::Int8,
|
79
|
+
format: None,
|
77
80
|
}))),
|
78
81
|
"map" => Ok(ParquetSchemaType::Map(Box::new(MapField {
|
79
82
|
key_type: ParquetSchemaType::String,
|
80
83
|
value_type: ParquetSchemaType::Int8,
|
84
|
+
format: None,
|
81
85
|
}))),
|
82
86
|
_ => Err(MagnusError::new(
|
83
87
|
magnus::exception::runtime_error(),
|
@@ -87,7 +91,7 @@ impl FromStr for ParquetSchemaType {
|
|
87
91
|
}
|
88
92
|
}
|
89
93
|
|
90
|
-
impl TryConvert for ParquetSchemaType {
|
94
|
+
impl<'a> TryConvert for ParquetSchemaType<'a> {
|
91
95
|
fn try_convert(value: Value) -> Result<Self, MagnusError> {
|
92
96
|
let ruby = unsafe { Ruby::get_unchecked() };
|
93
97
|
let schema_type = parse_string_or_symbol(&ruby, value)?;
|
@@ -98,7 +102,7 @@ impl TryConvert for ParquetSchemaType {
|
|
98
102
|
|
99
103
|
// We know this type is safe to move between threads because it's just an enum
|
100
104
|
// with simple primitive types and strings
|
101
|
-
unsafe impl Send for ParquetSchemaType {}
|
105
|
+
unsafe impl<'a> Send for ParquetSchemaType<'a> {}
|
102
106
|
|
103
107
|
fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, MagnusError> {
|
104
108
|
if value.is_nil() {
|
@@ -162,17 +166,19 @@ impl From<ParquetErrorWrapper> for MagnusError {
|
|
162
166
|
}
|
163
167
|
}
|
164
168
|
|
165
|
-
pub struct ColumnCollector {
|
169
|
+
pub struct ColumnCollector<'a> {
|
166
170
|
pub name: String,
|
167
|
-
pub type_: ParquetSchemaType
|
171
|
+
pub type_: ParquetSchemaType<'a>,
|
172
|
+
pub format: Option<String>,
|
168
173
|
pub values: Vec<crate::types::ParquetValue>,
|
169
174
|
}
|
170
175
|
|
171
|
-
impl ColumnCollector {
|
172
|
-
pub fn new(name: String, type_: ParquetSchemaType) -> Self {
|
176
|
+
impl<'a> ColumnCollector<'a> {
|
177
|
+
pub fn new(name: String, type_: ParquetSchemaType<'a>, format: Option<String>) -> Self {
|
173
178
|
Self {
|
174
179
|
name,
|
175
180
|
type_,
|
181
|
+
format,
|
176
182
|
values: Vec::new(),
|
177
183
|
}
|
178
184
|
}
|
@@ -244,15 +250,15 @@ impl ColumnCollector {
|
|
244
250
|
ParquetValue::Boolean(v)
|
245
251
|
}
|
246
252
|
ParquetSchemaType::Date32 => {
|
247
|
-
let v = convert_to_date32(value)?;
|
253
|
+
let v = convert_to_date32(value, self.format.as_deref())?;
|
248
254
|
ParquetValue::Date32(v)
|
249
255
|
}
|
250
256
|
ParquetSchemaType::TimestampMillis => {
|
251
|
-
let v = convert_to_timestamp_millis(value)?;
|
257
|
+
let v = convert_to_timestamp_millis(value, self.format.as_deref())?;
|
252
258
|
ParquetValue::TimestampMillis(v, None)
|
253
259
|
}
|
254
260
|
ParquetSchemaType::TimestampMicros => {
|
255
|
-
let v = convert_to_timestamp_micros(value)?;
|
261
|
+
let v = convert_to_timestamp_micros(value, self.format.as_deref())?;
|
256
262
|
ParquetValue::TimestampMicros(v, None)
|
257
263
|
}
|
258
264
|
ParquetSchemaType::List(list_field) => {
|
@@ -11,7 +11,11 @@ use magnus::{
|
|
11
11
|
value::ReprValue,
|
12
12
|
Error as MagnusError, RArray, Ruby, TryConvert, Value,
|
13
13
|
};
|
14
|
-
use parquet::
|
14
|
+
use parquet::{
|
15
|
+
arrow::ArrowWriter,
|
16
|
+
basic::{Compression, GzipLevel, ZstdLevel},
|
17
|
+
file::properties::WriterProperties,
|
18
|
+
};
|
15
19
|
use tempfile::NamedTempFile;
|
16
20
|
|
17
21
|
use crate::{
|
@@ -28,11 +32,12 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
28
32
|
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
29
33
|
let (read_from,) = parsed_args.required;
|
30
34
|
|
31
|
-
let kwargs =
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
35
|
+
let kwargs =
|
36
|
+
get_kwargs::<_, (Value, Value), (Option<Option<usize>>, Option<Option<String>>), ()>(
|
37
|
+
parsed_args.keywords,
|
38
|
+
&["schema", "write_to"],
|
39
|
+
&["batch_size", "compression"],
|
40
|
+
)?;
|
36
41
|
|
37
42
|
let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
|
38
43
|
MagnusError::new(
|
@@ -59,11 +64,45 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
59
64
|
));
|
60
65
|
}
|
61
66
|
|
62
|
-
let (name,
|
67
|
+
let (name, type_value) = &entries[0];
|
63
68
|
let name = String::try_convert(name.clone())?;
|
64
|
-
let type_ = ParquetSchemaType::try_convert(type_str.clone())?;
|
65
69
|
|
66
|
-
|
70
|
+
let (type_, format) = if type_value.is_kind_of(ruby.class_hash()) {
|
71
|
+
let type_hash: Vec<(Value, Value)> = type_value.funcall("to_a", ())?;
|
72
|
+
let mut type_str = None;
|
73
|
+
let mut format_str = None;
|
74
|
+
|
75
|
+
for (key, value) in type_hash {
|
76
|
+
let key = String::try_convert(key)?;
|
77
|
+
match key.as_str() {
|
78
|
+
"type" => type_str = Some(value),
|
79
|
+
"format" => format_str = Some(String::try_convert(value)?),
|
80
|
+
_ => {
|
81
|
+
return Err(MagnusError::new(
|
82
|
+
magnus::exception::type_error(),
|
83
|
+
format!("Unknown key '{}' in type definition", key),
|
84
|
+
))
|
85
|
+
}
|
86
|
+
}
|
87
|
+
}
|
88
|
+
|
89
|
+
let type_str = type_str.ok_or_else(|| {
|
90
|
+
MagnusError::new(
|
91
|
+
magnus::exception::type_error(),
|
92
|
+
"Missing 'type' in type definition",
|
93
|
+
)
|
94
|
+
})?;
|
95
|
+
|
96
|
+
(ParquetSchemaType::try_convert(type_str)?, format_str)
|
97
|
+
} else {
|
98
|
+
(ParquetSchemaType::try_convert(type_value.clone())?, None)
|
99
|
+
};
|
100
|
+
|
101
|
+
schema.push(SchemaField {
|
102
|
+
name,
|
103
|
+
type_,
|
104
|
+
format,
|
105
|
+
});
|
67
106
|
}
|
68
107
|
|
69
108
|
Ok(ParquetWriteArgs {
|
@@ -71,6 +110,7 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
71
110
|
write_to: kwargs.required.1,
|
72
111
|
schema,
|
73
112
|
batch_size: kwargs.optional.0.flatten(),
|
113
|
+
compression: kwargs.optional.1.flatten(),
|
74
114
|
})
|
75
115
|
}
|
76
116
|
|
@@ -83,6 +123,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
83
123
|
write_to,
|
84
124
|
schema,
|
85
125
|
batch_size,
|
126
|
+
compression,
|
86
127
|
} = parse_parquet_write_args(args)?;
|
87
128
|
|
88
129
|
let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
|
@@ -124,13 +165,13 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
124
165
|
let arrow_schema = Arc::new(Schema::new(arrow_fields));
|
125
166
|
|
126
167
|
// Create the writer
|
127
|
-
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
|
168
|
+
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
|
128
169
|
|
129
170
|
if read_from.is_kind_of(ruby.class_enumerator()) {
|
130
171
|
// Create collectors for each column
|
131
172
|
let mut column_collectors: Vec<ColumnCollector> = schema
|
132
173
|
.into_iter()
|
133
|
-
.map(|field| ColumnCollector::new(field.name, field.type_))
|
174
|
+
.map(|field| ColumnCollector::new(field.name, field.type_, field.format))
|
134
175
|
.collect();
|
135
176
|
|
136
177
|
let mut rows_in_batch = 0;
|
@@ -204,7 +245,8 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
|
204
245
|
read_from,
|
205
246
|
write_to,
|
206
247
|
schema,
|
207
|
-
batch_size: _,
|
248
|
+
batch_size: _,
|
249
|
+
compression,
|
208
250
|
} = parse_parquet_write_args(args)?;
|
209
251
|
|
210
252
|
// Convert schema to Arrow schema
|
@@ -244,7 +286,7 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
|
244
286
|
let arrow_schema = Arc::new(Schema::new(arrow_fields));
|
245
287
|
|
246
288
|
// Create the writer
|
247
|
-
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
|
289
|
+
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
|
248
290
|
|
249
291
|
if read_from.is_kind_of(ruby.class_enumerator()) {
|
250
292
|
loop {
|
@@ -326,12 +368,25 @@ fn create_writer(
|
|
326
368
|
ruby: &Ruby,
|
327
369
|
write_to: &Value,
|
328
370
|
schema: Arc<Schema>,
|
371
|
+
compression: Option<String>,
|
329
372
|
) -> Result<WriterOutput, MagnusError> {
|
373
|
+
// Create writer properties with compression based on the option
|
374
|
+
let props = WriterProperties::builder()
|
375
|
+
.set_compression(match compression.as_deref() {
|
376
|
+
Some("none") | Some("uncompressed") => Compression::UNCOMPRESSED,
|
377
|
+
Some("snappy") => Compression::SNAPPY,
|
378
|
+
Some("gzip") => Compression::GZIP(GzipLevel::default()),
|
379
|
+
Some("lz4") => Compression::LZ4,
|
380
|
+
Some("zstd") => Compression::ZSTD(ZstdLevel::default()),
|
381
|
+
_ => Compression::UNCOMPRESSED,
|
382
|
+
})
|
383
|
+
.build();
|
384
|
+
|
330
385
|
if write_to.is_kind_of(ruby.class_string()) {
|
331
386
|
let path = write_to.to_r_string()?.to_string()?;
|
332
387
|
let file: Box<dyn SendableWrite> = Box::new(File::create(path).unwrap());
|
333
388
|
let writer =
|
334
|
-
ArrowWriter::try_new(file, schema,
|
389
|
+
ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
|
335
390
|
Ok(WriterOutput::File(writer))
|
336
391
|
} else {
|
337
392
|
// Create a temporary file to write to instead of directly to the IoLikeValue
|
@@ -348,7 +403,7 @@ fn create_writer(
|
|
348
403
|
)
|
349
404
|
})?);
|
350
405
|
let writer =
|
351
|
-
ArrowWriter::try_new(file, schema,
|
406
|
+
ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
|
352
407
|
Ok(WriterOutput::TempFile(writer, temp_file))
|
353
408
|
}
|
354
409
|
}
|
data/lib/parquet/version.rb
CHANGED
data/lib/parquet.rbi
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# typed:
|
1
|
+
# typed: true
|
2
2
|
|
3
3
|
module Parquet
|
4
4
|
# Options:
|
@@ -7,13 +7,20 @@ module Parquet
|
|
7
7
|
# ("hash" or "array" or :hash or :array)
|
8
8
|
# - `columns`: When present, only the specified columns will be included in the output.
|
9
9
|
# This is useful for reducing how much data is read and improving performance.
|
10
|
+
sig do
|
11
|
+
params(
|
12
|
+
input: T.any(String, File, StringIO, IO),
|
13
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
14
|
+
columns: T.nilable(T::Array[String])
|
15
|
+
).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
|
16
|
+
end
|
10
17
|
sig do
|
11
18
|
params(
|
12
19
|
input: T.any(String, File, StringIO, IO),
|
13
20
|
result_type: T.nilable(T.any(String, Symbol)),
|
14
21
|
columns: T.nilable(T::Array[String]),
|
15
22
|
blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
|
16
|
-
).returns(
|
23
|
+
).returns(NilClass)
|
17
24
|
end
|
18
25
|
def self.each_row(input, result_type: nil, columns: nil, &blk)
|
19
26
|
end
|
@@ -24,6 +31,14 @@ module Parquet
|
|
24
31
|
# ("hash" or "array" or :hash or :array)
|
25
32
|
# - `columns`: When present, only the specified columns will be included in the output.
|
26
33
|
# - `batch_size`: When present, specifies the number of rows per batch
|
34
|
+
sig do
|
35
|
+
params(
|
36
|
+
input: T.any(String, File, StringIO, IO),
|
37
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
38
|
+
columns: T.nilable(T::Array[String]),
|
39
|
+
batch_size: T.nilable(Integer)
|
40
|
+
).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
|
41
|
+
end
|
27
42
|
sig do
|
28
43
|
params(
|
29
44
|
input: T.any(String, File, StringIO, IO),
|
@@ -32,38 +47,61 @@ module Parquet
|
|
32
47
|
batch_size: T.nilable(Integer),
|
33
48
|
blk:
|
34
49
|
T.nilable(T.proc.params(batch: T.any(T::Hash[String, T::Array[T.untyped]], T::Array[T::Array[T.untyped]])).void)
|
35
|
-
).returns(
|
50
|
+
).returns(NilClass)
|
36
51
|
end
|
37
52
|
def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, &blk)
|
38
53
|
end
|
39
54
|
|
40
55
|
# Options:
|
41
56
|
# - `read_from`: An Enumerator yielding arrays of values representing each row
|
42
|
-
# - `schema`: Array of hashes specifying column names and types
|
57
|
+
# - `schema`: Array of hashes specifying column names and types. Supported types:
|
58
|
+
# - `int8`, `int16`, `int32`, `int64`
|
59
|
+
# - `uint8`, `uint16`, `uint32`, `uint64`
|
60
|
+
# - `float`, `double`
|
61
|
+
# - `string`
|
62
|
+
# - `binary`
|
63
|
+
# - `boolean`
|
64
|
+
# - `date32`
|
65
|
+
# - `timestamp_millis`, `timestamp_micros`
|
43
66
|
# - `write_to`: String path or IO object to write the parquet file to
|
44
67
|
# - `batch_size`: Optional batch size for writing (defaults to 1000)
|
68
|
+
# - `compression`: Optional compression type to use (defaults to "zstd")
|
69
|
+
# Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
45
70
|
sig do
|
46
71
|
params(
|
47
72
|
read_from: T::Enumerator[T::Array[T.untyped]],
|
48
73
|
schema: T::Array[T::Hash[String, String]],
|
49
74
|
write_to: T.any(String, IO),
|
50
|
-
batch_size: T.nilable(Integer)
|
75
|
+
batch_size: T.nilable(Integer),
|
76
|
+
compression: T.nilable(String)
|
51
77
|
).void
|
52
78
|
end
|
53
|
-
def self.write_rows(read_from, schema:, write_to:, batch_size: nil)
|
79
|
+
def self.write_rows(read_from, schema:, write_to:, batch_size: nil, compression: nil)
|
54
80
|
end
|
55
81
|
|
56
82
|
# Options:
|
57
83
|
# - `read_from`: An Enumerator yielding arrays of column batches
|
58
|
-
# - `schema`: Array of hashes specifying column names and types
|
84
|
+
# - `schema`: Array of hashes specifying column names and types. Supported types:
|
85
|
+
# - `int8`, `int16`, `int32`, `int64`
|
86
|
+
# - `uint8`, `uint16`, `uint32`, `uint64`
|
87
|
+
# - `float`, `double`
|
88
|
+
# - `string`
|
89
|
+
# - `binary`
|
90
|
+
# - `boolean`
|
91
|
+
# - `date32`
|
92
|
+
# - `timestamp_millis`, `timestamp_micros`
|
93
|
+
# - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
|
59
94
|
# - `write_to`: String path or IO object to write the parquet file to
|
95
|
+
# - `compression`: Optional compression type to use (defaults to "zstd")
|
96
|
+
# Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
60
97
|
sig do
|
61
98
|
params(
|
62
99
|
read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
|
63
100
|
schema: T::Array[T::Hash[String, String]],
|
64
|
-
write_to: T.any(String, IO)
|
101
|
+
write_to: T.any(String, IO),
|
102
|
+
compression: T.nilable(String)
|
65
103
|
).void
|
66
104
|
end
|
67
|
-
def self.write_columns(read_from, schema:, write_to:)
|
105
|
+
def self.write_columns(read_from, schema:, write_to:, compression: nil)
|
68
106
|
end
|
69
107
|
end
|