parquet 0.2.6 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +8 -1
- data/ext/parquet/src/types/core_types.rs +10 -8
- data/ext/parquet/src/types/parquet_value.rs +3 -3
- data/ext/parquet/src/types/type_conversion.rs +84 -28
- data/ext/parquet/src/types/writer_types.rs +20 -14
- data/ext/parquet/src/writer/mod.rs +70 -15
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +47 -9
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2dea9b9b171070949497da37aff1888de71c0782e76968ba218f38e5dc2f1606
|
4
|
+
data.tar.gz: 74f4599b00a818cfca62d7fc162d02a87658da014ace361a76c998b718def9f2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 209ca0339ccb11224501efc1d1adfed241097763475aa44e3997fce811123e9744f1bbfb1447e91decd1b020181b722ded94a6655630288db1f22e88aa8c09ae
|
7
|
+
data.tar.gz: a889e46dc8fca484043b3f1513ee6487b0f8caa8096c826cdbe4fa9ff2d6aa457c2d84e1bd95f7b05819e0ce2e33017a77a720aa331be7115cfa2ac470557a59
|
data/README.md
CHANGED
@@ -152,9 +152,16 @@ batches = [
|
|
152
152
|
# Create an enumerator from the batches
|
153
153
|
columns = batches.each
|
154
154
|
|
155
|
-
# Write to a parquet file
|
155
|
+
# Write to a parquet file with default ZSTD compression
|
156
156
|
Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
|
157
157
|
|
158
|
+
# Write to a parquet file with specific compression
|
159
|
+
Parquet.write_columns(columns,
|
160
|
+
schema: schema,
|
161
|
+
write_to: "data.parquet",
|
162
|
+
compression: "snappy" # Supported: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
163
|
+
)
|
164
|
+
|
158
165
|
# Write to an IO object
|
159
166
|
File.open("data.parquet", "wb") do |file|
|
160
167
|
Parquet.write_columns(columns, schema: schema, write_to: file)
|
@@ -40,18 +40,20 @@ impl std::fmt::Display for ParserResultType {
|
|
40
40
|
}
|
41
41
|
|
42
42
|
#[derive(Debug, Clone)]
|
43
|
-
pub struct ListField {
|
44
|
-
pub item_type: ParquetSchemaType
|
43
|
+
pub struct ListField<'a> {
|
44
|
+
pub item_type: ParquetSchemaType<'a>,
|
45
|
+
pub format: Option<&'a str>,
|
45
46
|
}
|
46
47
|
|
47
48
|
#[derive(Debug, Clone)]
|
48
|
-
pub struct MapField {
|
49
|
-
pub key_type: ParquetSchemaType
|
50
|
-
pub value_type: ParquetSchemaType
|
49
|
+
pub struct MapField<'a> {
|
50
|
+
pub key_type: ParquetSchemaType<'a>,
|
51
|
+
pub value_type: ParquetSchemaType<'a>,
|
52
|
+
pub format: Option<&'a str>,
|
51
53
|
}
|
52
54
|
|
53
55
|
#[derive(Debug, Clone)]
|
54
|
-
pub enum ParquetSchemaType {
|
56
|
+
pub enum ParquetSchemaType<'a> {
|
55
57
|
Int8,
|
56
58
|
Int16,
|
57
59
|
Int32,
|
@@ -68,6 +70,6 @@ pub enum ParquetSchemaType {
|
|
68
70
|
Date32,
|
69
71
|
TimestampMillis,
|
70
72
|
TimestampMicros,
|
71
|
-
List(Box<ListField
|
72
|
-
Map(Box<MapField
|
73
|
+
List(Box<ListField<'a>>),
|
74
|
+
Map(Box<MapField<'a>>),
|
73
75
|
}
|
@@ -215,15 +215,15 @@ impl ParquetValue {
|
|
215
215
|
Ok(ParquetValue::Boolean(v))
|
216
216
|
}
|
217
217
|
ParquetSchemaType::Date32 => {
|
218
|
-
let v = convert_to_date32(value)?;
|
218
|
+
let v = convert_to_date32(value, None)?;
|
219
219
|
Ok(ParquetValue::Date32(v))
|
220
220
|
}
|
221
221
|
ParquetSchemaType::TimestampMillis => {
|
222
|
-
let v = convert_to_timestamp_millis(value)?;
|
222
|
+
let v = convert_to_timestamp_millis(value, None)?;
|
223
223
|
Ok(ParquetValue::TimestampMillis(v, None))
|
224
224
|
}
|
225
225
|
ParquetSchemaType::TimestampMicros => {
|
226
|
-
let v = convert_to_timestamp_micros(value)?;
|
226
|
+
let v = convert_to_timestamp_micros(value, None)?;
|
227
227
|
Ok(ParquetValue::TimestampMicros(v, None))
|
228
228
|
}
|
229
229
|
ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => Err(MagnusError::new(
|
@@ -30,17 +30,35 @@ where
|
|
30
30
|
}
|
31
31
|
}
|
32
32
|
|
33
|
-
pub fn convert_to_date32(value: Value) -> Result<i32, MagnusError> {
|
33
|
+
pub fn convert_to_date32(value: Value, format: Option<&str>) -> Result<i32, MagnusError> {
|
34
34
|
let ruby = unsafe { Ruby::get_unchecked() };
|
35
35
|
if value.is_kind_of(ruby.class_string()) {
|
36
36
|
let s = String::try_convert(value)?;
|
37
|
-
// Parse string into
|
38
|
-
let date
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
37
|
+
// Parse string into Date using jiff
|
38
|
+
let date = if let Some(fmt) = format {
|
39
|
+
jiff::civil::Date::strptime(&fmt, &s).or_else(|e1| {
|
40
|
+
// Try parsing as DateTime and convert to Date with zero offset
|
41
|
+
jiff::civil::DateTime::strptime(&fmt, &s)
|
42
|
+
.and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
|
43
|
+
.map(|dt| dt.date())
|
44
|
+
.map_err(|e2| {
|
45
|
+
MagnusError::new(
|
46
|
+
magnus::exception::type_error(),
|
47
|
+
format!(
|
48
|
+
"Failed to parse '{}' with format '{}' as date32: {} (and as datetime: {})",
|
49
|
+
s, fmt, e1, e2
|
50
|
+
),
|
51
|
+
)
|
52
|
+
})
|
53
|
+
})?
|
54
|
+
} else {
|
55
|
+
s.parse().map_err(|e| {
|
56
|
+
MagnusError::new(
|
57
|
+
magnus::exception::type_error(),
|
58
|
+
format!("Failed to parse '{}' as date32: {}", s, e),
|
59
|
+
)
|
60
|
+
})?
|
61
|
+
};
|
44
62
|
|
45
63
|
let timestamp = date.at(0, 0, 0, 0);
|
46
64
|
|
@@ -63,17 +81,36 @@ pub fn convert_to_date32(value: Value) -> Result<i32, MagnusError> {
|
|
63
81
|
}
|
64
82
|
}
|
65
83
|
|
66
|
-
pub fn convert_to_timestamp_millis(value: Value) -> Result<i64, MagnusError> {
|
84
|
+
pub fn convert_to_timestamp_millis(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
|
67
85
|
let ruby = unsafe { Ruby::get_unchecked() };
|
68
86
|
if value.is_kind_of(ruby.class_string()) {
|
69
87
|
let s = String::try_convert(value)?;
|
70
88
|
// Parse string into Timestamp using jiff
|
71
|
-
let timestamp
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
89
|
+
let timestamp = if let Some(fmt) = format {
|
90
|
+
jiff::Timestamp::strptime(&fmt, &s)
|
91
|
+
.or_else(|e1| {
|
92
|
+
// Try parsing as DateTime and convert to Timestamp with zero offset
|
93
|
+
jiff::civil::DateTime::strptime(&fmt, &s)
|
94
|
+
.and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
|
95
|
+
.map(|dt| dt.timestamp())
|
96
|
+
.map_err(|e2| {
|
97
|
+
MagnusError::new(
|
98
|
+
magnus::exception::type_error(),
|
99
|
+
format!(
|
100
|
+
"Failed to parse '{}' with format '{}' as timestamp_millis: {} (and as datetime: {})",
|
101
|
+
s, fmt, e1, e2
|
102
|
+
),
|
103
|
+
)
|
104
|
+
})
|
105
|
+
})?
|
106
|
+
} else {
|
107
|
+
s.parse().map_err(|e| {
|
108
|
+
MagnusError::new(
|
109
|
+
magnus::exception::type_error(),
|
110
|
+
format!("Failed to parse '{}' as timestamp_millis: {}", s, e),
|
111
|
+
)
|
112
|
+
})?
|
113
|
+
};
|
77
114
|
// Convert to milliseconds
|
78
115
|
Ok(timestamp.as_millisecond())
|
79
116
|
} else if value.is_kind_of(ruby.class_time()) {
|
@@ -91,17 +128,36 @@ pub fn convert_to_timestamp_millis(value: Value) -> Result<i64, MagnusError> {
|
|
91
128
|
}
|
92
129
|
}
|
93
130
|
|
94
|
-
pub fn convert_to_timestamp_micros(value: Value) -> Result<i64, MagnusError> {
|
131
|
+
pub fn convert_to_timestamp_micros(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
|
95
132
|
let ruby = unsafe { Ruby::get_unchecked() };
|
96
133
|
if value.is_kind_of(ruby.class_string()) {
|
97
134
|
let s = String::try_convert(value)?;
|
98
135
|
// Parse string into Timestamp using jiff
|
99
|
-
let timestamp
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
136
|
+
let timestamp = if let Some(fmt) = format {
|
137
|
+
jiff::Timestamp::strptime(&fmt, &s).or_else(|e1| {
|
138
|
+
// Try parsing as DateTime and convert to Timestamp with zero offset
|
139
|
+
jiff::civil::DateTime::strptime(&fmt, &s).and_then(|dt| {
|
140
|
+
dt.to_zoned(TimeZone::fixed(Offset::constant(0)))
|
141
|
+
})
|
142
|
+
.map(|dt| dt.timestamp())
|
143
|
+
.map_err(|e2| {
|
144
|
+
MagnusError::new(
|
145
|
+
magnus::exception::type_error(),
|
146
|
+
format!(
|
147
|
+
"Failed to parse '{}' with format '{}' as timestamp_micros: {} (and as datetime: {})",
|
148
|
+
s, fmt, e1, e2
|
149
|
+
),
|
150
|
+
)
|
151
|
+
})
|
152
|
+
})?
|
153
|
+
} else {
|
154
|
+
s.parse().map_err(|e| {
|
155
|
+
MagnusError::new(
|
156
|
+
magnus::exception::type_error(),
|
157
|
+
format!("Failed to parse '{}' as timestamp_micros: {}", s, e),
|
158
|
+
)
|
159
|
+
})?
|
160
|
+
};
|
105
161
|
// Convert to microseconds
|
106
162
|
Ok(timestamp.as_microsecond())
|
107
163
|
} else if value.is_kind_of(ruby.class_time()) {
|
@@ -204,15 +260,15 @@ pub fn convert_to_list(
|
|
204
260
|
ParquetValue::Boolean(v)
|
205
261
|
}
|
206
262
|
ParquetSchemaType::Date32 => {
|
207
|
-
let v = convert_to_date32(item_value)?;
|
263
|
+
let v = convert_to_date32(item_value, list_field.format)?;
|
208
264
|
ParquetValue::Date32(v)
|
209
265
|
}
|
210
266
|
ParquetSchemaType::TimestampMillis => {
|
211
|
-
let v = convert_to_timestamp_millis(item_value)?;
|
267
|
+
let v = convert_to_timestamp_millis(item_value, list_field.format)?;
|
212
268
|
ParquetValue::TimestampMillis(v, None)
|
213
269
|
}
|
214
270
|
ParquetSchemaType::TimestampMicros => {
|
215
|
-
let v = convert_to_timestamp_micros(item_value)?;
|
271
|
+
let v = convert_to_timestamp_micros(item_value, list_field.format)?;
|
216
272
|
ParquetValue::TimestampMicros(v, None)
|
217
273
|
}
|
218
274
|
ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
|
@@ -310,15 +366,15 @@ pub fn convert_to_map(
|
|
310
366
|
ParquetValue::Boolean(v)
|
311
367
|
}
|
312
368
|
ParquetSchemaType::Date32 => {
|
313
|
-
let v = convert_to_date32(value)?;
|
369
|
+
let v = convert_to_date32(value, map_field.format)?;
|
314
370
|
ParquetValue::Date32(v)
|
315
371
|
}
|
316
372
|
ParquetSchemaType::TimestampMillis => {
|
317
|
-
let v = convert_to_timestamp_millis(value)?;
|
373
|
+
let v = convert_to_timestamp_millis(value, map_field.format)?;
|
318
374
|
ParquetValue::TimestampMillis(v, None)
|
319
375
|
}
|
320
376
|
ParquetSchemaType::TimestampMicros => {
|
321
|
-
let v = convert_to_timestamp_micros(value)?;
|
377
|
+
let v = convert_to_timestamp_micros(value, map_field.format)?;
|
322
378
|
ParquetValue::TimestampMicros(v, None)
|
323
379
|
}
|
324
380
|
ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
|
@@ -12,17 +12,19 @@ use tempfile::NamedTempFile;
|
|
12
12
|
use crate::types::{ListField, MapField, ParquetSchemaType};
|
13
13
|
|
14
14
|
#[derive(Debug)]
|
15
|
-
pub struct SchemaField {
|
15
|
+
pub struct SchemaField<'a> {
|
16
16
|
pub name: String,
|
17
|
-
pub type_: ParquetSchemaType
|
17
|
+
pub type_: ParquetSchemaType<'a>,
|
18
|
+
pub format: Option<String>,
|
18
19
|
}
|
19
20
|
|
20
21
|
#[derive(Debug)]
|
21
|
-
pub struct ParquetWriteArgs {
|
22
|
+
pub struct ParquetWriteArgs<'a> {
|
22
23
|
pub read_from: Value,
|
23
24
|
pub write_to: Value,
|
24
|
-
pub schema: Vec<SchemaField
|
25
|
+
pub schema: Vec<SchemaField<'a>>,
|
25
26
|
pub batch_size: Option<usize>,
|
27
|
+
pub compression: Option<String>,
|
26
28
|
}
|
27
29
|
|
28
30
|
pub trait SendableWrite: Send + Write {}
|
@@ -51,7 +53,7 @@ impl Write for IoLikeValue {
|
|
51
53
|
}
|
52
54
|
}
|
53
55
|
|
54
|
-
impl FromStr for ParquetSchemaType {
|
56
|
+
impl<'a> FromStr for ParquetSchemaType<'a> {
|
55
57
|
type Err = MagnusError;
|
56
58
|
|
57
59
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
@@ -74,10 +76,12 @@ impl FromStr for ParquetSchemaType {
|
|
74
76
|
"timestamp_micros" => Ok(ParquetSchemaType::TimestampMicros),
|
75
77
|
"list" => Ok(ParquetSchemaType::List(Box::new(ListField {
|
76
78
|
item_type: ParquetSchemaType::Int8,
|
79
|
+
format: None,
|
77
80
|
}))),
|
78
81
|
"map" => Ok(ParquetSchemaType::Map(Box::new(MapField {
|
79
82
|
key_type: ParquetSchemaType::String,
|
80
83
|
value_type: ParquetSchemaType::Int8,
|
84
|
+
format: None,
|
81
85
|
}))),
|
82
86
|
_ => Err(MagnusError::new(
|
83
87
|
magnus::exception::runtime_error(),
|
@@ -87,7 +91,7 @@ impl FromStr for ParquetSchemaType {
|
|
87
91
|
}
|
88
92
|
}
|
89
93
|
|
90
|
-
impl TryConvert for ParquetSchemaType {
|
94
|
+
impl<'a> TryConvert for ParquetSchemaType<'a> {
|
91
95
|
fn try_convert(value: Value) -> Result<Self, MagnusError> {
|
92
96
|
let ruby = unsafe { Ruby::get_unchecked() };
|
93
97
|
let schema_type = parse_string_or_symbol(&ruby, value)?;
|
@@ -98,7 +102,7 @@ impl TryConvert for ParquetSchemaType {
|
|
98
102
|
|
99
103
|
// We know this type is safe to move between threads because it's just an enum
|
100
104
|
// with simple primitive types and strings
|
101
|
-
unsafe impl Send for ParquetSchemaType {}
|
105
|
+
unsafe impl<'a> Send for ParquetSchemaType<'a> {}
|
102
106
|
|
103
107
|
fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, MagnusError> {
|
104
108
|
if value.is_nil() {
|
@@ -162,17 +166,19 @@ impl From<ParquetErrorWrapper> for MagnusError {
|
|
162
166
|
}
|
163
167
|
}
|
164
168
|
|
165
|
-
pub struct ColumnCollector {
|
169
|
+
pub struct ColumnCollector<'a> {
|
166
170
|
pub name: String,
|
167
|
-
pub type_: ParquetSchemaType
|
171
|
+
pub type_: ParquetSchemaType<'a>,
|
172
|
+
pub format: Option<String>,
|
168
173
|
pub values: Vec<crate::types::ParquetValue>,
|
169
174
|
}
|
170
175
|
|
171
|
-
impl ColumnCollector {
|
172
|
-
pub fn new(name: String, type_: ParquetSchemaType) -> Self {
|
176
|
+
impl<'a> ColumnCollector<'a> {
|
177
|
+
pub fn new(name: String, type_: ParquetSchemaType<'a>, format: Option<String>) -> Self {
|
173
178
|
Self {
|
174
179
|
name,
|
175
180
|
type_,
|
181
|
+
format,
|
176
182
|
values: Vec::new(),
|
177
183
|
}
|
178
184
|
}
|
@@ -244,15 +250,15 @@ impl ColumnCollector {
|
|
244
250
|
ParquetValue::Boolean(v)
|
245
251
|
}
|
246
252
|
ParquetSchemaType::Date32 => {
|
247
|
-
let v = convert_to_date32(value)?;
|
253
|
+
let v = convert_to_date32(value, self.format.as_deref())?;
|
248
254
|
ParquetValue::Date32(v)
|
249
255
|
}
|
250
256
|
ParquetSchemaType::TimestampMillis => {
|
251
|
-
let v = convert_to_timestamp_millis(value)?;
|
257
|
+
let v = convert_to_timestamp_millis(value, self.format.as_deref())?;
|
252
258
|
ParquetValue::TimestampMillis(v, None)
|
253
259
|
}
|
254
260
|
ParquetSchemaType::TimestampMicros => {
|
255
|
-
let v = convert_to_timestamp_micros(value)?;
|
261
|
+
let v = convert_to_timestamp_micros(value, self.format.as_deref())?;
|
256
262
|
ParquetValue::TimestampMicros(v, None)
|
257
263
|
}
|
258
264
|
ParquetSchemaType::List(list_field) => {
|
@@ -11,7 +11,11 @@ use magnus::{
|
|
11
11
|
value::ReprValue,
|
12
12
|
Error as MagnusError, RArray, Ruby, TryConvert, Value,
|
13
13
|
};
|
14
|
-
use parquet::
|
14
|
+
use parquet::{
|
15
|
+
arrow::ArrowWriter,
|
16
|
+
basic::{Compression, GzipLevel, ZstdLevel},
|
17
|
+
file::properties::WriterProperties,
|
18
|
+
};
|
15
19
|
use tempfile::NamedTempFile;
|
16
20
|
|
17
21
|
use crate::{
|
@@ -28,11 +32,12 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
28
32
|
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
29
33
|
let (read_from,) = parsed_args.required;
|
30
34
|
|
31
|
-
let kwargs =
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
35
|
+
let kwargs =
|
36
|
+
get_kwargs::<_, (Value, Value), (Option<Option<usize>>, Option<Option<String>>), ()>(
|
37
|
+
parsed_args.keywords,
|
38
|
+
&["schema", "write_to"],
|
39
|
+
&["batch_size", "compression"],
|
40
|
+
)?;
|
36
41
|
|
37
42
|
let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
|
38
43
|
MagnusError::new(
|
@@ -59,11 +64,45 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
59
64
|
));
|
60
65
|
}
|
61
66
|
|
62
|
-
let (name,
|
67
|
+
let (name, type_value) = &entries[0];
|
63
68
|
let name = String::try_convert(name.clone())?;
|
64
|
-
let type_ = ParquetSchemaType::try_convert(type_str.clone())?;
|
65
69
|
|
66
|
-
|
70
|
+
let (type_, format) = if type_value.is_kind_of(ruby.class_hash()) {
|
71
|
+
let type_hash: Vec<(Value, Value)> = type_value.funcall("to_a", ())?;
|
72
|
+
let mut type_str = None;
|
73
|
+
let mut format_str = None;
|
74
|
+
|
75
|
+
for (key, value) in type_hash {
|
76
|
+
let key = String::try_convert(key)?;
|
77
|
+
match key.as_str() {
|
78
|
+
"type" => type_str = Some(value),
|
79
|
+
"format" => format_str = Some(String::try_convert(value)?),
|
80
|
+
_ => {
|
81
|
+
return Err(MagnusError::new(
|
82
|
+
magnus::exception::type_error(),
|
83
|
+
format!("Unknown key '{}' in type definition", key),
|
84
|
+
))
|
85
|
+
}
|
86
|
+
}
|
87
|
+
}
|
88
|
+
|
89
|
+
let type_str = type_str.ok_or_else(|| {
|
90
|
+
MagnusError::new(
|
91
|
+
magnus::exception::type_error(),
|
92
|
+
"Missing 'type' in type definition",
|
93
|
+
)
|
94
|
+
})?;
|
95
|
+
|
96
|
+
(ParquetSchemaType::try_convert(type_str)?, format_str)
|
97
|
+
} else {
|
98
|
+
(ParquetSchemaType::try_convert(type_value.clone())?, None)
|
99
|
+
};
|
100
|
+
|
101
|
+
schema.push(SchemaField {
|
102
|
+
name,
|
103
|
+
type_,
|
104
|
+
format,
|
105
|
+
});
|
67
106
|
}
|
68
107
|
|
69
108
|
Ok(ParquetWriteArgs {
|
@@ -71,6 +110,7 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
71
110
|
write_to: kwargs.required.1,
|
72
111
|
schema,
|
73
112
|
batch_size: kwargs.optional.0.flatten(),
|
113
|
+
compression: kwargs.optional.1.flatten(),
|
74
114
|
})
|
75
115
|
}
|
76
116
|
|
@@ -83,6 +123,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
83
123
|
write_to,
|
84
124
|
schema,
|
85
125
|
batch_size,
|
126
|
+
compression,
|
86
127
|
} = parse_parquet_write_args(args)?;
|
87
128
|
|
88
129
|
let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
|
@@ -124,13 +165,13 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
124
165
|
let arrow_schema = Arc::new(Schema::new(arrow_fields));
|
125
166
|
|
126
167
|
// Create the writer
|
127
|
-
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
|
168
|
+
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
|
128
169
|
|
129
170
|
if read_from.is_kind_of(ruby.class_enumerator()) {
|
130
171
|
// Create collectors for each column
|
131
172
|
let mut column_collectors: Vec<ColumnCollector> = schema
|
132
173
|
.into_iter()
|
133
|
-
.map(|field| ColumnCollector::new(field.name, field.type_))
|
174
|
+
.map(|field| ColumnCollector::new(field.name, field.type_, field.format))
|
134
175
|
.collect();
|
135
176
|
|
136
177
|
let mut rows_in_batch = 0;
|
@@ -204,7 +245,8 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
|
204
245
|
read_from,
|
205
246
|
write_to,
|
206
247
|
schema,
|
207
|
-
batch_size: _,
|
248
|
+
batch_size: _,
|
249
|
+
compression,
|
208
250
|
} = parse_parquet_write_args(args)?;
|
209
251
|
|
210
252
|
// Convert schema to Arrow schema
|
@@ -244,7 +286,7 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
|
244
286
|
let arrow_schema = Arc::new(Schema::new(arrow_fields));
|
245
287
|
|
246
288
|
// Create the writer
|
247
|
-
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
|
289
|
+
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
|
248
290
|
|
249
291
|
if read_from.is_kind_of(ruby.class_enumerator()) {
|
250
292
|
loop {
|
@@ -326,12 +368,25 @@ fn create_writer(
|
|
326
368
|
ruby: &Ruby,
|
327
369
|
write_to: &Value,
|
328
370
|
schema: Arc<Schema>,
|
371
|
+
compression: Option<String>,
|
329
372
|
) -> Result<WriterOutput, MagnusError> {
|
373
|
+
// Create writer properties with compression based on the option
|
374
|
+
let props = WriterProperties::builder()
|
375
|
+
.set_compression(match compression.as_deref() {
|
376
|
+
Some("none") | Some("uncompressed") => Compression::UNCOMPRESSED,
|
377
|
+
Some("snappy") => Compression::SNAPPY,
|
378
|
+
Some("gzip") => Compression::GZIP(GzipLevel::default()),
|
379
|
+
Some("lz4") => Compression::LZ4,
|
380
|
+
Some("zstd") => Compression::ZSTD(ZstdLevel::default()),
|
381
|
+
_ => Compression::UNCOMPRESSED,
|
382
|
+
})
|
383
|
+
.build();
|
384
|
+
|
330
385
|
if write_to.is_kind_of(ruby.class_string()) {
|
331
386
|
let path = write_to.to_r_string()?.to_string()?;
|
332
387
|
let file: Box<dyn SendableWrite> = Box::new(File::create(path).unwrap());
|
333
388
|
let writer =
|
334
|
-
ArrowWriter::try_new(file, schema,
|
389
|
+
ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
|
335
390
|
Ok(WriterOutput::File(writer))
|
336
391
|
} else {
|
337
392
|
// Create a temporary file to write to instead of directly to the IoLikeValue
|
@@ -348,7 +403,7 @@ fn create_writer(
|
|
348
403
|
)
|
349
404
|
})?);
|
350
405
|
let writer =
|
351
|
-
ArrowWriter::try_new(file, schema,
|
406
|
+
ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
|
352
407
|
Ok(WriterOutput::TempFile(writer, temp_file))
|
353
408
|
}
|
354
409
|
}
|
data/lib/parquet/version.rb
CHANGED
data/lib/parquet.rbi
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# typed:
|
1
|
+
# typed: true
|
2
2
|
|
3
3
|
module Parquet
|
4
4
|
# Options:
|
@@ -7,13 +7,20 @@ module Parquet
|
|
7
7
|
# ("hash" or "array" or :hash or :array)
|
8
8
|
# - `columns`: When present, only the specified columns will be included in the output.
|
9
9
|
# This is useful for reducing how much data is read and improving performance.
|
10
|
+
sig do
|
11
|
+
params(
|
12
|
+
input: T.any(String, File, StringIO, IO),
|
13
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
14
|
+
columns: T.nilable(T::Array[String])
|
15
|
+
).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
|
16
|
+
end
|
10
17
|
sig do
|
11
18
|
params(
|
12
19
|
input: T.any(String, File, StringIO, IO),
|
13
20
|
result_type: T.nilable(T.any(String, Symbol)),
|
14
21
|
columns: T.nilable(T::Array[String]),
|
15
22
|
blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
|
16
|
-
).returns(
|
23
|
+
).returns(NilClass)
|
17
24
|
end
|
18
25
|
def self.each_row(input, result_type: nil, columns: nil, &blk)
|
19
26
|
end
|
@@ -24,6 +31,14 @@ module Parquet
|
|
24
31
|
# ("hash" or "array" or :hash or :array)
|
25
32
|
# - `columns`: When present, only the specified columns will be included in the output.
|
26
33
|
# - `batch_size`: When present, specifies the number of rows per batch
|
34
|
+
sig do
|
35
|
+
params(
|
36
|
+
input: T.any(String, File, StringIO, IO),
|
37
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
38
|
+
columns: T.nilable(T::Array[String]),
|
39
|
+
batch_size: T.nilable(Integer)
|
40
|
+
).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
|
41
|
+
end
|
27
42
|
sig do
|
28
43
|
params(
|
29
44
|
input: T.any(String, File, StringIO, IO),
|
@@ -32,38 +47,61 @@ module Parquet
|
|
32
47
|
batch_size: T.nilable(Integer),
|
33
48
|
blk:
|
34
49
|
T.nilable(T.proc.params(batch: T.any(T::Hash[String, T::Array[T.untyped]], T::Array[T::Array[T.untyped]])).void)
|
35
|
-
).returns(
|
50
|
+
).returns(NilClass)
|
36
51
|
end
|
37
52
|
def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, &blk)
|
38
53
|
end
|
39
54
|
|
40
55
|
# Options:
|
41
56
|
# - `read_from`: An Enumerator yielding arrays of values representing each row
|
42
|
-
# - `schema`: Array of hashes specifying column names and types
|
57
|
+
# - `schema`: Array of hashes specifying column names and types. Supported types:
|
58
|
+
# - `int8`, `int16`, `int32`, `int64`
|
59
|
+
# - `uint8`, `uint16`, `uint32`, `uint64`
|
60
|
+
# - `float`, `double`
|
61
|
+
# - `string`
|
62
|
+
# - `binary`
|
63
|
+
# - `boolean`
|
64
|
+
# - `date32`
|
65
|
+
# - `timestamp_millis`, `timestamp_micros`
|
43
66
|
# - `write_to`: String path or IO object to write the parquet file to
|
44
67
|
# - `batch_size`: Optional batch size for writing (defaults to 1000)
|
68
|
+
# - `compression`: Optional compression type to use (defaults to "zstd")
|
69
|
+
# Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
45
70
|
sig do
|
46
71
|
params(
|
47
72
|
read_from: T::Enumerator[T::Array[T.untyped]],
|
48
73
|
schema: T::Array[T::Hash[String, String]],
|
49
74
|
write_to: T.any(String, IO),
|
50
|
-
batch_size: T.nilable(Integer)
|
75
|
+
batch_size: T.nilable(Integer),
|
76
|
+
compression: T.nilable(String)
|
51
77
|
).void
|
52
78
|
end
|
53
|
-
def self.write_rows(read_from, schema:, write_to:, batch_size: nil)
|
79
|
+
def self.write_rows(read_from, schema:, write_to:, batch_size: nil, compression: nil)
|
54
80
|
end
|
55
81
|
|
56
82
|
# Options:
|
57
83
|
# - `read_from`: An Enumerator yielding arrays of column batches
|
58
|
-
# - `schema`: Array of hashes specifying column names and types
|
84
|
+
# - `schema`: Array of hashes specifying column names and types. Supported types:
|
85
|
+
# - `int8`, `int16`, `int32`, `int64`
|
86
|
+
# - `uint8`, `uint16`, `uint32`, `uint64`
|
87
|
+
# - `float`, `double`
|
88
|
+
# - `string`
|
89
|
+
# - `binary`
|
90
|
+
# - `boolean`
|
91
|
+
# - `date32`
|
92
|
+
# - `timestamp_millis`, `timestamp_micros`
|
93
|
+
# - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
|
59
94
|
# - `write_to`: String path or IO object to write the parquet file to
|
95
|
+
# - `compression`: Optional compression type to use (defaults to "zstd")
|
96
|
+
# Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
60
97
|
sig do
|
61
98
|
params(
|
62
99
|
read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
|
63
100
|
schema: T::Array[T::Hash[String, String]],
|
64
|
-
write_to: T.any(String, IO)
|
101
|
+
write_to: T.any(String, IO),
|
102
|
+
compression: T.nilable(String)
|
65
103
|
).void
|
66
104
|
end
|
67
|
-
def self.write_columns(read_from, schema:, write_to:)
|
105
|
+
def self.write_columns(read_from, schema:, write_to:, compression: nil)
|
68
106
|
end
|
69
107
|
end
|