parquet 0.2.6 → 0.2.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 794d11142b73d13b665ecdb4ffd46df6ab7d97e5f99336e2bc91b79dbb55a514
4
- data.tar.gz: eb2843d724e7aad70445a8b992a527e3bee0a79055fdeab7f2ebd2cdfb6247d6
3
+ metadata.gz: 2dea9b9b171070949497da37aff1888de71c0782e76968ba218f38e5dc2f1606
4
+ data.tar.gz: 74f4599b00a818cfca62d7fc162d02a87658da014ace361a76c998b718def9f2
5
5
  SHA512:
6
- metadata.gz: 8b97550fb18f2ab4db0b5fbb170d12448237665d9372242d4027760f1c697be0d1e7a8bb47d43886f704e0923ddf57544961fe5af29c596b49aac188f714b9e6
7
- data.tar.gz: 1ea56a23e39a084d40690d4e7bd108ec2a4cb20b61714bd564e68600d3f3edda3ffd5c3e646d49d4bb85632ad14f2c7d5735e645610e7a863d9e25d6f1d2b90d
6
+ metadata.gz: 209ca0339ccb11224501efc1d1adfed241097763475aa44e3997fce811123e9744f1bbfb1447e91decd1b020181b722ded94a6655630288db1f22e88aa8c09ae
7
+ data.tar.gz: a889e46dc8fca484043b3f1513ee6487b0f8caa8096c826cdbe4fa9ff2d6aa457c2d84e1bd95f7b05819e0ce2e33017a77a720aa331be7115cfa2ac470557a59
data/README.md CHANGED
@@ -152,9 +152,16 @@ batches = [
152
152
  # Create an enumerator from the batches
153
153
  columns = batches.each
154
154
 
155
- # Write to a parquet file
155
+ # Write to a parquet file with default ZSTD compression
156
156
  Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
157
157
 
158
+ # Write to a parquet file with specific compression
159
+ Parquet.write_columns(columns,
160
+ schema: schema,
161
+ write_to: "data.parquet",
162
+ compression: "snappy" # Supported: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
163
+ )
164
+
158
165
  # Write to an IO object
159
166
  File.open("data.parquet", "wb") do |file|
160
167
  Parquet.write_columns(columns, schema: schema, write_to: file)
@@ -40,18 +40,20 @@ impl std::fmt::Display for ParserResultType {
40
40
  }
41
41
 
42
42
  #[derive(Debug, Clone)]
43
- pub struct ListField {
44
- pub item_type: ParquetSchemaType,
43
+ pub struct ListField<'a> {
44
+ pub item_type: ParquetSchemaType<'a>,
45
+ pub format: Option<&'a str>,
45
46
  }
46
47
 
47
48
  #[derive(Debug, Clone)]
48
- pub struct MapField {
49
- pub key_type: ParquetSchemaType,
50
- pub value_type: ParquetSchemaType,
49
+ pub struct MapField<'a> {
50
+ pub key_type: ParquetSchemaType<'a>,
51
+ pub value_type: ParquetSchemaType<'a>,
52
+ pub format: Option<&'a str>,
51
53
  }
52
54
 
53
55
  #[derive(Debug, Clone)]
54
- pub enum ParquetSchemaType {
56
+ pub enum ParquetSchemaType<'a> {
55
57
  Int8,
56
58
  Int16,
57
59
  Int32,
@@ -68,6 +70,6 @@ pub enum ParquetSchemaType {
68
70
  Date32,
69
71
  TimestampMillis,
70
72
  TimestampMicros,
71
- List(Box<ListField>),
72
- Map(Box<MapField>),
73
+ List(Box<ListField<'a>>),
74
+ Map(Box<MapField<'a>>),
73
75
  }
@@ -215,15 +215,15 @@ impl ParquetValue {
215
215
  Ok(ParquetValue::Boolean(v))
216
216
  }
217
217
  ParquetSchemaType::Date32 => {
218
- let v = convert_to_date32(value)?;
218
+ let v = convert_to_date32(value, None)?;
219
219
  Ok(ParquetValue::Date32(v))
220
220
  }
221
221
  ParquetSchemaType::TimestampMillis => {
222
- let v = convert_to_timestamp_millis(value)?;
222
+ let v = convert_to_timestamp_millis(value, None)?;
223
223
  Ok(ParquetValue::TimestampMillis(v, None))
224
224
  }
225
225
  ParquetSchemaType::TimestampMicros => {
226
- let v = convert_to_timestamp_micros(value)?;
226
+ let v = convert_to_timestamp_micros(value, None)?;
227
227
  Ok(ParquetValue::TimestampMicros(v, None))
228
228
  }
229
229
  ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => Err(MagnusError::new(
@@ -30,17 +30,35 @@ where
30
30
  }
31
31
  }
32
32
 
33
- pub fn convert_to_date32(value: Value) -> Result<i32, MagnusError> {
33
+ pub fn convert_to_date32(value: Value, format: Option<&str>) -> Result<i32, MagnusError> {
34
34
  let ruby = unsafe { Ruby::get_unchecked() };
35
35
  if value.is_kind_of(ruby.class_string()) {
36
36
  let s = String::try_convert(value)?;
37
- // Parse string into Timestamp using jiff
38
- let date: jiff::civil::Date = s.parse().map_err(|e| {
39
- MagnusError::new(
40
- magnus::exception::type_error(),
41
- format!("Failed to parse '{}' as date32: {}", s, e),
42
- )
43
- })?;
37
+ // Parse string into Date using jiff
38
+ let date = if let Some(fmt) = format {
39
+ jiff::civil::Date::strptime(&fmt, &s).or_else(|e1| {
40
+ // Try parsing as DateTime and convert to Date with zero offset
41
+ jiff::civil::DateTime::strptime(&fmt, &s)
42
+ .and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
43
+ .map(|dt| dt.date())
44
+ .map_err(|e2| {
45
+ MagnusError::new(
46
+ magnus::exception::type_error(),
47
+ format!(
48
+ "Failed to parse '{}' with format '{}' as date32: {} (and as datetime: {})",
49
+ s, fmt, e1, e2
50
+ ),
51
+ )
52
+ })
53
+ })?
54
+ } else {
55
+ s.parse().map_err(|e| {
56
+ MagnusError::new(
57
+ magnus::exception::type_error(),
58
+ format!("Failed to parse '{}' as date32: {}", s, e),
59
+ )
60
+ })?
61
+ };
44
62
 
45
63
  let timestamp = date.at(0, 0, 0, 0);
46
64
 
@@ -63,17 +81,36 @@ pub fn convert_to_date32(value: Value) -> Result<i32, MagnusError> {
63
81
  }
64
82
  }
65
83
 
66
- pub fn convert_to_timestamp_millis(value: Value) -> Result<i64, MagnusError> {
84
+ pub fn convert_to_timestamp_millis(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
67
85
  let ruby = unsafe { Ruby::get_unchecked() };
68
86
  if value.is_kind_of(ruby.class_string()) {
69
87
  let s = String::try_convert(value)?;
70
88
  // Parse string into Timestamp using jiff
71
- let timestamp: jiff::Timestamp = s.parse().map_err(|e| {
72
- MagnusError::new(
73
- magnus::exception::type_error(),
74
- format!("Failed to parse '{}' as timestamp_millis: {}", s, e),
75
- )
76
- })?;
89
+ let timestamp = if let Some(fmt) = format {
90
+ jiff::Timestamp::strptime(&fmt, &s)
91
+ .or_else(|e1| {
92
+ // Try parsing as DateTime and convert to Timestamp with zero offset
93
+ jiff::civil::DateTime::strptime(&fmt, &s)
94
+ .and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
95
+ .map(|dt| dt.timestamp())
96
+ .map_err(|e2| {
97
+ MagnusError::new(
98
+ magnus::exception::type_error(),
99
+ format!(
100
+ "Failed to parse '{}' with format '{}' as timestamp_millis: {} (and as datetime: {})",
101
+ s, fmt, e1, e2
102
+ ),
103
+ )
104
+ })
105
+ })?
106
+ } else {
107
+ s.parse().map_err(|e| {
108
+ MagnusError::new(
109
+ magnus::exception::type_error(),
110
+ format!("Failed to parse '{}' as timestamp_millis: {}", s, e),
111
+ )
112
+ })?
113
+ };
77
114
  // Convert to milliseconds
78
115
  Ok(timestamp.as_millisecond())
79
116
  } else if value.is_kind_of(ruby.class_time()) {
@@ -91,17 +128,36 @@ pub fn convert_to_timestamp_millis(value: Value) -> Result<i64, MagnusError> {
91
128
  }
92
129
  }
93
130
 
94
- pub fn convert_to_timestamp_micros(value: Value) -> Result<i64, MagnusError> {
131
+ pub fn convert_to_timestamp_micros(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
95
132
  let ruby = unsafe { Ruby::get_unchecked() };
96
133
  if value.is_kind_of(ruby.class_string()) {
97
134
  let s = String::try_convert(value)?;
98
135
  // Parse string into Timestamp using jiff
99
- let timestamp: jiff::Timestamp = s.parse().map_err(|e| {
100
- MagnusError::new(
101
- magnus::exception::type_error(),
102
- format!("Failed to parse '{}' as timestamp_micros: {}", s, e),
103
- )
104
- })?;
136
+ let timestamp = if let Some(fmt) = format {
137
+ jiff::Timestamp::strptime(&fmt, &s).or_else(|e1| {
138
+ // Try parsing as DateTime and convert to Timestamp with zero offset
139
+ jiff::civil::DateTime::strptime(&fmt, &s).and_then(|dt| {
140
+ dt.to_zoned(TimeZone::fixed(Offset::constant(0)))
141
+ })
142
+ .map(|dt| dt.timestamp())
143
+ .map_err(|e2| {
144
+ MagnusError::new(
145
+ magnus::exception::type_error(),
146
+ format!(
147
+ "Failed to parse '{}' with format '{}' as timestamp_micros: {} (and as datetime: {})",
148
+ s, fmt, e1, e2
149
+ ),
150
+ )
151
+ })
152
+ })?
153
+ } else {
154
+ s.parse().map_err(|e| {
155
+ MagnusError::new(
156
+ magnus::exception::type_error(),
157
+ format!("Failed to parse '{}' as timestamp_micros: {}", s, e),
158
+ )
159
+ })?
160
+ };
105
161
  // Convert to microseconds
106
162
  Ok(timestamp.as_microsecond())
107
163
  } else if value.is_kind_of(ruby.class_time()) {
@@ -204,15 +260,15 @@ pub fn convert_to_list(
204
260
  ParquetValue::Boolean(v)
205
261
  }
206
262
  ParquetSchemaType::Date32 => {
207
- let v = convert_to_date32(item_value)?;
263
+ let v = convert_to_date32(item_value, list_field.format)?;
208
264
  ParquetValue::Date32(v)
209
265
  }
210
266
  ParquetSchemaType::TimestampMillis => {
211
- let v = convert_to_timestamp_millis(item_value)?;
267
+ let v = convert_to_timestamp_millis(item_value, list_field.format)?;
212
268
  ParquetValue::TimestampMillis(v, None)
213
269
  }
214
270
  ParquetSchemaType::TimestampMicros => {
215
- let v = convert_to_timestamp_micros(item_value)?;
271
+ let v = convert_to_timestamp_micros(item_value, list_field.format)?;
216
272
  ParquetValue::TimestampMicros(v, None)
217
273
  }
218
274
  ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
@@ -310,15 +366,15 @@ pub fn convert_to_map(
310
366
  ParquetValue::Boolean(v)
311
367
  }
312
368
  ParquetSchemaType::Date32 => {
313
- let v = convert_to_date32(value)?;
369
+ let v = convert_to_date32(value, map_field.format)?;
314
370
  ParquetValue::Date32(v)
315
371
  }
316
372
  ParquetSchemaType::TimestampMillis => {
317
- let v = convert_to_timestamp_millis(value)?;
373
+ let v = convert_to_timestamp_millis(value, map_field.format)?;
318
374
  ParquetValue::TimestampMillis(v, None)
319
375
  }
320
376
  ParquetSchemaType::TimestampMicros => {
321
- let v = convert_to_timestamp_micros(value)?;
377
+ let v = convert_to_timestamp_micros(value, map_field.format)?;
322
378
  ParquetValue::TimestampMicros(v, None)
323
379
  }
324
380
  ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
@@ -12,17 +12,19 @@ use tempfile::NamedTempFile;
12
12
  use crate::types::{ListField, MapField, ParquetSchemaType};
13
13
 
14
14
  #[derive(Debug)]
15
- pub struct SchemaField {
15
+ pub struct SchemaField<'a> {
16
16
  pub name: String,
17
- pub type_: ParquetSchemaType,
17
+ pub type_: ParquetSchemaType<'a>,
18
+ pub format: Option<String>,
18
19
  }
19
20
 
20
21
  #[derive(Debug)]
21
- pub struct ParquetWriteArgs {
22
+ pub struct ParquetWriteArgs<'a> {
22
23
  pub read_from: Value,
23
24
  pub write_to: Value,
24
- pub schema: Vec<SchemaField>,
25
+ pub schema: Vec<SchemaField<'a>>,
25
26
  pub batch_size: Option<usize>,
27
+ pub compression: Option<String>,
26
28
  }
27
29
 
28
30
  pub trait SendableWrite: Send + Write {}
@@ -51,7 +53,7 @@ impl Write for IoLikeValue {
51
53
  }
52
54
  }
53
55
 
54
- impl FromStr for ParquetSchemaType {
56
+ impl<'a> FromStr for ParquetSchemaType<'a> {
55
57
  type Err = MagnusError;
56
58
 
57
59
  fn from_str(s: &str) -> Result<Self, Self::Err> {
@@ -74,10 +76,12 @@ impl FromStr for ParquetSchemaType {
74
76
  "timestamp_micros" => Ok(ParquetSchemaType::TimestampMicros),
75
77
  "list" => Ok(ParquetSchemaType::List(Box::new(ListField {
76
78
  item_type: ParquetSchemaType::Int8,
79
+ format: None,
77
80
  }))),
78
81
  "map" => Ok(ParquetSchemaType::Map(Box::new(MapField {
79
82
  key_type: ParquetSchemaType::String,
80
83
  value_type: ParquetSchemaType::Int8,
84
+ format: None,
81
85
  }))),
82
86
  _ => Err(MagnusError::new(
83
87
  magnus::exception::runtime_error(),
@@ -87,7 +91,7 @@ impl FromStr for ParquetSchemaType {
87
91
  }
88
92
  }
89
93
 
90
- impl TryConvert for ParquetSchemaType {
94
+ impl<'a> TryConvert for ParquetSchemaType<'a> {
91
95
  fn try_convert(value: Value) -> Result<Self, MagnusError> {
92
96
  let ruby = unsafe { Ruby::get_unchecked() };
93
97
  let schema_type = parse_string_or_symbol(&ruby, value)?;
@@ -98,7 +102,7 @@ impl TryConvert for ParquetSchemaType {
98
102
 
99
103
  // We know this type is safe to move between threads because it's just an enum
100
104
  // with simple primitive types and strings
101
- unsafe impl Send for ParquetSchemaType {}
105
+ unsafe impl<'a> Send for ParquetSchemaType<'a> {}
102
106
 
103
107
  fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, MagnusError> {
104
108
  if value.is_nil() {
@@ -162,17 +166,19 @@ impl From<ParquetErrorWrapper> for MagnusError {
162
166
  }
163
167
  }
164
168
 
165
- pub struct ColumnCollector {
169
+ pub struct ColumnCollector<'a> {
166
170
  pub name: String,
167
- pub type_: ParquetSchemaType,
171
+ pub type_: ParquetSchemaType<'a>,
172
+ pub format: Option<String>,
168
173
  pub values: Vec<crate::types::ParquetValue>,
169
174
  }
170
175
 
171
- impl ColumnCollector {
172
- pub fn new(name: String, type_: ParquetSchemaType) -> Self {
176
+ impl<'a> ColumnCollector<'a> {
177
+ pub fn new(name: String, type_: ParquetSchemaType<'a>, format: Option<String>) -> Self {
173
178
  Self {
174
179
  name,
175
180
  type_,
181
+ format,
176
182
  values: Vec::new(),
177
183
  }
178
184
  }
@@ -244,15 +250,15 @@ impl ColumnCollector {
244
250
  ParquetValue::Boolean(v)
245
251
  }
246
252
  ParquetSchemaType::Date32 => {
247
- let v = convert_to_date32(value)?;
253
+ let v = convert_to_date32(value, self.format.as_deref())?;
248
254
  ParquetValue::Date32(v)
249
255
  }
250
256
  ParquetSchemaType::TimestampMillis => {
251
- let v = convert_to_timestamp_millis(value)?;
257
+ let v = convert_to_timestamp_millis(value, self.format.as_deref())?;
252
258
  ParquetValue::TimestampMillis(v, None)
253
259
  }
254
260
  ParquetSchemaType::TimestampMicros => {
255
- let v = convert_to_timestamp_micros(value)?;
261
+ let v = convert_to_timestamp_micros(value, self.format.as_deref())?;
256
262
  ParquetValue::TimestampMicros(v, None)
257
263
  }
258
264
  ParquetSchemaType::List(list_field) => {
@@ -11,7 +11,11 @@ use magnus::{
11
11
  value::ReprValue,
12
12
  Error as MagnusError, RArray, Ruby, TryConvert, Value,
13
13
  };
14
- use parquet::arrow::ArrowWriter;
14
+ use parquet::{
15
+ arrow::ArrowWriter,
16
+ basic::{Compression, GzipLevel, ZstdLevel},
17
+ file::properties::WriterProperties,
18
+ };
15
19
  use tempfile::NamedTempFile;
16
20
 
17
21
  use crate::{
@@ -28,11 +32,12 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
28
32
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
29
33
  let (read_from,) = parsed_args.required;
30
34
 
31
- let kwargs = get_kwargs::<_, (Value, Value), (Option<Option<usize>>,), ()>(
32
- parsed_args.keywords,
33
- &["schema", "write_to"],
34
- &["batch_size"],
35
- )?;
35
+ let kwargs =
36
+ get_kwargs::<_, (Value, Value), (Option<Option<usize>>, Option<Option<String>>), ()>(
37
+ parsed_args.keywords,
38
+ &["schema", "write_to"],
39
+ &["batch_size", "compression"],
40
+ )?;
36
41
 
37
42
  let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
38
43
  MagnusError::new(
@@ -59,11 +64,45 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
59
64
  ));
60
65
  }
61
66
 
62
- let (name, type_str) = &entries[0];
67
+ let (name, type_value) = &entries[0];
63
68
  let name = String::try_convert(name.clone())?;
64
- let type_ = ParquetSchemaType::try_convert(type_str.clone())?;
65
69
 
66
- schema.push(SchemaField { name, type_ });
70
+ let (type_, format) = if type_value.is_kind_of(ruby.class_hash()) {
71
+ let type_hash: Vec<(Value, Value)> = type_value.funcall("to_a", ())?;
72
+ let mut type_str = None;
73
+ let mut format_str = None;
74
+
75
+ for (key, value) in type_hash {
76
+ let key = String::try_convert(key)?;
77
+ match key.as_str() {
78
+ "type" => type_str = Some(value),
79
+ "format" => format_str = Some(String::try_convert(value)?),
80
+ _ => {
81
+ return Err(MagnusError::new(
82
+ magnus::exception::type_error(),
83
+ format!("Unknown key '{}' in type definition", key),
84
+ ))
85
+ }
86
+ }
87
+ }
88
+
89
+ let type_str = type_str.ok_or_else(|| {
90
+ MagnusError::new(
91
+ magnus::exception::type_error(),
92
+ "Missing 'type' in type definition",
93
+ )
94
+ })?;
95
+
96
+ (ParquetSchemaType::try_convert(type_str)?, format_str)
97
+ } else {
98
+ (ParquetSchemaType::try_convert(type_value.clone())?, None)
99
+ };
100
+
101
+ schema.push(SchemaField {
102
+ name,
103
+ type_,
104
+ format,
105
+ });
67
106
  }
68
107
 
69
108
  Ok(ParquetWriteArgs {
@@ -71,6 +110,7 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
71
110
  write_to: kwargs.required.1,
72
111
  schema,
73
112
  batch_size: kwargs.optional.0.flatten(),
113
+ compression: kwargs.optional.1.flatten(),
74
114
  })
75
115
  }
76
116
 
@@ -83,6 +123,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
83
123
  write_to,
84
124
  schema,
85
125
  batch_size,
126
+ compression,
86
127
  } = parse_parquet_write_args(args)?;
87
128
 
88
129
  let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
@@ -124,13 +165,13 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
124
165
  let arrow_schema = Arc::new(Schema::new(arrow_fields));
125
166
 
126
167
  // Create the writer
127
- let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
168
+ let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
128
169
 
129
170
  if read_from.is_kind_of(ruby.class_enumerator()) {
130
171
  // Create collectors for each column
131
172
  let mut column_collectors: Vec<ColumnCollector> = schema
132
173
  .into_iter()
133
- .map(|field| ColumnCollector::new(field.name, field.type_))
174
+ .map(|field| ColumnCollector::new(field.name, field.type_, field.format))
134
175
  .collect();
135
176
 
136
177
  let mut rows_in_batch = 0;
@@ -204,7 +245,8 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
204
245
  read_from,
205
246
  write_to,
206
247
  schema,
207
- batch_size: _, // Batch size is determined by the input
248
+ batch_size: _,
249
+ compression,
208
250
  } = parse_parquet_write_args(args)?;
209
251
 
210
252
  // Convert schema to Arrow schema
@@ -244,7 +286,7 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
244
286
  let arrow_schema = Arc::new(Schema::new(arrow_fields));
245
287
 
246
288
  // Create the writer
247
- let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
289
+ let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
248
290
 
249
291
  if read_from.is_kind_of(ruby.class_enumerator()) {
250
292
  loop {
@@ -326,12 +368,25 @@ fn create_writer(
326
368
  ruby: &Ruby,
327
369
  write_to: &Value,
328
370
  schema: Arc<Schema>,
371
+ compression: Option<String>,
329
372
  ) -> Result<WriterOutput, MagnusError> {
373
+ // Create writer properties with compression based on the option
374
+ let props = WriterProperties::builder()
375
+ .set_compression(match compression.as_deref() {
376
+ Some("none") | Some("uncompressed") => Compression::UNCOMPRESSED,
377
+ Some("snappy") => Compression::SNAPPY,
378
+ Some("gzip") => Compression::GZIP(GzipLevel::default()),
379
+ Some("lz4") => Compression::LZ4,
380
+ Some("zstd") => Compression::ZSTD(ZstdLevel::default()),
381
+ _ => Compression::UNCOMPRESSED,
382
+ })
383
+ .build();
384
+
330
385
  if write_to.is_kind_of(ruby.class_string()) {
331
386
  let path = write_to.to_r_string()?.to_string()?;
332
387
  let file: Box<dyn SendableWrite> = Box::new(File::create(path).unwrap());
333
388
  let writer =
334
- ArrowWriter::try_new(file, schema, None).map_err(|e| ParquetErrorWrapper(e))?;
389
+ ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
335
390
  Ok(WriterOutput::File(writer))
336
391
  } else {
337
392
  // Create a temporary file to write to instead of directly to the IoLikeValue
@@ -348,7 +403,7 @@ fn create_writer(
348
403
  )
349
404
  })?);
350
405
  let writer =
351
- ArrowWriter::try_new(file, schema, None).map_err(|e| ParquetErrorWrapper(e))?;
406
+ ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
352
407
  Ok(WriterOutput::TempFile(writer, temp_file))
353
408
  }
354
409
  }
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.2.6"
2
+ VERSION = "0.2.8"
3
3
  end
data/lib/parquet.rbi CHANGED
@@ -1,4 +1,4 @@
1
- # typed: strict
1
+ # typed: true
2
2
 
3
3
  module Parquet
4
4
  # Options:
@@ -7,13 +7,20 @@ module Parquet
7
7
  # ("hash" or "array" or :hash or :array)
8
8
  # - `columns`: When present, only the specified columns will be included in the output.
9
9
  # This is useful for reducing how much data is read and improving performance.
10
+ sig do
11
+ params(
12
+ input: T.any(String, File, StringIO, IO),
13
+ result_type: T.nilable(T.any(String, Symbol)),
14
+ columns: T.nilable(T::Array[String])
15
+ ).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
16
+ end
10
17
  sig do
11
18
  params(
12
19
  input: T.any(String, File, StringIO, IO),
13
20
  result_type: T.nilable(T.any(String, Symbol)),
14
21
  columns: T.nilable(T::Array[String]),
15
22
  blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
16
- ).returns(T.any(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])], NilClass))
23
+ ).returns(NilClass)
17
24
  end
18
25
  def self.each_row(input, result_type: nil, columns: nil, &blk)
19
26
  end
@@ -24,6 +31,14 @@ module Parquet
24
31
  # ("hash" or "array" or :hash or :array)
25
32
  # - `columns`: When present, only the specified columns will be included in the output.
26
33
  # - `batch_size`: When present, specifies the number of rows per batch
34
+ sig do
35
+ params(
36
+ input: T.any(String, File, StringIO, IO),
37
+ result_type: T.nilable(T.any(String, Symbol)),
38
+ columns: T.nilable(T::Array[String]),
39
+ batch_size: T.nilable(Integer)
40
+ ).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
41
+ end
27
42
  sig do
28
43
  params(
29
44
  input: T.any(String, File, StringIO, IO),
@@ -32,38 +47,61 @@ module Parquet
32
47
  batch_size: T.nilable(Integer),
33
48
  blk:
34
49
  T.nilable(T.proc.params(batch: T.any(T::Hash[String, T::Array[T.untyped]], T::Array[T::Array[T.untyped]])).void)
35
- ).returns(T.any(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])], NilClass))
50
+ ).returns(NilClass)
36
51
  end
37
52
  def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, &blk)
38
53
  end
39
54
 
40
55
  # Options:
41
56
  # - `read_from`: An Enumerator yielding arrays of values representing each row
42
- # - `schema`: Array of hashes specifying column names and types
57
+ # - `schema`: Array of hashes specifying column names and types. Supported types:
58
+ # - `int8`, `int16`, `int32`, `int64`
59
+ # - `uint8`, `uint16`, `uint32`, `uint64`
60
+ # - `float`, `double`
61
+ # - `string`
62
+ # - `binary`
63
+ # - `boolean`
64
+ # - `date32`
65
+ # - `timestamp_millis`, `timestamp_micros`
43
66
  # - `write_to`: String path or IO object to write the parquet file to
44
67
  # - `batch_size`: Optional batch size for writing (defaults to 1000)
68
+ # - `compression`: Optional compression type to use (defaults to "zstd")
69
+ # Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
45
70
  sig do
46
71
  params(
47
72
  read_from: T::Enumerator[T::Array[T.untyped]],
48
73
  schema: T::Array[T::Hash[String, String]],
49
74
  write_to: T.any(String, IO),
50
- batch_size: T.nilable(Integer)
75
+ batch_size: T.nilable(Integer),
76
+ compression: T.nilable(String)
51
77
  ).void
52
78
  end
53
- def self.write_rows(read_from, schema:, write_to:, batch_size: nil)
79
+ def self.write_rows(read_from, schema:, write_to:, batch_size: nil, compression: nil)
54
80
  end
55
81
 
56
82
  # Options:
57
83
  # - `read_from`: An Enumerator yielding arrays of column batches
58
- # - `schema`: Array of hashes specifying column names and types
84
+ # - `schema`: Array of hashes specifying column names and types. Supported types:
85
+ # - `int8`, `int16`, `int32`, `int64`
86
+ # - `uint8`, `uint16`, `uint32`, `uint64`
87
+ # - `float`, `double`
88
+ # - `string`
89
+ # - `binary`
90
+ # - `boolean`
91
+ # - `date32`
92
+ # - `timestamp_millis`, `timestamp_micros`
93
+ # - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
59
94
  # - `write_to`: String path or IO object to write the parquet file to
95
+ # - `compression`: Optional compression type to use (defaults to "zstd")
96
+ # Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
60
97
  sig do
61
98
  params(
62
99
  read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
63
100
  schema: T::Array[T::Hash[String, String]],
64
- write_to: T.any(String, IO)
101
+ write_to: T.any(String, IO),
102
+ compression: T.nilable(String)
65
103
  ).void
66
104
  end
67
- def self.write_columns(read_from, schema:, write_to:)
105
+ def self.write_columns(read_from, schema:, write_to:, compression: nil)
68
106
  end
69
107
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.6
4
+ version: 0.2.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko