parquet 0.2.6 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 794d11142b73d13b665ecdb4ffd46df6ab7d97e5f99336e2bc91b79dbb55a514
4
- data.tar.gz: eb2843d724e7aad70445a8b992a527e3bee0a79055fdeab7f2ebd2cdfb6247d6
3
+ metadata.gz: 2dea9b9b171070949497da37aff1888de71c0782e76968ba218f38e5dc2f1606
4
+ data.tar.gz: 74f4599b00a818cfca62d7fc162d02a87658da014ace361a76c998b718def9f2
5
5
  SHA512:
6
- metadata.gz: 8b97550fb18f2ab4db0b5fbb170d12448237665d9372242d4027760f1c697be0d1e7a8bb47d43886f704e0923ddf57544961fe5af29c596b49aac188f714b9e6
7
- data.tar.gz: 1ea56a23e39a084d40690d4e7bd108ec2a4cb20b61714bd564e68600d3f3edda3ffd5c3e646d49d4bb85632ad14f2c7d5735e645610e7a863d9e25d6f1d2b90d
6
+ metadata.gz: 209ca0339ccb11224501efc1d1adfed241097763475aa44e3997fce811123e9744f1bbfb1447e91decd1b020181b722ded94a6655630288db1f22e88aa8c09ae
7
+ data.tar.gz: a889e46dc8fca484043b3f1513ee6487b0f8caa8096c826cdbe4fa9ff2d6aa457c2d84e1bd95f7b05819e0ce2e33017a77a720aa331be7115cfa2ac470557a59
data/README.md CHANGED
@@ -152,9 +152,16 @@ batches = [
152
152
  # Create an enumerator from the batches
153
153
  columns = batches.each
154
154
 
155
- # Write to a parquet file
155
+ # Write to a parquet file with default ZSTD compression
156
156
  Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
157
157
 
158
+ # Write to a parquet file with specific compression
159
+ Parquet.write_columns(columns,
160
+ schema: schema,
161
+ write_to: "data.parquet",
162
+ compression: "snappy" # Supported: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
163
+ )
164
+
158
165
  # Write to an IO object
159
166
  File.open("data.parquet", "wb") do |file|
160
167
  Parquet.write_columns(columns, schema: schema, write_to: file)
@@ -40,18 +40,20 @@ impl std::fmt::Display for ParserResultType {
40
40
  }
41
41
 
42
42
  #[derive(Debug, Clone)]
43
- pub struct ListField {
44
- pub item_type: ParquetSchemaType,
43
+ pub struct ListField<'a> {
44
+ pub item_type: ParquetSchemaType<'a>,
45
+ pub format: Option<&'a str>,
45
46
  }
46
47
 
47
48
  #[derive(Debug, Clone)]
48
- pub struct MapField {
49
- pub key_type: ParquetSchemaType,
50
- pub value_type: ParquetSchemaType,
49
+ pub struct MapField<'a> {
50
+ pub key_type: ParquetSchemaType<'a>,
51
+ pub value_type: ParquetSchemaType<'a>,
52
+ pub format: Option<&'a str>,
51
53
  }
52
54
 
53
55
  #[derive(Debug, Clone)]
54
- pub enum ParquetSchemaType {
56
+ pub enum ParquetSchemaType<'a> {
55
57
  Int8,
56
58
  Int16,
57
59
  Int32,
@@ -68,6 +70,6 @@ pub enum ParquetSchemaType {
68
70
  Date32,
69
71
  TimestampMillis,
70
72
  TimestampMicros,
71
- List(Box<ListField>),
72
- Map(Box<MapField>),
73
+ List(Box<ListField<'a>>),
74
+ Map(Box<MapField<'a>>),
73
75
  }
@@ -215,15 +215,15 @@ impl ParquetValue {
215
215
  Ok(ParquetValue::Boolean(v))
216
216
  }
217
217
  ParquetSchemaType::Date32 => {
218
- let v = convert_to_date32(value)?;
218
+ let v = convert_to_date32(value, None)?;
219
219
  Ok(ParquetValue::Date32(v))
220
220
  }
221
221
  ParquetSchemaType::TimestampMillis => {
222
- let v = convert_to_timestamp_millis(value)?;
222
+ let v = convert_to_timestamp_millis(value, None)?;
223
223
  Ok(ParquetValue::TimestampMillis(v, None))
224
224
  }
225
225
  ParquetSchemaType::TimestampMicros => {
226
- let v = convert_to_timestamp_micros(value)?;
226
+ let v = convert_to_timestamp_micros(value, None)?;
227
227
  Ok(ParquetValue::TimestampMicros(v, None))
228
228
  }
229
229
  ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => Err(MagnusError::new(
@@ -30,17 +30,35 @@ where
30
30
  }
31
31
  }
32
32
 
33
- pub fn convert_to_date32(value: Value) -> Result<i32, MagnusError> {
33
+ pub fn convert_to_date32(value: Value, format: Option<&str>) -> Result<i32, MagnusError> {
34
34
  let ruby = unsafe { Ruby::get_unchecked() };
35
35
  if value.is_kind_of(ruby.class_string()) {
36
36
  let s = String::try_convert(value)?;
37
- // Parse string into Timestamp using jiff
38
- let date: jiff::civil::Date = s.parse().map_err(|e| {
39
- MagnusError::new(
40
- magnus::exception::type_error(),
41
- format!("Failed to parse '{}' as date32: {}", s, e),
42
- )
43
- })?;
37
+ // Parse string into Date using jiff
38
+ let date = if let Some(fmt) = format {
39
+ jiff::civil::Date::strptime(&fmt, &s).or_else(|e1| {
40
+ // Try parsing as DateTime and convert to Date with zero offset
41
+ jiff::civil::DateTime::strptime(&fmt, &s)
42
+ .and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
43
+ .map(|dt| dt.date())
44
+ .map_err(|e2| {
45
+ MagnusError::new(
46
+ magnus::exception::type_error(),
47
+ format!(
48
+ "Failed to parse '{}' with format '{}' as date32: {} (and as datetime: {})",
49
+ s, fmt, e1, e2
50
+ ),
51
+ )
52
+ })
53
+ })?
54
+ } else {
55
+ s.parse().map_err(|e| {
56
+ MagnusError::new(
57
+ magnus::exception::type_error(),
58
+ format!("Failed to parse '{}' as date32: {}", s, e),
59
+ )
60
+ })?
61
+ };
44
62
 
45
63
  let timestamp = date.at(0, 0, 0, 0);
46
64
 
@@ -63,17 +81,36 @@ pub fn convert_to_date32(value: Value) -> Result<i32, MagnusError> {
63
81
  }
64
82
  }
65
83
 
66
- pub fn convert_to_timestamp_millis(value: Value) -> Result<i64, MagnusError> {
84
+ pub fn convert_to_timestamp_millis(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
67
85
  let ruby = unsafe { Ruby::get_unchecked() };
68
86
  if value.is_kind_of(ruby.class_string()) {
69
87
  let s = String::try_convert(value)?;
70
88
  // Parse string into Timestamp using jiff
71
- let timestamp: jiff::Timestamp = s.parse().map_err(|e| {
72
- MagnusError::new(
73
- magnus::exception::type_error(),
74
- format!("Failed to parse '{}' as timestamp_millis: {}", s, e),
75
- )
76
- })?;
89
+ let timestamp = if let Some(fmt) = format {
90
+ jiff::Timestamp::strptime(&fmt, &s)
91
+ .or_else(|e1| {
92
+ // Try parsing as DateTime and convert to Timestamp with zero offset
93
+ jiff::civil::DateTime::strptime(&fmt, &s)
94
+ .and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
95
+ .map(|dt| dt.timestamp())
96
+ .map_err(|e2| {
97
+ MagnusError::new(
98
+ magnus::exception::type_error(),
99
+ format!(
100
+ "Failed to parse '{}' with format '{}' as timestamp_millis: {} (and as datetime: {})",
101
+ s, fmt, e1, e2
102
+ ),
103
+ )
104
+ })
105
+ })?
106
+ } else {
107
+ s.parse().map_err(|e| {
108
+ MagnusError::new(
109
+ magnus::exception::type_error(),
110
+ format!("Failed to parse '{}' as timestamp_millis: {}", s, e),
111
+ )
112
+ })?
113
+ };
77
114
  // Convert to milliseconds
78
115
  Ok(timestamp.as_millisecond())
79
116
  } else if value.is_kind_of(ruby.class_time()) {
@@ -91,17 +128,36 @@ pub fn convert_to_timestamp_millis(value: Value) -> Result<i64, MagnusError> {
91
128
  }
92
129
  }
93
130
 
94
- pub fn convert_to_timestamp_micros(value: Value) -> Result<i64, MagnusError> {
131
+ pub fn convert_to_timestamp_micros(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
95
132
  let ruby = unsafe { Ruby::get_unchecked() };
96
133
  if value.is_kind_of(ruby.class_string()) {
97
134
  let s = String::try_convert(value)?;
98
135
  // Parse string into Timestamp using jiff
99
- let timestamp: jiff::Timestamp = s.parse().map_err(|e| {
100
- MagnusError::new(
101
- magnus::exception::type_error(),
102
- format!("Failed to parse '{}' as timestamp_micros: {}", s, e),
103
- )
104
- })?;
136
+ let timestamp = if let Some(fmt) = format {
137
+ jiff::Timestamp::strptime(&fmt, &s).or_else(|e1| {
138
+ // Try parsing as DateTime and convert to Timestamp with zero offset
139
+ jiff::civil::DateTime::strptime(&fmt, &s).and_then(|dt| {
140
+ dt.to_zoned(TimeZone::fixed(Offset::constant(0)))
141
+ })
142
+ .map(|dt| dt.timestamp())
143
+ .map_err(|e2| {
144
+ MagnusError::new(
145
+ magnus::exception::type_error(),
146
+ format!(
147
+ "Failed to parse '{}' with format '{}' as timestamp_micros: {} (and as datetime: {})",
148
+ s, fmt, e1, e2
149
+ ),
150
+ )
151
+ })
152
+ })?
153
+ } else {
154
+ s.parse().map_err(|e| {
155
+ MagnusError::new(
156
+ magnus::exception::type_error(),
157
+ format!("Failed to parse '{}' as timestamp_micros: {}", s, e),
158
+ )
159
+ })?
160
+ };
105
161
  // Convert to microseconds
106
162
  Ok(timestamp.as_microsecond())
107
163
  } else if value.is_kind_of(ruby.class_time()) {
@@ -204,15 +260,15 @@ pub fn convert_to_list(
204
260
  ParquetValue::Boolean(v)
205
261
  }
206
262
  ParquetSchemaType::Date32 => {
207
- let v = convert_to_date32(item_value)?;
263
+ let v = convert_to_date32(item_value, list_field.format)?;
208
264
  ParquetValue::Date32(v)
209
265
  }
210
266
  ParquetSchemaType::TimestampMillis => {
211
- let v = convert_to_timestamp_millis(item_value)?;
267
+ let v = convert_to_timestamp_millis(item_value, list_field.format)?;
212
268
  ParquetValue::TimestampMillis(v, None)
213
269
  }
214
270
  ParquetSchemaType::TimestampMicros => {
215
- let v = convert_to_timestamp_micros(item_value)?;
271
+ let v = convert_to_timestamp_micros(item_value, list_field.format)?;
216
272
  ParquetValue::TimestampMicros(v, None)
217
273
  }
218
274
  ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
@@ -310,15 +366,15 @@ pub fn convert_to_map(
310
366
  ParquetValue::Boolean(v)
311
367
  }
312
368
  ParquetSchemaType::Date32 => {
313
- let v = convert_to_date32(value)?;
369
+ let v = convert_to_date32(value, map_field.format)?;
314
370
  ParquetValue::Date32(v)
315
371
  }
316
372
  ParquetSchemaType::TimestampMillis => {
317
- let v = convert_to_timestamp_millis(value)?;
373
+ let v = convert_to_timestamp_millis(value, map_field.format)?;
318
374
  ParquetValue::TimestampMillis(v, None)
319
375
  }
320
376
  ParquetSchemaType::TimestampMicros => {
321
- let v = convert_to_timestamp_micros(value)?;
377
+ let v = convert_to_timestamp_micros(value, map_field.format)?;
322
378
  ParquetValue::TimestampMicros(v, None)
323
379
  }
324
380
  ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
@@ -12,17 +12,19 @@ use tempfile::NamedTempFile;
12
12
  use crate::types::{ListField, MapField, ParquetSchemaType};
13
13
 
14
14
  #[derive(Debug)]
15
- pub struct SchemaField {
15
+ pub struct SchemaField<'a> {
16
16
  pub name: String,
17
- pub type_: ParquetSchemaType,
17
+ pub type_: ParquetSchemaType<'a>,
18
+ pub format: Option<String>,
18
19
  }
19
20
 
20
21
  #[derive(Debug)]
21
- pub struct ParquetWriteArgs {
22
+ pub struct ParquetWriteArgs<'a> {
22
23
  pub read_from: Value,
23
24
  pub write_to: Value,
24
- pub schema: Vec<SchemaField>,
25
+ pub schema: Vec<SchemaField<'a>>,
25
26
  pub batch_size: Option<usize>,
27
+ pub compression: Option<String>,
26
28
  }
27
29
 
28
30
  pub trait SendableWrite: Send + Write {}
@@ -51,7 +53,7 @@ impl Write for IoLikeValue {
51
53
  }
52
54
  }
53
55
 
54
- impl FromStr for ParquetSchemaType {
56
+ impl<'a> FromStr for ParquetSchemaType<'a> {
55
57
  type Err = MagnusError;
56
58
 
57
59
  fn from_str(s: &str) -> Result<Self, Self::Err> {
@@ -74,10 +76,12 @@ impl FromStr for ParquetSchemaType {
74
76
  "timestamp_micros" => Ok(ParquetSchemaType::TimestampMicros),
75
77
  "list" => Ok(ParquetSchemaType::List(Box::new(ListField {
76
78
  item_type: ParquetSchemaType::Int8,
79
+ format: None,
77
80
  }))),
78
81
  "map" => Ok(ParquetSchemaType::Map(Box::new(MapField {
79
82
  key_type: ParquetSchemaType::String,
80
83
  value_type: ParquetSchemaType::Int8,
84
+ format: None,
81
85
  }))),
82
86
  _ => Err(MagnusError::new(
83
87
  magnus::exception::runtime_error(),
@@ -87,7 +91,7 @@ impl FromStr for ParquetSchemaType {
87
91
  }
88
92
  }
89
93
 
90
- impl TryConvert for ParquetSchemaType {
94
+ impl<'a> TryConvert for ParquetSchemaType<'a> {
91
95
  fn try_convert(value: Value) -> Result<Self, MagnusError> {
92
96
  let ruby = unsafe { Ruby::get_unchecked() };
93
97
  let schema_type = parse_string_or_symbol(&ruby, value)?;
@@ -98,7 +102,7 @@ impl TryConvert for ParquetSchemaType {
98
102
 
99
103
  // We know this type is safe to move between threads because it's just an enum
100
104
  // with simple primitive types and strings
101
- unsafe impl Send for ParquetSchemaType {}
105
+ unsafe impl<'a> Send for ParquetSchemaType<'a> {}
102
106
 
103
107
  fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, MagnusError> {
104
108
  if value.is_nil() {
@@ -162,17 +166,19 @@ impl From<ParquetErrorWrapper> for MagnusError {
162
166
  }
163
167
  }
164
168
 
165
- pub struct ColumnCollector {
169
+ pub struct ColumnCollector<'a> {
166
170
  pub name: String,
167
- pub type_: ParquetSchemaType,
171
+ pub type_: ParquetSchemaType<'a>,
172
+ pub format: Option<String>,
168
173
  pub values: Vec<crate::types::ParquetValue>,
169
174
  }
170
175
 
171
- impl ColumnCollector {
172
- pub fn new(name: String, type_: ParquetSchemaType) -> Self {
176
+ impl<'a> ColumnCollector<'a> {
177
+ pub fn new(name: String, type_: ParquetSchemaType<'a>, format: Option<String>) -> Self {
173
178
  Self {
174
179
  name,
175
180
  type_,
181
+ format,
176
182
  values: Vec::new(),
177
183
  }
178
184
  }
@@ -244,15 +250,15 @@ impl ColumnCollector {
244
250
  ParquetValue::Boolean(v)
245
251
  }
246
252
  ParquetSchemaType::Date32 => {
247
- let v = convert_to_date32(value)?;
253
+ let v = convert_to_date32(value, self.format.as_deref())?;
248
254
  ParquetValue::Date32(v)
249
255
  }
250
256
  ParquetSchemaType::TimestampMillis => {
251
- let v = convert_to_timestamp_millis(value)?;
257
+ let v = convert_to_timestamp_millis(value, self.format.as_deref())?;
252
258
  ParquetValue::TimestampMillis(v, None)
253
259
  }
254
260
  ParquetSchemaType::TimestampMicros => {
255
- let v = convert_to_timestamp_micros(value)?;
261
+ let v = convert_to_timestamp_micros(value, self.format.as_deref())?;
256
262
  ParquetValue::TimestampMicros(v, None)
257
263
  }
258
264
  ParquetSchemaType::List(list_field) => {
@@ -11,7 +11,11 @@ use magnus::{
11
11
  value::ReprValue,
12
12
  Error as MagnusError, RArray, Ruby, TryConvert, Value,
13
13
  };
14
- use parquet::arrow::ArrowWriter;
14
+ use parquet::{
15
+ arrow::ArrowWriter,
16
+ basic::{Compression, GzipLevel, ZstdLevel},
17
+ file::properties::WriterProperties,
18
+ };
15
19
  use tempfile::NamedTempFile;
16
20
 
17
21
  use crate::{
@@ -28,11 +32,12 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
28
32
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
29
33
  let (read_from,) = parsed_args.required;
30
34
 
31
- let kwargs = get_kwargs::<_, (Value, Value), (Option<Option<usize>>,), ()>(
32
- parsed_args.keywords,
33
- &["schema", "write_to"],
34
- &["batch_size"],
35
- )?;
35
+ let kwargs =
36
+ get_kwargs::<_, (Value, Value), (Option<Option<usize>>, Option<Option<String>>), ()>(
37
+ parsed_args.keywords,
38
+ &["schema", "write_to"],
39
+ &["batch_size", "compression"],
40
+ )?;
36
41
 
37
42
  let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
38
43
  MagnusError::new(
@@ -59,11 +64,45 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
59
64
  ));
60
65
  }
61
66
 
62
- let (name, type_str) = &entries[0];
67
+ let (name, type_value) = &entries[0];
63
68
  let name = String::try_convert(name.clone())?;
64
- let type_ = ParquetSchemaType::try_convert(type_str.clone())?;
65
69
 
66
- schema.push(SchemaField { name, type_ });
70
+ let (type_, format) = if type_value.is_kind_of(ruby.class_hash()) {
71
+ let type_hash: Vec<(Value, Value)> = type_value.funcall("to_a", ())?;
72
+ let mut type_str = None;
73
+ let mut format_str = None;
74
+
75
+ for (key, value) in type_hash {
76
+ let key = String::try_convert(key)?;
77
+ match key.as_str() {
78
+ "type" => type_str = Some(value),
79
+ "format" => format_str = Some(String::try_convert(value)?),
80
+ _ => {
81
+ return Err(MagnusError::new(
82
+ magnus::exception::type_error(),
83
+ format!("Unknown key '{}' in type definition", key),
84
+ ))
85
+ }
86
+ }
87
+ }
88
+
89
+ let type_str = type_str.ok_or_else(|| {
90
+ MagnusError::new(
91
+ magnus::exception::type_error(),
92
+ "Missing 'type' in type definition",
93
+ )
94
+ })?;
95
+
96
+ (ParquetSchemaType::try_convert(type_str)?, format_str)
97
+ } else {
98
+ (ParquetSchemaType::try_convert(type_value.clone())?, None)
99
+ };
100
+
101
+ schema.push(SchemaField {
102
+ name,
103
+ type_,
104
+ format,
105
+ });
67
106
  }
68
107
 
69
108
  Ok(ParquetWriteArgs {
@@ -71,6 +110,7 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
71
110
  write_to: kwargs.required.1,
72
111
  schema,
73
112
  batch_size: kwargs.optional.0.flatten(),
113
+ compression: kwargs.optional.1.flatten(),
74
114
  })
75
115
  }
76
116
 
@@ -83,6 +123,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
83
123
  write_to,
84
124
  schema,
85
125
  batch_size,
126
+ compression,
86
127
  } = parse_parquet_write_args(args)?;
87
128
 
88
129
  let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
@@ -124,13 +165,13 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
124
165
  let arrow_schema = Arc::new(Schema::new(arrow_fields));
125
166
 
126
167
  // Create the writer
127
- let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
168
+ let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
128
169
 
129
170
  if read_from.is_kind_of(ruby.class_enumerator()) {
130
171
  // Create collectors for each column
131
172
  let mut column_collectors: Vec<ColumnCollector> = schema
132
173
  .into_iter()
133
- .map(|field| ColumnCollector::new(field.name, field.type_))
174
+ .map(|field| ColumnCollector::new(field.name, field.type_, field.format))
134
175
  .collect();
135
176
 
136
177
  let mut rows_in_batch = 0;
@@ -204,7 +245,8 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
204
245
  read_from,
205
246
  write_to,
206
247
  schema,
207
- batch_size: _, // Batch size is determined by the input
248
+ batch_size: _,
249
+ compression,
208
250
  } = parse_parquet_write_args(args)?;
209
251
 
210
252
  // Convert schema to Arrow schema
@@ -244,7 +286,7 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
244
286
  let arrow_schema = Arc::new(Schema::new(arrow_fields));
245
287
 
246
288
  // Create the writer
247
- let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
289
+ let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
248
290
 
249
291
  if read_from.is_kind_of(ruby.class_enumerator()) {
250
292
  loop {
@@ -326,12 +368,25 @@ fn create_writer(
326
368
  ruby: &Ruby,
327
369
  write_to: &Value,
328
370
  schema: Arc<Schema>,
371
+ compression: Option<String>,
329
372
  ) -> Result<WriterOutput, MagnusError> {
373
+ // Create writer properties with compression based on the option
374
+ let props = WriterProperties::builder()
375
+ .set_compression(match compression.as_deref() {
376
+ Some("none") | Some("uncompressed") => Compression::UNCOMPRESSED,
377
+ Some("snappy") => Compression::SNAPPY,
378
+ Some("gzip") => Compression::GZIP(GzipLevel::default()),
379
+ Some("lz4") => Compression::LZ4,
380
+ Some("zstd") => Compression::ZSTD(ZstdLevel::default()),
381
+ _ => Compression::UNCOMPRESSED,
382
+ })
383
+ .build();
384
+
330
385
  if write_to.is_kind_of(ruby.class_string()) {
331
386
  let path = write_to.to_r_string()?.to_string()?;
332
387
  let file: Box<dyn SendableWrite> = Box::new(File::create(path).unwrap());
333
388
  let writer =
334
- ArrowWriter::try_new(file, schema, None).map_err(|e| ParquetErrorWrapper(e))?;
389
+ ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
335
390
  Ok(WriterOutput::File(writer))
336
391
  } else {
337
392
  // Create a temporary file to write to instead of directly to the IoLikeValue
@@ -348,7 +403,7 @@ fn create_writer(
348
403
  )
349
404
  })?);
350
405
  let writer =
351
- ArrowWriter::try_new(file, schema, None).map_err(|e| ParquetErrorWrapper(e))?;
406
+ ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
352
407
  Ok(WriterOutput::TempFile(writer, temp_file))
353
408
  }
354
409
  }
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.2.6"
2
+ VERSION = "0.2.8"
3
3
  end
data/lib/parquet.rbi CHANGED
@@ -1,4 +1,4 @@
1
- # typed: strict
1
+ # typed: true
2
2
 
3
3
  module Parquet
4
4
  # Options:
@@ -7,13 +7,20 @@ module Parquet
7
7
  # ("hash" or "array" or :hash or :array)
8
8
  # - `columns`: When present, only the specified columns will be included in the output.
9
9
  # This is useful for reducing how much data is read and improving performance.
10
+ sig do
11
+ params(
12
+ input: T.any(String, File, StringIO, IO),
13
+ result_type: T.nilable(T.any(String, Symbol)),
14
+ columns: T.nilable(T::Array[String])
15
+ ).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
16
+ end
10
17
  sig do
11
18
  params(
12
19
  input: T.any(String, File, StringIO, IO),
13
20
  result_type: T.nilable(T.any(String, Symbol)),
14
21
  columns: T.nilable(T::Array[String]),
15
22
  blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
16
- ).returns(T.any(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])], NilClass))
23
+ ).returns(NilClass)
17
24
  end
18
25
  def self.each_row(input, result_type: nil, columns: nil, &blk)
19
26
  end
@@ -24,6 +31,14 @@ module Parquet
24
31
  # ("hash" or "array" or :hash or :array)
25
32
  # - `columns`: When present, only the specified columns will be included in the output.
26
33
  # - `batch_size`: When present, specifies the number of rows per batch
34
+ sig do
35
+ params(
36
+ input: T.any(String, File, StringIO, IO),
37
+ result_type: T.nilable(T.any(String, Symbol)),
38
+ columns: T.nilable(T::Array[String]),
39
+ batch_size: T.nilable(Integer)
40
+ ).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
41
+ end
27
42
  sig do
28
43
  params(
29
44
  input: T.any(String, File, StringIO, IO),
@@ -32,38 +47,61 @@ module Parquet
32
47
  batch_size: T.nilable(Integer),
33
48
  blk:
34
49
  T.nilable(T.proc.params(batch: T.any(T::Hash[String, T::Array[T.untyped]], T::Array[T::Array[T.untyped]])).void)
35
- ).returns(T.any(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])], NilClass))
50
+ ).returns(NilClass)
36
51
  end
37
52
  def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, &blk)
38
53
  end
39
54
 
40
55
  # Options:
41
56
  # - `read_from`: An Enumerator yielding arrays of values representing each row
42
- # - `schema`: Array of hashes specifying column names and types
57
+ # - `schema`: Array of hashes specifying column names and types. Supported types:
58
+ # - `int8`, `int16`, `int32`, `int64`
59
+ # - `uint8`, `uint16`, `uint32`, `uint64`
60
+ # - `float`, `double`
61
+ # - `string`
62
+ # - `binary`
63
+ # - `boolean`
64
+ # - `date32`
65
+ # - `timestamp_millis`, `timestamp_micros`
43
66
  # - `write_to`: String path or IO object to write the parquet file to
44
67
  # - `batch_size`: Optional batch size for writing (defaults to 1000)
68
+ # - `compression`: Optional compression type to use (defaults to "zstd")
69
+ # Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
45
70
  sig do
46
71
  params(
47
72
  read_from: T::Enumerator[T::Array[T.untyped]],
48
73
  schema: T::Array[T::Hash[String, String]],
49
74
  write_to: T.any(String, IO),
50
- batch_size: T.nilable(Integer)
75
+ batch_size: T.nilable(Integer),
76
+ compression: T.nilable(String)
51
77
  ).void
52
78
  end
53
- def self.write_rows(read_from, schema:, write_to:, batch_size: nil)
79
+ def self.write_rows(read_from, schema:, write_to:, batch_size: nil, compression: nil)
54
80
  end
55
81
 
56
82
  # Options:
57
83
  # - `read_from`: An Enumerator yielding arrays of column batches
58
- # - `schema`: Array of hashes specifying column names and types
84
+ # - `schema`: Array of hashes specifying column names and types. Supported types:
85
+ # - `int8`, `int16`, `int32`, `int64`
86
+ # - `uint8`, `uint16`, `uint32`, `uint64`
87
+ # - `float`, `double`
88
+ # - `string`
89
+ # - `binary`
90
+ # - `boolean`
91
+ # - `date32`
92
+ # - `timestamp_millis`, `timestamp_micros`
93
+ # - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
59
94
  # - `write_to`: String path or IO object to write the parquet file to
95
+ # - `compression`: Optional compression type to use (defaults to "zstd")
96
+ # Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
60
97
  sig do
61
98
  params(
62
99
  read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
63
100
  schema: T::Array[T::Hash[String, String]],
64
- write_to: T.any(String, IO)
101
+ write_to: T.any(String, IO),
102
+ compression: T.nilable(String)
65
103
  ).void
66
104
  end
67
- def self.write_columns(read_from, schema:, write_to:)
105
+ def self.write_columns(read_from, schema:, write_to:, compression: nil)
68
106
  end
69
107
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.6
4
+ version: 0.2.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko