RubyGems - parquet - Versions diffs - 0.2.6 → 0.2.8 - Mend

parquet 0.2.6 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/README.md +8 -1
data/ext/parquet/src/types/core_types.rs +10 -8
data/ext/parquet/src/types/parquet_value.rs +3 -3
data/ext/parquet/src/types/type_conversion.rs +84 -28
data/ext/parquet/src/types/writer_types.rs +20 -14
data/ext/parquet/src/writer/mod.rs +70 -15
data/lib/parquet/version.rb +1 -1
data/lib/parquet.rbi +47 -9
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 794d11142b73d13b665ecdb4ffd46df6ab7d97e5f99336e2bc91b79dbb55a514
-  data.tar.gz: eb2843d724e7aad70445a8b992a527e3bee0a79055fdeab7f2ebd2cdfb6247d6
+  metadata.gz: 2dea9b9b171070949497da37aff1888de71c0782e76968ba218f38e5dc2f1606
+  data.tar.gz: 74f4599b00a818cfca62d7fc162d02a87658da014ace361a76c998b718def9f2
 SHA512:
-  metadata.gz: 8b97550fb18f2ab4db0b5fbb170d12448237665d9372242d4027760f1c697be0d1e7a8bb47d43886f704e0923ddf57544961fe5af29c596b49aac188f714b9e6
-  data.tar.gz: 1ea56a23e39a084d40690d4e7bd108ec2a4cb20b61714bd564e68600d3f3edda3ffd5c3e646d49d4bb85632ad14f2c7d5735e645610e7a863d9e25d6f1d2b90d
+  metadata.gz: 209ca0339ccb11224501efc1d1adfed241097763475aa44e3997fce811123e9744f1bbfb1447e91decd1b020181b722ded94a6655630288db1f22e88aa8c09ae
+  data.tar.gz: a889e46dc8fca484043b3f1513ee6487b0f8caa8096c826cdbe4fa9ff2d6aa457c2d84e1bd95f7b05819e0ce2e33017a77a720aa331be7115cfa2ac470557a59

data/README.md CHANGED Viewed

@@ -152,9 +152,16 @@ batches = [
 # Create an enumerator from the batches
 columns = batches.each
-# Write to a parquet file
+# Write to a parquet file with default ZSTD compression
 Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
+# Write to a parquet file with specific compression
+Parquet.write_columns(columns,
+  schema: schema,
+  write_to: "data.parquet",
+  compression: "snappy"  # Supported: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
+)
 # Write to an IO object
 File.open("data.parquet", "wb") do |file|
   Parquet.write_columns(columns, schema: schema, write_to: file)

data/ext/parquet/src/types/core_types.rs CHANGED Viewed

@@ -40,18 +40,20 @@ impl std::fmt::Display for ParserResultType {
 }
 #[derive(Debug, Clone)]
-pub struct ListField {
-    pub item_type: ParquetSchemaType,
+pub struct ListField<'a> {
+    pub item_type: ParquetSchemaType<'a>,
+    pub format: Option<&'a str>,
 }
 #[derive(Debug, Clone)]
-pub struct MapField {
-    pub key_type: ParquetSchemaType,
-    pub value_type: ParquetSchemaType,
+pub struct MapField<'a> {
+    pub key_type: ParquetSchemaType<'a>,
+    pub value_type: ParquetSchemaType<'a>,
+    pub format: Option<&'a str>,
 }
 #[derive(Debug, Clone)]
-pub enum ParquetSchemaType {
+pub enum ParquetSchemaType<'a> {
     Int8,
     Int16,
     Int32,
@@ -68,6 +70,6 @@ pub enum ParquetSchemaType {
     Date32,
     TimestampMillis,
     TimestampMicros,
-    List(Box<ListField>),
-    Map(Box<MapField>),
+    List(Box<ListField<'a>>),
+    Map(Box<MapField<'a>>),
 }

data/ext/parquet/src/types/parquet_value.rs CHANGED Viewed

@@ -215,15 +215,15 @@ impl ParquetValue {
                 Ok(ParquetValue::Boolean(v))
             }
             ParquetSchemaType::Date32 => {
-                let v = convert_to_date32(value)?;
+                let v = convert_to_date32(value, None)?;
                 Ok(ParquetValue::Date32(v))
             }
             ParquetSchemaType::TimestampMillis => {
-                let v = convert_to_timestamp_millis(value)?;
+                let v = convert_to_timestamp_millis(value, None)?;
                 Ok(ParquetValue::TimestampMillis(v, None))
             }
             ParquetSchemaType::TimestampMicros => {
-                let v = convert_to_timestamp_micros(value)?;
+                let v = convert_to_timestamp_micros(value, None)?;
                 Ok(ParquetValue::TimestampMicros(v, None))
             }
             ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => Err(MagnusError::new(

data/ext/parquet/src/types/type_conversion.rs CHANGED Viewed

@@ -30,17 +30,35 @@ where
     }
 }
-pub fn convert_to_date32(value: Value) -> Result<i32, MagnusError> {
+pub fn convert_to_date32(value: Value, format: Option<&str>) -> Result<i32, MagnusError> {
     let ruby = unsafe { Ruby::get_unchecked() };
     if value.is_kind_of(ruby.class_string()) {
         let s = String::try_convert(value)?;
-        // Parse string into Timestamp using jiff
-        let date: jiff::civil::Date = s.parse().map_err(|e| {
-            MagnusError::new(
-                magnus::exception::type_error(),
-                format!("Failed to parse '{}' as date32: {}", s, e),
-            )
-        })?;
+        // Parse string into Date using jiff
+        let date = if let Some(fmt) = format {
+            jiff::civil::Date::strptime(&fmt, &s).or_else(|e1| {
+                // Try parsing as DateTime and convert to Date with zero offset
+                jiff::civil::DateTime::strptime(&fmt, &s)
+                    .and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
+                    .map(|dt| dt.date())
+                    .map_err(|e2| {
+                        MagnusError::new(
+                            magnus::exception::type_error(),
+                            format!(
+                                "Failed to parse '{}' with format '{}' as date32: {} (and as datetime: {})",
+                                s, fmt, e1, e2
+                            ),
+                        )
+                    })
+            })?
+        } else {
+            s.parse().map_err(|e| {
+                MagnusError::new(
+                    magnus::exception::type_error(),
+                    format!("Failed to parse '{}' as date32: {}", s, e),
+                )
+            })?
+        };
         let timestamp = date.at(0, 0, 0, 0);
@@ -63,17 +81,36 @@ pub fn convert_to_date32(value: Value) -> Result<i32, MagnusError> {
     }
 }
-pub fn convert_to_timestamp_millis(value: Value) -> Result<i64, MagnusError> {
+pub fn convert_to_timestamp_millis(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
     let ruby = unsafe { Ruby::get_unchecked() };
     if value.is_kind_of(ruby.class_string()) {
         let s = String::try_convert(value)?;
         // Parse string into Timestamp using jiff
-        let timestamp: jiff::Timestamp = s.parse().map_err(|e| {
-            MagnusError::new(
-                magnus::exception::type_error(),
-                format!("Failed to parse '{}' as timestamp_millis: {}", s, e),
-            )
-        })?;
+        let timestamp = if let Some(fmt) = format {
+            jiff::Timestamp::strptime(&fmt, &s)
+                .or_else(|e1| {
+                    // Try parsing as DateTime and convert to Timestamp with zero offset
+                    jiff::civil::DateTime::strptime(&fmt, &s)
+                        .and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
+                        .map(|dt| dt.timestamp())
+                        .map_err(|e2| {
+                            MagnusError::new(
+                                magnus::exception::type_error(),
+                                format!(
+                                    "Failed to parse '{}' with format '{}' as timestamp_millis: {} (and as datetime: {})",
+                                    s, fmt, e1, e2
+                                ),
+                            )
+                        })
+                })?
+        } else {
+            s.parse().map_err(|e| {
+                MagnusError::new(
+                    magnus::exception::type_error(),
+                    format!("Failed to parse '{}' as timestamp_millis: {}", s, e),
+                )
+            })?
+        };
         // Convert to milliseconds
         Ok(timestamp.as_millisecond())
     } else if value.is_kind_of(ruby.class_time()) {
@@ -91,17 +128,36 @@ pub fn convert_to_timestamp_millis(value: Value) -> Result<i64, MagnusError> {
     }
 }
-pub fn convert_to_timestamp_micros(value: Value) -> Result<i64, MagnusError> {
+pub fn convert_to_timestamp_micros(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
     let ruby = unsafe { Ruby::get_unchecked() };
     if value.is_kind_of(ruby.class_string()) {
         let s = String::try_convert(value)?;
         // Parse string into Timestamp using jiff
-        let timestamp: jiff::Timestamp = s.parse().map_err(|e| {
-            MagnusError::new(
-                magnus::exception::type_error(),
-                format!("Failed to parse '{}' as timestamp_micros: {}", s, e),
-            )
-        })?;
+        let timestamp = if let Some(fmt) = format {
+            jiff::Timestamp::strptime(&fmt, &s).or_else(|e1| {
+                // Try parsing as DateTime and convert to Timestamp with zero offset
+                jiff::civil::DateTime::strptime(&fmt, &s).and_then(|dt| {
+                    dt.to_zoned(TimeZone::fixed(Offset::constant(0)))
+                })
+                .map(|dt| dt.timestamp())
+                .map_err(|e2| {
+                    MagnusError::new(
+                        magnus::exception::type_error(),
+                        format!(
+                            "Failed to parse '{}' with format '{}' as timestamp_micros: {} (and as datetime: {})",
+                            s, fmt, e1, e2
+                        ),
+                    )
+                })
+            })?
+        } else {
+            s.parse().map_err(|e| {
+                MagnusError::new(
+                    magnus::exception::type_error(),
+                    format!("Failed to parse '{}' as timestamp_micros: {}", s, e),
+                )
+            })?
+        };
         // Convert to microseconds
         Ok(timestamp.as_microsecond())
     } else if value.is_kind_of(ruby.class_time()) {
@@ -204,15 +260,15 @@ pub fn convert_to_list(
                     ParquetValue::Boolean(v)
                 }
                 ParquetSchemaType::Date32 => {
-                    let v = convert_to_date32(item_value)?;
+                    let v = convert_to_date32(item_value, list_field.format)?;
                     ParquetValue::Date32(v)
                 }
                 ParquetSchemaType::TimestampMillis => {
-                    let v = convert_to_timestamp_millis(item_value)?;
+                    let v = convert_to_timestamp_millis(item_value, list_field.format)?;
                     ParquetValue::TimestampMillis(v, None)
                 }
                 ParquetSchemaType::TimestampMicros => {
-                    let v = convert_to_timestamp_micros(item_value)?;
+                    let v = convert_to_timestamp_micros(item_value, list_field.format)?;
                     ParquetValue::TimestampMicros(v, None)
                 }
                 ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
@@ -310,15 +366,15 @@ pub fn convert_to_map(
                     ParquetValue::Boolean(v)
                 }
                 ParquetSchemaType::Date32 => {
-                    let v = convert_to_date32(value)?;
+                    let v = convert_to_date32(value, map_field.format)?;
                     ParquetValue::Date32(v)
                 }
                 ParquetSchemaType::TimestampMillis => {
-                    let v = convert_to_timestamp_millis(value)?;
+                    let v = convert_to_timestamp_millis(value, map_field.format)?;
                     ParquetValue::TimestampMillis(v, None)
                 }
                 ParquetSchemaType::TimestampMicros => {
-                    let v = convert_to_timestamp_micros(value)?;
+                    let v = convert_to_timestamp_micros(value, map_field.format)?;
                     ParquetValue::TimestampMicros(v, None)
                 }
                 ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {

data/ext/parquet/src/types/writer_types.rs CHANGED Viewed

@@ -12,17 +12,19 @@ use tempfile::NamedTempFile;
 use crate::types::{ListField, MapField, ParquetSchemaType};
 #[derive(Debug)]
-pub struct SchemaField {
+pub struct SchemaField<'a> {
     pub name: String,
-    pub type_: ParquetSchemaType,
+    pub type_: ParquetSchemaType<'a>,
+    pub format: Option<String>,
 }
 #[derive(Debug)]
-pub struct ParquetWriteArgs {
+pub struct ParquetWriteArgs<'a> {
     pub read_from: Value,
     pub write_to: Value,
-    pub schema: Vec<SchemaField>,
+    pub schema: Vec<SchemaField<'a>>,
     pub batch_size: Option<usize>,
+    pub compression: Option<String>,
 }
 pub trait SendableWrite: Send + Write {}
@@ -51,7 +53,7 @@ impl Write for IoLikeValue {
     }
 }
-impl FromStr for ParquetSchemaType {
+impl<'a> FromStr for ParquetSchemaType<'a> {
     type Err = MagnusError;
     fn from_str(s: &str) -> Result<Self, Self::Err> {
@@ -74,10 +76,12 @@ impl FromStr for ParquetSchemaType {
             "timestamp_micros" => Ok(ParquetSchemaType::TimestampMicros),
             "list" => Ok(ParquetSchemaType::List(Box::new(ListField {
                 item_type: ParquetSchemaType::Int8,
+                format: None,
             }))),
             "map" => Ok(ParquetSchemaType::Map(Box::new(MapField {
                 key_type: ParquetSchemaType::String,
                 value_type: ParquetSchemaType::Int8,
+                format: None,
             }))),
             _ => Err(MagnusError::new(
                 magnus::exception::runtime_error(),
@@ -87,7 +91,7 @@ impl FromStr for ParquetSchemaType {
     }
 }
-impl TryConvert for ParquetSchemaType {
+impl<'a> TryConvert for ParquetSchemaType<'a> {
     fn try_convert(value: Value) -> Result<Self, MagnusError> {
         let ruby = unsafe { Ruby::get_unchecked() };
         let schema_type = parse_string_or_symbol(&ruby, value)?;
@@ -98,7 +102,7 @@ impl TryConvert for ParquetSchemaType {
 // We know this type is safe to move between threads because it's just an enum
 // with simple primitive types and strings
-unsafe impl Send for ParquetSchemaType {}
+unsafe impl<'a> Send for ParquetSchemaType<'a> {}
 fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, MagnusError> {
     if value.is_nil() {
@@ -162,17 +166,19 @@ impl From<ParquetErrorWrapper> for MagnusError {
     }
 }
-pub struct ColumnCollector {
+pub struct ColumnCollector<'a> {
     pub name: String,
-    pub type_: ParquetSchemaType,
+    pub type_: ParquetSchemaType<'a>,
+    pub format: Option<String>,
     pub values: Vec<crate::types::ParquetValue>,
 }
-impl ColumnCollector {
-    pub fn new(name: String, type_: ParquetSchemaType) -> Self {
+impl<'a> ColumnCollector<'a> {
+    pub fn new(name: String, type_: ParquetSchemaType<'a>, format: Option<String>) -> Self {
         Self {
             name,
             type_,
+            format,
             values: Vec::new(),
         }
     }
@@ -244,15 +250,15 @@ impl ColumnCollector {
                 ParquetValue::Boolean(v)
             }
             ParquetSchemaType::Date32 => {
-                let v = convert_to_date32(value)?;
+                let v = convert_to_date32(value, self.format.as_deref())?;
                 ParquetValue::Date32(v)
             }
             ParquetSchemaType::TimestampMillis => {
-                let v = convert_to_timestamp_millis(value)?;
+                let v = convert_to_timestamp_millis(value, self.format.as_deref())?;
                 ParquetValue::TimestampMillis(v, None)
             }
             ParquetSchemaType::TimestampMicros => {
-                let v = convert_to_timestamp_micros(value)?;
+                let v = convert_to_timestamp_micros(value, self.format.as_deref())?;
                 ParquetValue::TimestampMicros(v, None)
             }
             ParquetSchemaType::List(list_field) => {

data/ext/parquet/src/writer/mod.rs CHANGED Viewed

@@ -11,7 +11,11 @@ use magnus::{
     value::ReprValue,
     Error as MagnusError, RArray, Ruby, TryConvert, Value,
 };
-use parquet::arrow::ArrowWriter;
+use parquet::{
+    arrow::ArrowWriter,
+    basic::{Compression, GzipLevel, ZstdLevel},
+    file::properties::WriterProperties,
+};
 use tempfile::NamedTempFile;
 use crate::{
@@ -28,11 +32,12 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
     let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
     let (read_from,) = parsed_args.required;
-    let kwargs = get_kwargs::<_, (Value, Value), (Option<Option<usize>>,), ()>(
-        parsed_args.keywords,
-        &["schema", "write_to"],
-        &["batch_size"],
-    )?;
+    let kwargs =
+        get_kwargs::<_, (Value, Value), (Option<Option<usize>>, Option<Option<String>>), ()>(
+            parsed_args.keywords,
+            &["schema", "write_to"],
+            &["batch_size", "compression"],
+        )?;
     let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
         MagnusError::new(
@@ -59,11 +64,45 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
             ));
         }
-        let (name, type_str) = &entries[0];
+        let (name, type_value) = &entries[0];
         let name = String::try_convert(name.clone())?;
-        let type_ = ParquetSchemaType::try_convert(type_str.clone())?;
-        schema.push(SchemaField { name, type_ });
+        let (type_, format) = if type_value.is_kind_of(ruby.class_hash()) {
+            let type_hash: Vec<(Value, Value)> = type_value.funcall("to_a", ())?;
+            let mut type_str = None;
+            let mut format_str = None;
+            for (key, value) in type_hash {
+                let key = String::try_convert(key)?;
+                match key.as_str() {
+                    "type" => type_str = Some(value),
+                    "format" => format_str = Some(String::try_convert(value)?),
+                    _ => {
+                        return Err(MagnusError::new(
+                            magnus::exception::type_error(),
+                            format!("Unknown key '{}' in type definition", key),
+                        ))
+                    }
+                }
+            }
+            let type_str = type_str.ok_or_else(|| {
+                MagnusError::new(
+                    magnus::exception::type_error(),
+                    "Missing 'type' in type definition",
+                )
+            })?;
+            (ParquetSchemaType::try_convert(type_str)?, format_str)
+        } else {
+            (ParquetSchemaType::try_convert(type_value.clone())?, None)
+        };
+        schema.push(SchemaField {
+            name,
+            type_,
+            format,
+        });
     }
     Ok(ParquetWriteArgs {
@@ -71,6 +110,7 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
         write_to: kwargs.required.1,
         schema,
         batch_size: kwargs.optional.0.flatten(),
+        compression: kwargs.optional.1.flatten(),
     })
 }
@@ -83,6 +123,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
         write_to,
         schema,
         batch_size,
+        compression,
     } = parse_parquet_write_args(args)?;
     let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
@@ -124,13 +165,13 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
     let arrow_schema = Arc::new(Schema::new(arrow_fields));
     // Create the writer
-    let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
+    let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
     if read_from.is_kind_of(ruby.class_enumerator()) {
         // Create collectors for each column
         let mut column_collectors: Vec<ColumnCollector> = schema
             .into_iter()
-            .map(|field| ColumnCollector::new(field.name, field.type_))
+            .map(|field| ColumnCollector::new(field.name, field.type_, field.format))
             .collect();
         let mut rows_in_batch = 0;
@@ -204,7 +245,8 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
         read_from,
         write_to,
         schema,
-        batch_size: _, // Batch size is determined by the input
+        batch_size: _,
+        compression,
     } = parse_parquet_write_args(args)?;
     // Convert schema to Arrow schema
@@ -244,7 +286,7 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
     let arrow_schema = Arc::new(Schema::new(arrow_fields));
     // Create the writer
-    let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
+    let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
     if read_from.is_kind_of(ruby.class_enumerator()) {
         loop {
@@ -326,12 +368,25 @@ fn create_writer(
     ruby: &Ruby,
     write_to: &Value,
     schema: Arc<Schema>,
+    compression: Option<String>,
 ) -> Result<WriterOutput, MagnusError> {
+    // Create writer properties with compression based on the option
+    let props = WriterProperties::builder()
+        .set_compression(match compression.as_deref() {
+            Some("none") | Some("uncompressed") => Compression::UNCOMPRESSED,
+            Some("snappy") => Compression::SNAPPY,
+            Some("gzip") => Compression::GZIP(GzipLevel::default()),
+            Some("lz4") => Compression::LZ4,
+            Some("zstd") => Compression::ZSTD(ZstdLevel::default()),
+            _ => Compression::UNCOMPRESSED,
+        })
+        .build();
     if write_to.is_kind_of(ruby.class_string()) {
         let path = write_to.to_r_string()?.to_string()?;
         let file: Box<dyn SendableWrite> = Box::new(File::create(path).unwrap());
         let writer =
-            ArrowWriter::try_new(file, schema, None).map_err(|e| ParquetErrorWrapper(e))?;
+            ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
         Ok(WriterOutput::File(writer))
     } else {
         // Create a temporary file to write to instead of directly to the IoLikeValue
@@ -348,7 +403,7 @@ fn create_writer(
             )
         })?);
         let writer =
-            ArrowWriter::try_new(file, schema, None).map_err(|e| ParquetErrorWrapper(e))?;
+            ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
         Ok(WriterOutput::TempFile(writer, temp_file))
     }
 }

data/lib/parquet/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Parquet
-  VERSION = "0.2.6"
+  VERSION = "0.2.8"
 end

data/lib/parquet.rbi CHANGED Viewed

@@ -1,4 +1,4 @@
-# typed: strict
+# typed: true
 module Parquet
   # Options:
@@ -7,13 +7,20 @@ module Parquet
   #                    ("hash" or "array" or :hash or :array)
   #   - `columns`: When present, only the specified columns will be included in the output.
   #                This is useful for reducing how much data is read and improving performance.
+  sig do
+    params(
+      input: T.any(String, File, StringIO, IO),
+      result_type: T.nilable(T.any(String, Symbol)),
+      columns: T.nilable(T::Array[String])
+    ).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
+  end
   sig do
     params(
       input: T.any(String, File, StringIO, IO),
       result_type: T.nilable(T.any(String, Symbol)),
       columns: T.nilable(T::Array[String]),
       blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
-    ).returns(T.any(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])], NilClass))
+    ).returns(NilClass)
   end
   def self.each_row(input, result_type: nil, columns: nil, &blk)
   end
@@ -24,6 +31,14 @@ module Parquet
   #                    ("hash" or "array" or :hash or :array)
   #   - `columns`: When present, only the specified columns will be included in the output.
   #   - `batch_size`: When present, specifies the number of rows per batch
+  sig do
+    params(
+      input: T.any(String, File, StringIO, IO),
+      result_type: T.nilable(T.any(String, Symbol)),
+      columns: T.nilable(T::Array[String]),
+      batch_size: T.nilable(Integer)
+    ).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
+  end
   sig do
     params(
       input: T.any(String, File, StringIO, IO),
@@ -32,38 +47,61 @@ module Parquet
       batch_size: T.nilable(Integer),
       blk:
         T.nilable(T.proc.params(batch: T.any(T::Hash[String, T::Array[T.untyped]], T::Array[T::Array[T.untyped]])).void)
-    ).returns(T.any(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])], NilClass))
+    ).returns(NilClass)
   end
   def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, &blk)
   end
   # Options:
   #   - `read_from`: An Enumerator yielding arrays of values representing each row
-  #   - `schema`: Array of hashes specifying column names and types
+  #   - `schema`: Array of hashes specifying column names and types. Supported types:
+  #     - `int8`, `int16`, `int32`, `int64`
+  #     - `uint8`, `uint16`, `uint32`, `uint64`
+  #     - `float`, `double`
+  #     - `string`
+  #     - `binary`
+  #     - `boolean`
+  #     - `date32`
+  #     - `timestamp_millis`, `timestamp_micros`
   #   - `write_to`: String path or IO object to write the parquet file to
   #   - `batch_size`: Optional batch size for writing (defaults to 1000)
+  #   - `compression`: Optional compression type to use (defaults to "zstd")
+  #                   Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
   sig do
     params(
       read_from: T::Enumerator[T::Array[T.untyped]],
       schema: T::Array[T::Hash[String, String]],
       write_to: T.any(String, IO),
-      batch_size: T.nilable(Integer)
+      batch_size: T.nilable(Integer),
+      compression: T.nilable(String)
     ).void
   end
-  def self.write_rows(read_from, schema:, write_to:, batch_size: nil)
+  def self.write_rows(read_from, schema:, write_to:, batch_size: nil, compression: nil)
   end
   # Options:
   #   - `read_from`: An Enumerator yielding arrays of column batches
-  #   - `schema`: Array of hashes specifying column names and types
+  #   - `schema`: Array of hashes specifying column names and types. Supported types:
+  #     - `int8`, `int16`, `int32`, `int64`
+  #     - `uint8`, `uint16`, `uint32`, `uint64`
+  #     - `float`, `double`
+  #     - `string`
+  #     - `binary`
+  #     - `boolean`
+  #     - `date32`
+  #     - `timestamp_millis`, `timestamp_micros`
+  #     - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
   #   - `write_to`: String path or IO object to write the parquet file to
+  #   - `compression`: Optional compression type to use (defaults to "zstd")
+  #                   Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
   sig do
     params(
       read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
       schema: T::Array[T::Hash[String, String]],
-      write_to: T.any(String, IO)
+      write_to: T.any(String, IO),
+      compression: T.nilable(String)
     ).void
   end
-  def self.write_columns(read_from, schema:, write_to:)
+  def self.write_columns(read_from, schema:, write_to:, compression: nil)
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: parquet
 version: !ruby/object:Gem::Version
-  version: 0.2.6
+  version: 0.2.8
 platform: ruby
 authors:
 - Nathan Jaremko