RubyGems - parquet - Versions diffs - 0.5.2 → 0.5.4 - Mend

parquet 0.5.2 → 0.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +4 -4
data/Cargo.lock +9 -1
data/README.md +228 -4
data/ext/parquet/Cargo.toml +4 -0
data/ext/parquet/build.rs +5 -0
data/ext/parquet/src/lib.rs +1 -0
data/ext/parquet/src/reader/common.rs +7 -6
data/ext/parquet/src/reader/mod.rs +204 -0
data/ext/parquet/src/reader/parquet_column_reader.rs +19 -20
data/ext/parquet/src/reader/parquet_row_reader.rs +18 -22
data/ext/parquet/src/ruby_reader.rs +3 -5
data/ext/parquet/src/types/core_types.rs +1 -0
data/ext/parquet/src/types/mod.rs +8 -5
data/ext/parquet/src/types/parquet_value.rs +199 -7
data/ext/parquet/src/types/record_types.rs +16 -5
data/ext/parquet/src/types/schema_converter.rs +118 -11
data/ext/parquet/src/types/schema_node.rs +83 -2
data/ext/parquet/src/types/timestamp.rs +6 -10
data/ext/parquet/src/types/type_conversion.rs +84 -11
data/ext/parquet/src/types/writer_types.rs +59 -11
data/ext/parquet/src/utils.rs +6 -6
data/ext/parquet/src/writer/mod.rs +25 -17
data/ext/parquet/src/writer/write_columns.rs +27 -24
data/ext/parquet/src/writer/write_rows.rs +14 -15
data/lib/parquet/schema.rb +89 -4
data/lib/parquet/version.rb +1 -1
data/lib/parquet.rbi +11 -0
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 896f2833b6db8e4466af8fc9d43eb5c695e25a207a6f8050d22052458edded36
-  data.tar.gz: 38de2831bf7013e0194b2e61a91b26a1283fef65a04309dfbe125c570d64e9ed
+  metadata.gz: 936feb49be7a1bbbb36236551480ae0522d6b52443e76b4ebb7502abdb9d2903
+  data.tar.gz: bcc56665ec0cd132e22c262373e7b1294e085be364c93efbd214e434ada7dcb6
 SHA512:
-  metadata.gz: 52d83bc198f789856eac4bff7ff985a82c3f03f75e5de79efc5b388ce5afc63cb507b4cacc90625ee321619ade1ddc16f66f6c437ca0f60d144bc593bbec8cc5
-  data.tar.gz: c6dd98694fd2a1d29ceebec6b58d63220f3992fe4dc63dae1c28ea27f8a353764f87a16a13c95e3e2111bca811c57fde45da9f008a93d8868112b4be608d46ee
+  metadata.gz: 7856d7f36820a8384faf564f166d39e0daca1c9d15457b6f6aae8ff56f4176a8b1302bfbc2cc5edcfedfcb0805cbe71029f5712e716a29dc4942a1e6453a3e5e
+  data.tar.gz: '08d1f4cfe357b22bad4c4fab4ddd4fa93069b13c65559d668fb704e2f7d8884fc8f081270e4dc43a5db60aab7147be36bfe7d26945f93c9ad6e9badbd0ad957e'

data/Cargo.lock CHANGED Viewed

@@ -681,7 +681,7 @@ checksum = "3d87ae53030f3a22e83879e666cb94e58a7bdf31706878a0ba48752994146dab"
 dependencies = [
  "magnus-macros",
  "rb-sys",
- "rb-sys-env",
+ "rb-sys-env 0.1.2",
  "seq-macro",
 ]
@@ -839,9 +839,11 @@ dependencies = [
  "jiff",
  "magnus",
  "mimalloc",
+ "num",
  "parquet 54.2.0",
  "rand",
  "rb-sys",
+ "rb-sys-env 0.2.2",
  "simdutf8",
  "tempfile",
  "thiserror",
@@ -997,6 +999,12 @@ version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
+[[package]]
+name = "rb-sys-env"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08f8d2924cf136a1315e2b4c7460a39f62ef11ee5d522df9b2750fab55b868b6"
 [[package]]
 name = "regex"
 version = "1.11.1"

data/README.md CHANGED Viewed

@@ -8,6 +8,78 @@ This project is a Ruby library wrapping the [parquet-rs](https://github.com/apac
 This library provides high-level bindings to parquet-rs with two primary APIs for reading Parquet files: row-wise and column-wise iteration. The column-wise API generally offers better performance, especially when working with subset of columns.
+### Metadata
+The `metadata` method provides detailed information about a Parquet file's structure and contents:
+```ruby
+require "parquet"
+# Get metadata from a file path
+metadata = Parquet.metadata("data.parquet")
+# Or from an IO object
+File.open("data.parquet", "rb") do |file|
+  metadata = Parquet.metadata(file)
+end
+# Example metadata output:
+# {
+#   "num_rows" => 3,
+#   "created_by" => "parquet-rs version 54.2.0",
+#   "key_value_metadata" => [
+#     {
+#       "key" => "ARROW:schema",
+#       "value" => "base64_encoded_schema"
+#     }
+#   ],
+#   "schema" => {
+#     "name" => "arrow_schema",
+#     "fields" => [
+#       {
+#         "name" => "id",
+#         "type" => "primitive",
+#         "physical_type" => "INT64",
+#         "repetition" => "OPTIONAL",
+#         "converted_type" => "NONE"
+#       },
+#       # ... other fields
+#     ]
+#   },
+#   "row_groups" => [
+#     {
+#       "num_columns" => 5,
+#       "num_rows" => 3,
+#       "total_byte_size" => 379,
+#       "columns" => [
+#         {
+#           "column_path" => "id",
+#           "num_values" => 3,
+#           "compression" => "UNCOMPRESSED",
+#           "total_compressed_size" => 91,
+#           "encodings" => ["PLAIN", "RLE", "RLE_DICTIONARY"],
+#           "statistics" => {
+#             "min_is_exact" => true,
+#             "max_is_exact" => true
+#           }
+#         },
+#         # ... other columns
+#       ]
+#     }
+#   ]
+# }
+```
+The metadata includes:
+- Total number of rows
+- File creation information
+- Key-value metadata (including Arrow schema)
+- Detailed schema information for each column
+- Row group information including:
+  - Number of columns and rows
+  - Total byte size
+  - Column-level details (compression, encodings, statistics)
 ### Row-wise Iteration
 The `each_row` method provides sequential access to individual rows:
@@ -236,17 +308,169 @@ schema = Parquet::Schema.define do
     field :description, :string
   end
-  # Nested lists
+  # Nested lists (list of lists of strings)
   field :nested_lists, :list, item: :list do
-    field :item, :string  # For nested lists, inner item must be named 'item'
+    field :item, :string  # REQUIRED: Inner item field MUST be named 'item' for nested lists
   end
   # Map of lists
   field :map_of_lists, :map, key: :string, value: :list do
-    field :item, :int32  # For list items in maps, item must be named 'item'
+    field :item, :int32  # REQUIRED: List items in maps MUST be named 'item'
   end
 end
+### Nested Lists
+When working with nested lists (a list of lists), there are specific requirements:
+1. Using the Schema DSL:
+```ruby
+# A list of lists of strings
+field :nested_lists, :list, item: :list do
+  field :item, :string  # For nested lists, inner item MUST be named 'item'
+end
+```
+2. Using hash-based schema format:
+```ruby
+# A list of lists of integers
+{ "nested_numbers" => "list<list<int32>>" }
+```
+The data for nested lists is structured as an array of arrays:
+```ruby
+# Data for the nested_lists field
+[["a", "b"], ["c", "d", "e"], []]  # Last one is an empty inner list
+```
+### Decimal Data Type
+Parquet supports decimal numbers with configurable precision and scale, which is essential for financial applications where exact decimal representation is critical. The library seamlessly converts between Ruby's `BigDecimal` and Parquet's decimal type.
+#### Decimal Precision and Scale
+When working with decimal fields, you need to understand two key parameters:
+- **Precision**: The total number of significant digits (both before and after the decimal point)
+- **Scale**: The number of digits after the decimal point
+The rules for defining decimals are:
+```ruby
+# No precision/scale specified - uses maximum precision (38) with scale 0
+field :amount1, :decimal  # Equivalent to INTEGER with 38 digits
+# Only precision specified - scale defaults to 0
+field :amount2, :decimal, precision: 10  # 10 digits, no decimal places
+# Only scale specified - uses maximum precision (38)
+field :amount3, :decimal, scale: 2  # 38 digits with 2 decimal places
+# Both precision and scale specified
+field :amount4, :decimal, precision: 10, scale: 2  # 10 digits with 2 decimal places
+```
+#### Financial Data Example
+Here's a practical example for a financial application:
+```ruby
+require "parquet"
+require "bigdecimal"
+# Schema for financial transactions
+schema = Parquet::Schema.define do
+  field :transaction_id, :string, nullable: false
+  field :timestamp, :timestamp_millis, nullable: false
+  field :amount, :decimal, precision: 12, scale: 2  # Supports up to 10^10 with 2 decimal places
+  field :balance, :decimal, precision: 16, scale: 2  # Larger precision for running balances
+  field :currency, :string
+  field :exchange_rate, :decimal, precision: 10, scale: 6  # 6 decimal places for forex rates
+  field :fee, :decimal, precision: 8, scale: 2, nullable: true  # Optional fee
+  field :category, :string
+end
+# Sample financial data
+transactions = [
+  [
+    "T-12345",
+    Time.now,
+    BigDecimal("1256.99"),       # amount (directly using BigDecimal)
+    BigDecimal("10250.25"),      # balance
+    "USD",
+    BigDecimal("1.0"),           # exchange_rate
+    BigDecimal("2.50"),          # fee
+    "Groceries"
+  ],
+  [
+    "T-12346",
+    Time.now - 86400,            # yesterday
+    BigDecimal("-89.50"),        # negative amount for withdrawal
+    BigDecimal("10160.75"),      # updated balance
+    "USD",
+    BigDecimal("1.0"),           # exchange_rate
+    nil,                         # no fee
+    "Transportation"
+  ],
+  [
+    "T-12347",
+    Time.now - 172800,           # two days ago
+    BigDecimal("250.00"),        # amount
+    BigDecimal("10410.75"),      # balance
+    "EUR",                       # different currency
+    BigDecimal("1.05463"),       # exchange_rate
+    BigDecimal("1.75"),          # fee
+    "Entertainment"
+  ]
+]
+# Write financial data to Parquet file
+Parquet.write_rows(transactions.each, schema: schema, write_to: "financial_data.parquet")
+# Read back transactions
+Parquet.each_row("financial_data.parquet") do |transaction|
+  # Access decimal fields as BigDecimal objects
+  puts "Transaction: #{transaction['transaction_id']}"
+  puts "  Amount: #{transaction['currency']} #{transaction['amount']}"
+  puts "  Balance: $#{transaction['balance']}"
+  puts "  Fee: #{transaction['fee'] || 'No fee'}"
+  # You can perform precise decimal calculations
+  if transaction['currency'] != 'USD'
+    usd_amount = transaction['amount'] * transaction['exchange_rate']
+    puts "  USD Equivalent: $#{usd_amount.round(2)}"
+  end
+end
+```
+#### Decimal Type Storage Considerations
+Parquet optimizes storage based on the precision:
+- For precision ≤ 9: Uses 4-byte INT32
+- For precision ≤ 18: Uses 8-byte INT64
+- For precision ≤ 38: Uses 16-byte BYTE_ARRAY
+Choose appropriate precision and scale for your data to optimize storage while ensuring adequate range:
+```ruby
+# Banking examples
+field :account_balance, :decimal, precision: 16, scale: 2   # Up to 14 digits before decimal point
+field :interest_rate, :decimal, precision: 8, scale: 6      # Rate with 6 decimal places (e.g., 0.015625)
+# E-commerce examples
+field :product_price, :decimal, precision: 10, scale: 2     # Product price
+field :shipping_weight, :decimal, precision: 6, scale: 3    # Weight in kg with 3 decimal places
+# Analytics examples
+field :conversion_rate, :decimal, precision: 5, scale: 4    # Rate like 0.0123
+field :daily_revenue, :decimal, precision: 14, scale: 2     # Daily revenue with 2 decimal places
+```
+### Sample Data with Nested Structures
+Here's an example showing how to use the schema defined earlier with sample data:
+```ruby
 # Sample data with nested structures
 data = [
   [
@@ -271,7 +495,7 @@ data = [
       "feature1" => { "count" => 5, "description" => "Main feature" },
       "feature2" => { "count" => 3, "description" => "Secondary feature" }
     },
-    [["a", "b"], ["c", "d", "e"]],  # nested_lists
+    [["a", "b"], ["c", "d", "e"]],  # nested_lists (a list of lists of strings)
     {                                # map_of_lists
       "group1" => [1, 2, 3],
       "group2" => [4, 5, 6]

data/ext/parquet/Cargo.toml CHANGED Viewed

@@ -6,6 +6,9 @@ edition = "2021"
 [lib]
 crate-type = ["cdylib"]
+[build-dependencies]
+rb-sys-env = "^0.2"
 [dependencies]
 ahash = "0.8"
 arrow-array = "54.0.0"
@@ -21,6 +24,7 @@ rb-sys = "^0.9"
 simdutf8 = "0.1.5"
 tempfile = "^3.15"
 thiserror = "2.0"
+num = "0.4.3"
 [target.'cfg(target_os = "linux")'.dependencies]
 jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }

data/ext/parquet/build.rs ADDED Viewed

@@ -0,0 +1,5 @@
+pub fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let _rb_env = rb_sys_env::activate()?;
+    Ok(())
+}

data/ext/parquet/src/lib.rs CHANGED Viewed

@@ -20,6 +20,7 @@ use writer::write_rows;
 #[magnus::init]
 fn init(ruby: &Ruby) -> Result<(), Error> {
     let module = ruby.define_module("Parquet")?;
+    module.define_module_function("metadata", magnus::method!(reader::parse_metadata, -1))?;
     module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
     module.define_module_function("each_column", magnus::method!(parse_parquet_columns, -1))?;
     module.define_module_function("write_rows", magnus::function!(write_rows, -1))?;

data/ext/parquet/src/reader/common.rs CHANGED Viewed

@@ -5,6 +5,7 @@ use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchR
 use parquet::arrow::ProjectionMask;
 use std::collections::HashMap;
 use std::fs::File;
+use std::rc::Rc;
 use std::sync::Arc;
 use magnus::value::ReprValue;
@@ -21,7 +22,7 @@ use crate::ColumnRecord;
 /// returning either a File or a ThreadSafeRubyReader that can be used with
 /// parquet readers.
 pub fn open_parquet_source(
-    ruby: Arc<Ruby>,
+    ruby: Rc<Ruby>,
     to_read: Value,
 ) -> Result<Either<File, ThreadSafeRubyReader>, ParquetGemError> {
     if to_read.is_kind_of(ruby.class_string()) {
@@ -58,8 +59,8 @@ pub fn create_batch_reader<T: parquet::file::reader::ChunkReader + 'static>(
     columns: &Option<Vec<String>>,
     batch_size: Option<usize>,
 ) -> Result<(ParquetRecordBatchReader, std::sync::Arc<Schema>, i64), ParquetGemError> {
-    let mut builder = ParquetRecordBatchReaderBuilder::try_new(reader)
-        .map_err(|e| ParquetGemError::Parquet(e))?;
+    let mut builder =
+        ParquetRecordBatchReaderBuilder::try_new(reader).map_err(ParquetGemError::Parquet)?;
     let schema = builder.schema().clone();
     let num_rows = builder.metadata().file_metadata().num_rows();
@@ -78,7 +79,7 @@ pub fn create_batch_reader<T: parquet::file::reader::ChunkReader + 'static>(
         builder = builder.with_batch_size(batch_size);
     }
-    let reader = builder.build().map_err(|e| ParquetGemError::Parquet(e))?;
+    let reader = builder.build().map_err(ParquetGemError::Parquet)?;
     Ok((reader, schema, num_rows))
 }
@@ -98,12 +99,12 @@ pub fn handle_empty_file(
             .map(|field| field.name().to_string())
             .collect();
         let interned_headers =
-            StringCache::intern_many(&headers).map_err(|e| ParquetGemError::HeaderIntern(e))?;
+            StringCache::intern_many(&headers).map_err(ParquetGemError::HeaderIntern)?;
         for field in interned_headers.iter() {
             map.insert(*field, vec![]);
         }
         let record = ColumnRecord::Map(map);
-        let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
+        let _: Value = ruby.yield_value(record.try_into_value_with(ruby)?)?;
         return Ok(true);
     }
     Ok(false)

data/ext/parquet/src/reader/mod.rs CHANGED Viewed

@@ -1,6 +1,210 @@
 mod common;
 mod parquet_column_reader;
 mod parquet_row_reader;
+use std::{fs::File, rc::Rc};
+use magnus::{value::ReprValue, Error as MagnusError, Ruby, Value};
+use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader};
 pub use parquet_column_reader::parse_parquet_columns;
 pub use parquet_row_reader::parse_parquet_rows;
+use crate::{
+    ruby_reader::{RubyReader, ThreadSafeRubyReader},
+    types::{ParquetGemError, TryIntoValue},
+};
+struct RubyParquetMetaData(ParquetMetaData);
+impl TryIntoValue for RubyParquetMetaData {
+    fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ParquetGemError> {
+        let metadata = self.0;
+        let file_metadata = metadata.file_metadata();
+        let row_groups = metadata.row_groups();
+        // Construct a hash with the metadata
+        let hash = handle.hash_new();
+        hash.aset("num_rows", file_metadata.num_rows())?;
+        hash.aset("created_by", file_metadata.created_by())?;
+        // Convert key_value_metadata to a Ruby array if it exists
+        if let Some(key_value_metadata) = file_metadata.key_value_metadata() {
+            let kv_array = handle.ary_new();
+            for kv in key_value_metadata {
+                let kv_hash = handle.hash_new();
+                kv_hash.aset("key", kv.key.clone())?;
+                kv_hash.aset("value", kv.value.clone())?;
+                kv_array.push(kv_hash)?;
+            }
+            hash.aset("key_value_metadata", kv_array)?;
+        } else {
+            hash.aset("key_value_metadata", None::<Value>)?;
+        }
+        // Convert schema to a Ruby hash since &Type doesn't implement IntoValue
+        let schema_hash = handle.hash_new();
+        let schema = file_metadata.schema();
+        schema_hash.aset("name", schema.name())?;
+        // Add schema fields information
+        let fields_array = handle.ary_new();
+        for field in schema.get_fields() {
+            let field_hash = handle.hash_new();
+            field_hash.aset("name", field.name())?;
+            // Handle different field types
+            match field.as_ref() {
+                parquet::schema::types::Type::PrimitiveType {
+                    physical_type,
+                    type_length,
+                    scale,
+                    precision,
+                    ..
+                } => {
+                    field_hash.aset("type", "primitive")?;
+                    field_hash.aset("physical_type", format!("{:?}", physical_type))?;
+                    field_hash.aset("type_length", *type_length)?;
+                    field_hash.aset("scale", *scale)?;
+                    field_hash.aset("precision", *precision)?;
+                }
+                parquet::schema::types::Type::GroupType { .. } => {
+                    field_hash.aset("type", "group")?;
+                }
+            }
+            // Add basic info
+            let basic_info = field.get_basic_info();
+            field_hash.aset("repetition", format!("{:?}", basic_info.repetition()))?;
+            field_hash.aset(
+                "converted_type",
+                format!("{:?}", basic_info.converted_type()),
+            )?;
+            if let Some(logical_type) = basic_info.logical_type() {
+                field_hash.aset("logical_type", format!("{:?}", logical_type))?;
+            }
+            fields_array.push(field_hash)?;
+        }
+        schema_hash.aset("fields", fields_array)?;
+        hash.aset("schema", schema_hash)?;
+        // Convert row_groups to a Ruby array since &[RowGroupMetaData] doesn't implement IntoValue
+        let row_groups_array = handle.ary_new();
+        for row_group in row_groups.iter() {
+            let rg_hash = handle.hash_new();
+            rg_hash.aset("num_columns", row_group.num_columns())?;
+            rg_hash.aset("num_rows", row_group.num_rows())?;
+            rg_hash.aset("total_byte_size", row_group.total_byte_size())?;
+            rg_hash.aset("file_offset", row_group.file_offset())?;
+            rg_hash.aset("ordinal", row_group.ordinal())?;
+            rg_hash.aset("compressed_size", row_group.compressed_size())?;
+            // Add column chunks metadata
+            let columns_array = handle.ary_new();
+            for col_idx in 0..row_group.num_columns() {
+                let column = row_group.column(col_idx);
+                let col_hash = handle.hash_new();
+                col_hash.aset("column_path", column.column_path().string())?;
+                col_hash.aset("file_path", column.file_path())?;
+                col_hash.aset("file_offset", column.file_offset())?;
+                col_hash.aset("num_values", column.num_values())?;
+                col_hash.aset("compression", format!("{:?}", column.compression()))?;
+                col_hash.aset("total_compressed_size", column.compressed_size())?;
+                col_hash.aset("total_uncompressed_size", column.uncompressed_size())?;
+                col_hash.aset("data_page_offset", column.data_page_offset())?;
+                if let Some(offset) = column.dictionary_page_offset() {
+                    col_hash.aset("dictionary_page_offset", offset)?;
+                }
+                if let Some(offset) = column.bloom_filter_offset() {
+                    col_hash.aset("bloom_filter_offset", offset)?;
+                }
+                if let Some(length) = column.bloom_filter_length() {
+                    col_hash.aset("bloom_filter_length", length)?;
+                }
+                if let Some(offset) = column.offset_index_offset() {
+                    col_hash.aset("offset_index_offset", offset)?;
+                }
+                if let Some(length) = column.offset_index_length() {
+                    col_hash.aset("offset_index_length", length)?;
+                }
+                if let Some(offset) = column.column_index_offset() {
+                    col_hash.aset("column_index_offset", offset)?;
+                }
+                if let Some(length) = column.column_index_length() {
+                    col_hash.aset("column_index_length", length)?;
+                }
+                // Add encodings
+                let encodings_array = handle.ary_new();
+                for encoding in column.encodings() {
+                    encodings_array.push(format!("{:?}", encoding))?;
+                }
+                col_hash.aset("encodings", encodings_array)?;
+                // Add statistics if available
+                if let Some(stats) = column.statistics() {
+                    let stats_hash = handle.hash_new();
+                    stats_hash.aset("min_is_exact", stats.min_is_exact())?;
+                    stats_hash.aset("max_is_exact", stats.max_is_exact())?;
+                    col_hash.aset("statistics", stats_hash)?;
+                }
+                // Add page encoding stats if available
+                if let Some(page_encoding_stats) = column.page_encoding_stats() {
+                    let page_stats_array = handle.ary_new();
+                    for stat in page_encoding_stats {
+                        let stat_hash = handle.hash_new();
+                        stat_hash.aset("page_type", format!("{:?}", stat.page_type))?;
+                        stat_hash.aset("encoding", format!("{:?}", stat.encoding))?;
+                        stat_hash.aset("count", stat.count)?;
+                        page_stats_array.push(stat_hash)?;
+                    }
+                    col_hash.aset("page_encoding_stats", page_stats_array)?;
+                }
+                columns_array.push(col_hash)?;
+            }
+            rg_hash.aset("columns", columns_array)?;
+            row_groups_array.push(rg_hash)?;
+        }
+        hash.aset("row_groups", row_groups_array)?;
+        Ok(handle.into_value(hash))
+    }
+}
+pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
+    let ruby = unsafe { Ruby::get_unchecked() };
+    if args.len() != 1 {
+        return Err(MagnusError::new(
+            magnus::exception::arg_error(),
+            format!("metadata expects exactly 1 argument (file path or IO-like object), got {}", args.len()),
+        ));
+    }
+    let ruby = Rc::new(ruby);
+    let arg = args[0];
+    let mut reader = ParquetMetaDataReader::new();
+    if arg.is_kind_of(ruby.class_string()) {
+        let path = arg.to_r_string()?.to_string()?;
+        let file = File::open(path).map_err(ParquetGemError::FileOpen)?;
+        reader.try_parse(&file).map_err(ParquetGemError::Parquet)?;
+    } else {
+        let file = ThreadSafeRubyReader::new(RubyReader::new(ruby.clone(), arg)?);
+        reader.try_parse(&file).map_err(ParquetGemError::Parquet)?;
+    }
+    let metadata = reader.finish().map_err(ParquetGemError::Parquet)?;
+    Ok(RubyParquetMetaData(metadata).try_into_value_with(&ruby)?)
+}

data/ext/parquet/src/reader/parquet_column_reader.rs CHANGED Viewed

@@ -10,26 +10,25 @@ use either::Either;
 use magnus::IntoValue;
 use magnus::{Error as MagnusError, Ruby, Value};
 use std::collections::HashMap;
-use std::sync::{Arc, OnceLock};
+use std::rc::Rc;
+use std::sync::OnceLock;
 use super::common::{
     create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
 };
 #[inline]
-pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
+pub fn parse_parquet_columns(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
     let ruby = unsafe { Ruby::get_unchecked() };
-    Ok(
-        parse_parquet_columns_impl(Arc::new(ruby), rb_self, args).map_err(|e| {
-            let z: MagnusError = e.into();
-            z
-        })?,
-    )
+    parse_parquet_columns_impl(Rc::new(ruby), rb_self, args).map_err(|e| {
+        let z: MagnusError = e.into();
+        z
+    })
 }
 #[inline]
-fn parse_parquet_columns_impl<'a>(
-    ruby: Arc<Ruby>,
+fn parse_parquet_columns_impl(
+    ruby: Rc<Ruby>,
     rb_self: Value,
     args: &[Value],
 ) -> Result<Value, ParquetGemError> {
@@ -76,13 +75,13 @@ fn parse_parquet_columns_impl<'a>(
         Either::Right(readable) => create_batch_reader(readable, &columns, batch_size)?,
     };
-    // Handle empty file case
-    if handle_empty_file(&ruby, &schema, num_rows)? {
-        return Ok(ruby.qnil().into_value_with(&ruby));
-    }
     match result_type {
         ParserResultType::Hash => {
+            // For hash return type, we need to return a hash with column names pointing at empty arrays
+            if handle_empty_file(&ruby, &schema, num_rows)? {
+                return Ok(ruby.qnil().into_value_with(&ruby));
+            }
             let headers = OnceLock::new();
             let headers_clone = headers.clone();
             let iter = batch_reader.map(move |batch| {
@@ -112,8 +111,8 @@ fn parse_parquet_columns_impl<'a>(
                         .try_for_each(|(i, column)| {
                             let header = local_headers[i];
                             let values = ParquetValueVec::try_from(ArrayWrapper {
-                                array: &*column,
-                                strict: strict,
+                                array: column,
+                                strict,
                             })?;
                             map.insert(header, values.into_inner());
                             Ok::<_, ParquetGemError>(())
@@ -133,11 +132,11 @@ fn parse_parquet_columns_impl<'a>(
                 batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
                     let vec = batch
                         .columns()
-                        .into_iter()
+                        .iter()
                         .map(|column| {
                             let values = ParquetValueVec::try_from(ArrayWrapper {
-                                array: &*column,
-                                strict: strict,
+                                array: column,
+                                strict,
                             })?;
                             Ok::<_, ParquetGemError>(values.into_inner())
                         })