RubyGems - parquet - Versions diffs - 0.5.12 → 0.6.0 - Mend

parquet 0.5.12 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

checksums.yaml +4 -4
data/Cargo.lock +295 -98
data/Cargo.toml +1 -1
data/Gemfile +1 -0
data/README.md +94 -3
data/ext/parquet/Cargo.toml +8 -5
data/ext/parquet/src/adapter_ffi.rs +156 -0
data/ext/parquet/src/lib.rs +13 -21
data/ext/parquet-core/Cargo.toml +23 -0
data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
data/ext/parquet-core/src/error.rs +163 -0
data/ext/parquet-core/src/lib.rs +60 -0
data/ext/parquet-core/src/reader.rs +263 -0
data/ext/parquet-core/src/schema.rs +283 -0
data/ext/parquet-core/src/test_utils.rs +308 -0
data/ext/parquet-core/src/traits/mod.rs +5 -0
data/ext/parquet-core/src/traits/schema.rs +151 -0
data/ext/parquet-core/src/value.rs +209 -0
data/ext/parquet-core/src/writer.rs +839 -0
data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
data/ext/parquet-core/tests/binary_data.rs +437 -0
data/ext/parquet-core/tests/column_projection.rs +557 -0
data/ext/parquet-core/tests/complex_types.rs +821 -0
data/ext/parquet-core/tests/compression_tests.rs +434 -0
data/ext/parquet-core/tests/concurrent_access.rs +430 -0
data/ext/parquet-core/tests/decimal_tests.rs +488 -0
data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
data/ext/parquet-core/tests/performance_memory.rs +181 -0
data/ext/parquet-core/tests/primitive_types.rs +547 -0
data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
data/ext/parquet-core/tests/temporal_tests.rs +518 -0
data/ext/parquet-core/tests/test_helpers.rs +132 -0
data/ext/parquet-core/tests/writer_tests.rs +545 -0
data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
data/ext/parquet-ruby-adapter/build.rs +5 -0
data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
data/ext/parquet-ruby-adapter/src/error.rs +148 -0
data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
data/ext/parquet-ruby-adapter/src/types.rs +94 -0
data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
data/lib/parquet/schema.rb +19 -0
data/lib/parquet/version.rb +1 -1
metadata +50 -24
data/ext/parquet/src/enumerator.rs +0 -68
data/ext/parquet/src/header_cache.rs +0 -99
data/ext/parquet/src/logger.rs +0 -171
data/ext/parquet/src/reader/common.rs +0 -111
data/ext/parquet/src/reader/mod.rs +0 -211
data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
data/ext/parquet/src/reader/unified/mod.rs +0 -363
data/ext/parquet/src/types/core_types.rs +0 -120
data/ext/parquet/src/types/mod.rs +0 -100
data/ext/parquet/src/types/parquet_value.rs +0 -1275
data/ext/parquet/src/types/record_types.rs +0 -603
data/ext/parquet/src/types/schema_converter.rs +0 -290
data/ext/parquet/src/types/schema_node.rs +0 -424
data/ext/parquet/src/types/timestamp.rs +0 -285
data/ext/parquet/src/types/type_conversion.rs +0 -1949
data/ext/parquet/src/types/writer_types.rs +0 -329
data/ext/parquet/src/utils.rs +0 -184
data/ext/parquet/src/writer/mod.rs +0 -505
data/ext/parquet/src/writer/write_columns.rs +0 -238
data/ext/parquet/src/writer/write_rows.rs +0 -488

data/Cargo.toml CHANGED Viewed

@@ -1,3 +1,3 @@
 [workspace]
-members = ["./ext/parquet"]
+members = ["./ext/parquet", "./ext/parquet-core", "./ext/parquet-ruby-adapter"]
 resolver = "2"

data/Gemfile CHANGED Viewed

@@ -11,6 +11,7 @@ group :development do
   # gem "benchmark-ips", "~> 2.12"
   # gem "polars-df"
   # gem "duckdb"
+  gem "benchmark-memory"
 end
 group :test do

data/README.md CHANGED Viewed

@@ -2,11 +2,11 @@
 [![Gem Version](https://badge.fury.io/rb/parquet.svg)](https://badge.fury.io/rb/parquet)
-This project is a Ruby library wrapping the [parquet-rs](https://github.com/apache/parquet-rs) rust crate.
+This project is a Ruby library wrapping the [`parquet`](https://github.com/apache/arrow-rs/tree/main/parquet) rust crate.
 ## Usage
-This library provides high-level bindings to parquet-rs with two primary APIs for reading Parquet files: row-wise and column-wise iteration. The column-wise API generally offers better performance, especially when working with subset of columns.
+This library provides high-level bindings to `parquet` with two primary APIs for reading Parquet files: row-wise and column-wise iteration. The column-wise API generally offers better performance, especially when working with subset of columns.
 ### Metadata
@@ -264,9 +264,100 @@ The following data types are supported in the schema:
 - `binary`
 - `boolean`
 - `date32`
-- `timestamp_millis`, `timestamp_micros`
+- `timestamp_millis`, `timestamp_micros`, `timestamp_second`, `timestamp_nanos`
 - `time_millis`, `time_micros`
+### Timestamp Timezone Handling
+**CRITICAL PARQUET SPECIFICATION LIMITATION**: The Apache Parquet format specification only supports two types of timestamps:
+1. **UTC-normalized timestamps** (when ANY timezone is specified) - `isAdjustedToUTC = true`
+2. **Local/unzoned timestamps** (when NO timezone is specified) - `isAdjustedToUTC = false`
+This means that specific timezone offsets like "+09:00" or "America/New_York" CANNOT be preserved in Parquet files. This is not a limitation of this Ruby library, but of the Parquet format itself.
+**When Writing:**
+- If the schema specifies ANY timezone (whether it's "UTC", "+09:00", "America/New_York", etc.):
+  - Time values are converted to UTC before storing
+  - The file metadata sets `isAdjustedToUTC = true`
+  - The original timezone information is LOST
+- If the schema doesn't specify a timezone:
+  - Timestamps are stored as local/unzoned time (no conversion)
+  - The file metadata sets `isAdjustedToUTC = false`
+  - These represent "wall clock" times without timezone context
+**When Reading:**
+- If the Parquet file has `isAdjustedToUTC = true` (ANY timezone was specified during writing):
+  - Time objects are returned in UTC
+  - The original timezone (e.g., "+09:00") is NOT recoverable
+- If the file has `isAdjustedToUTC = false` (NO timezone was specified):
+  - Time objects are returned as local time in your system's timezone
+  - These are "wall clock" times without timezone information
+```ruby
+# Preferred approach: use has_timezone to be explicit about UTC vs local storage
+schema = Parquet::Schema.define do
+  field :timestamp_utc, :timestamp_millis, has_timezone: true     # Stored as UTC (default)
+  field :timestamp_local, :timestamp_millis, has_timezone: false  # Stored as local/unzoned
+  field :timestamp_default, :timestamp_millis                     # Default: UTC storage
+end
+# Legacy approach still supported (any timezone value means UTC storage)
+schema_legacy = Parquet::Schema.define do
+  field :timestamp_utc, :timestamp_millis, timezone: "UTC"        # Stored as UTC
+  field :timestamp_tokyo, :timestamp_millis, timezone: "+09:00"  # Also stored as UTC!
+  field :timestamp_local, :timestamp_millis                       # No timezone - local
+end
+# Time values will be converted based on schema
+rows = [
+  [
+    Time.new(2024, 1, 1, 12, 0, 0, "+03:00"),  # Converted to UTC if has_timezone: true
+    Time.new(2024, 1, 1, 12, 0, 0, "-05:00"),  # Kept as local if has_timezone: false
+    Time.new(2024, 1, 1, 12, 0, 0)              # Kept as local (default)
+  ]
+]
+Parquet.write_rows(rows.each, schema: schema, write_to: "timestamps.parquet")
+# Reading back - timezone presence determines UTC vs local
+Parquet.each_row("timestamps.parquet") do |row|
+  # row["timestamp_utc"]     => Time object in UTC
+  # row["timestamp_local"]   => Time object in local timezone
+  # row["timestamp_default"] => Time object in local timezone
+end
+# If you need to preserve specific timezone information, store it separately:
+schema_with_tz = Parquet::Schema.define do
+  field :timestamp, :timestamp_millis, has_timezone: true  # Store as UTC
+  field :original_timezone, :string                        # Store timezone as string
+end
+```
+## Architecture
+This library uses a modular, trait-based architecture that separates language-agnostic Parquet operations from Ruby-specific bindings:
+- **parquet-core**: Language-agnostic core functionality for Parquet file operations
+  - Pure Rust implementation without Ruby dependencies
+  - Traits for customizable I/O operations (`ChunkReader`) and value conversion (`ValueConverter`)
+  - Efficient Arrow-based reader and writer implementations
+- **parquet-ruby-adapter**: Ruby-specific adapter layer
+  - Implements core traits for Ruby integration
+  - Handles Ruby value conversion through the `ValueConverter` trait
+  - Manages Ruby I/O objects through the `ChunkReader` trait
+- **parquet gem**: Ruby FFI bindings
+  - Provides high-level Ruby API
+  - Manages memory safety between Ruby and Rust
+  - Supports both file-based and IO-based operations
+This architecture enables:
+- Clear separation of concerns between core functionality and language bindings
+- Easy testing of core logic without Ruby dependencies
+- Potential reuse of core functionality for other language bindings
+- Type-safe interfaces through Rust's trait system
 ### Schema DSL for Complex Data Types
 In addition to the hash-based schema definition shown above, this library provides a more expressive DSL for defining complex schemas with nested structures:

data/ext/parquet/Cargo.toml CHANGED Viewed

@@ -11,16 +11,17 @@ rb-sys-env = "^0.2"
 [dependencies]
 ahash = "0.8"
-arrow-array = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-time" }
-arrow-buffer = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-time" }
-arrow-ipc = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-time", features = ["lz4"] }
-arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-time" }
+arrow-array = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
+arrow-buffer = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
+arrow-ipc = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader", features = ["lz4"] }
+arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
 bytes = "^1.9"
 either = "1.9"
 itertools = "^0.14"
 jiff = "0.2"
 magnus = { version = "0.7", features = ["rb-sys"] }
-parquet = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-time", features = ["json"] }
+parquet = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader", features = ["json"] }
+parquet-ruby-adapter = { path = "../parquet-ruby-adapter" }
 rand = "0.9"
 rb-sys = "^0.9"
 simdutf8 = "0.1.5"
@@ -28,6 +29,8 @@ tempfile = "^3.15"
 thiserror = "2.0"
 num = "0.4.3"
 uuid = "1.16.0"
+ordered-float = "5.0.0"
 [target.'cfg(target_os = "linux")'.dependencies]
 jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }

data/ext/parquet/src/adapter_ffi.rs ADDED Viewed

@@ -0,0 +1,156 @@
+use magnus::scan_args::{get_kwargs, scan_args};
+use magnus::value::ReprValue;
+use magnus::{Error as MagnusError, Ruby, Value};
+use parquet_ruby_adapter::{
+    logger::RubyLogger, types::ParserResultType, utils::parse_parquet_write_args,
+};
+pub fn each_row(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
+    let ruby = Ruby::get().map_err(|_| {
+        MagnusError::new(
+            magnus::exception::runtime_error(),
+            "Failed to get Ruby runtime",
+        )
+    })?;
+    // Parse arguments
+    let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
+    let (to_read,) = parsed_args.required;
+    // Parse keyword arguments
+    let kwargs = get_kwargs::<
+        _,
+        (),
+        (
+            Option<Option<Value>>,       // result_type
+            Option<Option<Vec<String>>>, // columns
+            Option<Option<bool>>,        // strict
+            Option<Option<Value>>,       // logger
+        ),
+        (),
+    >(
+        parsed_args.keywords,
+        &[],
+        &["result_type", "columns", "strict", "logger"],
+    )?;
+    let result_type: ParserResultType = if let Some(rt_value) = kwargs.optional.0.flatten() {
+        rt_value
+            .to_r_string()?
+            .to_string()?
+            .parse()
+            .map_err(|e| MagnusError::new(ruby.exception_arg_error(), e))?
+    } else {
+        ParserResultType::Hash
+    };
+    let columns = kwargs.optional.1.flatten();
+    let strict = kwargs.optional.2.flatten().unwrap_or(true);
+    let logger = RubyLogger::new(kwargs.optional.3.flatten())?;
+    // Delegate to parquet_ruby_adapter
+    parquet_ruby_adapter::reader::each_row(
+        &ruby,
+        rb_self,
+        to_read,
+        result_type,
+        columns,
+        strict,
+        logger,
+    )
+}
+pub fn each_column(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
+    let ruby = Ruby::get().map_err(|_| {
+        MagnusError::new(
+            magnus::exception::runtime_error(),
+            "Failed to get Ruby runtime",
+        )
+    })?;
+    // Parse arguments
+    let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
+    let (to_read,) = parsed_args.required;
+    // Parse keyword arguments
+    let kwargs = get_kwargs::<
+        _,
+        (),
+        (
+            Option<Option<Value>>,       // result_type
+            Option<Option<Vec<String>>>, // columns
+            Option<Option<usize>>,       // batch_size
+            Option<Option<bool>>,        // strict
+            Option<Option<Value>>,       // logger
+        ),
+        (),
+    >(
+        parsed_args.keywords,
+        &[],
+        &["result_type", "columns", "batch_size", "strict", "logger"],
+    )?;
+    let result_type: ParserResultType = if let Some(rt_value) = kwargs.optional.0.flatten() {
+        rt_value
+            .to_r_string()?
+            .to_string()?
+            .parse()
+            .map_err(|e| MagnusError::new(ruby.exception_arg_error(), e))?
+    } else {
+        ParserResultType::Hash
+    };
+    let columns = kwargs.optional.1.flatten();
+    let batch_size = if let Some(bs) = kwargs.optional.2.flatten() {
+        if bs == 0 {
+            return Err(MagnusError::new(
+                ruby.exception_arg_error(),
+                "batch_size must be greater than 0",
+            ));
+        }
+        Some(bs)
+    } else {
+        None
+    };
+    let strict = kwargs.optional.3.flatten().unwrap_or(true);
+    let logger = RubyLogger::new(kwargs.optional.4.flatten())?;
+    // Delegate to parquet_ruby_adapter
+    parquet_ruby_adapter::reader::each_column(
+        &ruby,
+        rb_self,
+        to_read,
+        result_type,
+        columns,
+        batch_size,
+        strict,
+        logger,
+    )
+}
+pub fn write_rows(args: &[Value]) -> Result<Value, MagnusError> {
+    let ruby = Ruby::get().map_err(|_| {
+        MagnusError::new(
+            magnus::exception::runtime_error(),
+            "Failed to get Ruby runtime",
+        )
+    })?;
+    // Parse arguments using the new parser
+    let write_args = parse_parquet_write_args(&ruby, args)?;
+    // Delegate to parquet_ruby_adapter
+    parquet_ruby_adapter::writer::write_rows(&ruby, write_args)
+}
+pub fn write_columns(args: &[Value]) -> Result<Value, MagnusError> {
+    let ruby = Ruby::get().map_err(|_| {
+        MagnusError::new(
+            magnus::exception::runtime_error(),
+            "Failed to get Ruby runtime",
+        )
+    })?;
+    // Parse arguments using the new parser
+    let write_args = parse_parquet_write_args(&ruby, args)?;
+    // Delegate to parquet_ruby_adapter
+    parquet_ruby_adapter::writer::write_columns(&ruby, write_args)
+}

data/ext/parquet/src/lib.rs CHANGED Viewed

@@ -1,32 +1,24 @@
+mod adapter_ffi;
 mod allocator;
-mod enumerator;
-pub mod header_cache;
-mod logger;
-mod reader;
-mod ruby_reader;
-mod types;
-mod utils;
-mod writer;
-use crate::enumerator::*;
-use crate::reader::*;
-use crate::types::*;
+use magnus::{function, method, Error, Ruby};
-use magnus::{Error, Ruby};
-use writer::write_columns;
-use writer::write_rows;
+use crate::adapter_ffi::{each_column, each_row, write_columns, write_rows};
+use parquet_ruby_adapter::metadata::parse_metadata;
 /// Initializes the Ruby extension and defines methods.
 #[magnus::init]
 fn init(ruby: &Ruby) -> Result<(), Error> {
-    // Require 'time' for Time.parse method
     ruby.require("time")?;
+    ruby.require("bigdecimal")?;
     let module = ruby.define_module("Parquet")?;
-    module.define_module_function("metadata", magnus::method!(reader::parse_metadata, -1))?;
-    module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
-    module.define_module_function("each_column", magnus::method!(parse_parquet_columns, -1))?;
-    module.define_module_function("write_rows", magnus::function!(write_rows, -1))?;
-    module.define_module_function("write_columns", magnus::function!(write_columns, -1))?;
+    module.define_module_function("metadata", function!(parse_metadata, 1))?;
+    module.define_module_function("each_row", method!(each_row, -1))?;
+    module.define_module_function("each_column", method!(each_column, -1))?;
+    module.define_module_function("write_rows", function!(write_rows, -1))?;
+    module.define_module_function("write_columns", function!(write_columns, -1))?;
     Ok(())
 }

data/ext/parquet-core/Cargo.toml ADDED Viewed

@@ -0,0 +1,23 @@
+[package]
+name = "parquet-core"
+version = "0.1.0"
+edition = "2021"
+[dependencies]
+arrow = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
+arrow-array = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
+arrow-buffer = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
+arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
+bytes = "1.5"
+indexmap = "2.2"
+jiff = "0.2"
+num = "0.4.3"
+ordered-float = "5.0.0"
+parquet = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader", features = ["arrow", "zstd", "lz4", "snap"] }
+rand = "0.9.1"
+serde = { version = "1.0", features = ["derive"] }
+thiserror = "2.0"
+[dev-dependencies]
+uuid = { version = "1.0", features = ["v4"] }
+tempfile = "3.8"