RubyGems - parquet - Versions diffs - 0.2.12-arm64-darwin - Mend

parquet 0.2.12-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

checksums.yaml +7 -0
data/Cargo.lock +1449 -0
data/Cargo.toml +3 -0
data/Gemfile +17 -0
data/LICENSE +21 -0
data/README.md +197 -0
data/Rakefile +27 -0
data/ext/parquet/Cargo.toml +28 -0
data/ext/parquet/extconf.rb +4 -0
data/ext/parquet/src/allocator.rs +13 -0
data/ext/parquet/src/enumerator.rs +52 -0
data/ext/parquet/src/header_cache.rs +100 -0
data/ext/parquet/src/lib.rs +29 -0
data/ext/parquet/src/reader/mod.rs +44 -0
data/ext/parquet/src/reader/parquet_column_reader.rs +214 -0
data/ext/parquet/src/reader/parquet_row_reader.rs +157 -0
data/ext/parquet/src/ruby_integration.rs +77 -0
data/ext/parquet/src/ruby_reader.rs +171 -0
data/ext/parquet/src/types/core_types.rs +75 -0
data/ext/parquet/src/types/mod.rs +30 -0
data/ext/parquet/src/types/parquet_value.rs +462 -0
data/ext/parquet/src/types/record_types.rs +204 -0
data/ext/parquet/src/types/timestamp.rs +85 -0
data/ext/parquet/src/types/type_conversion.rs +809 -0
data/ext/parquet/src/types/writer_types.rs +283 -0
data/ext/parquet/src/utils.rs +148 -0
data/ext/parquet/src/writer/mod.rs +575 -0
data/lib/parquet/version.rb +3 -0
data/lib/parquet.rb +5 -0
data/lib/parquet.rbi +113 -0
metadata +109 -0

data/Cargo.toml ADDED Viewed

@@ -0,0 +1,3 @@
+[workspace]
+members = ["./ext/parquet"]
+resolver = "2"

data/Gemfile ADDED Viewed

@@ -0,0 +1,17 @@
+source "https://rubygems.org"
+gem "rb_sys", "~> 0.9.56"
+gem "rake"
+# Use local version of parquet
+gemspec
+group :development do
+  gem "benchmark-ips", "~> 2.12"
+  gem "polars-df"
+  gem "duckdb"
+end
+group :test do
+  gem "minitest", "~> 5.0"
+end

data/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2024 Nathan Jaremko
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,197 @@
+# parquet-ruby
+[![Gem Version](https://badge.fury.io/rb/parquet.svg)](https://badge.fury.io/rb/parquet)
+This project is a Ruby library wrapping the [parquet-rs](https://github.com/apache/parquet-rs) rust crate.
+## Usage
+This library provides high-level bindings to parquet-rs with two primary APIs for reading Parquet files: row-wise and column-wise iteration. The column-wise API generally offers better performance, especially when working with subset of columns.
+### Row-wise Iteration
+The `each_row` method provides sequential access to individual rows:
+```ruby
+require "parquet"
+# Basic usage with default hash output
+Parquet.each_row("data.parquet") do |row|
+  puts row.inspect  # {"id"=>1, "name"=>"name_1"}
+end
+# Array output for more efficient memory usage
+Parquet.each_row("data.parquet", result_type: :array) do |row|
+  puts row.inspect  # [1, "name_1"]
+end
+# Select specific columns to reduce I/O
+Parquet.each_row("data.parquet", columns: ["id", "name"]) do |row|
+  puts row.inspect
+end
+# Reading from IO objects
+File.open("data.parquet", "rb") do |file|
+  Parquet.each_row(file) do |row|
+    puts row.inspect
+  end
+end
+```
+### Column-wise Iteration
+The `each_column` method reads data in column-oriented batches, which is typically more efficient for analytical queries:
+```ruby
+require "parquet"
+# Process columns in batches of 1024 rows
+Parquet.each_column("data.parquet", batch_size: 1024) do |batch|
+  # With result_type: :hash (default)
+  puts batch.inspect
+  # {
+  #   "id" => [1, 2, ..., 1024],
+  #   "name" => ["name_1", "name_2", ..., "name_1024"]
+  # }
+end
+# Array output with specific columns
+Parquet.each_column("data.parquet",
+                    columns: ["id", "name"],
+                    result_type: :array,
+                    batch_size: 1024) do |batch|
+  puts batch.inspect
+  # [
+  #   [1, 2, ..., 1024],           # id column
+  #   ["name_1", "name_2", ...]    # name column
+  # ]
+end
+```
+### Arguments
+Both methods accept these common arguments:
+- `input`: Path string or IO-like object containing Parquet data
+- `result_type`: Output format (`:hash` or `:array`, defaults to `:hash`)
+- `columns`: Optional array of column names to read (improves performance)
+Additional arguments for `each_column`:
+- `batch_size`: Number of rows per batch (defaults to implementation-defined value)
+When no block is given, both methods return an Enumerator.
+### Writing Row-wise Data
+The `write_rows` method allows you to write data row by row:
+```ruby
+require "parquet"
+# Define the schema for your data
+schema = [
+  { "id" => "int64" },
+  { "name" => "string" },
+  { "score" => "double" }
+]
+# Create an enumerator that yields arrays of row values
+rows = [
+  [1, "Alice", 95.5],
+  [2, "Bob", 82.3],
+  [3, "Charlie", 88.7]
+].each
+# Write to a file
+Parquet.write_rows(rows, schema: schema, write_to: "data.parquet")
+# Write to an IO object
+File.open("data.parquet", "wb") do |file|
+  Parquet.write_rows(rows, schema: schema, write_to: file)
+end
+# Optionally specify batch size (default is 1000)
+Parquet.write_rows(rows,
+  schema: schema,
+  write_to: "data.parquet",
+  batch_size: 500
+)
+# Optionally specify memory threshold for flushing (default is 64MB)
+Parquet.write_rows(rows,
+  schema: schema,
+  write_to: "data.parquet",
+  flush_threshold: 32 * 1024 * 1024  # 32MB
+)
+# Optionally specify sample size for row size estimation (default is 100)
+Parquet.write_rows(rows,
+  schema: schema,
+  write_to: "data.parquet",
+  sample_size: 200  # Sample 200 rows for size estimation
+)
+```
+### Writing Column-wise Data
+The `write_columns` method provides a more efficient way to write data in column-oriented batches:
+```ruby
+require "parquet"
+# Define the schema
+schema = [
+  { "id" => "int64" },
+  { "name" => "string" },
+  { "score" => "double" }
+]
+# Create batches of column data
+batches = [
+  # First batch
+  [
+    [1, 2],          # id column
+    ["Alice", "Bob"], # name column
+    [95.5, 82.3]     # score column
+  ],
+  # Second batch
+  [
+    [3],             # id column
+    ["Charlie"],     # name column
+    [88.7]           # score column
+  ]
+]
+# Create an enumerator from the batches
+columns = batches.each
+# Write to a parquet file with default ZSTD compression
+Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
+# Write to a parquet file with specific compression and memory threshold
+Parquet.write_columns(columns,
+  schema: schema,
+  write_to: "data.parquet",
+  compression: "snappy",  # Supported: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
+  flush_threshold: 32 * 1024 * 1024  # 32MB
+)
+# Write to an IO object
+File.open("data.parquet", "wb") do |file|
+  Parquet.write_columns(columns, schema: schema, write_to: file)
+end
+```
+The following data types are supported in the schema:
+- `int8`, `int16`, `int32`, `int64`
+- `uint8`, `uint16`, `uint32`, `uint64`
+- `float`, `double`
+- `string`
+- `binary`
+- `boolean`
+- `date32`
+- `timestamp_millis`, `timestamp_micros`
+Note: List and Map types are currently not supported.

data/Rakefile ADDED Viewed

@@ -0,0 +1,27 @@
+# frozen_string_literal: true
+require "rake/testtask"
+require "rb_sys/extensiontask"
+task default: :test
+GEMSPEC = Gem::Specification.load("parquet.gemspec")
+RbSys::ExtensionTask.new("parquet", GEMSPEC) do |ext|
+  ext.lib_dir = "lib/parquet"
+  ext.ext_dir = "ext/parquet"
+end
+Rake::TestTask.new do |t|
+  t.deps << :compile
+  t.test_files = FileList[File.expand_path("test/*_test.rb", __dir__)]
+  t.libs << "lib"
+  t.libs << "test"
+end
+task :release do
+  sh "bundle exec rake test"
+  sh "mkdir -p pkg"
+  sh "gem build parquet.gemspec -o pkg/parquet-#{Parquet::VERSION}.gem"
+  sh "gem push pkg/parquet-#{Parquet::VERSION}.gem"
+end

data/ext/parquet/Cargo.toml ADDED Viewed

@@ -0,0 +1,28 @@
+[package]
+name = "parquet"
+version = "0.1.0"
+edition = "2021"
+[lib]
+crate-type = ["cdylib"]
+[dependencies]
+ahash = "0.8"
+arrow-array = "54.0.0"
+arrow-schema = "54.0.0"
+bytes = "^1.9"
+itertools = "^0.14"
+jiff = "0.1.19"
+magnus = { version = "0.7", features = ["rb-sys"] }
+parquet = { version = "^54.0", features = ["json"] }
+rand = "0.9"
+rb-sys = "^0.9"
+thiserror = "2.0"
+tempfile = "^3.15"
+simdutf8 = "0.1.5"
+[target.'cfg(target_os = "linux")'.dependencies]
+jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
+[target.'cfg(not(any(target_os = "linux", target_os = "windows")))'.dependencies]
+mimalloc = { version = "0.1", default-features = false }

data/ext/parquet/extconf.rb ADDED Viewed

@@ -0,0 +1,4 @@
+require "mkmf"
+require "rb_sys/mkmf"
+create_rust_makefile("parquet/parquet")

data/ext/parquet/src/allocator.rs ADDED Viewed

@@ -0,0 +1,13 @@
+#[cfg(target_os = "linux")]
+use jemallocator::Jemalloc;
+#[cfg(not(any(target_os = "linux", target_os = "windows")))]
+use mimalloc::MiMalloc;
+#[global_allocator]
+#[cfg(target_os = "linux")]
+static ALLOC: Jemalloc = Jemalloc;
+#[global_allocator]
+#[cfg(not(any(target_os = "linux", target_os = "windows")))]
+static ALLOC: MiMalloc = MiMalloc;

data/ext/parquet/src/enumerator.rs ADDED Viewed

@@ -0,0 +1,52 @@
+use crate::ParserResultType;
+use magnus::{value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value};
+pub struct RowEnumeratorArgs {
+    pub rb_self: Value,
+    pub to_read: Value,
+    pub result_type: ParserResultType,
+    pub columns: Option<Vec<String>>,
+}
+/// Creates an enumerator for lazy Parquet row parsing
+pub fn create_row_enumerator(args: RowEnumeratorArgs) -> Result<magnus::Enumerator, MagnusError> {
+    let kwargs = RHash::new();
+    kwargs.aset(
+        Symbol::new("result_type"),
+        Symbol::new(args.result_type.to_string()),
+    )?;
+    if let Some(columns) = args.columns {
+        kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
+    }
+    Ok(args
+        .rb_self
+        .enumeratorize("each_row", (args.to_read, KwArgs(kwargs))))
+}
+pub struct ColumnEnumeratorArgs {
+    pub rb_self: Value,
+    pub to_read: Value,
+    pub result_type: ParserResultType,
+    pub columns: Option<Vec<String>>,
+    pub batch_size: Option<usize>,
+}
+#[inline]
+pub fn create_column_enumerator(
+    args: ColumnEnumeratorArgs,
+) -> Result<magnus::Enumerator, MagnusError> {
+    let kwargs = RHash::new();
+    kwargs.aset(
+        Symbol::new("result_type"),
+        Symbol::new(args.result_type.to_string()),
+    )?;
+    if let Some(columns) = args.columns {
+        kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
+    }
+    if let Some(batch_size) = args.batch_size {
+        kwargs.aset(Symbol::new("batch_size"), batch_size)?;
+    }
+    Ok(args
+        .rb_self
+        .enumeratorize("each_column", (args.to_read, KwArgs(kwargs))))
+}

data/ext/parquet/src/header_cache.rs ADDED Viewed

@@ -0,0 +1,100 @@
+/// This module exists to avoid cloning header keys in returned HashMaps.
+/// Since the underlying RString creation already involves cloning,
+/// this caching layer aims to reduce redundant allocations.
+///
+/// Note: Performance testing on macOS showed minimal speed improvements,
+/// so this optimization could be removed if any issues arise.
+use std::{
+    collections::HashMap,
+    sync::{
+        atomic::{AtomicU32, Ordering},
+        LazyLock, Mutex,
+    },
+};
+use magnus::{IntoValue, RString, Ruby, Value};
+use thiserror::Error;
+#[derive(Debug, Error)]
+pub enum CacheError {
+    #[error("Failed to acquire lock: {0}")]
+    LockError(String),
+}
+static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey, AtomicU32)>>> =
+    LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
+pub struct StringCache;
+#[derive(Copy, Clone)]
+pub struct StringCacheKey(&'static str);
+impl StringCacheKey {
+    pub fn new(string: &str) -> Self {
+        let rstr = RString::new(string);
+        let fstr = rstr.to_interned_str();
+        Self(fstr.as_str().unwrap())
+    }
+}
+impl AsRef<str> for StringCacheKey {
+    fn as_ref(&self) -> &'static str {
+        self.0
+    }
+}
+impl IntoValue for StringCacheKey {
+    fn into_value_with(self, handle: &Ruby) -> Value {
+        handle.into_value(self.0)
+    }
+}
+impl IntoValue for &StringCacheKey {
+    fn into_value_with(self, handle: &Ruby) -> Value {
+        handle.into_value(self.0)
+    }
+}
+impl std::fmt::Debug for StringCacheKey {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+impl PartialEq for StringCacheKey {
+    fn eq(&self, other: &Self) -> bool {
+        self.0 == other.0
+    }
+}
+impl std::cmp::Eq for StringCacheKey {}
+impl std::hash::Hash for StringCacheKey {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.0.hash(state);
+    }
+}
+impl StringCache {
+    pub fn intern_many<AsStr: AsRef<str>>(
+        strings: &[AsStr],
+    ) -> Result<Vec<StringCacheKey>, CacheError> {
+        let mut cache = STRING_CACHE
+            .lock()
+            .map_err(|e| CacheError::LockError(e.to_string()))?;
+        let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
+        for string in strings {
+            if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_ref()) {
+                counter.fetch_add(1, Ordering::Relaxed);
+                result.push(*interned_string);
+            } else {
+                let interned = StringCacheKey::new(string.as_ref());
+                cache.insert(interned.0, (interned, AtomicU32::new(1)));
+                result.push(interned);
+            }
+        }
+        Ok(result)
+    }
+}

data/ext/parquet/src/lib.rs ADDED Viewed

@@ -0,0 +1,29 @@
+mod allocator;
+mod enumerator;
+pub mod header_cache;
+mod reader;
+mod ruby_integration;
+mod ruby_reader;
+mod types;
+mod utils;
+mod writer;
+use crate::enumerator::*;
+use crate::reader::*;
+use crate::ruby_integration::*;
+use crate::types::*;
+use magnus::{Error, Ruby};
+use writer::write_columns;
+use writer::write_rows;
+/// Initializes the Ruby extension and defines methods.
+#[magnus::init]
+fn init(ruby: &Ruby) -> Result<(), Error> {
+    let module = ruby.define_module("Parquet")?;
+    module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
+    module.define_module_function("each_column", magnus::method!(parse_parquet_columns, -1))?;
+    module.define_module_function("write_rows", magnus::function!(write_rows, -1))?;
+    module.define_module_function("write_columns", magnus::function!(write_columns, -1))?;
+    Ok(())
+}

data/ext/parquet/src/reader/mod.rs ADDED Viewed

@@ -0,0 +1,44 @@
+mod parquet_column_reader;
+mod parquet_row_reader;
+use std::io;
+use magnus::{Error as MagnusError, Ruby};
+use thiserror::Error;
+use crate::header_cache::CacheError;
+pub use parquet_column_reader::parse_parquet_columns;
+pub use parquet_row_reader::parse_parquet_rows;
+#[derive(Error, Debug)]
+pub enum ReaderError {
+    #[error("Failed to get file descriptor: {0}")]
+    FileDescriptor(String),
+    #[error("Invalid file descriptor")]
+    InvalidFileDescriptor,
+    #[error("Failed to open file: {0}")]
+    FileOpen(#[from] io::Error),
+    #[error("Failed to intern headers: {0}")]
+    HeaderIntern(#[from] CacheError),
+    #[error("Ruby error: {0}")]
+    Ruby(String),
+    #[error("Parquet error: {0}")]
+    Parquet(#[from] parquet::errors::ParquetError),
+    #[error("Arrow error: {0}")]
+    Arrow(#[from] arrow_schema::ArrowError),
+}
+impl From<MagnusError> for ReaderError {
+    fn from(err: MagnusError) -> Self {
+        Self::Ruby(err.to_string())
+    }
+}
+impl From<ReaderError> for MagnusError {
+    fn from(err: ReaderError) -> Self {
+        MagnusError::new(
+            Ruby::get().unwrap().exception_runtime_error(),
+            err.to_string(),
+        )
+    }
+}