parquet 0.2.12-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Cargo.toml ADDED
@@ -0,0 +1,3 @@
1
+ [workspace]
2
+ members = ["./ext/parquet"]
3
+ resolver = "2"
data/Gemfile ADDED
@@ -0,0 +1,17 @@
1
+ source "https://rubygems.org"
2
+
3
+ gem "rb_sys", "~> 0.9.56"
4
+ gem "rake"
5
+
6
+ # Use local version of parquet
7
+ gemspec
8
+
9
+ group :development do
10
+ gem "benchmark-ips", "~> 2.12"
11
+ gem "polars-df"
12
+ gem "duckdb"
13
+ end
14
+
15
+ group :test do
16
+ gem "minitest", "~> 5.0"
17
+ end
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Nathan Jaremko
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,197 @@
1
+ # parquet-ruby
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/parquet.svg)](https://badge.fury.io/rb/parquet)
4
+
5
+ This project is a Ruby library wrapping the [parquet-rs](https://github.com/apache/parquet-rs) rust crate.
6
+
7
+ ## Usage
8
+
9
+ This library provides high-level bindings to parquet-rs with two primary APIs for reading Parquet files: row-wise and column-wise iteration. The column-wise API generally offers better performance, especially when working with subset of columns.
10
+
11
+ ### Row-wise Iteration
12
+
13
+ The `each_row` method provides sequential access to individual rows:
14
+
15
+ ```ruby
16
+ require "parquet"
17
+
18
+ # Basic usage with default hash output
19
+ Parquet.each_row("data.parquet") do |row|
20
+ puts row.inspect # {"id"=>1, "name"=>"name_1"}
21
+ end
22
+
23
+ # Array output for more efficient memory usage
24
+ Parquet.each_row("data.parquet", result_type: :array) do |row|
25
+ puts row.inspect # [1, "name_1"]
26
+ end
27
+
28
+ # Select specific columns to reduce I/O
29
+ Parquet.each_row("data.parquet", columns: ["id", "name"]) do |row|
30
+ puts row.inspect
31
+ end
32
+
33
+ # Reading from IO objects
34
+ File.open("data.parquet", "rb") do |file|
35
+ Parquet.each_row(file) do |row|
36
+ puts row.inspect
37
+ end
38
+ end
39
+ ```
40
+
41
+ ### Column-wise Iteration
42
+
43
+ The `each_column` method reads data in column-oriented batches, which is typically more efficient for analytical queries:
44
+
45
+ ```ruby
46
+ require "parquet"
47
+
48
+ # Process columns in batches of 1024 rows
49
+ Parquet.each_column("data.parquet", batch_size: 1024) do |batch|
50
+ # With result_type: :hash (default)
51
+ puts batch.inspect
52
+ # {
53
+ # "id" => [1, 2, ..., 1024],
54
+ # "name" => ["name_1", "name_2", ..., "name_1024"]
55
+ # }
56
+ end
57
+
58
+ # Array output with specific columns
59
+ Parquet.each_column("data.parquet",
60
+ columns: ["id", "name"],
61
+ result_type: :array,
62
+ batch_size: 1024) do |batch|
63
+ puts batch.inspect
64
+ # [
65
+ # [1, 2, ..., 1024], # id column
66
+ # ["name_1", "name_2", ...] # name column
67
+ # ]
68
+ end
69
+ ```
70
+
71
+ ### Arguments
72
+
73
+ Both methods accept these common arguments:
74
+
75
+ - `input`: Path string or IO-like object containing Parquet data
76
+ - `result_type`: Output format (`:hash` or `:array`, defaults to `:hash`)
77
+ - `columns`: Optional array of column names to read (improves performance)
78
+
79
+ Additional arguments for `each_column`:
80
+
81
+ - `batch_size`: Number of rows per batch (defaults to implementation-defined value)
82
+
83
+ When no block is given, both methods return an Enumerator.
84
+
85
+ ### Writing Row-wise Data
86
+
87
+ The `write_rows` method allows you to write data row by row:
88
+
89
+ ```ruby
90
+ require "parquet"
91
+
92
+ # Define the schema for your data
93
+ schema = [
94
+ { "id" => "int64" },
95
+ { "name" => "string" },
96
+ { "score" => "double" }
97
+ ]
98
+
99
+ # Create an enumerator that yields arrays of row values
100
+ rows = [
101
+ [1, "Alice", 95.5],
102
+ [2, "Bob", 82.3],
103
+ [3, "Charlie", 88.7]
104
+ ].each
105
+
106
+ # Write to a file
107
+ Parquet.write_rows(rows, schema: schema, write_to: "data.parquet")
108
+
109
+ # Write to an IO object
110
+ File.open("data.parquet", "wb") do |file|
111
+ Parquet.write_rows(rows, schema: schema, write_to: file)
112
+ end
113
+
114
+ # Optionally specify batch size (default is 1000)
115
+ Parquet.write_rows(rows,
116
+ schema: schema,
117
+ write_to: "data.parquet",
118
+ batch_size: 500
119
+ )
120
+
121
+ # Optionally specify memory threshold for flushing (default is 64MB)
122
+ Parquet.write_rows(rows,
123
+ schema: schema,
124
+ write_to: "data.parquet",
125
+ flush_threshold: 32 * 1024 * 1024 # 32MB
126
+ )
127
+
128
+ # Optionally specify sample size for row size estimation (default is 100)
129
+ Parquet.write_rows(rows,
130
+ schema: schema,
131
+ write_to: "data.parquet",
132
+ sample_size: 200 # Sample 200 rows for size estimation
133
+ )
134
+ ```
135
+
136
+ ### Writing Column-wise Data
137
+
138
+ The `write_columns` method provides a more efficient way to write data in column-oriented batches:
139
+
140
+ ```ruby
141
+ require "parquet"
142
+
143
+ # Define the schema
144
+ schema = [
145
+ { "id" => "int64" },
146
+ { "name" => "string" },
147
+ { "score" => "double" }
148
+ ]
149
+
150
+ # Create batches of column data
151
+ batches = [
152
+ # First batch
153
+ [
154
+ [1, 2], # id column
155
+ ["Alice", "Bob"], # name column
156
+ [95.5, 82.3] # score column
157
+ ],
158
+ # Second batch
159
+ [
160
+ [3], # id column
161
+ ["Charlie"], # name column
162
+ [88.7] # score column
163
+ ]
164
+ ]
165
+
166
+ # Create an enumerator from the batches
167
+ columns = batches.each
168
+
169
+ # Write to a parquet file with default ZSTD compression
170
+ Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
171
+
172
+ # Write to a parquet file with specific compression and memory threshold
173
+ Parquet.write_columns(columns,
174
+ schema: schema,
175
+ write_to: "data.parquet",
176
+ compression: "snappy", # Supported: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
177
+ flush_threshold: 32 * 1024 * 1024 # 32MB
178
+ )
179
+
180
+ # Write to an IO object
181
+ File.open("data.parquet", "wb") do |file|
182
+ Parquet.write_columns(columns, schema: schema, write_to: file)
183
+ end
184
+ ```
185
+
186
+ The following data types are supported in the schema:
187
+
188
+ - `int8`, `int16`, `int32`, `int64`
189
+ - `uint8`, `uint16`, `uint32`, `uint64`
190
+ - `float`, `double`
191
+ - `string`
192
+ - `binary`
193
+ - `boolean`
194
+ - `date32`
195
+ - `timestamp_millis`, `timestamp_micros`
196
+
197
+ Note: List and Map types are currently not supported.
data/Rakefile ADDED
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "rake/testtask"
4
+ require "rb_sys/extensiontask"
5
+
6
+ task default: :test
7
+
8
+ GEMSPEC = Gem::Specification.load("parquet.gemspec")
9
+
10
+ RbSys::ExtensionTask.new("parquet", GEMSPEC) do |ext|
11
+ ext.lib_dir = "lib/parquet"
12
+ ext.ext_dir = "ext/parquet"
13
+ end
14
+
15
+ Rake::TestTask.new do |t|
16
+ t.deps << :compile
17
+ t.test_files = FileList[File.expand_path("test/*_test.rb", __dir__)]
18
+ t.libs << "lib"
19
+ t.libs << "test"
20
+ end
21
+
22
+ task :release do
23
+ sh "bundle exec rake test"
24
+ sh "mkdir -p pkg"
25
+ sh "gem build parquet.gemspec -o pkg/parquet-#{Parquet::VERSION}.gem"
26
+ sh "gem push pkg/parquet-#{Parquet::VERSION}.gem"
27
+ end
@@ -0,0 +1,28 @@
1
+ [package]
2
+ name = "parquet"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+
6
+ [lib]
7
+ crate-type = ["cdylib"]
8
+
9
+ [dependencies]
10
+ ahash = "0.8"
11
+ arrow-array = "54.0.0"
12
+ arrow-schema = "54.0.0"
13
+ bytes = "^1.9"
14
+ itertools = "^0.14"
15
+ jiff = "0.1.19"
16
+ magnus = { version = "0.7", features = ["rb-sys"] }
17
+ parquet = { version = "^54.0", features = ["json"] }
18
+ rand = "0.9"
19
+ rb-sys = "^0.9"
20
+ thiserror = "2.0"
21
+ tempfile = "^3.15"
22
+ simdutf8 = "0.1.5"
23
+
24
+ [target.'cfg(target_os = "linux")'.dependencies]
25
+ jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
26
+
27
+ [target.'cfg(not(any(target_os = "linux", target_os = "windows")))'.dependencies]
28
+ mimalloc = { version = "0.1", default-features = false }
@@ -0,0 +1,4 @@
1
+ require "mkmf"
2
+ require "rb_sys/mkmf"
3
+
4
+ create_rust_makefile("parquet/parquet")
@@ -0,0 +1,13 @@
1
+ #[cfg(target_os = "linux")]
2
+ use jemallocator::Jemalloc;
3
+
4
+ #[cfg(not(any(target_os = "linux", target_os = "windows")))]
5
+ use mimalloc::MiMalloc;
6
+
7
+ #[global_allocator]
8
+ #[cfg(target_os = "linux")]
9
+ static ALLOC: Jemalloc = Jemalloc;
10
+
11
+ #[global_allocator]
12
+ #[cfg(not(any(target_os = "linux", target_os = "windows")))]
13
+ static ALLOC: MiMalloc = MiMalloc;
@@ -0,0 +1,52 @@
1
+ use crate::ParserResultType;
2
+ use magnus::{value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value};
3
+
4
+ pub struct RowEnumeratorArgs {
5
+ pub rb_self: Value,
6
+ pub to_read: Value,
7
+ pub result_type: ParserResultType,
8
+ pub columns: Option<Vec<String>>,
9
+ }
10
+
11
+ /// Creates an enumerator for lazy Parquet row parsing
12
+ pub fn create_row_enumerator(args: RowEnumeratorArgs) -> Result<magnus::Enumerator, MagnusError> {
13
+ let kwargs = RHash::new();
14
+ kwargs.aset(
15
+ Symbol::new("result_type"),
16
+ Symbol::new(args.result_type.to_string()),
17
+ )?;
18
+ if let Some(columns) = args.columns {
19
+ kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
20
+ }
21
+ Ok(args
22
+ .rb_self
23
+ .enumeratorize("each_row", (args.to_read, KwArgs(kwargs))))
24
+ }
25
+
26
+ pub struct ColumnEnumeratorArgs {
27
+ pub rb_self: Value,
28
+ pub to_read: Value,
29
+ pub result_type: ParserResultType,
30
+ pub columns: Option<Vec<String>>,
31
+ pub batch_size: Option<usize>,
32
+ }
33
+
34
+ #[inline]
35
+ pub fn create_column_enumerator(
36
+ args: ColumnEnumeratorArgs,
37
+ ) -> Result<magnus::Enumerator, MagnusError> {
38
+ let kwargs = RHash::new();
39
+ kwargs.aset(
40
+ Symbol::new("result_type"),
41
+ Symbol::new(args.result_type.to_string()),
42
+ )?;
43
+ if let Some(columns) = args.columns {
44
+ kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
45
+ }
46
+ if let Some(batch_size) = args.batch_size {
47
+ kwargs.aset(Symbol::new("batch_size"), batch_size)?;
48
+ }
49
+ Ok(args
50
+ .rb_self
51
+ .enumeratorize("each_column", (args.to_read, KwArgs(kwargs))))
52
+ }
@@ -0,0 +1,100 @@
1
+ /// This module exists to avoid cloning header keys in returned HashMaps.
2
+ /// Since the underlying RString creation already involves cloning,
3
+ /// this caching layer aims to reduce redundant allocations.
4
+ ///
5
+ /// Note: Performance testing on macOS showed minimal speed improvements,
6
+ /// so this optimization could be removed if any issues arise.
7
+ use std::{
8
+ collections::HashMap,
9
+ sync::{
10
+ atomic::{AtomicU32, Ordering},
11
+ LazyLock, Mutex,
12
+ },
13
+ };
14
+
15
+ use magnus::{IntoValue, RString, Ruby, Value};
16
+
17
+ use thiserror::Error;
18
+
19
+ #[derive(Debug, Error)]
20
+ pub enum CacheError {
21
+ #[error("Failed to acquire lock: {0}")]
22
+ LockError(String),
23
+ }
24
+
25
+ static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey, AtomicU32)>>> =
26
+ LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
27
+
28
+ pub struct StringCache;
29
+
30
+ #[derive(Copy, Clone)]
31
+ pub struct StringCacheKey(&'static str);
32
+
33
+ impl StringCacheKey {
34
+ pub fn new(string: &str) -> Self {
35
+ let rstr = RString::new(string);
36
+ let fstr = rstr.to_interned_str();
37
+ Self(fstr.as_str().unwrap())
38
+ }
39
+ }
40
+
41
+ impl AsRef<str> for StringCacheKey {
42
+ fn as_ref(&self) -> &'static str {
43
+ self.0
44
+ }
45
+ }
46
+
47
+ impl IntoValue for StringCacheKey {
48
+ fn into_value_with(self, handle: &Ruby) -> Value {
49
+ handle.into_value(self.0)
50
+ }
51
+ }
52
+
53
+ impl IntoValue for &StringCacheKey {
54
+ fn into_value_with(self, handle: &Ruby) -> Value {
55
+ handle.into_value(self.0)
56
+ }
57
+ }
58
+
59
+ impl std::fmt::Debug for StringCacheKey {
60
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
61
+ self.0.fmt(f)
62
+ }
63
+ }
64
+
65
+ impl PartialEq for StringCacheKey {
66
+ fn eq(&self, other: &Self) -> bool {
67
+ self.0 == other.0
68
+ }
69
+ }
70
+
71
+ impl std::cmp::Eq for StringCacheKey {}
72
+
73
+ impl std::hash::Hash for StringCacheKey {
74
+ fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
75
+ self.0.hash(state);
76
+ }
77
+ }
78
+
79
+ impl StringCache {
80
+ pub fn intern_many<AsStr: AsRef<str>>(
81
+ strings: &[AsStr],
82
+ ) -> Result<Vec<StringCacheKey>, CacheError> {
83
+ let mut cache = STRING_CACHE
84
+ .lock()
85
+ .map_err(|e| CacheError::LockError(e.to_string()))?;
86
+
87
+ let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
88
+ for string in strings {
89
+ if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_ref()) {
90
+ counter.fetch_add(1, Ordering::Relaxed);
91
+ result.push(*interned_string);
92
+ } else {
93
+ let interned = StringCacheKey::new(string.as_ref());
94
+ cache.insert(interned.0, (interned, AtomicU32::new(1)));
95
+ result.push(interned);
96
+ }
97
+ }
98
+ Ok(result)
99
+ }
100
+ }
@@ -0,0 +1,29 @@
1
+ mod allocator;
2
+ mod enumerator;
3
+ pub mod header_cache;
4
+ mod reader;
5
+ mod ruby_integration;
6
+ mod ruby_reader;
7
+ mod types;
8
+ mod utils;
9
+ mod writer;
10
+
11
+ use crate::enumerator::*;
12
+ use crate::reader::*;
13
+ use crate::ruby_integration::*;
14
+ use crate::types::*;
15
+
16
+ use magnus::{Error, Ruby};
17
+ use writer::write_columns;
18
+ use writer::write_rows;
19
+
20
+ /// Initializes the Ruby extension and defines methods.
21
+ #[magnus::init]
22
+ fn init(ruby: &Ruby) -> Result<(), Error> {
23
+ let module = ruby.define_module("Parquet")?;
24
+ module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
25
+ module.define_module_function("each_column", magnus::method!(parse_parquet_columns, -1))?;
26
+ module.define_module_function("write_rows", magnus::function!(write_rows, -1))?;
27
+ module.define_module_function("write_columns", magnus::function!(write_columns, -1))?;
28
+ Ok(())
29
+ }
@@ -0,0 +1,44 @@
1
+ mod parquet_column_reader;
2
+ mod parquet_row_reader;
3
+
4
+ use std::io;
5
+
6
+ use magnus::{Error as MagnusError, Ruby};
7
+ use thiserror::Error;
8
+
9
+ use crate::header_cache::CacheError;
10
+ pub use parquet_column_reader::parse_parquet_columns;
11
+ pub use parquet_row_reader::parse_parquet_rows;
12
+
13
+ #[derive(Error, Debug)]
14
+ pub enum ReaderError {
15
+ #[error("Failed to get file descriptor: {0}")]
16
+ FileDescriptor(String),
17
+ #[error("Invalid file descriptor")]
18
+ InvalidFileDescriptor,
19
+ #[error("Failed to open file: {0}")]
20
+ FileOpen(#[from] io::Error),
21
+ #[error("Failed to intern headers: {0}")]
22
+ HeaderIntern(#[from] CacheError),
23
+ #[error("Ruby error: {0}")]
24
+ Ruby(String),
25
+ #[error("Parquet error: {0}")]
26
+ Parquet(#[from] parquet::errors::ParquetError),
27
+ #[error("Arrow error: {0}")]
28
+ Arrow(#[from] arrow_schema::ArrowError),
29
+ }
30
+
31
+ impl From<MagnusError> for ReaderError {
32
+ fn from(err: MagnusError) -> Self {
33
+ Self::Ruby(err.to_string())
34
+ }
35
+ }
36
+
37
+ impl From<ReaderError> for MagnusError {
38
+ fn from(err: ReaderError) -> Self {
39
+ MagnusError::new(
40
+ Ruby::get().unwrap().exception_runtime_error(),
41
+ err.to_string(),
42
+ )
43
+ }
44
+ }