parquet 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Cargo.toml ADDED
@@ -0,0 +1,3 @@
1
+ [workspace]
2
+ members = ["./ext/parquet"]
3
+ resolver = "2"
data/Gemfile ADDED
@@ -0,0 +1,12 @@
1
+ source "https://rubygems.org"
2
+
3
+ gem "rb_sys", "~> 0.9.56"
4
+ gem "rake"
5
+
6
+ # Use local version of parquet
7
+ gemspec
8
+
9
+ group :development, :test do
10
+ gem "minitest", "~> 5.0"
11
+ gem "benchmark-ips", "~> 2.12"
12
+ end
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Nathan Jaremko
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # parquet-ruby
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/parquet.svg)](https://badge.fury.io/rb/parquet)
4
+
5
+ This project is a Ruby library wrapping the [parquet-rs](https://github.com/apache/parquet-rs) rust crate.
6
+
7
+ At the moment, it only supports iterating rows as either a hash or an array.
8
+
9
+ ## Usage
10
+
11
+ ```ruby
12
+ require "parquet"
13
+
14
+ # Read each row as a hash
15
+ Parquet.each_row("test/data.parquet") { |row| puts row.inspect }
16
+
17
+ # Read each row as an array
18
+ Parquet.each_row("test/data.parquet", result_type: :array) { |row| puts row.inspect }
19
+
20
+ # Read from an IO object (like File or StringIO)
21
+ File.open("test/data.parquet", "rb") do |file|
22
+ Parquet.each_row(file) { |row| puts row.inspect }
23
+ end
24
+
25
+ # Or with StringIO
26
+ io = StringIO.new(File.binread("test/data.parquet"))
27
+ Parquet.each_row(io) { |row| puts row.inspect }
28
+
29
+ ```
data/Rakefile ADDED
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "rake/testtask"
4
+ require "rb_sys/extensiontask"
5
+
6
+ task default: :test
7
+
8
+ GEMSPEC = Gem::Specification.load("parquet.gemspec")
9
+
10
+ RbSys::ExtensionTask.new("parquet", GEMSPEC) do |ext|
11
+ ext.lib_dir = "lib/parquet"
12
+ ext.ext_dir = "ext/parquet"
13
+ end
14
+
15
+ Rake::TestTask.new do |t|
16
+ t.deps << :compile
17
+ t.test_files = FileList[File.expand_path("test/*_test.rb", __dir__)]
18
+ t.libs << "lib"
19
+ t.libs << "test"
20
+ end
21
+
22
+ task :release do
23
+ sh "bundle exec rake test"
24
+ sh "mkdir -p pkg"
25
+ sh "gem build parquet.gemspec -o pkg/parquet-#{Parquet::VERSION}.gem"
26
+ sh "gem push pkg/parquet-#{Parquet::VERSION}.gem"
27
+ end
@@ -0,0 +1,18 @@
1
+ [package]
2
+ name = "parquet"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+
6
+ [lib]
7
+ crate-type = ["cdylib"]
8
+
9
+ [dependencies]
10
+ parquet = { version = "^54.0", features = ["json", "object_store"] }
11
+ bytes = "^1.9"
12
+ kanal = "0.1.0-pre8"
13
+ magnus = { version = "0.7", features = ["rb-sys"] }
14
+ rb-sys = "^0.9"
15
+ serde = { version = "1.0", features = ["derive"] }
16
+ serde_magnus = "0.8.1"
17
+ thiserror = "2.0"
18
+ xxhash-rust = { version = "0.8.12", features = ["xxh3"] }
@@ -0,0 +1,4 @@
1
+ require "mkmf"
2
+ require "rb_sys/mkmf"
3
+
4
+ create_rust_makefile("parquet/parquet")
@@ -0,0 +1,81 @@
1
+ /// This module exists to avoid cloning header keys in returned HashMaps.
2
+ /// Since the underlying RString creation already involves cloning,
3
+ /// this caching layer aims to reduce redundant allocations.
4
+ ///
5
+ /// Note: Performance testing on macOS showed minimal speed improvements,
6
+ /// so this optimization could be removed if any issues arise.
7
+ use std::{
8
+ collections::HashMap,
9
+ sync::{atomic::AtomicU32, LazyLock, Mutex},
10
+ };
11
+ use thiserror::Error;
12
+
13
+ #[derive(Debug, Error)]
14
+ pub enum CacheError {
15
+ #[error("Failed to acquire lock: {0}")]
16
+ LockError(String),
17
+ }
18
+
19
+ static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, AtomicU32>>> =
20
+ LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
21
+
22
+ pub struct StringCache;
23
+
24
+ impl StringCache {
25
+ #[allow(dead_code)]
26
+ pub fn intern(string: String) -> Result<&'static str, CacheError> {
27
+ let mut cache = STRING_CACHE
28
+ .lock()
29
+ .map_err(|e| CacheError::LockError(e.to_string()))?;
30
+
31
+ if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
32
+ count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
33
+ Ok(existing)
34
+ } else {
35
+ let leaked = Box::leak(string.into_boxed_str());
36
+ cache.insert(leaked, AtomicU32::new(1));
37
+ Ok(leaked)
38
+ }
39
+ }
40
+
41
+ pub fn intern_many(strings: &[String]) -> Result<Vec<&'static str>, CacheError> {
42
+ let mut cache = STRING_CACHE
43
+ .lock()
44
+ .map_err(|e| CacheError::LockError(e.to_string()))?;
45
+
46
+ let mut result = Vec::with_capacity(strings.len());
47
+ for string in strings {
48
+ if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
49
+ count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
50
+ result.push(existing);
51
+ } else {
52
+ let leaked = Box::leak(string.clone().into_boxed_str());
53
+ cache.insert(leaked, AtomicU32::new(1));
54
+ result.push(leaked);
55
+ }
56
+ }
57
+ Ok(result)
58
+ }
59
+
60
+ pub fn clear(headers: &[&'static str]) -> Result<(), CacheError> {
61
+ let mut cache = STRING_CACHE
62
+ .lock()
63
+ .map_err(|e| CacheError::LockError(e.to_string()))?;
64
+
65
+ for header in headers {
66
+ if let Some(count) = cache.get(header) {
67
+ // Returns the previous value of the counter
68
+ let was = count.fetch_sub(1, std::sync::atomic::Ordering::Relaxed);
69
+ if was == 1 {
70
+ cache.remove(header);
71
+ let ptr = *header as *const str as *mut str;
72
+ unsafe {
73
+ let _ = Box::from_raw(ptr);
74
+ }
75
+ }
76
+ }
77
+ }
78
+
79
+ Ok(())
80
+ }
81
+ }
@@ -0,0 +1,16 @@
1
+ pub mod header_cache;
2
+ mod reader;
3
+ mod ruby_reader;
4
+ mod utils;
5
+
6
+ use crate::reader::*;
7
+
8
+ use magnus::{Error, Ruby};
9
+
10
+ /// Initializes the Ruby extension and defines methods.
11
+ #[magnus::init]
12
+ fn init(ruby: &Ruby) -> Result<(), Error> {
13
+ let module = ruby.define_module("Parquet")?;
14
+ module.define_module_function("each_row", magnus::method!(parse_parquet, -1))?;
15
+ Ok(())
16
+ }
@@ -0,0 +1,337 @@
1
+ use crate::header_cache::{CacheError, StringCache};
2
+ use crate::ruby_reader::{build_ruby_reader, SeekableRead};
3
+ use crate::utils::*;
4
+ use bytes::Bytes;
5
+ use magnus::rb_sys::AsRawValue;
6
+ use magnus::value::{Opaque, ReprValue};
7
+ use magnus::IntoValue;
8
+ use magnus::{block::Yield, Error as MagnusError, KwArgs, RHash, Ruby, Symbol, Value};
9
+ use parquet::errors::ParquetError;
10
+ use parquet::file::reader::{ChunkReader, Length, SerializedFileReader};
11
+ use parquet::record::Field;
12
+ use std::collections::HashMap;
13
+ use std::fs::File;
14
+ use std::io::{self, BufReader, Read, Seek, SeekFrom};
15
+ use std::mem::ManuallyDrop;
16
+ use std::os::fd::FromRawFd;
17
+ use std::sync::OnceLock;
18
+ use std::{borrow::Cow, hash::BuildHasher};
19
+ use thiserror::Error;
20
+ use xxhash_rust::xxh3::Xxh3Builder;
21
+
22
+ use parquet::record::reader::RowIter as ParquetRowIter;
23
+
24
+ #[derive(Error, Debug)]
25
+ pub enum ReaderError {
26
+ #[error("Failed to get file descriptor: {0}")]
27
+ FileDescriptor(String),
28
+ #[error("Invalid file descriptor")]
29
+ InvalidFileDescriptor,
30
+ #[error("Failed to open file: {0}")]
31
+ FileOpen(#[from] io::Error),
32
+ #[error("Failed to intern headers: {0}")]
33
+ HeaderIntern(#[from] CacheError),
34
+ #[error("Ruby error: {0}")]
35
+ Ruby(String),
36
+ }
37
+
38
+ impl From<MagnusError> for ReaderError {
39
+ fn from(err: MagnusError) -> Self {
40
+ Self::Ruby(err.to_string())
41
+ }
42
+ }
43
+
44
+ impl From<ReaderError> for MagnusError {
45
+ fn from(err: ReaderError) -> Self {
46
+ MagnusError::new(
47
+ Ruby::get().unwrap().exception_runtime_error(),
48
+ err.to_string(),
49
+ )
50
+ }
51
+ }
52
+
53
+ struct ForgottenFileHandle(ManuallyDrop<File>);
54
+
55
+ impl Length for ForgottenFileHandle {
56
+ fn len(&self) -> u64 {
57
+ self.0.len()
58
+ }
59
+ }
60
+
61
+ impl ChunkReader for ForgottenFileHandle {
62
+ type T = BufReader<File>;
63
+
64
+ fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
65
+ self.0.get_read(start)
66
+ }
67
+
68
+ fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
69
+ self.0.get_bytes(start, length)
70
+ }
71
+ }
72
+
73
+ struct HeaderCacheCleanupIter<I> {
74
+ inner: I,
75
+ headers: OnceLock<Vec<&'static str>>,
76
+ }
77
+
78
+ impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
79
+ type Item = I::Item;
80
+
81
+ fn next(&mut self) -> Option<Self::Item> {
82
+ self.inner.next()
83
+ }
84
+ }
85
+
86
+ impl<I> Drop for HeaderCacheCleanupIter<I> {
87
+ fn drop(&mut self) {
88
+ if let Some(headers) = self.headers.get() {
89
+ StringCache::clear(&headers).unwrap();
90
+ }
91
+ }
92
+ }
93
+
94
+ pub fn parse_parquet<'a>(
95
+ rb_self: Value,
96
+ args: &[Value],
97
+ ) -> Result<Yield<Box<dyn Iterator<Item = Record<Xxh3Builder>>>>, MagnusError> {
98
+ let original = unsafe { Ruby::get_unchecked() };
99
+ let ruby: &'static Ruby = Box::leak(Box::new(original));
100
+
101
+ let ParquetArgs {
102
+ to_read,
103
+ result_type,
104
+ } = parse_parquet_args(&ruby, args)?;
105
+
106
+ if !ruby.block_given() {
107
+ return create_enumerator(EnumeratorArgs {
108
+ rb_self,
109
+ to_read,
110
+ result_type,
111
+ });
112
+ }
113
+
114
+ let iter = if to_read.is_kind_of(ruby.class_string()) {
115
+ let path_string = to_read.to_r_string()?;
116
+ let file_path = unsafe { path_string.as_str()? };
117
+ let file = File::open(file_path).unwrap();
118
+ let reader = SerializedFileReader::new(file).unwrap();
119
+ ParquetRowIter::from_file_into(Box::new(reader))
120
+ } else if to_read.is_kind_of(ruby.class_io()) {
121
+ let raw_value = to_read.as_raw();
122
+ let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
123
+ .map_err(|_| {
124
+ ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
125
+ })?;
126
+
127
+ if fd < 0 {
128
+ return Err(ReaderError::InvalidFileDescriptor.into());
129
+ }
130
+
131
+ let file = unsafe { File::from_raw_fd(fd) };
132
+ let file = ForgottenFileHandle(ManuallyDrop::new(file));
133
+ let reader = SerializedFileReader::new(file).unwrap();
134
+ ParquetRowIter::from_file_into(Box::new(reader))
135
+ } else {
136
+ let readable = SeekableRubyValue(Opaque::from(to_read));
137
+ let reader = SerializedFileReader::new(readable).unwrap();
138
+ ParquetRowIter::from_file_into(Box::new(reader))
139
+ };
140
+
141
+ let iter: Box<dyn Iterator<Item = Record<Xxh3Builder>>> = match result_type.as_str() {
142
+ "hash" => {
143
+ let headers = OnceLock::new();
144
+ let headers_clone = headers.clone();
145
+ let iter = iter
146
+ .filter_map(move |row| {
147
+ row.ok().map(|row| {
148
+ let headers = headers_clone.get_or_init(|| {
149
+ row.get_column_iter()
150
+ .map(|(k, _)| StringCache::intern(k.to_owned()).unwrap())
151
+ .collect::<Vec<_>>()
152
+ });
153
+
154
+ row.get_column_iter()
155
+ .enumerate()
156
+ .map(|(i, (_, v))| {
157
+ let key = headers[i];
158
+ (key, ParquetField(v.clone()))
159
+ })
160
+ .collect::<HashMap<&'static str, ParquetField, Xxh3Builder>>()
161
+ })
162
+ })
163
+ .map(|row| Record::Map(row));
164
+
165
+ Box::new(HeaderCacheCleanupIter {
166
+ inner: iter,
167
+ headers,
168
+ })
169
+ }
170
+ "array" => Box::new(
171
+ iter.filter_map(|row| {
172
+ row.ok().map(|row| {
173
+ row.get_column_iter()
174
+ .map(|(_, v)| ParquetField(v.clone()))
175
+ .collect::<Vec<ParquetField>>()
176
+ })
177
+ })
178
+ .map(|row| Record::Vec(row)),
179
+ ),
180
+ _ => {
181
+ return Err(MagnusError::new(
182
+ ruby.exception_runtime_error(),
183
+ "Invalid result type",
184
+ ))
185
+ }
186
+ };
187
+
188
+ Ok(Yield::Iter(iter))
189
+ }
190
+
191
+ struct EnumeratorArgs {
192
+ rb_self: Value,
193
+ to_read: Value,
194
+ result_type: String,
195
+ }
196
+
197
+ fn create_enumerator(
198
+ args: EnumeratorArgs,
199
+ ) -> Result<Yield<Box<dyn Iterator<Item = Record<Xxh3Builder>>>>, MagnusError> {
200
+ let kwargs = RHash::new();
201
+
202
+ kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
203
+
204
+ let enumerator = args
205
+ .rb_self
206
+ .enumeratorize("for_each", (args.to_read, KwArgs(kwargs)));
207
+ Ok(Yield::Enumerator(enumerator))
208
+ }
209
+
210
+ #[derive(Debug)]
211
+ pub enum Record<S: BuildHasher + Default> {
212
+ Vec(Vec<ParquetField>),
213
+ Map(HashMap<&'static str, ParquetField, S>),
214
+ }
215
+
216
+ impl<S: BuildHasher + Default> IntoValue for Record<S> {
217
+ #[inline]
218
+ fn into_value_with(self, handle: &Ruby) -> Value {
219
+ match self {
220
+ Record::Vec(vec) => {
221
+ let ary = handle.ary_new_capa(vec.len());
222
+ vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
223
+ ary.into_value_with(handle)
224
+ }
225
+ Record::Map(map) => {
226
+ // Pre-allocate the hash with the known size
227
+ let hash = handle.hash_new_capa(map.len());
228
+ map.into_iter()
229
+ .try_for_each(|(k, v)| hash.aset(k, v))
230
+ .unwrap();
231
+ hash.into_value_with(handle)
232
+ }
233
+ }
234
+ }
235
+ }
236
+
237
+ #[derive(Debug, Clone)]
238
+ pub struct CowValue<'a>(pub Cow<'a, str>);
239
+
240
+ impl<'a> IntoValue for CowValue<'a> {
241
+ fn into_value_with(self, handle: &Ruby) -> Value {
242
+ self.0.into_value_with(handle)
243
+ }
244
+ }
245
+
246
+ #[derive(Debug)]
247
+ pub struct ParquetField(Field);
248
+
249
+ impl<'a> IntoValue for ParquetField {
250
+ fn into_value_with(self, handle: &Ruby) -> Value {
251
+ match self.0 {
252
+ Field::Byte(b) => b.into_value_with(handle),
253
+ Field::Bool(b) => b.into_value_with(handle),
254
+ Field::Short(s) => s.into_value_with(handle),
255
+ Field::Int(i) => i.into_value_with(handle),
256
+ Field::Long(l) => l.into_value_with(handle),
257
+ Field::UByte(ub) => ub.into_value_with(handle),
258
+ Field::UShort(us) => us.into_value_with(handle),
259
+ Field::UInt(ui) => ui.into_value_with(handle),
260
+ Field::ULong(ul) => ul.into_value_with(handle),
261
+ Field::Float16(f) => f32::from(f).into_value_with(handle),
262
+ Field::Float(f) => f.into_value_with(handle),
263
+ Field::Double(d) => d.into_value_with(handle),
264
+
265
+ Field::Str(s) => s.into_value_with(handle),
266
+ Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
267
+ Field::Date(d) => d.into_value_with(handle),
268
+ Field::TimestampMillis(ts) => ts.into_value_with(handle),
269
+ Field::TimestampMicros(ts) => ts.into_value_with(handle),
270
+ Field::ListInternal(list) => {
271
+ let ary = handle.ary_new_capa(list.elements().len());
272
+ list.elements()
273
+ .iter()
274
+ .try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
275
+ .unwrap();
276
+ ary.into_value_with(handle)
277
+ }
278
+ Field::MapInternal(map) => {
279
+ let hash = handle.hash_new_capa(map.entries().len());
280
+ map.entries()
281
+ .iter()
282
+ .try_for_each(|(k, v)| {
283
+ hash.aset(
284
+ ParquetField(k.clone()).into_value_with(handle),
285
+ ParquetField(v.clone()).into_value_with(handle),
286
+ )
287
+ })
288
+ .unwrap();
289
+ hash.into_value_with(handle)
290
+ }
291
+ // Field::Decimal(d) => d.to_string().into_value_with(handle),
292
+ // Field::Group(row) => row.into_value_with(handle),
293
+ Field::Null => handle.qnil().as_value(),
294
+ _ => panic!("Unsupported field type"),
295
+ }
296
+ }
297
+ }
298
+
299
+ struct SeekableRubyValue(Opaque<Value>);
300
+
301
+ impl Length for SeekableRubyValue {
302
+ fn len(&self) -> u64 {
303
+ let ruby = unsafe { Ruby::get_unchecked() };
304
+ let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
305
+ let current_pos = reader.seek(SeekFrom::Current(0)).unwrap();
306
+ let file_len = reader.seek(SeekFrom::End(0)).unwrap();
307
+ reader.seek(SeekFrom::Start(current_pos)).unwrap();
308
+ file_len
309
+ }
310
+ }
311
+
312
+ impl ChunkReader for SeekableRubyValue {
313
+ type T = BufReader<Box<dyn SeekableRead>>;
314
+
315
+ fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
316
+ let ruby = unsafe { Ruby::get_unchecked() };
317
+ let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
318
+ reader.seek(SeekFrom::Start(start))?;
319
+ Ok(BufReader::new(reader))
320
+ }
321
+
322
+ fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
323
+ let ruby = unsafe { Ruby::get_unchecked() };
324
+ let mut buffer = Vec::with_capacity(length);
325
+ let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
326
+ reader.seek(SeekFrom::Start(start))?;
327
+ let read = reader.take(length as _).read_to_end(&mut buffer)?;
328
+
329
+ if read != length {
330
+ return Err(ParquetError::EOF(format!(
331
+ "Expected to read {} bytes, read only {}",
332
+ length, read
333
+ )));
334
+ }
335
+ Ok(buffer.into())
336
+ }
337
+ }