parquet 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/Cargo.toml ADDED
@@ -0,0 +1,3 @@
1
+ [workspace]
2
+ members = ["./ext/parquet"]
3
+ resolver = "2"
data/Gemfile ADDED
@@ -0,0 +1,12 @@
1
+ source "https://rubygems.org"
2
+
3
+ gem "rb_sys", "~> 0.9.56"
4
+ gem "rake"
5
+
6
+ # Use local version of parquet
7
+ gemspec
8
+
9
+ group :development, :test do
10
+ gem "minitest", "~> 5.0"
11
+ gem "benchmark-ips", "~> 2.12"
12
+ end
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Nathan Jaremko
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # parquet-ruby
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/parquet.svg)](https://badge.fury.io/rb/parquet)
4
+
5
+ This project is a Ruby library wrapping the [parquet-rs](https://github.com/apache/parquet-rs) rust crate.
6
+
7
+ At the moment, it only supports iterating rows as either a hash or an array.
8
+
9
+ ## Usage
10
+
11
+ ```ruby
12
+ require "parquet"
13
+
14
+ # Read each row as a hash
15
+ Parquet.each_row("test/data.parquet") { |row| puts row.inspect }
16
+
17
+ # Read each row as an array
18
+ Parquet.each_row("test/data.parquet", result_type: :array) { |row| puts row.inspect }
19
+
20
+ # Read from an IO object (like File or StringIO)
21
+ File.open("test/data.parquet", "rb") do |file|
22
+ Parquet.each_row(file) { |row| puts row.inspect }
23
+ end
24
+
25
+ # Or with StringIO
26
+ io = StringIO.new(File.binread("test/data.parquet"))
27
+ Parquet.each_row(io) { |row| puts row.inspect }
28
+
29
+ ```
data/Rakefile ADDED
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "rake/testtask"
4
+ require "rb_sys/extensiontask"
5
+
6
+ task default: :test
7
+
8
+ GEMSPEC = Gem::Specification.load("parquet.gemspec")
9
+
10
+ RbSys::ExtensionTask.new("parquet", GEMSPEC) do |ext|
11
+ ext.lib_dir = "lib/parquet"
12
+ ext.ext_dir = "ext/parquet"
13
+ end
14
+
15
+ Rake::TestTask.new do |t|
16
+ t.deps << :compile
17
+ t.test_files = FileList[File.expand_path("test/*_test.rb", __dir__)]
18
+ t.libs << "lib"
19
+ t.libs << "test"
20
+ end
21
+
22
+ task :release do
23
+ sh "bundle exec rake test"
24
+ sh "mkdir -p pkg"
25
+ sh "gem build parquet.gemspec -o pkg/parquet-#{Parquet::VERSION}.gem"
26
+ sh "gem push pkg/parquet-#{Parquet::VERSION}.gem"
27
+ end
@@ -0,0 +1,18 @@
1
+ [package]
2
+ name = "parquet"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+
6
+ [lib]
7
+ crate-type = ["cdylib"]
8
+
9
+ [dependencies]
10
+ parquet = { version = "^54.0", features = ["json", "object_store"] }
11
+ bytes = "^1.9"
12
+ kanal = "0.1.0-pre8"
13
+ magnus = { version = "0.7", features = ["rb-sys"] }
14
+ rb-sys = "^0.9"
15
+ serde = { version = "1.0", features = ["derive"] }
16
+ serde_magnus = "0.8.1"
17
+ thiserror = "2.0"
18
+ xxhash-rust = { version = "0.8.12", features = ["xxh3"] }
@@ -0,0 +1,4 @@
1
+ require "mkmf"
2
+ require "rb_sys/mkmf"
3
+
4
+ create_rust_makefile("parquet/parquet")
@@ -0,0 +1,81 @@
1
+ /// This module exists to avoid cloning header keys in returned HashMaps.
2
+ /// Since the underlying RString creation already involves cloning,
3
+ /// this caching layer aims to reduce redundant allocations.
4
+ ///
5
+ /// Note: Performance testing on macOS showed minimal speed improvements,
6
+ /// so this optimization could be removed if any issues arise.
7
+ use std::{
8
+ collections::HashMap,
9
+ sync::{atomic::AtomicU32, LazyLock, Mutex},
10
+ };
11
+ use thiserror::Error;
12
+
13
+ #[derive(Debug, Error)]
14
+ pub enum CacheError {
15
+ #[error("Failed to acquire lock: {0}")]
16
+ LockError(String),
17
+ }
18
+
19
+ static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, AtomicU32>>> =
20
+ LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
21
+
22
+ pub struct StringCache;
23
+
24
+ impl StringCache {
25
+ #[allow(dead_code)]
26
+ pub fn intern(string: String) -> Result<&'static str, CacheError> {
27
+ let mut cache = STRING_CACHE
28
+ .lock()
29
+ .map_err(|e| CacheError::LockError(e.to_string()))?;
30
+
31
+ if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
32
+ count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
33
+ Ok(existing)
34
+ } else {
35
+ let leaked = Box::leak(string.into_boxed_str());
36
+ cache.insert(leaked, AtomicU32::new(1));
37
+ Ok(leaked)
38
+ }
39
+ }
40
+
41
+ pub fn intern_many(strings: &[String]) -> Result<Vec<&'static str>, CacheError> {
42
+ let mut cache = STRING_CACHE
43
+ .lock()
44
+ .map_err(|e| CacheError::LockError(e.to_string()))?;
45
+
46
+ let mut result = Vec::with_capacity(strings.len());
47
+ for string in strings {
48
+ if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
49
+ count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
50
+ result.push(existing);
51
+ } else {
52
+ let leaked = Box::leak(string.clone().into_boxed_str());
53
+ cache.insert(leaked, AtomicU32::new(1));
54
+ result.push(leaked);
55
+ }
56
+ }
57
+ Ok(result)
58
+ }
59
+
60
+ pub fn clear(headers: &[&'static str]) -> Result<(), CacheError> {
61
+ let mut cache = STRING_CACHE
62
+ .lock()
63
+ .map_err(|e| CacheError::LockError(e.to_string()))?;
64
+
65
+ for header in headers {
66
+ if let Some(count) = cache.get(header) {
67
+ // Returns the previous value of the counter
68
+ let was = count.fetch_sub(1, std::sync::atomic::Ordering::Relaxed);
69
+ if was == 1 {
70
+ cache.remove(header);
71
+ let ptr = *header as *const str as *mut str;
72
+ unsafe {
73
+ let _ = Box::from_raw(ptr);
74
+ }
75
+ }
76
+ }
77
+ }
78
+
79
+ Ok(())
80
+ }
81
+ }
@@ -0,0 +1,16 @@
1
+ pub mod header_cache;
2
+ mod reader;
3
+ mod ruby_reader;
4
+ mod utils;
5
+
6
+ use crate::reader::*;
7
+
8
+ use magnus::{Error, Ruby};
9
+
10
+ /// Initializes the Ruby extension and defines methods.
11
+ #[magnus::init]
12
+ fn init(ruby: &Ruby) -> Result<(), Error> {
13
+ let module = ruby.define_module("Parquet")?;
14
+ module.define_module_function("each_row", magnus::method!(parse_parquet, -1))?;
15
+ Ok(())
16
+ }
@@ -0,0 +1,337 @@
1
+ use crate::header_cache::{CacheError, StringCache};
2
+ use crate::ruby_reader::{build_ruby_reader, SeekableRead};
3
+ use crate::utils::*;
4
+ use bytes::Bytes;
5
+ use magnus::rb_sys::AsRawValue;
6
+ use magnus::value::{Opaque, ReprValue};
7
+ use magnus::IntoValue;
8
+ use magnus::{block::Yield, Error as MagnusError, KwArgs, RHash, Ruby, Symbol, Value};
9
+ use parquet::errors::ParquetError;
10
+ use parquet::file::reader::{ChunkReader, Length, SerializedFileReader};
11
+ use parquet::record::Field;
12
+ use std::collections::HashMap;
13
+ use std::fs::File;
14
+ use std::io::{self, BufReader, Read, Seek, SeekFrom};
15
+ use std::mem::ManuallyDrop;
16
+ use std::os::fd::FromRawFd;
17
+ use std::sync::OnceLock;
18
+ use std::{borrow::Cow, hash::BuildHasher};
19
+ use thiserror::Error;
20
+ use xxhash_rust::xxh3::Xxh3Builder;
21
+
22
+ use parquet::record::reader::RowIter as ParquetRowIter;
23
+
24
+ #[derive(Error, Debug)]
25
+ pub enum ReaderError {
26
+ #[error("Failed to get file descriptor: {0}")]
27
+ FileDescriptor(String),
28
+ #[error("Invalid file descriptor")]
29
+ InvalidFileDescriptor,
30
+ #[error("Failed to open file: {0}")]
31
+ FileOpen(#[from] io::Error),
32
+ #[error("Failed to intern headers: {0}")]
33
+ HeaderIntern(#[from] CacheError),
34
+ #[error("Ruby error: {0}")]
35
+ Ruby(String),
36
+ }
37
+
38
+ impl From<MagnusError> for ReaderError {
39
+ fn from(err: MagnusError) -> Self {
40
+ Self::Ruby(err.to_string())
41
+ }
42
+ }
43
+
44
+ impl From<ReaderError> for MagnusError {
45
+ fn from(err: ReaderError) -> Self {
46
+ MagnusError::new(
47
+ Ruby::get().unwrap().exception_runtime_error(),
48
+ err.to_string(),
49
+ )
50
+ }
51
+ }
52
+
53
+ struct ForgottenFileHandle(ManuallyDrop<File>);
54
+
55
+ impl Length for ForgottenFileHandle {
56
+ fn len(&self) -> u64 {
57
+ self.0.len()
58
+ }
59
+ }
60
+
61
+ impl ChunkReader for ForgottenFileHandle {
62
+ type T = BufReader<File>;
63
+
64
+ fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
65
+ self.0.get_read(start)
66
+ }
67
+
68
+ fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
69
+ self.0.get_bytes(start, length)
70
+ }
71
+ }
72
+
73
+ struct HeaderCacheCleanupIter<I> {
74
+ inner: I,
75
+ headers: OnceLock<Vec<&'static str>>,
76
+ }
77
+
78
+ impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
79
+ type Item = I::Item;
80
+
81
+ fn next(&mut self) -> Option<Self::Item> {
82
+ self.inner.next()
83
+ }
84
+ }
85
+
86
+ impl<I> Drop for HeaderCacheCleanupIter<I> {
87
+ fn drop(&mut self) {
88
+ if let Some(headers) = self.headers.get() {
89
+ StringCache::clear(&headers).unwrap();
90
+ }
91
+ }
92
+ }
93
+
94
+ pub fn parse_parquet<'a>(
95
+ rb_self: Value,
96
+ args: &[Value],
97
+ ) -> Result<Yield<Box<dyn Iterator<Item = Record<Xxh3Builder>>>>, MagnusError> {
98
+ let original = unsafe { Ruby::get_unchecked() };
99
+ let ruby: &'static Ruby = Box::leak(Box::new(original));
100
+
101
+ let ParquetArgs {
102
+ to_read,
103
+ result_type,
104
+ } = parse_parquet_args(&ruby, args)?;
105
+
106
+ if !ruby.block_given() {
107
+ return create_enumerator(EnumeratorArgs {
108
+ rb_self,
109
+ to_read,
110
+ result_type,
111
+ });
112
+ }
113
+
114
+ let iter = if to_read.is_kind_of(ruby.class_string()) {
115
+ let path_string = to_read.to_r_string()?;
116
+ let file_path = unsafe { path_string.as_str()? };
117
+ let file = File::open(file_path).unwrap();
118
+ let reader = SerializedFileReader::new(file).unwrap();
119
+ ParquetRowIter::from_file_into(Box::new(reader))
120
+ } else if to_read.is_kind_of(ruby.class_io()) {
121
+ let raw_value = to_read.as_raw();
122
+ let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
123
+ .map_err(|_| {
124
+ ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
125
+ })?;
126
+
127
+ if fd < 0 {
128
+ return Err(ReaderError::InvalidFileDescriptor.into());
129
+ }
130
+
131
+ let file = unsafe { File::from_raw_fd(fd) };
132
+ let file = ForgottenFileHandle(ManuallyDrop::new(file));
133
+ let reader = SerializedFileReader::new(file).unwrap();
134
+ ParquetRowIter::from_file_into(Box::new(reader))
135
+ } else {
136
+ let readable = SeekableRubyValue(Opaque::from(to_read));
137
+ let reader = SerializedFileReader::new(readable).unwrap();
138
+ ParquetRowIter::from_file_into(Box::new(reader))
139
+ };
140
+
141
+ let iter: Box<dyn Iterator<Item = Record<Xxh3Builder>>> = match result_type.as_str() {
142
+ "hash" => {
143
+ let headers = OnceLock::new();
144
+ let headers_clone = headers.clone();
145
+ let iter = iter
146
+ .filter_map(move |row| {
147
+ row.ok().map(|row| {
148
+ let headers = headers_clone.get_or_init(|| {
149
+ row.get_column_iter()
150
+ .map(|(k, _)| StringCache::intern(k.to_owned()).unwrap())
151
+ .collect::<Vec<_>>()
152
+ });
153
+
154
+ row.get_column_iter()
155
+ .enumerate()
156
+ .map(|(i, (_, v))| {
157
+ let key = headers[i];
158
+ (key, ParquetField(v.clone()))
159
+ })
160
+ .collect::<HashMap<&'static str, ParquetField, Xxh3Builder>>()
161
+ })
162
+ })
163
+ .map(|row| Record::Map(row));
164
+
165
+ Box::new(HeaderCacheCleanupIter {
166
+ inner: iter,
167
+ headers,
168
+ })
169
+ }
170
+ "array" => Box::new(
171
+ iter.filter_map(|row| {
172
+ row.ok().map(|row| {
173
+ row.get_column_iter()
174
+ .map(|(_, v)| ParquetField(v.clone()))
175
+ .collect::<Vec<ParquetField>>()
176
+ })
177
+ })
178
+ .map(|row| Record::Vec(row)),
179
+ ),
180
+ _ => {
181
+ return Err(MagnusError::new(
182
+ ruby.exception_runtime_error(),
183
+ "Invalid result type",
184
+ ))
185
+ }
186
+ };
187
+
188
+ Ok(Yield::Iter(iter))
189
+ }
190
+
191
+ struct EnumeratorArgs {
192
+ rb_self: Value,
193
+ to_read: Value,
194
+ result_type: String,
195
+ }
196
+
197
+ fn create_enumerator(
198
+ args: EnumeratorArgs,
199
+ ) -> Result<Yield<Box<dyn Iterator<Item = Record<Xxh3Builder>>>>, MagnusError> {
200
+ let kwargs = RHash::new();
201
+
202
+ kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
203
+
204
+ let enumerator = args
205
+ .rb_self
206
+ .enumeratorize("for_each", (args.to_read, KwArgs(kwargs)));
207
+ Ok(Yield::Enumerator(enumerator))
208
+ }
209
+
210
+ #[derive(Debug)]
211
+ pub enum Record<S: BuildHasher + Default> {
212
+ Vec(Vec<ParquetField>),
213
+ Map(HashMap<&'static str, ParquetField, S>),
214
+ }
215
+
216
+ impl<S: BuildHasher + Default> IntoValue for Record<S> {
217
+ #[inline]
218
+ fn into_value_with(self, handle: &Ruby) -> Value {
219
+ match self {
220
+ Record::Vec(vec) => {
221
+ let ary = handle.ary_new_capa(vec.len());
222
+ vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
223
+ ary.into_value_with(handle)
224
+ }
225
+ Record::Map(map) => {
226
+ // Pre-allocate the hash with the known size
227
+ let hash = handle.hash_new_capa(map.len());
228
+ map.into_iter()
229
+ .try_for_each(|(k, v)| hash.aset(k, v))
230
+ .unwrap();
231
+ hash.into_value_with(handle)
232
+ }
233
+ }
234
+ }
235
+ }
236
+
237
+ #[derive(Debug, Clone)]
238
+ pub struct CowValue<'a>(pub Cow<'a, str>);
239
+
240
+ impl<'a> IntoValue for CowValue<'a> {
241
+ fn into_value_with(self, handle: &Ruby) -> Value {
242
+ self.0.into_value_with(handle)
243
+ }
244
+ }
245
+
246
+ #[derive(Debug)]
247
+ pub struct ParquetField(Field);
248
+
249
+ impl<'a> IntoValue for ParquetField {
250
+ fn into_value_with(self, handle: &Ruby) -> Value {
251
+ match self.0 {
252
+ Field::Byte(b) => b.into_value_with(handle),
253
+ Field::Bool(b) => b.into_value_with(handle),
254
+ Field::Short(s) => s.into_value_with(handle),
255
+ Field::Int(i) => i.into_value_with(handle),
256
+ Field::Long(l) => l.into_value_with(handle),
257
+ Field::UByte(ub) => ub.into_value_with(handle),
258
+ Field::UShort(us) => us.into_value_with(handle),
259
+ Field::UInt(ui) => ui.into_value_with(handle),
260
+ Field::ULong(ul) => ul.into_value_with(handle),
261
+ Field::Float16(f) => f32::from(f).into_value_with(handle),
262
+ Field::Float(f) => f.into_value_with(handle),
263
+ Field::Double(d) => d.into_value_with(handle),
264
+
265
+ Field::Str(s) => s.into_value_with(handle),
266
+ Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
267
+ Field::Date(d) => d.into_value_with(handle),
268
+ Field::TimestampMillis(ts) => ts.into_value_with(handle),
269
+ Field::TimestampMicros(ts) => ts.into_value_with(handle),
270
+ Field::ListInternal(list) => {
271
+ let ary = handle.ary_new_capa(list.elements().len());
272
+ list.elements()
273
+ .iter()
274
+ .try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
275
+ .unwrap();
276
+ ary.into_value_with(handle)
277
+ }
278
+ Field::MapInternal(map) => {
279
+ let hash = handle.hash_new_capa(map.entries().len());
280
+ map.entries()
281
+ .iter()
282
+ .try_for_each(|(k, v)| {
283
+ hash.aset(
284
+ ParquetField(k.clone()).into_value_with(handle),
285
+ ParquetField(v.clone()).into_value_with(handle),
286
+ )
287
+ })
288
+ .unwrap();
289
+ hash.into_value_with(handle)
290
+ }
291
+ // Field::Decimal(d) => d.to_string().into_value_with(handle),
292
+ // Field::Group(row) => row.into_value_with(handle),
293
+ Field::Null => handle.qnil().as_value(),
294
+ _ => panic!("Unsupported field type"),
295
+ }
296
+ }
297
+ }
298
+
299
+ struct SeekableRubyValue(Opaque<Value>);
300
+
301
+ impl Length for SeekableRubyValue {
302
+ fn len(&self) -> u64 {
303
+ let ruby = unsafe { Ruby::get_unchecked() };
304
+ let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
305
+ let current_pos = reader.seek(SeekFrom::Current(0)).unwrap();
306
+ let file_len = reader.seek(SeekFrom::End(0)).unwrap();
307
+ reader.seek(SeekFrom::Start(current_pos)).unwrap();
308
+ file_len
309
+ }
310
+ }
311
+
312
+ impl ChunkReader for SeekableRubyValue {
313
+ type T = BufReader<Box<dyn SeekableRead>>;
314
+
315
+ fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
316
+ let ruby = unsafe { Ruby::get_unchecked() };
317
+ let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
318
+ reader.seek(SeekFrom::Start(start))?;
319
+ Ok(BufReader::new(reader))
320
+ }
321
+
322
+ fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
323
+ let ruby = unsafe { Ruby::get_unchecked() };
324
+ let mut buffer = Vec::with_capacity(length);
325
+ let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
326
+ reader.seek(SeekFrom::Start(start))?;
327
+ let read = reader.take(length as _).read_to_end(&mut buffer)?;
328
+
329
+ if read != length {
330
+ return Err(ParquetError::EOF(format!(
331
+ "Expected to read {} bytes, read only {}",
332
+ length, read
333
+ )));
334
+ }
335
+ Ok(buffer.into())
336
+ }
337
+ }