parquet 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Cargo.lock +1918 -0
- data/Cargo.toml +3 -0
- data/Gemfile +12 -0
- data/LICENSE +21 -0
- data/README.md +29 -0
- data/Rakefile +27 -0
- data/ext/parquet/Cargo.toml +18 -0
- data/ext/parquet/extconf.rb +4 -0
- data/ext/parquet/src/header_cache.rs +81 -0
- data/ext/parquet/src/lib.rs +16 -0
- data/ext/parquet/src/reader.rs +337 -0
- data/ext/parquet/src/ruby_reader.rs +231 -0
- data/ext/parquet/src/utils.rs +70 -0
- data/lib/parquet/version.rb +3 -0
- data/lib/parquet.rb +5 -0
- data/lib/parquet.rbi +17 -0
- metadata +96 -0
@@ -0,0 +1,231 @@
|
|
1
|
+
use magnus::{
|
2
|
+
value::{Opaque, ReprValue},
|
3
|
+
RClass, RString, Ruby, Value,
|
4
|
+
};
|
5
|
+
use std::io::{self, Read, Seek};
|
6
|
+
use std::sync::OnceLock;
|
7
|
+
|
8
|
+
static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
|
9
|
+
|
10
|
+
const READ_BUFFER_SIZE: usize = 16 * 1024;
|
11
|
+
|
12
|
+
/// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
|
13
|
+
/// and provide a standard Read implementation for them.
|
14
|
+
pub struct RubyReader<T> {
|
15
|
+
inner: T,
|
16
|
+
buffer: Option<Vec<u8>>,
|
17
|
+
offset: usize,
|
18
|
+
// Number of bytes that have been read into the buffer
|
19
|
+
// Used as an upper bound for offset
|
20
|
+
buffered_bytes: usize,
|
21
|
+
}
|
22
|
+
|
23
|
+
pub trait SeekableRead: std::io::Read + Seek {}
|
24
|
+
impl SeekableRead for RubyReader<Value> {}
|
25
|
+
impl SeekableRead for RubyReader<RString> {}
|
26
|
+
|
27
|
+
pub fn build_ruby_reader<'a>(
|
28
|
+
ruby: &'a Ruby,
|
29
|
+
input: Value,
|
30
|
+
) -> Result<Box<dyn SeekableRead>, magnus::Error> {
|
31
|
+
if RubyReader::is_string_io(ruby, &input) {
|
32
|
+
RubyReader::from_string_io(ruby, input)
|
33
|
+
} else if RubyReader::is_io_like(&input) {
|
34
|
+
RubyReader::from_io(input)
|
35
|
+
} else {
|
36
|
+
RubyReader::from_string_like(input)
|
37
|
+
}
|
38
|
+
}
|
39
|
+
|
40
|
+
impl Seek for RubyReader<Value> {
|
41
|
+
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
42
|
+
let seek_to = match pos {
|
43
|
+
io::SeekFrom::Start(offset) => {
|
44
|
+
// SEEK_SET - absolute position
|
45
|
+
offset as i64
|
46
|
+
}
|
47
|
+
io::SeekFrom::End(offset) => {
|
48
|
+
// SEEK_END - from end of stream
|
49
|
+
offset
|
50
|
+
}
|
51
|
+
io::SeekFrom::Current(offset) => {
|
52
|
+
// SEEK_CUR - relative to current
|
53
|
+
offset
|
54
|
+
}
|
55
|
+
};
|
56
|
+
|
57
|
+
let whence = match pos {
|
58
|
+
io::SeekFrom::Start(_) => 0, // SEEK_SET
|
59
|
+
io::SeekFrom::End(_) => 2, // SEEK_END
|
60
|
+
io::SeekFrom::Current(_) => 1, // SEEK_CUR
|
61
|
+
};
|
62
|
+
|
63
|
+
// Call Ruby's seek method
|
64
|
+
let _: u64 = self.inner.funcall("seek", (seek_to, whence)).unwrap();
|
65
|
+
|
66
|
+
// Get current position
|
67
|
+
let pos: u64 = self.inner.funcall("pos", ()).unwrap();
|
68
|
+
|
69
|
+
Ok(pos)
|
70
|
+
}
|
71
|
+
}
|
72
|
+
|
73
|
+
impl Seek for RubyReader<RString> {
|
74
|
+
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
75
|
+
match pos {
|
76
|
+
io::SeekFrom::Start(offset) => {
|
77
|
+
self.offset = offset as usize;
|
78
|
+
}
|
79
|
+
io::SeekFrom::End(offset) => {
|
80
|
+
self.offset = (self.inner.len() - offset as usize) as usize;
|
81
|
+
}
|
82
|
+
io::SeekFrom::Current(offset) => {
|
83
|
+
self.offset = (self.offset as i64 + offset) as usize;
|
84
|
+
}
|
85
|
+
}
|
86
|
+
Ok(self.offset as u64)
|
87
|
+
}
|
88
|
+
}
|
89
|
+
|
90
|
+
impl RubyReader<Value> {
|
91
|
+
fn from_io(input: Value) -> Result<Box<dyn SeekableRead>, magnus::Error> {
|
92
|
+
if Self::is_io_like(&input) {
|
93
|
+
Ok(Box::new(Self::from_io_like(input)))
|
94
|
+
} else {
|
95
|
+
Err(magnus::Error::new(
|
96
|
+
magnus::exception::type_error(),
|
97
|
+
"Input is not an IO-like object",
|
98
|
+
))
|
99
|
+
}
|
100
|
+
}
|
101
|
+
|
102
|
+
fn is_io_like(input: &Value) -> bool {
|
103
|
+
input.respond_to("read", false).unwrap_or(false)
|
104
|
+
}
|
105
|
+
|
106
|
+
fn from_io_like(input: Value) -> Self {
|
107
|
+
Self {
|
108
|
+
inner: input,
|
109
|
+
buffer: Some(vec![0; READ_BUFFER_SIZE]),
|
110
|
+
offset: 0,
|
111
|
+
buffered_bytes: 0,
|
112
|
+
}
|
113
|
+
}
|
114
|
+
|
115
|
+
fn read_from_buffer(&mut self, to_buf: &mut [u8]) -> Option<io::Result<usize>> {
|
116
|
+
if let Some(from_buf) = &self.buffer {
|
117
|
+
// If the offset is within the buffered bytes, copy the remaining bytes to the output buffer
|
118
|
+
if self.offset < self.buffered_bytes {
|
119
|
+
let remaining = self.buffered_bytes - self.offset;
|
120
|
+
let copy_size = remaining.min(to_buf.len());
|
121
|
+
to_buf[..copy_size]
|
122
|
+
.copy_from_slice(&from_buf[self.offset..self.offset + copy_size]);
|
123
|
+
self.offset += copy_size;
|
124
|
+
Some(Ok(copy_size))
|
125
|
+
} else {
|
126
|
+
None
|
127
|
+
}
|
128
|
+
} else {
|
129
|
+
None
|
130
|
+
}
|
131
|
+
}
|
132
|
+
|
133
|
+
fn read_from_ruby(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
134
|
+
let buffer = self.buffer.as_mut().unwrap();
|
135
|
+
let result = self
|
136
|
+
.inner
|
137
|
+
.funcall::<_, _, RString>("read", (buffer.capacity(),))
|
138
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
139
|
+
|
140
|
+
if result.is_nil() {
|
141
|
+
return Ok(0); // EOF
|
142
|
+
}
|
143
|
+
|
144
|
+
let bytes = unsafe { result.as_slice() };
|
145
|
+
|
146
|
+
// Update internal buffer
|
147
|
+
let bytes_len = bytes.len();
|
148
|
+
if bytes_len == 0 {
|
149
|
+
return Ok(0);
|
150
|
+
}
|
151
|
+
|
152
|
+
// Only copy what we actually read
|
153
|
+
buffer[..bytes_len].copy_from_slice(bytes);
|
154
|
+
self.buffered_bytes = bytes_len;
|
155
|
+
|
156
|
+
// Copy to output buffer
|
157
|
+
let copy_size = bytes_len.min(buf.len());
|
158
|
+
buf[..copy_size].copy_from_slice(&buffer[..copy_size]);
|
159
|
+
self.offset = copy_size;
|
160
|
+
Ok(copy_size)
|
161
|
+
}
|
162
|
+
}
|
163
|
+
|
164
|
+
impl RubyReader<RString> {
|
165
|
+
pub fn from_string_io(
|
166
|
+
ruby: &Ruby,
|
167
|
+
input: Value,
|
168
|
+
) -> Result<Box<dyn SeekableRead>, magnus::Error> {
|
169
|
+
if !Self::is_string_io(ruby, &input) {
|
170
|
+
return Err(magnus::Error::new(
|
171
|
+
magnus::exception::type_error(),
|
172
|
+
"Input is not a StringIO",
|
173
|
+
));
|
174
|
+
}
|
175
|
+
|
176
|
+
let string_content = input.funcall::<_, _, RString>("string", ()).unwrap();
|
177
|
+
Ok(Box::new(Self {
|
178
|
+
inner: string_content,
|
179
|
+
buffer: None,
|
180
|
+
offset: 0,
|
181
|
+
buffered_bytes: 0,
|
182
|
+
}))
|
183
|
+
}
|
184
|
+
|
185
|
+
fn is_string_io(ruby: &Ruby, input: &Value) -> bool {
|
186
|
+
let string_io_class = STRING_IO_CLASS.get_or_init(|| {
|
187
|
+
let class = RClass::from_value(ruby.eval("StringIO").unwrap()).unwrap();
|
188
|
+
Opaque::from(class)
|
189
|
+
});
|
190
|
+
input.is_kind_of(ruby.get_inner(*string_io_class))
|
191
|
+
}
|
192
|
+
|
193
|
+
fn from_string_like(input: Value) -> Result<Box<dyn SeekableRead>, magnus::Error> {
|
194
|
+
// Try calling `to_str`, and if that fails, try `to_s`
|
195
|
+
let string_content = input
|
196
|
+
.funcall::<_, _, RString>("to_str", ())
|
197
|
+
.or_else(|_| input.funcall::<_, _, RString>("to_s", ()))?;
|
198
|
+
Ok(Box::new(Self {
|
199
|
+
inner: string_content,
|
200
|
+
buffer: None,
|
201
|
+
offset: 0,
|
202
|
+
buffered_bytes: 0,
|
203
|
+
}))
|
204
|
+
}
|
205
|
+
}
|
206
|
+
|
207
|
+
impl Read for RubyReader<Value> {
|
208
|
+
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
209
|
+
if let Some(result) = self.read_from_buffer(buf) {
|
210
|
+
result
|
211
|
+
} else {
|
212
|
+
// If the buffer is empty, read from Ruby
|
213
|
+
self.read_from_ruby(buf)
|
214
|
+
}
|
215
|
+
}
|
216
|
+
}
|
217
|
+
|
218
|
+
impl Read for RubyReader<RString> {
|
219
|
+
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
220
|
+
let string_buffer = unsafe { self.inner.as_slice() };
|
221
|
+
if self.offset >= string_buffer.len() {
|
222
|
+
return Ok(0); // EOF
|
223
|
+
}
|
224
|
+
|
225
|
+
let remaining = string_buffer.len() - self.offset;
|
226
|
+
let copy_size = remaining.min(buf.len());
|
227
|
+
buf[..copy_size].copy_from_slice(&string_buffer[self.offset..self.offset + copy_size]);
|
228
|
+
self.offset += copy_size;
|
229
|
+
Ok(copy_size)
|
230
|
+
}
|
231
|
+
}
|
@@ -0,0 +1,70 @@
|
|
1
|
+
use magnus::{
|
2
|
+
scan_args::{get_kwargs, scan_args},
|
3
|
+
value::ReprValue,
|
4
|
+
Error, RString, Ruby, Symbol, Value,
|
5
|
+
};
|
6
|
+
|
7
|
+
fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, Error> {
|
8
|
+
if value.is_nil() {
|
9
|
+
Ok(None)
|
10
|
+
} else if value.is_kind_of(ruby.class_string()) {
|
11
|
+
RString::from_value(value)
|
12
|
+
.ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid string value"))?
|
13
|
+
.to_string()
|
14
|
+
.map(|s| Some(s))
|
15
|
+
} else if value.is_kind_of(ruby.class_symbol()) {
|
16
|
+
Symbol::from_value(value)
|
17
|
+
.ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid symbol value"))?
|
18
|
+
.funcall("to_s", ())
|
19
|
+
.map(|s| Some(s))
|
20
|
+
} else {
|
21
|
+
Err(Error::new(
|
22
|
+
magnus::exception::type_error(),
|
23
|
+
"Value must be a String or Symbol",
|
24
|
+
))
|
25
|
+
}
|
26
|
+
}
|
27
|
+
|
28
|
+
#[derive(Debug)]
|
29
|
+
pub struct ParquetArgs {
|
30
|
+
pub to_read: Value,
|
31
|
+
pub result_type: String,
|
32
|
+
}
|
33
|
+
|
34
|
+
/// Parse common arguments for CSV parsing
|
35
|
+
pub fn parse_parquet_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetArgs, Error> {
|
36
|
+
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
37
|
+
let (to_read,) = parsed_args.required;
|
38
|
+
|
39
|
+
let kwargs =
|
40
|
+
get_kwargs::<_, (), (Option<Value>,), ()>(parsed_args.keywords, &[], &["result_type"])?;
|
41
|
+
|
42
|
+
let result_type = match kwargs
|
43
|
+
.optional
|
44
|
+
.0
|
45
|
+
.map(|value| parse_string_or_symbol(ruby, value))
|
46
|
+
{
|
47
|
+
Some(Ok(Some(parsed))) => match parsed.as_str() {
|
48
|
+
"hash" | "array" => parsed,
|
49
|
+
_ => {
|
50
|
+
return Err(Error::new(
|
51
|
+
magnus::exception::runtime_error(),
|
52
|
+
"result_type must be either 'hash' or 'array'",
|
53
|
+
))
|
54
|
+
}
|
55
|
+
},
|
56
|
+
Some(Ok(None)) => String::from("hash"),
|
57
|
+
Some(Err(_)) => {
|
58
|
+
return Err(Error::new(
|
59
|
+
magnus::exception::type_error(),
|
60
|
+
"result_type must be a String or Symbol",
|
61
|
+
))
|
62
|
+
}
|
63
|
+
None => String::from("hash"),
|
64
|
+
};
|
65
|
+
|
66
|
+
Ok(ParquetArgs {
|
67
|
+
to_read,
|
68
|
+
result_type,
|
69
|
+
})
|
70
|
+
}
|
data/lib/parquet.rb
ADDED
data/lib/parquet.rbi
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# typed: strict
|
2
|
+
|
3
|
+
module Parquet
|
4
|
+
# Options:
|
5
|
+
# - `input`: String specifying the input file
|
6
|
+
# - `result_type`: String specifying the output format
|
7
|
+
# ("hash" or "array" or :hash or :array)
|
8
|
+
sig do
|
9
|
+
params(
|
10
|
+
input: T.any(String, IO),
|
11
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
12
|
+
blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
|
13
|
+
).returns(T.any(Enumerator, T.untyped))
|
14
|
+
end
|
15
|
+
def self.each_row(input, result_type: nil, &blk)
|
16
|
+
end
|
17
|
+
end
|
metadata
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: parquet
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Nathan Jaremko
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2025-01-02 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rb_sys
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.9.39
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.9.39
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake-compiler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.2.0
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.2.0
|
41
|
+
description: |2
|
42
|
+
Parquet is a high-performance Parquet library for Ruby, written in Rust.
|
43
|
+
It wraps the official Apache Rust implementation to provide fast, correct Parquet parsing.
|
44
|
+
email:
|
45
|
+
- nathan@jaremko.ca
|
46
|
+
executables: []
|
47
|
+
extensions:
|
48
|
+
- ext/parquet/extconf.rb
|
49
|
+
extra_rdoc_files: []
|
50
|
+
files:
|
51
|
+
- Cargo.lock
|
52
|
+
- Cargo.toml
|
53
|
+
- Gemfile
|
54
|
+
- LICENSE
|
55
|
+
- README.md
|
56
|
+
- Rakefile
|
57
|
+
- ext/parquet/Cargo.toml
|
58
|
+
- ext/parquet/extconf.rb
|
59
|
+
- ext/parquet/src/header_cache.rs
|
60
|
+
- ext/parquet/src/lib.rs
|
61
|
+
- ext/parquet/src/reader.rs
|
62
|
+
- ext/parquet/src/ruby_reader.rs
|
63
|
+
- ext/parquet/src/utils.rs
|
64
|
+
- lib/parquet.rb
|
65
|
+
- lib/parquet.rbi
|
66
|
+
- lib/parquet/version.rb
|
67
|
+
homepage: https://github.com/njaremko/parquet
|
68
|
+
licenses:
|
69
|
+
- MIT
|
70
|
+
metadata:
|
71
|
+
homepage_uri: https://github.com/njaremko/parquet
|
72
|
+
source_code_uri: https://github.com/njaremko/parquet-ruby
|
73
|
+
readme_uri: https://github.com/njaremko/parquet-ruby/blob/main/README.md
|
74
|
+
changelog_uri: https://github.com/njaremko/parquet-ruby/blob/main/CHANGELOG.md
|
75
|
+
documentation_uri: https://www.rubydoc.info/gems/parquet
|
76
|
+
funding_uri: https://github.com/sponsors/njaremko
|
77
|
+
post_install_message:
|
78
|
+
rdoc_options: []
|
79
|
+
require_paths:
|
80
|
+
- lib
|
81
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
82
|
+
requirements:
|
83
|
+
- - ">="
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: 3.1.0
|
86
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
87
|
+
requirements:
|
88
|
+
- - ">="
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
version: '0'
|
91
|
+
requirements: []
|
92
|
+
rubygems_version: 3.4.19
|
93
|
+
signing_key:
|
94
|
+
specification_version: 4
|
95
|
+
summary: Parquet library for Ruby, written in Rust
|
96
|
+
test_files: []
|