parquet 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,231 @@
1
+ use magnus::{
2
+ value::{Opaque, ReprValue},
3
+ RClass, RString, Ruby, Value,
4
+ };
5
+ use std::io::{self, Read, Seek};
6
+ use std::sync::OnceLock;
7
+
8
+ static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
9
+
10
+ const READ_BUFFER_SIZE: usize = 16 * 1024;
11
+
12
+ /// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
13
+ /// and provide a standard Read implementation for them.
14
+ pub struct RubyReader<T> {
15
+ inner: T,
16
+ buffer: Option<Vec<u8>>,
17
+ offset: usize,
18
+ // Number of bytes that have been read into the buffer
19
+ // Used as an upper bound for offset
20
+ buffered_bytes: usize,
21
+ }
22
+
23
+ pub trait SeekableRead: std::io::Read + Seek {}
24
+ impl SeekableRead for RubyReader<Value> {}
25
+ impl SeekableRead for RubyReader<RString> {}
26
+
27
+ pub fn build_ruby_reader<'a>(
28
+ ruby: &'a Ruby,
29
+ input: Value,
30
+ ) -> Result<Box<dyn SeekableRead>, magnus::Error> {
31
+ if RubyReader::is_string_io(ruby, &input) {
32
+ RubyReader::from_string_io(ruby, input)
33
+ } else if RubyReader::is_io_like(&input) {
34
+ RubyReader::from_io(input)
35
+ } else {
36
+ RubyReader::from_string_like(input)
37
+ }
38
+ }
39
+
40
+ impl Seek for RubyReader<Value> {
41
+ fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
42
+ let seek_to = match pos {
43
+ io::SeekFrom::Start(offset) => {
44
+ // SEEK_SET - absolute position
45
+ offset as i64
46
+ }
47
+ io::SeekFrom::End(offset) => {
48
+ // SEEK_END - from end of stream
49
+ offset
50
+ }
51
+ io::SeekFrom::Current(offset) => {
52
+ // SEEK_CUR - relative to current
53
+ offset
54
+ }
55
+ };
56
+
57
+ let whence = match pos {
58
+ io::SeekFrom::Start(_) => 0, // SEEK_SET
59
+ io::SeekFrom::End(_) => 2, // SEEK_END
60
+ io::SeekFrom::Current(_) => 1, // SEEK_CUR
61
+ };
62
+
63
+ // Call Ruby's seek method
64
+ let _: u64 = self.inner.funcall("seek", (seek_to, whence)).unwrap();
65
+
66
+ // Get current position
67
+ let pos: u64 = self.inner.funcall("pos", ()).unwrap();
68
+
69
+ Ok(pos)
70
+ }
71
+ }
72
+
73
+ impl Seek for RubyReader<RString> {
74
+ fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
75
+ match pos {
76
+ io::SeekFrom::Start(offset) => {
77
+ self.offset = offset as usize;
78
+ }
79
+ io::SeekFrom::End(offset) => {
80
+ self.offset = (self.inner.len() - offset as usize) as usize;
81
+ }
82
+ io::SeekFrom::Current(offset) => {
83
+ self.offset = (self.offset as i64 + offset) as usize;
84
+ }
85
+ }
86
+ Ok(self.offset as u64)
87
+ }
88
+ }
89
+
90
+ impl RubyReader<Value> {
91
+ fn from_io(input: Value) -> Result<Box<dyn SeekableRead>, magnus::Error> {
92
+ if Self::is_io_like(&input) {
93
+ Ok(Box::new(Self::from_io_like(input)))
94
+ } else {
95
+ Err(magnus::Error::new(
96
+ magnus::exception::type_error(),
97
+ "Input is not an IO-like object",
98
+ ))
99
+ }
100
+ }
101
+
102
+ fn is_io_like(input: &Value) -> bool {
103
+ input.respond_to("read", false).unwrap_or(false)
104
+ }
105
+
106
+ fn from_io_like(input: Value) -> Self {
107
+ Self {
108
+ inner: input,
109
+ buffer: Some(vec![0; READ_BUFFER_SIZE]),
110
+ offset: 0,
111
+ buffered_bytes: 0,
112
+ }
113
+ }
114
+
115
+ fn read_from_buffer(&mut self, to_buf: &mut [u8]) -> Option<io::Result<usize>> {
116
+ if let Some(from_buf) = &self.buffer {
117
+ // If the offset is within the buffered bytes, copy the remaining bytes to the output buffer
118
+ if self.offset < self.buffered_bytes {
119
+ let remaining = self.buffered_bytes - self.offset;
120
+ let copy_size = remaining.min(to_buf.len());
121
+ to_buf[..copy_size]
122
+ .copy_from_slice(&from_buf[self.offset..self.offset + copy_size]);
123
+ self.offset += copy_size;
124
+ Some(Ok(copy_size))
125
+ } else {
126
+ None
127
+ }
128
+ } else {
129
+ None
130
+ }
131
+ }
132
+
133
+ fn read_from_ruby(&mut self, buf: &mut [u8]) -> io::Result<usize> {
134
+ let buffer = self.buffer.as_mut().unwrap();
135
+ let result = self
136
+ .inner
137
+ .funcall::<_, _, RString>("read", (buffer.capacity(),))
138
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
139
+
140
+ if result.is_nil() {
141
+ return Ok(0); // EOF
142
+ }
143
+
144
+ let bytes = unsafe { result.as_slice() };
145
+
146
+ // Update internal buffer
147
+ let bytes_len = bytes.len();
148
+ if bytes_len == 0 {
149
+ return Ok(0);
150
+ }
151
+
152
+ // Only copy what we actually read
153
+ buffer[..bytes_len].copy_from_slice(bytes);
154
+ self.buffered_bytes = bytes_len;
155
+
156
+ // Copy to output buffer
157
+ let copy_size = bytes_len.min(buf.len());
158
+ buf[..copy_size].copy_from_slice(&buffer[..copy_size]);
159
+ self.offset = copy_size;
160
+ Ok(copy_size)
161
+ }
162
+ }
163
+
164
+ impl RubyReader<RString> {
165
+ pub fn from_string_io(
166
+ ruby: &Ruby,
167
+ input: Value,
168
+ ) -> Result<Box<dyn SeekableRead>, magnus::Error> {
169
+ if !Self::is_string_io(ruby, &input) {
170
+ return Err(magnus::Error::new(
171
+ magnus::exception::type_error(),
172
+ "Input is not a StringIO",
173
+ ));
174
+ }
175
+
176
+ let string_content = input.funcall::<_, _, RString>("string", ()).unwrap();
177
+ Ok(Box::new(Self {
178
+ inner: string_content,
179
+ buffer: None,
180
+ offset: 0,
181
+ buffered_bytes: 0,
182
+ }))
183
+ }
184
+
185
+ fn is_string_io(ruby: &Ruby, input: &Value) -> bool {
186
+ let string_io_class = STRING_IO_CLASS.get_or_init(|| {
187
+ let class = RClass::from_value(ruby.eval("StringIO").unwrap()).unwrap();
188
+ Opaque::from(class)
189
+ });
190
+ input.is_kind_of(ruby.get_inner(*string_io_class))
191
+ }
192
+
193
+ fn from_string_like(input: Value) -> Result<Box<dyn SeekableRead>, magnus::Error> {
194
+ // Try calling `to_str`, and if that fails, try `to_s`
195
+ let string_content = input
196
+ .funcall::<_, _, RString>("to_str", ())
197
+ .or_else(|_| input.funcall::<_, _, RString>("to_s", ()))?;
198
+ Ok(Box::new(Self {
199
+ inner: string_content,
200
+ buffer: None,
201
+ offset: 0,
202
+ buffered_bytes: 0,
203
+ }))
204
+ }
205
+ }
206
+
207
+ impl Read for RubyReader<Value> {
208
+ fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
209
+ if let Some(result) = self.read_from_buffer(buf) {
210
+ result
211
+ } else {
212
+ // If the buffer is empty, read from Ruby
213
+ self.read_from_ruby(buf)
214
+ }
215
+ }
216
+ }
217
+
218
+ impl Read for RubyReader<RString> {
219
+ fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
220
+ let string_buffer = unsafe { self.inner.as_slice() };
221
+ if self.offset >= string_buffer.len() {
222
+ return Ok(0); // EOF
223
+ }
224
+
225
+ let remaining = string_buffer.len() - self.offset;
226
+ let copy_size = remaining.min(buf.len());
227
+ buf[..copy_size].copy_from_slice(&string_buffer[self.offset..self.offset + copy_size]);
228
+ self.offset += copy_size;
229
+ Ok(copy_size)
230
+ }
231
+ }
@@ -0,0 +1,70 @@
1
+ use magnus::{
2
+ scan_args::{get_kwargs, scan_args},
3
+ value::ReprValue,
4
+ Error, RString, Ruby, Symbol, Value,
5
+ };
6
+
7
+ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, Error> {
8
+ if value.is_nil() {
9
+ Ok(None)
10
+ } else if value.is_kind_of(ruby.class_string()) {
11
+ RString::from_value(value)
12
+ .ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid string value"))?
13
+ .to_string()
14
+ .map(|s| Some(s))
15
+ } else if value.is_kind_of(ruby.class_symbol()) {
16
+ Symbol::from_value(value)
17
+ .ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid symbol value"))?
18
+ .funcall("to_s", ())
19
+ .map(|s| Some(s))
20
+ } else {
21
+ Err(Error::new(
22
+ magnus::exception::type_error(),
23
+ "Value must be a String or Symbol",
24
+ ))
25
+ }
26
+ }
27
+
28
+ #[derive(Debug)]
29
+ pub struct ParquetArgs {
30
+ pub to_read: Value,
31
+ pub result_type: String,
32
+ }
33
+
34
+ /// Parse common arguments for CSV parsing
35
+ pub fn parse_parquet_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetArgs, Error> {
36
+ let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
37
+ let (to_read,) = parsed_args.required;
38
+
39
+ let kwargs =
40
+ get_kwargs::<_, (), (Option<Value>,), ()>(parsed_args.keywords, &[], &["result_type"])?;
41
+
42
+ let result_type = match kwargs
43
+ .optional
44
+ .0
45
+ .map(|value| parse_string_or_symbol(ruby, value))
46
+ {
47
+ Some(Ok(Some(parsed))) => match parsed.as_str() {
48
+ "hash" | "array" => parsed,
49
+ _ => {
50
+ return Err(Error::new(
51
+ magnus::exception::runtime_error(),
52
+ "result_type must be either 'hash' or 'array'",
53
+ ))
54
+ }
55
+ },
56
+ Some(Ok(None)) => String::from("hash"),
57
+ Some(Err(_)) => {
58
+ return Err(Error::new(
59
+ magnus::exception::type_error(),
60
+ "result_type must be a String or Symbol",
61
+ ))
62
+ }
63
+ None => String::from("hash"),
64
+ };
65
+
66
+ Ok(ParquetArgs {
67
+ to_read,
68
+ result_type,
69
+ })
70
+ }
@@ -0,0 +1,3 @@
1
+ module Parquet
2
+ VERSION = "0.0.1"
3
+ end
data/lib/parquet.rb ADDED
@@ -0,0 +1,5 @@
1
+ require_relative "parquet/version"
2
+ require_relative "parquet/parquet"
3
+
4
+ module Parquet
5
+ end
data/lib/parquet.rbi ADDED
@@ -0,0 +1,17 @@
1
+ # typed: strict
2
+
3
+ module Parquet
4
+ # Options:
5
+ # - `input`: String specifying the input file
6
+ # - `result_type`: String specifying the output format
7
+ # ("hash" or "array" or :hash or :array)
8
+ sig do
9
+ params(
10
+ input: T.any(String, IO),
11
+ result_type: T.nilable(T.any(String, Symbol)),
12
+ blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
13
+ ).returns(T.any(Enumerator, T.untyped))
14
+ end
15
+ def self.each_row(input, result_type: nil, &blk)
16
+ end
17
+ end
metadata ADDED
@@ -0,0 +1,96 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: parquet
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Nathan Jaremko
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2025-01-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rb_sys
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.9.39
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.9.39
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake-compiler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.2.0
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 1.2.0
41
+ description: |2
42
+ Parquet is a high-performance Parquet library for Ruby, written in Rust.
43
+ It wraps the official Apache Rust implementation to provide fast, correct Parquet parsing.
44
+ email:
45
+ - nathan@jaremko.ca
46
+ executables: []
47
+ extensions:
48
+ - ext/parquet/extconf.rb
49
+ extra_rdoc_files: []
50
+ files:
51
+ - Cargo.lock
52
+ - Cargo.toml
53
+ - Gemfile
54
+ - LICENSE
55
+ - README.md
56
+ - Rakefile
57
+ - ext/parquet/Cargo.toml
58
+ - ext/parquet/extconf.rb
59
+ - ext/parquet/src/header_cache.rs
60
+ - ext/parquet/src/lib.rs
61
+ - ext/parquet/src/reader.rs
62
+ - ext/parquet/src/ruby_reader.rs
63
+ - ext/parquet/src/utils.rs
64
+ - lib/parquet.rb
65
+ - lib/parquet.rbi
66
+ - lib/parquet/version.rb
67
+ homepage: https://github.com/njaremko/parquet
68
+ licenses:
69
+ - MIT
70
+ metadata:
71
+ homepage_uri: https://github.com/njaremko/parquet
72
+ source_code_uri: https://github.com/njaremko/parquet-ruby
73
+ readme_uri: https://github.com/njaremko/parquet-ruby/blob/main/README.md
74
+ changelog_uri: https://github.com/njaremko/parquet-ruby/blob/main/CHANGELOG.md
75
+ documentation_uri: https://www.rubydoc.info/gems/parquet
76
+ funding_uri: https://github.com/sponsors/njaremko
77
+ post_install_message:
78
+ rdoc_options: []
79
+ require_paths:
80
+ - lib
81
+ required_ruby_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: 3.1.0
86
+ required_rubygems_version: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: '0'
91
+ requirements: []
92
+ rubygems_version: 3.4.19
93
+ signing_key:
94
+ specification_version: 4
95
+ summary: Parquet library for Ruby, written in Rust
96
+ test_files: []