parquet 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,231 @@
1
+ use magnus::{
2
+ value::{Opaque, ReprValue},
3
+ RClass, RString, Ruby, Value,
4
+ };
5
+ use std::io::{self, Read, Seek};
6
+ use std::sync::OnceLock;
7
+
8
+ static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
9
+
10
+ const READ_BUFFER_SIZE: usize = 16 * 1024;
11
+
12
+ /// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
13
+ /// and provide a standard Read implementation for them.
14
+ pub struct RubyReader<T> {
15
+ inner: T,
16
+ buffer: Option<Vec<u8>>,
17
+ offset: usize,
18
+ // Number of bytes that have been read into the buffer
19
+ // Used as an upper bound for offset
20
+ buffered_bytes: usize,
21
+ }
22
+
23
+ pub trait SeekableRead: std::io::Read + Seek {}
24
+ impl SeekableRead for RubyReader<Value> {}
25
+ impl SeekableRead for RubyReader<RString> {}
26
+
27
+ pub fn build_ruby_reader<'a>(
28
+ ruby: &'a Ruby,
29
+ input: Value,
30
+ ) -> Result<Box<dyn SeekableRead>, magnus::Error> {
31
+ if RubyReader::is_string_io(ruby, &input) {
32
+ RubyReader::from_string_io(ruby, input)
33
+ } else if RubyReader::is_io_like(&input) {
34
+ RubyReader::from_io(input)
35
+ } else {
36
+ RubyReader::from_string_like(input)
37
+ }
38
+ }
39
+
40
+ impl Seek for RubyReader<Value> {
41
+ fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
42
+ let seek_to = match pos {
43
+ io::SeekFrom::Start(offset) => {
44
+ // SEEK_SET - absolute position
45
+ offset as i64
46
+ }
47
+ io::SeekFrom::End(offset) => {
48
+ // SEEK_END - from end of stream
49
+ offset
50
+ }
51
+ io::SeekFrom::Current(offset) => {
52
+ // SEEK_CUR - relative to current
53
+ offset
54
+ }
55
+ };
56
+
57
+ let whence = match pos {
58
+ io::SeekFrom::Start(_) => 0, // SEEK_SET
59
+ io::SeekFrom::End(_) => 2, // SEEK_END
60
+ io::SeekFrom::Current(_) => 1, // SEEK_CUR
61
+ };
62
+
63
+ // Call Ruby's seek method
64
+ let _: u64 = self.inner.funcall("seek", (seek_to, whence)).unwrap();
65
+
66
+ // Get current position
67
+ let pos: u64 = self.inner.funcall("pos", ()).unwrap();
68
+
69
+ Ok(pos)
70
+ }
71
+ }
72
+
73
+ impl Seek for RubyReader<RString> {
74
+ fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
75
+ match pos {
76
+ io::SeekFrom::Start(offset) => {
77
+ self.offset = offset as usize;
78
+ }
79
+ io::SeekFrom::End(offset) => {
80
+ self.offset = (self.inner.len() - offset as usize) as usize;
81
+ }
82
+ io::SeekFrom::Current(offset) => {
83
+ self.offset = (self.offset as i64 + offset) as usize;
84
+ }
85
+ }
86
+ Ok(self.offset as u64)
87
+ }
88
+ }
89
+
90
+ impl RubyReader<Value> {
91
+ fn from_io(input: Value) -> Result<Box<dyn SeekableRead>, magnus::Error> {
92
+ if Self::is_io_like(&input) {
93
+ Ok(Box::new(Self::from_io_like(input)))
94
+ } else {
95
+ Err(magnus::Error::new(
96
+ magnus::exception::type_error(),
97
+ "Input is not an IO-like object",
98
+ ))
99
+ }
100
+ }
101
+
102
+ fn is_io_like(input: &Value) -> bool {
103
+ input.respond_to("read", false).unwrap_or(false)
104
+ }
105
+
106
+ fn from_io_like(input: Value) -> Self {
107
+ Self {
108
+ inner: input,
109
+ buffer: Some(vec![0; READ_BUFFER_SIZE]),
110
+ offset: 0,
111
+ buffered_bytes: 0,
112
+ }
113
+ }
114
+
115
+ fn read_from_buffer(&mut self, to_buf: &mut [u8]) -> Option<io::Result<usize>> {
116
+ if let Some(from_buf) = &self.buffer {
117
+ // If the offset is within the buffered bytes, copy the remaining bytes to the output buffer
118
+ if self.offset < self.buffered_bytes {
119
+ let remaining = self.buffered_bytes - self.offset;
120
+ let copy_size = remaining.min(to_buf.len());
121
+ to_buf[..copy_size]
122
+ .copy_from_slice(&from_buf[self.offset..self.offset + copy_size]);
123
+ self.offset += copy_size;
124
+ Some(Ok(copy_size))
125
+ } else {
126
+ None
127
+ }
128
+ } else {
129
+ None
130
+ }
131
+ }
132
+
133
+ fn read_from_ruby(&mut self, buf: &mut [u8]) -> io::Result<usize> {
134
+ let buffer = self.buffer.as_mut().unwrap();
135
+ let result = self
136
+ .inner
137
+ .funcall::<_, _, RString>("read", (buffer.capacity(),))
138
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
139
+
140
+ if result.is_nil() {
141
+ return Ok(0); // EOF
142
+ }
143
+
144
+ let bytes = unsafe { result.as_slice() };
145
+
146
+ // Update internal buffer
147
+ let bytes_len = bytes.len();
148
+ if bytes_len == 0 {
149
+ return Ok(0);
150
+ }
151
+
152
+ // Only copy what we actually read
153
+ buffer[..bytes_len].copy_from_slice(bytes);
154
+ self.buffered_bytes = bytes_len;
155
+
156
+ // Copy to output buffer
157
+ let copy_size = bytes_len.min(buf.len());
158
+ buf[..copy_size].copy_from_slice(&buffer[..copy_size]);
159
+ self.offset = copy_size;
160
+ Ok(copy_size)
161
+ }
162
+ }
163
+
164
+ impl RubyReader<RString> {
165
+ pub fn from_string_io(
166
+ ruby: &Ruby,
167
+ input: Value,
168
+ ) -> Result<Box<dyn SeekableRead>, magnus::Error> {
169
+ if !Self::is_string_io(ruby, &input) {
170
+ return Err(magnus::Error::new(
171
+ magnus::exception::type_error(),
172
+ "Input is not a StringIO",
173
+ ));
174
+ }
175
+
176
+ let string_content = input.funcall::<_, _, RString>("string", ()).unwrap();
177
+ Ok(Box::new(Self {
178
+ inner: string_content,
179
+ buffer: None,
180
+ offset: 0,
181
+ buffered_bytes: 0,
182
+ }))
183
+ }
184
+
185
+ fn is_string_io(ruby: &Ruby, input: &Value) -> bool {
186
+ let string_io_class = STRING_IO_CLASS.get_or_init(|| {
187
+ let class = RClass::from_value(ruby.eval("StringIO").unwrap()).unwrap();
188
+ Opaque::from(class)
189
+ });
190
+ input.is_kind_of(ruby.get_inner(*string_io_class))
191
+ }
192
+
193
+ fn from_string_like(input: Value) -> Result<Box<dyn SeekableRead>, magnus::Error> {
194
+ // Try calling `to_str`, and if that fails, try `to_s`
195
+ let string_content = input
196
+ .funcall::<_, _, RString>("to_str", ())
197
+ .or_else(|_| input.funcall::<_, _, RString>("to_s", ()))?;
198
+ Ok(Box::new(Self {
199
+ inner: string_content,
200
+ buffer: None,
201
+ offset: 0,
202
+ buffered_bytes: 0,
203
+ }))
204
+ }
205
+ }
206
+
207
+ impl Read for RubyReader<Value> {
208
+ fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
209
+ if let Some(result) = self.read_from_buffer(buf) {
210
+ result
211
+ } else {
212
+ // If the buffer is empty, read from Ruby
213
+ self.read_from_ruby(buf)
214
+ }
215
+ }
216
+ }
217
+
218
+ impl Read for RubyReader<RString> {
219
+ fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
220
+ let string_buffer = unsafe { self.inner.as_slice() };
221
+ if self.offset >= string_buffer.len() {
222
+ return Ok(0); // EOF
223
+ }
224
+
225
+ let remaining = string_buffer.len() - self.offset;
226
+ let copy_size = remaining.min(buf.len());
227
+ buf[..copy_size].copy_from_slice(&string_buffer[self.offset..self.offset + copy_size]);
228
+ self.offset += copy_size;
229
+ Ok(copy_size)
230
+ }
231
+ }
@@ -0,0 +1,70 @@
1
+ use magnus::{
2
+ scan_args::{get_kwargs, scan_args},
3
+ value::ReprValue,
4
+ Error, RString, Ruby, Symbol, Value,
5
+ };
6
+
7
+ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, Error> {
8
+ if value.is_nil() {
9
+ Ok(None)
10
+ } else if value.is_kind_of(ruby.class_string()) {
11
+ RString::from_value(value)
12
+ .ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid string value"))?
13
+ .to_string()
14
+ .map(|s| Some(s))
15
+ } else if value.is_kind_of(ruby.class_symbol()) {
16
+ Symbol::from_value(value)
17
+ .ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid symbol value"))?
18
+ .funcall("to_s", ())
19
+ .map(|s| Some(s))
20
+ } else {
21
+ Err(Error::new(
22
+ magnus::exception::type_error(),
23
+ "Value must be a String or Symbol",
24
+ ))
25
+ }
26
+ }
27
+
28
+ #[derive(Debug)]
29
+ pub struct ParquetArgs {
30
+ pub to_read: Value,
31
+ pub result_type: String,
32
+ }
33
+
34
+ /// Parse common arguments for CSV parsing
35
+ pub fn parse_parquet_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetArgs, Error> {
36
+ let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
37
+ let (to_read,) = parsed_args.required;
38
+
39
+ let kwargs =
40
+ get_kwargs::<_, (), (Option<Value>,), ()>(parsed_args.keywords, &[], &["result_type"])?;
41
+
42
+ let result_type = match kwargs
43
+ .optional
44
+ .0
45
+ .map(|value| parse_string_or_symbol(ruby, value))
46
+ {
47
+ Some(Ok(Some(parsed))) => match parsed.as_str() {
48
+ "hash" | "array" => parsed,
49
+ _ => {
50
+ return Err(Error::new(
51
+ magnus::exception::runtime_error(),
52
+ "result_type must be either 'hash' or 'array'",
53
+ ))
54
+ }
55
+ },
56
+ Some(Ok(None)) => String::from("hash"),
57
+ Some(Err(_)) => {
58
+ return Err(Error::new(
59
+ magnus::exception::type_error(),
60
+ "result_type must be a String or Symbol",
61
+ ))
62
+ }
63
+ None => String::from("hash"),
64
+ };
65
+
66
+ Ok(ParquetArgs {
67
+ to_read,
68
+ result_type,
69
+ })
70
+ }
@@ -0,0 +1,3 @@
1
+ module Parquet
2
+ VERSION = "0.0.1"
3
+ end
data/lib/parquet.rb ADDED
@@ -0,0 +1,5 @@
1
+ require_relative "parquet/version"
2
+ require_relative "parquet/parquet"
3
+
4
+ module Parquet
5
+ end
data/lib/parquet.rbi ADDED
@@ -0,0 +1,17 @@
1
+ # typed: strict
2
+
3
+ module Parquet
4
+ # Options:
5
+ # - `input`: String specifying the input file
6
+ # - `result_type`: String specifying the output format
7
+ # ("hash" or "array" or :hash or :array)
8
+ sig do
9
+ params(
10
+ input: T.any(String, IO),
11
+ result_type: T.nilable(T.any(String, Symbol)),
12
+ blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
13
+ ).returns(T.any(Enumerator, T.untyped))
14
+ end
15
+ def self.each_row(input, result_type: nil, &blk)
16
+ end
17
+ end
metadata ADDED
@@ -0,0 +1,96 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: parquet
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Nathan Jaremko
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2025-01-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rb_sys
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.9.39
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.9.39
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake-compiler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.2.0
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 1.2.0
41
+ description: |2
42
+ Parquet is a high-performance Parquet library for Ruby, written in Rust.
43
+ It wraps the official Apache Rust implementation to provide fast, correct Parquet parsing.
44
+ email:
45
+ - nathan@jaremko.ca
46
+ executables: []
47
+ extensions:
48
+ - ext/parquet/extconf.rb
49
+ extra_rdoc_files: []
50
+ files:
51
+ - Cargo.lock
52
+ - Cargo.toml
53
+ - Gemfile
54
+ - LICENSE
55
+ - README.md
56
+ - Rakefile
57
+ - ext/parquet/Cargo.toml
58
+ - ext/parquet/extconf.rb
59
+ - ext/parquet/src/header_cache.rs
60
+ - ext/parquet/src/lib.rs
61
+ - ext/parquet/src/reader.rs
62
+ - ext/parquet/src/ruby_reader.rs
63
+ - ext/parquet/src/utils.rs
64
+ - lib/parquet.rb
65
+ - lib/parquet.rbi
66
+ - lib/parquet/version.rb
67
+ homepage: https://github.com/njaremko/parquet
68
+ licenses:
69
+ - MIT
70
+ metadata:
71
+ homepage_uri: https://github.com/njaremko/parquet
72
+ source_code_uri: https://github.com/njaremko/parquet-ruby
73
+ readme_uri: https://github.com/njaremko/parquet-ruby/blob/main/README.md
74
+ changelog_uri: https://github.com/njaremko/parquet-ruby/blob/main/CHANGELOG.md
75
+ documentation_uri: https://www.rubydoc.info/gems/parquet
76
+ funding_uri: https://github.com/sponsors/njaremko
77
+ post_install_message:
78
+ rdoc_options: []
79
+ require_paths:
80
+ - lib
81
+ required_ruby_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: 3.1.0
86
+ required_rubygems_version: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: '0'
91
+ requirements: []
92
+ rubygems_version: 3.4.19
93
+ signing_key:
94
+ specification_version: 4
95
+ summary: Parquet library for Ruby, written in Rust
96
+ test_files: []