parquet 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +99 -7
- data/Gemfile +7 -2
- data/README.md +66 -10
- data/ext/parquet/Cargo.toml +12 -1
- data/ext/parquet/src/allocator.rs +13 -0
- data/ext/parquet/src/enumerator.rs +54 -0
- data/ext/parquet/src/header_cache.rs +105 -26
- data/ext/parquet/src/lib.rs +9 -1
- data/ext/parquet/src/reader.rs +289 -231
- data/ext/parquet/src/ruby_integration.rs +77 -0
- data/ext/parquet/src/ruby_reader.rs +43 -102
- data/ext/parquet/src/types.rs +722 -0
- data/ext/parquet/src/utils.rs +64 -5
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +26 -5
- metadata +6 -2
@@ -0,0 +1,77 @@
|
|
1
|
+
use std::{
|
2
|
+
fs::File,
|
3
|
+
io::{BufReader, SeekFrom},
|
4
|
+
mem::ManuallyDrop,
|
5
|
+
};
|
6
|
+
|
7
|
+
use bytes::Bytes;
|
8
|
+
use magnus::{value::Opaque, Ruby, Value};
|
9
|
+
use parquet::{
|
10
|
+
errors::ParquetError,
|
11
|
+
file::reader::{ChunkReader, Length},
|
12
|
+
};
|
13
|
+
use std::io::Read;
|
14
|
+
|
15
|
+
use crate::ruby_reader::{build_ruby_reader, SeekableRead};
|
16
|
+
|
17
|
+
const READ_BUFFER_SIZE: usize = 16 * 1024;
|
18
|
+
|
19
|
+
pub struct SeekableRubyValue(pub Opaque<Value>);
|
20
|
+
|
21
|
+
impl Length for SeekableRubyValue {
|
22
|
+
fn len(&self) -> u64 {
|
23
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
24
|
+
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
25
|
+
let current_pos = reader.seek(SeekFrom::Current(0)).unwrap();
|
26
|
+
let file_len = reader.seek(SeekFrom::End(0)).unwrap();
|
27
|
+
reader.seek(SeekFrom::Start(current_pos)).unwrap();
|
28
|
+
file_len
|
29
|
+
}
|
30
|
+
}
|
31
|
+
|
32
|
+
impl ChunkReader for SeekableRubyValue {
|
33
|
+
type T = BufReader<Box<dyn SeekableRead>>;
|
34
|
+
|
35
|
+
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
36
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
37
|
+
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
38
|
+
reader.seek(SeekFrom::Start(start))?;
|
39
|
+
Ok(BufReader::with_capacity(READ_BUFFER_SIZE, reader))
|
40
|
+
}
|
41
|
+
|
42
|
+
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
43
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
44
|
+
let mut buffer = Vec::with_capacity(length);
|
45
|
+
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
46
|
+
reader.seek(SeekFrom::Start(start))?;
|
47
|
+
let read = reader.take(length as _).read_to_end(&mut buffer)?;
|
48
|
+
|
49
|
+
if read != length {
|
50
|
+
return Err(ParquetError::EOF(format!(
|
51
|
+
"Expected to read {} bytes, read only {}",
|
52
|
+
length, read
|
53
|
+
)));
|
54
|
+
}
|
55
|
+
Ok(buffer.into())
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
pub struct ForgottenFileHandle(pub ManuallyDrop<File>);
|
60
|
+
|
61
|
+
impl Length for ForgottenFileHandle {
|
62
|
+
fn len(&self) -> u64 {
|
63
|
+
self.0.len()
|
64
|
+
}
|
65
|
+
}
|
66
|
+
|
67
|
+
impl ChunkReader for ForgottenFileHandle {
|
68
|
+
type T = BufReader<File>;
|
69
|
+
|
70
|
+
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
71
|
+
self.0.get_read(start)
|
72
|
+
}
|
73
|
+
|
74
|
+
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
75
|
+
self.0.get_bytes(start, length)
|
76
|
+
}
|
77
|
+
}
|
@@ -2,30 +2,24 @@ use magnus::{
|
|
2
2
|
value::{Opaque, ReprValue},
|
3
3
|
RClass, RString, Ruby, Value,
|
4
4
|
};
|
5
|
-
use std::io::{self, Read, Seek};
|
5
|
+
use std::io::{self, Read, Seek, SeekFrom, Write};
|
6
6
|
use std::sync::OnceLock;
|
7
7
|
|
8
8
|
static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
|
9
9
|
|
10
|
-
const READ_BUFFER_SIZE: usize = 16 * 1024;
|
11
|
-
|
12
10
|
/// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
|
13
11
|
/// and provide a standard Read implementation for them.
|
14
12
|
pub struct RubyReader<T> {
|
15
13
|
inner: T,
|
16
|
-
buffer: Option<Vec<u8>>,
|
17
14
|
offset: usize,
|
18
|
-
// Number of bytes that have been read into the buffer
|
19
|
-
// Used as an upper bound for offset
|
20
|
-
buffered_bytes: usize,
|
21
15
|
}
|
22
16
|
|
23
17
|
pub trait SeekableRead: std::io::Read + Seek {}
|
24
18
|
impl SeekableRead for RubyReader<Value> {}
|
25
19
|
impl SeekableRead for RubyReader<RString> {}
|
26
20
|
|
27
|
-
pub fn build_ruby_reader
|
28
|
-
ruby: &
|
21
|
+
pub fn build_ruby_reader(
|
22
|
+
ruby: &Ruby,
|
29
23
|
input: Value,
|
30
24
|
) -> Result<Box<dyn SeekableRead>, magnus::Error> {
|
31
25
|
if RubyReader::is_string_io(ruby, &input) {
|
@@ -39,49 +33,48 @@ pub fn build_ruby_reader<'a>(
|
|
39
33
|
|
40
34
|
impl Seek for RubyReader<Value> {
|
41
35
|
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
42
|
-
let
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
}
|
47
|
-
io::SeekFrom::End(offset) => {
|
48
|
-
// SEEK_END - from end of stream
|
49
|
-
offset
|
50
|
-
}
|
51
|
-
io::SeekFrom::Current(offset) => {
|
52
|
-
// SEEK_CUR - relative to current
|
53
|
-
offset
|
54
|
-
}
|
36
|
+
let (whence, offset) = match pos {
|
37
|
+
SeekFrom::Start(i) => (0, i as i64),
|
38
|
+
SeekFrom::Current(i) => (1, i),
|
39
|
+
SeekFrom::End(i) => (2, i),
|
55
40
|
};
|
56
41
|
|
57
|
-
let
|
58
|
-
|
59
|
-
|
60
|
-
io::
|
61
|
-
|
42
|
+
let new_position = self
|
43
|
+
.inner
|
44
|
+
.funcall("seek", (offset, whence))
|
45
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
46
|
+
|
47
|
+
Ok(new_position)
|
48
|
+
}
|
49
|
+
}
|
62
50
|
|
63
|
-
|
64
|
-
|
51
|
+
impl Write for RubyReader<Value> {
|
52
|
+
fn write(&mut self, buf: &[u8]) -> Result<usize, io::Error> {
|
53
|
+
let ruby_bytes = RString::from_slice(buf);
|
65
54
|
|
66
|
-
|
67
|
-
|
55
|
+
let bytes_written = self
|
56
|
+
.inner
|
57
|
+
.funcall::<_, _, usize>("write", (ruby_bytes,))
|
58
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
68
59
|
|
69
|
-
Ok(
|
60
|
+
Ok(bytes_written)
|
61
|
+
}
|
62
|
+
|
63
|
+
fn flush(&mut self) -> Result<(), io::Error> {
|
64
|
+
self.inner
|
65
|
+
.funcall::<_, _, Value>("flush", ())
|
66
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
67
|
+
|
68
|
+
Ok(())
|
70
69
|
}
|
71
70
|
}
|
72
71
|
|
73
72
|
impl Seek for RubyReader<RString> {
|
74
73
|
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
75
74
|
match pos {
|
76
|
-
io::SeekFrom::Start(offset) =>
|
77
|
-
|
78
|
-
|
79
|
-
io::SeekFrom::End(offset) => {
|
80
|
-
self.offset = (self.inner.len() - offset as usize) as usize;
|
81
|
-
}
|
82
|
-
io::SeekFrom::Current(offset) => {
|
83
|
-
self.offset = (self.offset as i64 + offset) as usize;
|
84
|
-
}
|
75
|
+
io::SeekFrom::Start(offset) => self.offset = offset as usize,
|
76
|
+
io::SeekFrom::Current(offset) => self.offset = (self.offset as i64 + offset) as usize,
|
77
|
+
io::SeekFrom::End(offset) => self.offset = self.inner.len() - offset as usize,
|
85
78
|
}
|
86
79
|
Ok(self.offset as u64)
|
87
80
|
}
|
@@ -106,59 +99,9 @@ impl RubyReader<Value> {
|
|
106
99
|
fn from_io_like(input: Value) -> Self {
|
107
100
|
Self {
|
108
101
|
inner: input,
|
109
|
-
buffer: Some(vec![0; READ_BUFFER_SIZE]),
|
110
102
|
offset: 0,
|
111
|
-
buffered_bytes: 0,
|
112
103
|
}
|
113
104
|
}
|
114
|
-
|
115
|
-
fn read_from_buffer(&mut self, to_buf: &mut [u8]) -> Option<io::Result<usize>> {
|
116
|
-
if let Some(from_buf) = &self.buffer {
|
117
|
-
// If the offset is within the buffered bytes, copy the remaining bytes to the output buffer
|
118
|
-
if self.offset < self.buffered_bytes {
|
119
|
-
let remaining = self.buffered_bytes - self.offset;
|
120
|
-
let copy_size = remaining.min(to_buf.len());
|
121
|
-
to_buf[..copy_size]
|
122
|
-
.copy_from_slice(&from_buf[self.offset..self.offset + copy_size]);
|
123
|
-
self.offset += copy_size;
|
124
|
-
Some(Ok(copy_size))
|
125
|
-
} else {
|
126
|
-
None
|
127
|
-
}
|
128
|
-
} else {
|
129
|
-
None
|
130
|
-
}
|
131
|
-
}
|
132
|
-
|
133
|
-
fn read_from_ruby(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
134
|
-
let buffer = self.buffer.as_mut().unwrap();
|
135
|
-
let result = self
|
136
|
-
.inner
|
137
|
-
.funcall::<_, _, RString>("read", (buffer.capacity(),))
|
138
|
-
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
139
|
-
|
140
|
-
if result.is_nil() {
|
141
|
-
return Ok(0); // EOF
|
142
|
-
}
|
143
|
-
|
144
|
-
let bytes = unsafe { result.as_slice() };
|
145
|
-
|
146
|
-
// Update internal buffer
|
147
|
-
let bytes_len = bytes.len();
|
148
|
-
if bytes_len == 0 {
|
149
|
-
return Ok(0);
|
150
|
-
}
|
151
|
-
|
152
|
-
// Only copy what we actually read
|
153
|
-
buffer[..bytes_len].copy_from_slice(bytes);
|
154
|
-
self.buffered_bytes = bytes_len;
|
155
|
-
|
156
|
-
// Copy to output buffer
|
157
|
-
let copy_size = bytes_len.min(buf.len());
|
158
|
-
buf[..copy_size].copy_from_slice(&buffer[..copy_size]);
|
159
|
-
self.offset = copy_size;
|
160
|
-
Ok(copy_size)
|
161
|
-
}
|
162
105
|
}
|
163
106
|
|
164
107
|
impl RubyReader<RString> {
|
@@ -176,9 +119,7 @@ impl RubyReader<RString> {
|
|
176
119
|
let string_content = input.funcall::<_, _, RString>("string", ()).unwrap();
|
177
120
|
Ok(Box::new(Self {
|
178
121
|
inner: string_content,
|
179
|
-
buffer: None,
|
180
122
|
offset: 0,
|
181
|
-
buffered_bytes: 0,
|
182
123
|
}))
|
183
124
|
}
|
184
125
|
|
@@ -197,21 +138,21 @@ impl RubyReader<RString> {
|
|
197
138
|
.or_else(|_| input.funcall::<_, _, RString>("to_s", ()))?;
|
198
139
|
Ok(Box::new(Self {
|
199
140
|
inner: string_content,
|
200
|
-
buffer: None,
|
201
141
|
offset: 0,
|
202
|
-
buffered_bytes: 0,
|
203
142
|
}))
|
204
143
|
}
|
205
144
|
}
|
206
145
|
|
207
146
|
impl Read for RubyReader<Value> {
|
208
|
-
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
}
|
147
|
+
fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
|
148
|
+
let bytes = self
|
149
|
+
.inner
|
150
|
+
.funcall::<_, _, RString>("read", (buf.len(),))
|
151
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
152
|
+
|
153
|
+
buf.write_all(unsafe { bytes.as_slice() })?;
|
154
|
+
|
155
|
+
Ok(bytes.len())
|
215
156
|
}
|
216
157
|
}
|
217
158
|
|