parquet 0.0.1 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Cargo.lock +99 -7
- data/Gemfile +7 -2
- data/README.md +66 -10
- data/ext/parquet/Cargo.toml +12 -1
- data/ext/parquet/src/allocator.rs +13 -0
- data/ext/parquet/src/enumerator.rs +54 -0
- data/ext/parquet/src/header_cache.rs +105 -26
- data/ext/parquet/src/lib.rs +9 -1
- data/ext/parquet/src/reader.rs +289 -231
- data/ext/parquet/src/ruby_integration.rs +77 -0
- data/ext/parquet/src/ruby_reader.rs +43 -102
- data/ext/parquet/src/types.rs +722 -0
- data/ext/parquet/src/utils.rs +64 -5
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +26 -5
- metadata +6 -2
@@ -0,0 +1,77 @@
|
|
1
|
+
use std::{
|
2
|
+
fs::File,
|
3
|
+
io::{BufReader, SeekFrom},
|
4
|
+
mem::ManuallyDrop,
|
5
|
+
};
|
6
|
+
|
7
|
+
use bytes::Bytes;
|
8
|
+
use magnus::{value::Opaque, Ruby, Value};
|
9
|
+
use parquet::{
|
10
|
+
errors::ParquetError,
|
11
|
+
file::reader::{ChunkReader, Length},
|
12
|
+
};
|
13
|
+
use std::io::Read;
|
14
|
+
|
15
|
+
use crate::ruby_reader::{build_ruby_reader, SeekableRead};
|
16
|
+
|
17
|
+
const READ_BUFFER_SIZE: usize = 16 * 1024;
|
18
|
+
|
19
|
+
pub struct SeekableRubyValue(pub Opaque<Value>);
|
20
|
+
|
21
|
+
impl Length for SeekableRubyValue {
|
22
|
+
fn len(&self) -> u64 {
|
23
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
24
|
+
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
25
|
+
let current_pos = reader.seek(SeekFrom::Current(0)).unwrap();
|
26
|
+
let file_len = reader.seek(SeekFrom::End(0)).unwrap();
|
27
|
+
reader.seek(SeekFrom::Start(current_pos)).unwrap();
|
28
|
+
file_len
|
29
|
+
}
|
30
|
+
}
|
31
|
+
|
32
|
+
impl ChunkReader for SeekableRubyValue {
|
33
|
+
type T = BufReader<Box<dyn SeekableRead>>;
|
34
|
+
|
35
|
+
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
36
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
37
|
+
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
38
|
+
reader.seek(SeekFrom::Start(start))?;
|
39
|
+
Ok(BufReader::with_capacity(READ_BUFFER_SIZE, reader))
|
40
|
+
}
|
41
|
+
|
42
|
+
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
43
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
44
|
+
let mut buffer = Vec::with_capacity(length);
|
45
|
+
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
46
|
+
reader.seek(SeekFrom::Start(start))?;
|
47
|
+
let read = reader.take(length as _).read_to_end(&mut buffer)?;
|
48
|
+
|
49
|
+
if read != length {
|
50
|
+
return Err(ParquetError::EOF(format!(
|
51
|
+
"Expected to read {} bytes, read only {}",
|
52
|
+
length, read
|
53
|
+
)));
|
54
|
+
}
|
55
|
+
Ok(buffer.into())
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
pub struct ForgottenFileHandle(pub ManuallyDrop<File>);
|
60
|
+
|
61
|
+
impl Length for ForgottenFileHandle {
|
62
|
+
fn len(&self) -> u64 {
|
63
|
+
self.0.len()
|
64
|
+
}
|
65
|
+
}
|
66
|
+
|
67
|
+
impl ChunkReader for ForgottenFileHandle {
|
68
|
+
type T = BufReader<File>;
|
69
|
+
|
70
|
+
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
71
|
+
self.0.get_read(start)
|
72
|
+
}
|
73
|
+
|
74
|
+
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
75
|
+
self.0.get_bytes(start, length)
|
76
|
+
}
|
77
|
+
}
|
@@ -2,30 +2,24 @@ use magnus::{
|
|
2
2
|
value::{Opaque, ReprValue},
|
3
3
|
RClass, RString, Ruby, Value,
|
4
4
|
};
|
5
|
-
use std::io::{self, Read, Seek};
|
5
|
+
use std::io::{self, Read, Seek, SeekFrom, Write};
|
6
6
|
use std::sync::OnceLock;
|
7
7
|
|
8
8
|
static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
|
9
9
|
|
10
|
-
const READ_BUFFER_SIZE: usize = 16 * 1024;
|
11
|
-
|
12
10
|
/// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
|
13
11
|
/// and provide a standard Read implementation for them.
|
14
12
|
pub struct RubyReader<T> {
|
15
13
|
inner: T,
|
16
|
-
buffer: Option<Vec<u8>>,
|
17
14
|
offset: usize,
|
18
|
-
// Number of bytes that have been read into the buffer
|
19
|
-
// Used as an upper bound for offset
|
20
|
-
buffered_bytes: usize,
|
21
15
|
}
|
22
16
|
|
23
17
|
pub trait SeekableRead: std::io::Read + Seek {}
|
24
18
|
impl SeekableRead for RubyReader<Value> {}
|
25
19
|
impl SeekableRead for RubyReader<RString> {}
|
26
20
|
|
27
|
-
pub fn build_ruby_reader
|
28
|
-
ruby: &
|
21
|
+
pub fn build_ruby_reader(
|
22
|
+
ruby: &Ruby,
|
29
23
|
input: Value,
|
30
24
|
) -> Result<Box<dyn SeekableRead>, magnus::Error> {
|
31
25
|
if RubyReader::is_string_io(ruby, &input) {
|
@@ -39,49 +33,48 @@ pub fn build_ruby_reader<'a>(
|
|
39
33
|
|
40
34
|
impl Seek for RubyReader<Value> {
|
41
35
|
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
42
|
-
let
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
}
|
47
|
-
io::SeekFrom::End(offset) => {
|
48
|
-
// SEEK_END - from end of stream
|
49
|
-
offset
|
50
|
-
}
|
51
|
-
io::SeekFrom::Current(offset) => {
|
52
|
-
// SEEK_CUR - relative to current
|
53
|
-
offset
|
54
|
-
}
|
36
|
+
let (whence, offset) = match pos {
|
37
|
+
SeekFrom::Start(i) => (0, i as i64),
|
38
|
+
SeekFrom::Current(i) => (1, i),
|
39
|
+
SeekFrom::End(i) => (2, i),
|
55
40
|
};
|
56
41
|
|
57
|
-
let
|
58
|
-
|
59
|
-
|
60
|
-
io::
|
61
|
-
|
42
|
+
let new_position = self
|
43
|
+
.inner
|
44
|
+
.funcall("seek", (offset, whence))
|
45
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
46
|
+
|
47
|
+
Ok(new_position)
|
48
|
+
}
|
49
|
+
}
|
62
50
|
|
63
|
-
|
64
|
-
|
51
|
+
impl Write for RubyReader<Value> {
|
52
|
+
fn write(&mut self, buf: &[u8]) -> Result<usize, io::Error> {
|
53
|
+
let ruby_bytes = RString::from_slice(buf);
|
65
54
|
|
66
|
-
|
67
|
-
|
55
|
+
let bytes_written = self
|
56
|
+
.inner
|
57
|
+
.funcall::<_, _, usize>("write", (ruby_bytes,))
|
58
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
68
59
|
|
69
|
-
Ok(
|
60
|
+
Ok(bytes_written)
|
61
|
+
}
|
62
|
+
|
63
|
+
fn flush(&mut self) -> Result<(), io::Error> {
|
64
|
+
self.inner
|
65
|
+
.funcall::<_, _, Value>("flush", ())
|
66
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
67
|
+
|
68
|
+
Ok(())
|
70
69
|
}
|
71
70
|
}
|
72
71
|
|
73
72
|
impl Seek for RubyReader<RString> {
|
74
73
|
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
75
74
|
match pos {
|
76
|
-
io::SeekFrom::Start(offset) =>
|
77
|
-
|
78
|
-
|
79
|
-
io::SeekFrom::End(offset) => {
|
80
|
-
self.offset = (self.inner.len() - offset as usize) as usize;
|
81
|
-
}
|
82
|
-
io::SeekFrom::Current(offset) => {
|
83
|
-
self.offset = (self.offset as i64 + offset) as usize;
|
84
|
-
}
|
75
|
+
io::SeekFrom::Start(offset) => self.offset = offset as usize,
|
76
|
+
io::SeekFrom::Current(offset) => self.offset = (self.offset as i64 + offset) as usize,
|
77
|
+
io::SeekFrom::End(offset) => self.offset = self.inner.len() - offset as usize,
|
85
78
|
}
|
86
79
|
Ok(self.offset as u64)
|
87
80
|
}
|
@@ -106,59 +99,9 @@ impl RubyReader<Value> {
|
|
106
99
|
fn from_io_like(input: Value) -> Self {
|
107
100
|
Self {
|
108
101
|
inner: input,
|
109
|
-
buffer: Some(vec![0; READ_BUFFER_SIZE]),
|
110
102
|
offset: 0,
|
111
|
-
buffered_bytes: 0,
|
112
103
|
}
|
113
104
|
}
|
114
|
-
|
115
|
-
fn read_from_buffer(&mut self, to_buf: &mut [u8]) -> Option<io::Result<usize>> {
|
116
|
-
if let Some(from_buf) = &self.buffer {
|
117
|
-
// If the offset is within the buffered bytes, copy the remaining bytes to the output buffer
|
118
|
-
if self.offset < self.buffered_bytes {
|
119
|
-
let remaining = self.buffered_bytes - self.offset;
|
120
|
-
let copy_size = remaining.min(to_buf.len());
|
121
|
-
to_buf[..copy_size]
|
122
|
-
.copy_from_slice(&from_buf[self.offset..self.offset + copy_size]);
|
123
|
-
self.offset += copy_size;
|
124
|
-
Some(Ok(copy_size))
|
125
|
-
} else {
|
126
|
-
None
|
127
|
-
}
|
128
|
-
} else {
|
129
|
-
None
|
130
|
-
}
|
131
|
-
}
|
132
|
-
|
133
|
-
fn read_from_ruby(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
134
|
-
let buffer = self.buffer.as_mut().unwrap();
|
135
|
-
let result = self
|
136
|
-
.inner
|
137
|
-
.funcall::<_, _, RString>("read", (buffer.capacity(),))
|
138
|
-
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
139
|
-
|
140
|
-
if result.is_nil() {
|
141
|
-
return Ok(0); // EOF
|
142
|
-
}
|
143
|
-
|
144
|
-
let bytes = unsafe { result.as_slice() };
|
145
|
-
|
146
|
-
// Update internal buffer
|
147
|
-
let bytes_len = bytes.len();
|
148
|
-
if bytes_len == 0 {
|
149
|
-
return Ok(0);
|
150
|
-
}
|
151
|
-
|
152
|
-
// Only copy what we actually read
|
153
|
-
buffer[..bytes_len].copy_from_slice(bytes);
|
154
|
-
self.buffered_bytes = bytes_len;
|
155
|
-
|
156
|
-
// Copy to output buffer
|
157
|
-
let copy_size = bytes_len.min(buf.len());
|
158
|
-
buf[..copy_size].copy_from_slice(&buffer[..copy_size]);
|
159
|
-
self.offset = copy_size;
|
160
|
-
Ok(copy_size)
|
161
|
-
}
|
162
105
|
}
|
163
106
|
|
164
107
|
impl RubyReader<RString> {
|
@@ -176,9 +119,7 @@ impl RubyReader<RString> {
|
|
176
119
|
let string_content = input.funcall::<_, _, RString>("string", ()).unwrap();
|
177
120
|
Ok(Box::new(Self {
|
178
121
|
inner: string_content,
|
179
|
-
buffer: None,
|
180
122
|
offset: 0,
|
181
|
-
buffered_bytes: 0,
|
182
123
|
}))
|
183
124
|
}
|
184
125
|
|
@@ -197,21 +138,21 @@ impl RubyReader<RString> {
|
|
197
138
|
.or_else(|_| input.funcall::<_, _, RString>("to_s", ()))?;
|
198
139
|
Ok(Box::new(Self {
|
199
140
|
inner: string_content,
|
200
|
-
buffer: None,
|
201
141
|
offset: 0,
|
202
|
-
buffered_bytes: 0,
|
203
142
|
}))
|
204
143
|
}
|
205
144
|
}
|
206
145
|
|
207
146
|
impl Read for RubyReader<Value> {
|
208
|
-
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
}
|
147
|
+
fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
|
148
|
+
let bytes = self
|
149
|
+
.inner
|
150
|
+
.funcall::<_, _, RString>("read", (buf.len(),))
|
151
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
152
|
+
|
153
|
+
buf.write_all(unsafe { bytes.as_slice() })?;
|
154
|
+
|
155
|
+
Ok(bytes.len())
|
215
156
|
}
|
216
157
|
}
|
217
158
|
|