parquet 0.0.1 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,77 @@
1
+ use std::{
2
+ fs::File,
3
+ io::{BufReader, SeekFrom},
4
+ mem::ManuallyDrop,
5
+ };
6
+
7
+ use bytes::Bytes;
8
+ use magnus::{value::Opaque, Ruby, Value};
9
+ use parquet::{
10
+ errors::ParquetError,
11
+ file::reader::{ChunkReader, Length},
12
+ };
13
+ use std::io::Read;
14
+
15
+ use crate::ruby_reader::{build_ruby_reader, SeekableRead};
16
+
17
+ const READ_BUFFER_SIZE: usize = 16 * 1024;
18
+
19
+ pub struct SeekableRubyValue(pub Opaque<Value>);
20
+
21
+ impl Length for SeekableRubyValue {
22
+ fn len(&self) -> u64 {
23
+ let ruby = unsafe { Ruby::get_unchecked() };
24
+ let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
25
+ let current_pos = reader.seek(SeekFrom::Current(0)).unwrap();
26
+ let file_len = reader.seek(SeekFrom::End(0)).unwrap();
27
+ reader.seek(SeekFrom::Start(current_pos)).unwrap();
28
+ file_len
29
+ }
30
+ }
31
+
32
+ impl ChunkReader for SeekableRubyValue {
33
+ type T = BufReader<Box<dyn SeekableRead>>;
34
+
35
+ fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
36
+ let ruby = unsafe { Ruby::get_unchecked() };
37
+ let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
38
+ reader.seek(SeekFrom::Start(start))?;
39
+ Ok(BufReader::with_capacity(READ_BUFFER_SIZE, reader))
40
+ }
41
+
42
+ fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
43
+ let ruby = unsafe { Ruby::get_unchecked() };
44
+ let mut buffer = Vec::with_capacity(length);
45
+ let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
46
+ reader.seek(SeekFrom::Start(start))?;
47
+ let read = reader.take(length as _).read_to_end(&mut buffer)?;
48
+
49
+ if read != length {
50
+ return Err(ParquetError::EOF(format!(
51
+ "Expected to read {} bytes, read only {}",
52
+ length, read
53
+ )));
54
+ }
55
+ Ok(buffer.into())
56
+ }
57
+ }
58
+
59
+ pub struct ForgottenFileHandle(pub ManuallyDrop<File>);
60
+
61
+ impl Length for ForgottenFileHandle {
62
+ fn len(&self) -> u64 {
63
+ self.0.len()
64
+ }
65
+ }
66
+
67
+ impl ChunkReader for ForgottenFileHandle {
68
+ type T = BufReader<File>;
69
+
70
+ fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
71
+ self.0.get_read(start)
72
+ }
73
+
74
+ fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
75
+ self.0.get_bytes(start, length)
76
+ }
77
+ }
@@ -2,30 +2,24 @@ use magnus::{
2
2
  value::{Opaque, ReprValue},
3
3
  RClass, RString, Ruby, Value,
4
4
  };
5
- use std::io::{self, Read, Seek};
5
+ use std::io::{self, Read, Seek, SeekFrom, Write};
6
6
  use std::sync::OnceLock;
7
7
 
8
8
  static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
9
9
 
10
- const READ_BUFFER_SIZE: usize = 16 * 1024;
11
-
12
10
  /// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
13
11
  /// and provide a standard Read implementation for them.
14
12
  pub struct RubyReader<T> {
15
13
  inner: T,
16
- buffer: Option<Vec<u8>>,
17
14
  offset: usize,
18
- // Number of bytes that have been read into the buffer
19
- // Used as an upper bound for offset
20
- buffered_bytes: usize,
21
15
  }
22
16
 
23
17
  pub trait SeekableRead: std::io::Read + Seek {}
24
18
  impl SeekableRead for RubyReader<Value> {}
25
19
  impl SeekableRead for RubyReader<RString> {}
26
20
 
27
- pub fn build_ruby_reader<'a>(
28
- ruby: &'a Ruby,
21
+ pub fn build_ruby_reader(
22
+ ruby: &Ruby,
29
23
  input: Value,
30
24
  ) -> Result<Box<dyn SeekableRead>, magnus::Error> {
31
25
  if RubyReader::is_string_io(ruby, &input) {
@@ -39,49 +33,48 @@ pub fn build_ruby_reader<'a>(
39
33
 
40
34
  impl Seek for RubyReader<Value> {
41
35
  fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
42
- let seek_to = match pos {
43
- io::SeekFrom::Start(offset) => {
44
- // SEEK_SET - absolute position
45
- offset as i64
46
- }
47
- io::SeekFrom::End(offset) => {
48
- // SEEK_END - from end of stream
49
- offset
50
- }
51
- io::SeekFrom::Current(offset) => {
52
- // SEEK_CUR - relative to current
53
- offset
54
- }
36
+ let (whence, offset) = match pos {
37
+ SeekFrom::Start(i) => (0, i as i64),
38
+ SeekFrom::Current(i) => (1, i),
39
+ SeekFrom::End(i) => (2, i),
55
40
  };
56
41
 
57
- let whence = match pos {
58
- io::SeekFrom::Start(_) => 0, // SEEK_SET
59
- io::SeekFrom::End(_) => 2, // SEEK_END
60
- io::SeekFrom::Current(_) => 1, // SEEK_CUR
61
- };
42
+ let new_position = self
43
+ .inner
44
+ .funcall("seek", (offset, whence))
45
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
46
+
47
+ Ok(new_position)
48
+ }
49
+ }
62
50
 
63
- // Call Ruby's seek method
64
- let _: u64 = self.inner.funcall("seek", (seek_to, whence)).unwrap();
51
+ impl Write for RubyReader<Value> {
52
+ fn write(&mut self, buf: &[u8]) -> Result<usize, io::Error> {
53
+ let ruby_bytes = RString::from_slice(buf);
65
54
 
66
- // Get current position
67
- let pos: u64 = self.inner.funcall("pos", ()).unwrap();
55
+ let bytes_written = self
56
+ .inner
57
+ .funcall::<_, _, usize>("write", (ruby_bytes,))
58
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
68
59
 
69
- Ok(pos)
60
+ Ok(bytes_written)
61
+ }
62
+
63
+ fn flush(&mut self) -> Result<(), io::Error> {
64
+ self.inner
65
+ .funcall::<_, _, Value>("flush", ())
66
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
67
+
68
+ Ok(())
70
69
  }
71
70
  }
72
71
 
73
72
  impl Seek for RubyReader<RString> {
74
73
  fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
75
74
  match pos {
76
- io::SeekFrom::Start(offset) => {
77
- self.offset = offset as usize;
78
- }
79
- io::SeekFrom::End(offset) => {
80
- self.offset = (self.inner.len() - offset as usize) as usize;
81
- }
82
- io::SeekFrom::Current(offset) => {
83
- self.offset = (self.offset as i64 + offset) as usize;
84
- }
75
+ io::SeekFrom::Start(offset) => self.offset = offset as usize,
76
+ io::SeekFrom::Current(offset) => self.offset = (self.offset as i64 + offset) as usize,
77
+ io::SeekFrom::End(offset) => self.offset = self.inner.len() - offset as usize,
85
78
  }
86
79
  Ok(self.offset as u64)
87
80
  }
@@ -106,59 +99,9 @@ impl RubyReader<Value> {
106
99
  fn from_io_like(input: Value) -> Self {
107
100
  Self {
108
101
  inner: input,
109
- buffer: Some(vec![0; READ_BUFFER_SIZE]),
110
102
  offset: 0,
111
- buffered_bytes: 0,
112
103
  }
113
104
  }
114
-
115
- fn read_from_buffer(&mut self, to_buf: &mut [u8]) -> Option<io::Result<usize>> {
116
- if let Some(from_buf) = &self.buffer {
117
- // If the offset is within the buffered bytes, copy the remaining bytes to the output buffer
118
- if self.offset < self.buffered_bytes {
119
- let remaining = self.buffered_bytes - self.offset;
120
- let copy_size = remaining.min(to_buf.len());
121
- to_buf[..copy_size]
122
- .copy_from_slice(&from_buf[self.offset..self.offset + copy_size]);
123
- self.offset += copy_size;
124
- Some(Ok(copy_size))
125
- } else {
126
- None
127
- }
128
- } else {
129
- None
130
- }
131
- }
132
-
133
- fn read_from_ruby(&mut self, buf: &mut [u8]) -> io::Result<usize> {
134
- let buffer = self.buffer.as_mut().unwrap();
135
- let result = self
136
- .inner
137
- .funcall::<_, _, RString>("read", (buffer.capacity(),))
138
- .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
139
-
140
- if result.is_nil() {
141
- return Ok(0); // EOF
142
- }
143
-
144
- let bytes = unsafe { result.as_slice() };
145
-
146
- // Update internal buffer
147
- let bytes_len = bytes.len();
148
- if bytes_len == 0 {
149
- return Ok(0);
150
- }
151
-
152
- // Only copy what we actually read
153
- buffer[..bytes_len].copy_from_slice(bytes);
154
- self.buffered_bytes = bytes_len;
155
-
156
- // Copy to output buffer
157
- let copy_size = bytes_len.min(buf.len());
158
- buf[..copy_size].copy_from_slice(&buffer[..copy_size]);
159
- self.offset = copy_size;
160
- Ok(copy_size)
161
- }
162
105
  }
163
106
 
164
107
  impl RubyReader<RString> {
@@ -176,9 +119,7 @@ impl RubyReader<RString> {
176
119
  let string_content = input.funcall::<_, _, RString>("string", ()).unwrap();
177
120
  Ok(Box::new(Self {
178
121
  inner: string_content,
179
- buffer: None,
180
122
  offset: 0,
181
- buffered_bytes: 0,
182
123
  }))
183
124
  }
184
125
 
@@ -197,21 +138,21 @@ impl RubyReader<RString> {
197
138
  .or_else(|_| input.funcall::<_, _, RString>("to_s", ()))?;
198
139
  Ok(Box::new(Self {
199
140
  inner: string_content,
200
- buffer: None,
201
141
  offset: 0,
202
- buffered_bytes: 0,
203
142
  }))
204
143
  }
205
144
  }
206
145
 
207
146
  impl Read for RubyReader<Value> {
208
- fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
209
- if let Some(result) = self.read_from_buffer(buf) {
210
- result
211
- } else {
212
- // If the buffer is empty, read from Ruby
213
- self.read_from_ruby(buf)
214
- }
147
+ fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
148
+ let bytes = self
149
+ .inner
150
+ .funcall::<_, _, RString>("read", (buf.len(),))
151
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
152
+
153
+ buf.write_all(unsafe { bytes.as_slice() })?;
154
+
155
+ Ok(bytes.len())
215
156
  }
216
157
  }
217
158