parquet 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,77 @@
1
+ use std::{
2
+ fs::File,
3
+ io::{BufReader, SeekFrom},
4
+ mem::ManuallyDrop,
5
+ };
6
+
7
+ use bytes::Bytes;
8
+ use magnus::{value::Opaque, Ruby, Value};
9
+ use parquet::{
10
+ errors::ParquetError,
11
+ file::reader::{ChunkReader, Length},
12
+ };
13
+ use std::io::Read;
14
+
15
+ use crate::ruby_reader::{build_ruby_reader, SeekableRead};
16
+
17
+ const READ_BUFFER_SIZE: usize = 16 * 1024;
18
+
19
+ pub struct SeekableRubyValue(pub Opaque<Value>);
20
+
21
+ impl Length for SeekableRubyValue {
22
+ fn len(&self) -> u64 {
23
+ let ruby = unsafe { Ruby::get_unchecked() };
24
+ let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
25
+ let current_pos = reader.seek(SeekFrom::Current(0)).unwrap();
26
+ let file_len = reader.seek(SeekFrom::End(0)).unwrap();
27
+ reader.seek(SeekFrom::Start(current_pos)).unwrap();
28
+ file_len
29
+ }
30
+ }
31
+
32
+ impl ChunkReader for SeekableRubyValue {
33
+ type T = BufReader<Box<dyn SeekableRead>>;
34
+
35
+ fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
36
+ let ruby = unsafe { Ruby::get_unchecked() };
37
+ let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
38
+ reader.seek(SeekFrom::Start(start))?;
39
+ Ok(BufReader::with_capacity(READ_BUFFER_SIZE, reader))
40
+ }
41
+
42
+ fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
43
+ let ruby = unsafe { Ruby::get_unchecked() };
44
+ let mut buffer = Vec::with_capacity(length);
45
+ let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
46
+ reader.seek(SeekFrom::Start(start))?;
47
+ let read = reader.take(length as _).read_to_end(&mut buffer)?;
48
+
49
+ if read != length {
50
+ return Err(ParquetError::EOF(format!(
51
+ "Expected to read {} bytes, read only {}",
52
+ length, read
53
+ )));
54
+ }
55
+ Ok(buffer.into())
56
+ }
57
+ }
58
+
59
+ pub struct ForgottenFileHandle(pub ManuallyDrop<File>);
60
+
61
+ impl Length for ForgottenFileHandle {
62
+ fn len(&self) -> u64 {
63
+ self.0.len()
64
+ }
65
+ }
66
+
67
+ impl ChunkReader for ForgottenFileHandle {
68
+ type T = BufReader<File>;
69
+
70
+ fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
71
+ self.0.get_read(start)
72
+ }
73
+
74
+ fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
75
+ self.0.get_bytes(start, length)
76
+ }
77
+ }
@@ -2,30 +2,24 @@ use magnus::{
2
2
  value::{Opaque, ReprValue},
3
3
  RClass, RString, Ruby, Value,
4
4
  };
5
- use std::io::{self, Read, Seek};
5
+ use std::io::{self, Read, Seek, SeekFrom, Write};
6
6
  use std::sync::OnceLock;
7
7
 
8
8
  static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
9
9
 
10
- const READ_BUFFER_SIZE: usize = 16 * 1024;
11
-
12
10
  /// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
13
11
  /// and provide a standard Read implementation for them.
14
12
  pub struct RubyReader<T> {
15
13
  inner: T,
16
- buffer: Option<Vec<u8>>,
17
14
  offset: usize,
18
- // Number of bytes that have been read into the buffer
19
- // Used as an upper bound for offset
20
- buffered_bytes: usize,
21
15
  }
22
16
 
23
17
  pub trait SeekableRead: std::io::Read + Seek {}
24
18
  impl SeekableRead for RubyReader<Value> {}
25
19
  impl SeekableRead for RubyReader<RString> {}
26
20
 
27
- pub fn build_ruby_reader<'a>(
28
- ruby: &'a Ruby,
21
+ pub fn build_ruby_reader(
22
+ ruby: &Ruby,
29
23
  input: Value,
30
24
  ) -> Result<Box<dyn SeekableRead>, magnus::Error> {
31
25
  if RubyReader::is_string_io(ruby, &input) {
@@ -39,49 +33,48 @@ pub fn build_ruby_reader<'a>(
39
33
 
40
34
  impl Seek for RubyReader<Value> {
41
35
  fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
42
- let seek_to = match pos {
43
- io::SeekFrom::Start(offset) => {
44
- // SEEK_SET - absolute position
45
- offset as i64
46
- }
47
- io::SeekFrom::End(offset) => {
48
- // SEEK_END - from end of stream
49
- offset
50
- }
51
- io::SeekFrom::Current(offset) => {
52
- // SEEK_CUR - relative to current
53
- offset
54
- }
36
+ let (whence, offset) = match pos {
37
+ SeekFrom::Start(i) => (0, i as i64),
38
+ SeekFrom::Current(i) => (1, i),
39
+ SeekFrom::End(i) => (2, i),
55
40
  };
56
41
 
57
- let whence = match pos {
58
- io::SeekFrom::Start(_) => 0, // SEEK_SET
59
- io::SeekFrom::End(_) => 2, // SEEK_END
60
- io::SeekFrom::Current(_) => 1, // SEEK_CUR
61
- };
42
+ let new_position = self
43
+ .inner
44
+ .funcall("seek", (offset, whence))
45
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
46
+
47
+ Ok(new_position)
48
+ }
49
+ }
62
50
 
63
- // Call Ruby's seek method
64
- let _: u64 = self.inner.funcall("seek", (seek_to, whence)).unwrap();
51
+ impl Write for RubyReader<Value> {
52
+ fn write(&mut self, buf: &[u8]) -> Result<usize, io::Error> {
53
+ let ruby_bytes = RString::from_slice(buf);
65
54
 
66
- // Get current position
67
- let pos: u64 = self.inner.funcall("pos", ()).unwrap();
55
+ let bytes_written = self
56
+ .inner
57
+ .funcall::<_, _, usize>("write", (ruby_bytes,))
58
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
68
59
 
69
- Ok(pos)
60
+ Ok(bytes_written)
61
+ }
62
+
63
+ fn flush(&mut self) -> Result<(), io::Error> {
64
+ self.inner
65
+ .funcall::<_, _, Value>("flush", ())
66
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
67
+
68
+ Ok(())
70
69
  }
71
70
  }
72
71
 
73
72
  impl Seek for RubyReader<RString> {
74
73
  fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
75
74
  match pos {
76
- io::SeekFrom::Start(offset) => {
77
- self.offset = offset as usize;
78
- }
79
- io::SeekFrom::End(offset) => {
80
- self.offset = (self.inner.len() - offset as usize) as usize;
81
- }
82
- io::SeekFrom::Current(offset) => {
83
- self.offset = (self.offset as i64 + offset) as usize;
84
- }
75
+ io::SeekFrom::Start(offset) => self.offset = offset as usize,
76
+ io::SeekFrom::Current(offset) => self.offset = (self.offset as i64 + offset) as usize,
77
+ io::SeekFrom::End(offset) => self.offset = self.inner.len() - offset as usize,
85
78
  }
86
79
  Ok(self.offset as u64)
87
80
  }
@@ -106,59 +99,9 @@ impl RubyReader<Value> {
106
99
  fn from_io_like(input: Value) -> Self {
107
100
  Self {
108
101
  inner: input,
109
- buffer: Some(vec![0; READ_BUFFER_SIZE]),
110
102
  offset: 0,
111
- buffered_bytes: 0,
112
103
  }
113
104
  }
114
-
115
- fn read_from_buffer(&mut self, to_buf: &mut [u8]) -> Option<io::Result<usize>> {
116
- if let Some(from_buf) = &self.buffer {
117
- // If the offset is within the buffered bytes, copy the remaining bytes to the output buffer
118
- if self.offset < self.buffered_bytes {
119
- let remaining = self.buffered_bytes - self.offset;
120
- let copy_size = remaining.min(to_buf.len());
121
- to_buf[..copy_size]
122
- .copy_from_slice(&from_buf[self.offset..self.offset + copy_size]);
123
- self.offset += copy_size;
124
- Some(Ok(copy_size))
125
- } else {
126
- None
127
- }
128
- } else {
129
- None
130
- }
131
- }
132
-
133
- fn read_from_ruby(&mut self, buf: &mut [u8]) -> io::Result<usize> {
134
- let buffer = self.buffer.as_mut().unwrap();
135
- let result = self
136
- .inner
137
- .funcall::<_, _, RString>("read", (buffer.capacity(),))
138
- .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
139
-
140
- if result.is_nil() {
141
- return Ok(0); // EOF
142
- }
143
-
144
- let bytes = unsafe { result.as_slice() };
145
-
146
- // Update internal buffer
147
- let bytes_len = bytes.len();
148
- if bytes_len == 0 {
149
- return Ok(0);
150
- }
151
-
152
- // Only copy what we actually read
153
- buffer[..bytes_len].copy_from_slice(bytes);
154
- self.buffered_bytes = bytes_len;
155
-
156
- // Copy to output buffer
157
- let copy_size = bytes_len.min(buf.len());
158
- buf[..copy_size].copy_from_slice(&buffer[..copy_size]);
159
- self.offset = copy_size;
160
- Ok(copy_size)
161
- }
162
105
  }
163
106
 
164
107
  impl RubyReader<RString> {
@@ -176,9 +119,7 @@ impl RubyReader<RString> {
176
119
  let string_content = input.funcall::<_, _, RString>("string", ()).unwrap();
177
120
  Ok(Box::new(Self {
178
121
  inner: string_content,
179
- buffer: None,
180
122
  offset: 0,
181
- buffered_bytes: 0,
182
123
  }))
183
124
  }
184
125
 
@@ -197,21 +138,21 @@ impl RubyReader<RString> {
197
138
  .or_else(|_| input.funcall::<_, _, RString>("to_s", ()))?;
198
139
  Ok(Box::new(Self {
199
140
  inner: string_content,
200
- buffer: None,
201
141
  offset: 0,
202
- buffered_bytes: 0,
203
142
  }))
204
143
  }
205
144
  }
206
145
 
207
146
  impl Read for RubyReader<Value> {
208
- fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
209
- if let Some(result) = self.read_from_buffer(buf) {
210
- result
211
- } else {
212
- // If the buffer is empty, read from Ruby
213
- self.read_from_ruby(buf)
214
- }
147
+ fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
148
+ let bytes = self
149
+ .inner
150
+ .funcall::<_, _, RString>("read", (buf.len(),))
151
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
152
+
153
+ buf.write_all(unsafe { bytes.as_slice() })?;
154
+
155
+ Ok(bytes.len())
215
156
  }
216
157
  }
217
158