parquet 0.2.13 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,20 +1,18 @@
1
1
  use crate::header_cache::StringCache;
2
+ use crate::ruby_reader::{RubyReader, ThreadSafeRubyReader};
2
3
  use crate::types::{ArrayWrapper, TryIntoValue};
3
4
  use crate::{
4
- create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ForgottenFileHandle,
5
- ParquetValueVec, ParserResultType, SeekableRubyValue,
5
+ create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ParquetValueVec,
6
+ ParserResultType,
6
7
  };
7
8
  use ahash::RandomState;
8
- use magnus::rb_sys::AsRawValue;
9
- use magnus::value::{Opaque, ReprValue};
9
+ use magnus::value::ReprValue;
10
10
  use magnus::IntoValue;
11
11
  use magnus::{Error as MagnusError, Ruby, Value};
12
12
  use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
13
13
  use parquet::arrow::ProjectionMask;
14
14
  use std::collections::HashMap;
15
15
  use std::fs::File;
16
- use std::mem::ManuallyDrop;
17
- use std::os::fd::FromRawFd;
18
16
  use std::sync::OnceLock;
19
17
 
20
18
  use super::ReaderError;
@@ -71,50 +69,12 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
71
69
 
72
70
  let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
73
71
 
74
- (reader, schema, num_rows)
75
- } else if to_read.is_kind_of(ruby.class_io()) {
76
- let raw_value = to_read.as_raw();
77
- let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
78
- .map_err(|_| {
79
- ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
80
- })?;
81
-
82
- if fd < 0 {
83
- return Err(ReaderError::InvalidFileDescriptor.into());
84
- }
85
-
86
- let file = unsafe { File::from_raw_fd(fd) };
87
- let file = ForgottenFileHandle(ManuallyDrop::new(file));
88
-
89
- let mut builder =
90
- ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
91
- let schema = builder.schema().clone();
92
- let num_rows = builder.metadata().file_metadata().num_rows();
93
-
94
- // If columns are specified, project only those columns
95
- if let Some(cols) = &columns {
96
- // Get the parquet schema
97
- let parquet_schema = builder.parquet_schema();
98
-
99
- // Create a projection mask from column names
100
- let projection =
101
- ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
102
-
103
- builder = builder.with_projection(projection);
104
- }
105
-
106
- if let Some(batch_size) = batch_size {
107
- builder = builder.with_batch_size(batch_size);
108
- }
109
-
110
- let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
111
-
112
72
  (reader, schema, num_rows)
113
73
  } else {
114
- let readable = SeekableRubyValue(Opaque::from(to_read));
74
+ let readable = ThreadSafeRubyReader::new(RubyReader::try_from(to_read)?);
115
75
 
116
- let mut builder = ParquetRecordBatchReaderBuilder::try_new(readable)
117
- .map_err(|e| ReaderError::Parquet(e))?;
76
+ let mut builder =
77
+ ParquetRecordBatchReaderBuilder::try_new(readable).map_err(ReaderError::from)?;
118
78
  let schema = builder.schema().clone();
119
79
  let num_rows = builder.metadata().file_metadata().num_rows();
120
80
 
@@ -1,12 +1,12 @@
1
1
  use crate::header_cache::StringCache;
2
+ use crate::ruby_reader::{RubyReader, ThreadSafeRubyReader};
2
3
  use crate::types::TryIntoValue;
3
4
  use crate::{
4
- create_row_enumerator, utils::*, ForgottenFileHandle, ParquetField, ParserResultType,
5
- ReaderError, RowEnumeratorArgs, RowRecord, SeekableRubyValue,
5
+ create_row_enumerator, utils::*, ParquetField, ParserResultType, ReaderError,
6
+ RowEnumeratorArgs, RowRecord,
6
7
  };
7
8
  use ahash::RandomState;
8
- use magnus::rb_sys::AsRawValue;
9
- use magnus::value::{Opaque, ReprValue};
9
+ use magnus::value::ReprValue;
10
10
  use magnus::IntoValue;
11
11
  use magnus::{Error as MagnusError, Ruby, Value};
12
12
  use parquet::file::reader::{FileReader, SerializedFileReader};
@@ -14,8 +14,6 @@ use parquet::record::reader::RowIter as ParquetRowIter;
14
14
  use parquet::schema::types::{Type as SchemaType, TypePtr};
15
15
  use std::collections::HashMap;
16
16
  use std::fs::File;
17
- use std::mem::ManuallyDrop;
18
- use std::os::fd::FromRawFd;
19
17
  use std::sync::OnceLock;
20
18
 
21
19
  #[inline]
@@ -40,39 +38,17 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
40
38
  .map(|yield_enum| yield_enum.into_value_with(&ruby));
41
39
  }
42
40
 
43
- let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
41
+ let reader: Box<dyn FileReader> = if to_read.is_kind_of(ruby.class_string()) {
44
42
  let path_string = to_read.to_r_string()?;
45
43
  let file_path = unsafe { path_string.as_str()? };
46
- let file = File::open(file_path).unwrap();
47
- let reader = SerializedFileReader::new(file).unwrap();
48
- let schema = reader.metadata().file_metadata().schema().clone();
49
-
50
- (schema, ParquetRowIter::from_file_into(Box::new(reader)))
51
- } else if to_read.is_kind_of(ruby.class_io()) {
52
- let raw_value = to_read.as_raw();
53
- let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
54
- .map_err(|_| {
55
- ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
56
- })?;
57
-
58
- if fd < 0 {
59
- return Err(ReaderError::InvalidFileDescriptor.into());
60
- }
61
-
62
- let file = unsafe { File::from_raw_fd(fd) };
63
- let file = ForgottenFileHandle(ManuallyDrop::new(file));
64
- let reader = SerializedFileReader::new(file).unwrap();
65
- let schema = reader.metadata().file_metadata().schema().clone();
66
-
67
- (schema, ParquetRowIter::from_file_into(Box::new(reader)))
44
+ let file = File::open(file_path).map_err(ReaderError::from)?;
45
+ Box::new(SerializedFileReader::new(file).map_err(ReaderError::from)?)
68
46
  } else {
69
- let readable = SeekableRubyValue(Opaque::from(to_read));
70
- let reader = SerializedFileReader::new(readable).unwrap();
71
- let schema = reader.metadata().file_metadata().schema().clone();
72
-
73
- (schema, ParquetRowIter::from_file_into(Box::new(reader)))
47
+ let readable = ThreadSafeRubyReader::new(RubyReader::try_from(to_read)?);
48
+ Box::new(SerializedFileReader::new(readable).map_err(ReaderError::from)?)
74
49
  };
75
-
50
+ let schema = reader.metadata().file_metadata().schema().clone();
51
+ let mut iter = ParquetRowIter::from_file_into(reader);
76
52
  if let Some(cols) = columns {
77
53
  let projection = create_projection_schema(&schema, &cols);
78
54
  iter = iter.project(Some(projection.to_owned())).map_err(|e| {
@@ -97,7 +73,8 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
97
73
  header_string.push(k.to_owned());
98
74
  }
99
75
 
100
- let headers = StringCache::intern_many(&header_string).unwrap();
76
+ let headers = StringCache::intern_many(&header_string)
77
+ .expect("Failed to intern headers");
101
78
 
102
79
  headers
103
80
  });
@@ -110,7 +87,7 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
110
87
  Ok(map)
111
88
  })
112
89
  .and_then(|row| Ok(RowRecord::Map::<RandomState>(row)))
113
- .map_err(|e| ReaderError::Parquet(e))
90
+ .map_err(|e| ReaderError::from(e))
114
91
  });
115
92
 
116
93
  for result in iter {
@@ -128,7 +105,7 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
128
105
  Ok(vec)
129
106
  })
130
107
  .and_then(|row| Ok(RowRecord::Vec::<RandomState>(row)))
131
- .map_err(|e| ReaderError::Parquet(e))
108
+ .map_err(|e| ReaderError::from(e))
132
109
  });
133
110
 
134
111
  for result in iter {
@@ -1,171 +1,294 @@
1
+ use bytes::Bytes;
1
2
  use magnus::{
2
3
  value::{Opaque, ReprValue},
3
4
  RClass, RString, Ruby, Value,
4
5
  };
5
- use std::io::{self, Read, Seek, SeekFrom, Write};
6
- use std::sync::OnceLock;
6
+ use parquet::{
7
+ errors::ParquetError,
8
+ file::reader::{ChunkReader, Length},
9
+ };
10
+ use std::{
11
+ fs::File,
12
+ sync::{Mutex, OnceLock},
13
+ };
14
+ use std::{
15
+ io::{self, BufReader, Read, Seek, SeekFrom, Write},
16
+ sync::Arc,
17
+ };
7
18
 
8
19
  static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
9
20
 
10
21
  /// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
11
22
  /// and provide a standard Read implementation for them.
12
- pub struct RubyReader<T> {
13
- inner: T,
14
- offset: usize,
23
+ pub enum RubyReader {
24
+ String {
25
+ inner: Opaque<RString>,
26
+ offset: usize,
27
+ },
28
+ RubyIoLike {
29
+ inner: Opaque<Value>,
30
+ },
31
+ NativeProxyIoLike {
32
+ proxy_file: File,
33
+ },
15
34
  }
16
35
 
17
- pub trait SeekableRead: Read + Seek {}
18
- impl<T: Read + Seek> SeekableRead for T {}
19
-
20
- pub fn build_ruby_reader(
21
- ruby: &Ruby,
22
- input: Value,
23
- ) -> Result<Box<dyn SeekableRead>, magnus::Error> {
24
- if RubyReader::is_string_io(ruby, &input) {
25
- RubyReader::from_string_io(ruby, input)
26
- } else if RubyReader::is_io_like(&input) {
27
- RubyReader::from_io(input)
28
- } else {
29
- RubyReader::from_string_like(input)
36
+ impl RubyReader {
37
+ fn is_string_io(ruby: &Ruby, value: &Value) -> bool {
38
+ let string_io_class = STRING_IO_CLASS.get_or_init(|| {
39
+ let class = RClass::from_value(ruby.eval("StringIO").expect("Failed to find StringIO"))
40
+ .expect("Failed to get StringIO class");
41
+ Opaque::from(class)
42
+ });
43
+ value.is_kind_of(ruby.get_inner(*string_io_class))
30
44
  }
31
- }
32
45
 
33
- impl Seek for RubyReader<Value> {
34
- fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
35
- let (whence, offset) = match pos {
36
- SeekFrom::Start(i) => (0, i as i64),
37
- SeekFrom::Current(i) => (1, i),
38
- SeekFrom::End(i) => (2, i),
39
- };
40
-
41
- let new_position = self
42
- .inner
43
- .funcall("seek", (offset, whence))
44
- .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
46
+ fn is_io_like(value: &Value) -> bool {
47
+ value.respond_to("read", false).unwrap_or(false)
48
+ }
45
49
 
46
- Ok(new_position)
50
+ // For now, don't use this. Having to use seek in length is scary.
51
+ fn is_seekable_io_like(_value: &Value) -> bool {
52
+ // Self::is_io_like(value) && value.respond_to("seek", false).unwrap_or(false)
53
+ false
47
54
  }
48
55
  }
49
56
 
50
- impl Write for RubyReader<Value> {
51
- fn write(&mut self, buf: &[u8]) -> Result<usize, io::Error> {
52
- let ruby_bytes = RString::from_slice(buf);
57
+ impl TryFrom<Value> for RubyReader {
58
+ type Error = magnus::Error;
53
59
 
54
- let bytes_written = self
55
- .inner
56
- .funcall::<_, _, usize>("write", (ruby_bytes,))
57
- .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
58
-
59
- Ok(bytes_written)
60
- }
60
+ fn try_from(value: Value) -> Result<Self, Self::Error> {
61
+ let ruby = unsafe { Ruby::get_unchecked() };
62
+ if RubyReader::is_string_io(&ruby, &value) {
63
+ let string_content = value.funcall::<_, _, RString>("string", ())?;
64
+ Ok(RubyReader::String {
65
+ inner: Opaque::from(string_content),
66
+ offset: 0,
67
+ })
68
+ } else if RubyReader::is_seekable_io_like(&value) {
69
+ Ok(RubyReader::RubyIoLike {
70
+ inner: Opaque::from(value),
71
+ })
72
+ } else if RubyReader::is_io_like(&value) {
73
+ let mut temp_file = tempfile::tempfile()
74
+ .map_err(|e| magnus::Error::new(ruby.exception_runtime_error(), e.to_string()))?;
61
75
 
62
- fn flush(&mut self) -> Result<(), io::Error> {
63
- self.inner
64
- .funcall::<_, _, Value>("flush", ())
65
- .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
76
+ // This is safe, because we won't call seek
77
+ let inner_readable = RubyReader::RubyIoLike {
78
+ inner: Opaque::from(value),
79
+ };
80
+ let mut reader = BufReader::new(inner_readable);
81
+ io::copy(&mut reader, &mut temp_file)
82
+ .map_err(|e| magnus::Error::new(ruby.exception_runtime_error(), e.to_string()))?;
83
+ temp_file
84
+ .seek(SeekFrom::Start(0))
85
+ .map_err(|e| magnus::Error::new(ruby.exception_runtime_error(), e.to_string()))?;
66
86
 
67
- Ok(())
87
+ Ok(RubyReader::NativeProxyIoLike {
88
+ proxy_file: temp_file,
89
+ })
90
+ } else {
91
+ // Try calling `to_str`, and if that fails, try `to_s`
92
+ let string_content = value
93
+ .funcall::<_, _, RString>("to_str", ())
94
+ .or_else(|_| value.funcall::<_, _, RString>("to_s", ()))?;
95
+ Ok(RubyReader::String {
96
+ inner: Opaque::from(string_content),
97
+ offset: 0,
98
+ })
99
+ }
68
100
  }
69
101
  }
70
102
 
71
- impl Seek for RubyReader<RString> {
103
+ impl Seek for RubyReader {
72
104
  fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
73
- match pos {
74
- io::SeekFrom::Start(offset) => self.offset = offset as usize,
75
- io::SeekFrom::Current(offset) => self.offset = (self.offset as i64 + offset) as usize,
76
- io::SeekFrom::End(offset) => self.offset = self.inner.len() - offset as usize,
105
+ let ruby = unsafe { Ruby::get_unchecked() };
106
+ match self {
107
+ RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.seek(pos),
108
+ RubyReader::String {
109
+ inner,
110
+ offset: original_offset,
111
+ } => {
112
+ let unwrapped_inner = ruby.get_inner(*inner);
113
+
114
+ let new_offset = match pos {
115
+ io::SeekFrom::Start(offset) => offset as usize,
116
+ io::SeekFrom::Current(offset) => (*original_offset as i64 + offset) as usize,
117
+ io::SeekFrom::End(offset) => {
118
+ unwrapped_inner.len().saturating_sub(offset as usize)
119
+ }
120
+ };
121
+
122
+ *original_offset = new_offset.min(unwrapped_inner.len());
123
+ Ok(*original_offset as u64)
124
+ }
125
+ RubyReader::RubyIoLike { inner } => {
126
+ let unwrapped_inner = ruby.get_inner(*inner);
127
+
128
+ let (whence, ruby_offset) = match pos {
129
+ SeekFrom::Start(i) => (0, i as i64),
130
+ SeekFrom::Current(i) => (1, i),
131
+ SeekFrom::End(i) => (2, i),
132
+ };
133
+
134
+ let new_position = unwrapped_inner
135
+ .funcall("seek", (ruby_offset, whence))
136
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
137
+
138
+ Ok(new_position)
139
+ }
77
140
  }
78
- Ok(self.offset as u64)
79
141
  }
80
142
  }
81
143
 
82
- impl RubyReader<Value> {
83
- fn from_io(input: Value) -> Result<Box<dyn SeekableRead>, magnus::Error> {
84
- if Self::is_io_like(&input) {
85
- Ok(Box::new(Self::from_io_like(input)))
86
- } else {
87
- Err(magnus::Error::new(
88
- magnus::exception::type_error(),
89
- "Input is not an IO-like object",
90
- ))
144
+ impl Read for RubyReader {
145
+ fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
146
+ let ruby = unsafe { Ruby::get_unchecked() };
147
+ match self {
148
+ RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.read(buf),
149
+ RubyReader::String { inner, offset } => {
150
+ let unwrapped_inner = ruby.get_inner(*inner);
151
+
152
+ let string_buffer = unsafe { unwrapped_inner.as_slice() };
153
+ if *offset >= string_buffer.len() {
154
+ return Ok(0); // EOF
155
+ }
156
+
157
+ let remaining = string_buffer.len() - *offset;
158
+ let copy_size = remaining.min(buf.len());
159
+ buf[..copy_size].copy_from_slice(&string_buffer[*offset..*offset + copy_size]);
160
+
161
+ *offset += copy_size;
162
+
163
+ Ok(copy_size)
164
+ }
165
+ RubyReader::RubyIoLike { inner } => {
166
+ let unwrapped_inner = ruby.get_inner(*inner);
167
+
168
+ let bytes = unwrapped_inner
169
+ .funcall::<_, _, Option<RString>>("read", (buf.len(),))
170
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
171
+
172
+ match bytes {
173
+ Some(bytes) => {
174
+ let string_buffer = unsafe { bytes.as_slice() };
175
+ buf.write_all(string_buffer)?;
176
+ Ok(string_buffer.len())
177
+ }
178
+ None => {
179
+ return Ok(0);
180
+ }
181
+ }
182
+ }
91
183
  }
92
184
  }
185
+ }
93
186
 
94
- fn is_io_like(input: &Value) -> bool {
95
- input.respond_to("read", false).unwrap_or(false)
96
- }
187
+ impl Length for RubyReader {
188
+ fn len(&self) -> u64 {
189
+ let ruby = unsafe { Ruby::get_unchecked() };
190
+ match self {
191
+ RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.len(),
192
+ RubyReader::String { inner, offset: _ } => {
193
+ let unwrapped_inner = ruby.get_inner(*inner);
194
+ unwrapped_inner.len() as u64
195
+ }
196
+ RubyReader::RubyIoLike { inner } => {
197
+ let unwrapped_inner = ruby.get_inner(*inner);
198
+ let current_pos = unwrapped_inner.funcall::<_, _, u64>("seek", (0, 1));
199
+
200
+ if let Err(e) = current_pos {
201
+ eprintln!("Error seeking: {}", e);
202
+ return 0;
203
+ }
204
+
205
+ if let Err(e) = unwrapped_inner.funcall::<_, _, u64>("seek", (0, 2)) {
206
+ eprintln!("Error seeking: {}", e);
207
+ return 0;
208
+ }
97
209
 
98
- fn from_io_like(input: Value) -> Self {
99
- Self {
100
- inner: input,
101
- offset: 0,
210
+ let size = unwrapped_inner.funcall::<_, _, u64>("pos", ());
211
+
212
+ match size {
213
+ Ok(size) => {
214
+ // Restore original position
215
+ if let Err(e) = unwrapped_inner.funcall::<_, _, u64>(
216
+ "seek",
217
+ (current_pos.expect("Current position is not set!"), 0),
218
+ ) {
219
+ eprintln!("Error seeking: {}", e);
220
+ return 0;
221
+ }
222
+ size
223
+ }
224
+ Err(e) => {
225
+ eprintln!("Error seeking: {}", e);
226
+ return 0;
227
+ }
228
+ }
229
+ }
102
230
  }
103
231
  }
104
232
  }
105
233
 
106
- impl RubyReader<RString> {
107
- pub fn from_string_io(
108
- ruby: &Ruby,
109
- input: Value,
110
- ) -> Result<Box<dyn SeekableRead>, magnus::Error> {
111
- if !Self::is_string_io(ruby, &input) {
112
- return Err(magnus::Error::new(
113
- magnus::exception::type_error(),
114
- "Input is not a StringIO",
115
- ));
116
- }
234
+ const READ_BUFFER_SIZE: usize = 16 * 1024;
235
+
236
+ #[derive(Clone)]
237
+ pub struct ThreadSafeRubyReader(Arc<Mutex<RubyReader>>);
117
238
 
118
- let string_content = input.funcall::<_, _, RString>("string", ()).unwrap();
119
- Ok(Box::new(Self {
120
- inner: string_content,
121
- offset: 0,
122
- }))
239
+ impl ThreadSafeRubyReader {
240
+ pub fn new(reader: RubyReader) -> Self {
241
+ Self(Arc::new(Mutex::new(reader)))
123
242
  }
243
+ }
124
244
 
125
- fn is_string_io(ruby: &Ruby, input: &Value) -> bool {
126
- let string_io_class = STRING_IO_CLASS.get_or_init(|| {
127
- let class = RClass::from_value(ruby.eval("StringIO").unwrap()).unwrap();
128
- Opaque::from(class)
129
- });
130
- input.is_kind_of(ruby.get_inner(*string_io_class))
245
+ impl Length for ThreadSafeRubyReader {
246
+ fn len(&self) -> u64 {
247
+ self.0.lock().expect("Failed to lock mutex").len()
131
248
  }
249
+ }
132
250
 
133
- fn from_string_like(input: Value) -> Result<Box<dyn SeekableRead>, magnus::Error> {
134
- // Try calling `to_str`, and if that fails, try `to_s`
135
- let string_content = input
136
- .funcall::<_, _, RString>("to_str", ())
137
- .or_else(|_| input.funcall::<_, _, RString>("to_s", ()))?;
138
- Ok(Box::new(Self {
139
- inner: string_content,
140
- offset: 0,
141
- }))
251
+ impl Seek for ThreadSafeRubyReader {
252
+ fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
253
+ let mut reader = self
254
+ .0
255
+ .lock()
256
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
257
+ reader.seek(pos)
142
258
  }
143
259
  }
144
260
 
145
- impl Read for RubyReader<Value> {
146
- fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
147
- let bytes = self
148
- .inner
149
- .funcall::<_, _, RString>("read", (buf.len(),))
261
+ impl Read for ThreadSafeRubyReader {
262
+ fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
263
+ let mut reader = self
264
+ .0
265
+ .lock()
150
266
  .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
267
+ reader.read(buf)
268
+ }
269
+ }
151
270
 
152
- buf.write_all(unsafe { bytes.as_slice() })?;
271
+ impl ChunkReader for ThreadSafeRubyReader {
272
+ type T = BufReader<ThreadSafeRubyReader>;
153
273
 
154
- Ok(bytes.len())
274
+ fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
275
+ let mut reader = self.clone();
276
+ reader.seek(SeekFrom::Start(start))?;
277
+ Ok(BufReader::with_capacity(READ_BUFFER_SIZE, reader))
155
278
  }
156
- }
157
279
 
158
- impl Read for RubyReader<RString> {
159
- fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
160
- let string_buffer = unsafe { self.inner.as_slice() };
161
- if self.offset >= string_buffer.len() {
162
- return Ok(0); // EOF
163
- }
280
+ fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
281
+ let mut buffer = Vec::with_capacity(length);
282
+ let mut reader = self.clone();
283
+ reader.seek(SeekFrom::Start(start))?;
284
+ let read = reader.take(length as _).read_to_end(&mut buffer)?;
164
285
 
165
- let remaining = string_buffer.len() - self.offset;
166
- let copy_size = remaining.min(buf.len());
167
- buf[..copy_size].copy_from_slice(&string_buffer[self.offset..self.offset + copy_size]);
168
- self.offset += copy_size;
169
- Ok(copy_size)
286
+ if read != length {
287
+ return Err(ParquetError::EOF(format!(
288
+ "Expected to read {} bytes, read only {}",
289
+ length, read
290
+ )));
291
+ }
292
+ Ok(buffer.into())
170
293
  }
171
294
  }
@@ -45,7 +45,7 @@ pub fn parse_zoned_timestamp(value: &ParquetValue) -> jiff::Timestamp {
45
45
  ts.to_zoned(tz).timestamp()
46
46
  } else {
47
47
  // Try IANA timezone
48
- match ts.intz(&tz) {
48
+ match ts.in_tz(&tz) {
49
49
  Ok(zoned) => zoned.timestamp(),
50
50
  Err(_) => ts, // Fall back to UTC if timezone is invalid
51
51
  }
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.2.13"
2
+ VERSION = "0.3.0"
3
3
  end
data/lib/parquet.rb CHANGED
@@ -1,5 +1,10 @@
1
1
  require_relative "parquet/version"
2
- require_relative "parquet/parquet"
2
+
3
+ begin
4
+ require "parquet/#{RUBY_VERSION.to_f}/parquet"
5
+ rescue LoadError
6
+ require "parquet/parquet"
7
+ end
3
8
 
4
9
  module Parquet
5
10
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.13
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-02-03 00:00:00.000000000 Z
11
+ date: 2025-02-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -63,7 +63,6 @@ files:
63
63
  - ext/parquet/src/reader/mod.rs
64
64
  - ext/parquet/src/reader/parquet_column_reader.rs
65
65
  - ext/parquet/src/reader/parquet_row_reader.rs
66
- - ext/parquet/src/ruby_integration.rs
67
66
  - ext/parquet/src/ruby_reader.rs
68
67
  - ext/parquet/src/types/core_types.rs
69
68
  - ext/parquet/src/types/mod.rs
@@ -76,7 +75,6 @@ files:
76
75
  - ext/parquet/src/writer/mod.rs
77
76
  - lib/parquet.rb
78
77
  - lib/parquet.rbi
79
- - lib/parquet/parquet.so
80
78
  - lib/parquet/version.rb
81
79
  homepage: https://github.com/njaremko/parquet-ruby
82
80
  licenses: