parquet 0.2.13 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +105 -94
- data/Gemfile +5 -3
- data/README.md +1 -1
- data/Rakefile +16 -0
- data/ext/parquet/src/lib.rs +0 -2
- data/ext/parquet/src/reader/parquet_column_reader.rs +7 -47
- data/ext/parquet/src/reader/parquet_row_reader.rs +15 -38
- data/ext/parquet/src/ruby_reader.rs +242 -119
- data/ext/parquet/src/types/timestamp.rs +1 -1
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rb +6 -1
- metadata +2 -4
- data/ext/parquet/src/ruby_integration.rs +0 -77
- data/lib/parquet/parquet.so +0 -0
@@ -1,20 +1,18 @@
|
|
1
1
|
use crate::header_cache::StringCache;
|
2
|
+
use crate::ruby_reader::{RubyReader, ThreadSafeRubyReader};
|
2
3
|
use crate::types::{ArrayWrapper, TryIntoValue};
|
3
4
|
use crate::{
|
4
|
-
create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord,
|
5
|
-
|
5
|
+
create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ParquetValueVec,
|
6
|
+
ParserResultType,
|
6
7
|
};
|
7
8
|
use ahash::RandomState;
|
8
|
-
use magnus::
|
9
|
-
use magnus::value::{Opaque, ReprValue};
|
9
|
+
use magnus::value::ReprValue;
|
10
10
|
use magnus::IntoValue;
|
11
11
|
use magnus::{Error as MagnusError, Ruby, Value};
|
12
12
|
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
|
13
13
|
use parquet::arrow::ProjectionMask;
|
14
14
|
use std::collections::HashMap;
|
15
15
|
use std::fs::File;
|
16
|
-
use std::mem::ManuallyDrop;
|
17
|
-
use std::os::fd::FromRawFd;
|
18
16
|
use std::sync::OnceLock;
|
19
17
|
|
20
18
|
use super::ReaderError;
|
@@ -71,50 +69,12 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
|
|
71
69
|
|
72
70
|
let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
|
73
71
|
|
74
|
-
(reader, schema, num_rows)
|
75
|
-
} else if to_read.is_kind_of(ruby.class_io()) {
|
76
|
-
let raw_value = to_read.as_raw();
|
77
|
-
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
78
|
-
.map_err(|_| {
|
79
|
-
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
80
|
-
})?;
|
81
|
-
|
82
|
-
if fd < 0 {
|
83
|
-
return Err(ReaderError::InvalidFileDescriptor.into());
|
84
|
-
}
|
85
|
-
|
86
|
-
let file = unsafe { File::from_raw_fd(fd) };
|
87
|
-
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
88
|
-
|
89
|
-
let mut builder =
|
90
|
-
ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
|
91
|
-
let schema = builder.schema().clone();
|
92
|
-
let num_rows = builder.metadata().file_metadata().num_rows();
|
93
|
-
|
94
|
-
// If columns are specified, project only those columns
|
95
|
-
if let Some(cols) = &columns {
|
96
|
-
// Get the parquet schema
|
97
|
-
let parquet_schema = builder.parquet_schema();
|
98
|
-
|
99
|
-
// Create a projection mask from column names
|
100
|
-
let projection =
|
101
|
-
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
102
|
-
|
103
|
-
builder = builder.with_projection(projection);
|
104
|
-
}
|
105
|
-
|
106
|
-
if let Some(batch_size) = batch_size {
|
107
|
-
builder = builder.with_batch_size(batch_size);
|
108
|
-
}
|
109
|
-
|
110
|
-
let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
|
111
|
-
|
112
72
|
(reader, schema, num_rows)
|
113
73
|
} else {
|
114
|
-
let readable =
|
74
|
+
let readable = ThreadSafeRubyReader::new(RubyReader::try_from(to_read)?);
|
115
75
|
|
116
|
-
let mut builder =
|
117
|
-
.map_err(
|
76
|
+
let mut builder =
|
77
|
+
ParquetRecordBatchReaderBuilder::try_new(readable).map_err(ReaderError::from)?;
|
118
78
|
let schema = builder.schema().clone();
|
119
79
|
let num_rows = builder.metadata().file_metadata().num_rows();
|
120
80
|
|
@@ -1,12 +1,12 @@
|
|
1
1
|
use crate::header_cache::StringCache;
|
2
|
+
use crate::ruby_reader::{RubyReader, ThreadSafeRubyReader};
|
2
3
|
use crate::types::TryIntoValue;
|
3
4
|
use crate::{
|
4
|
-
create_row_enumerator, utils::*,
|
5
|
-
|
5
|
+
create_row_enumerator, utils::*, ParquetField, ParserResultType, ReaderError,
|
6
|
+
RowEnumeratorArgs, RowRecord,
|
6
7
|
};
|
7
8
|
use ahash::RandomState;
|
8
|
-
use magnus::
|
9
|
-
use magnus::value::{Opaque, ReprValue};
|
9
|
+
use magnus::value::ReprValue;
|
10
10
|
use magnus::IntoValue;
|
11
11
|
use magnus::{Error as MagnusError, Ruby, Value};
|
12
12
|
use parquet::file::reader::{FileReader, SerializedFileReader};
|
@@ -14,8 +14,6 @@ use parquet::record::reader::RowIter as ParquetRowIter;
|
|
14
14
|
use parquet::schema::types::{Type as SchemaType, TypePtr};
|
15
15
|
use std::collections::HashMap;
|
16
16
|
use std::fs::File;
|
17
|
-
use std::mem::ManuallyDrop;
|
18
|
-
use std::os::fd::FromRawFd;
|
19
17
|
use std::sync::OnceLock;
|
20
18
|
|
21
19
|
#[inline]
|
@@ -40,39 +38,17 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
|
|
40
38
|
.map(|yield_enum| yield_enum.into_value_with(&ruby));
|
41
39
|
}
|
42
40
|
|
43
|
-
let
|
41
|
+
let reader: Box<dyn FileReader> = if to_read.is_kind_of(ruby.class_string()) {
|
44
42
|
let path_string = to_read.to_r_string()?;
|
45
43
|
let file_path = unsafe { path_string.as_str()? };
|
46
|
-
let file = File::open(file_path).
|
47
|
-
|
48
|
-
let schema = reader.metadata().file_metadata().schema().clone();
|
49
|
-
|
50
|
-
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
51
|
-
} else if to_read.is_kind_of(ruby.class_io()) {
|
52
|
-
let raw_value = to_read.as_raw();
|
53
|
-
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
54
|
-
.map_err(|_| {
|
55
|
-
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
56
|
-
})?;
|
57
|
-
|
58
|
-
if fd < 0 {
|
59
|
-
return Err(ReaderError::InvalidFileDescriptor.into());
|
60
|
-
}
|
61
|
-
|
62
|
-
let file = unsafe { File::from_raw_fd(fd) };
|
63
|
-
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
64
|
-
let reader = SerializedFileReader::new(file).unwrap();
|
65
|
-
let schema = reader.metadata().file_metadata().schema().clone();
|
66
|
-
|
67
|
-
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
44
|
+
let file = File::open(file_path).map_err(ReaderError::from)?;
|
45
|
+
Box::new(SerializedFileReader::new(file).map_err(ReaderError::from)?)
|
68
46
|
} else {
|
69
|
-
let readable =
|
70
|
-
|
71
|
-
let schema = reader.metadata().file_metadata().schema().clone();
|
72
|
-
|
73
|
-
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
47
|
+
let readable = ThreadSafeRubyReader::new(RubyReader::try_from(to_read)?);
|
48
|
+
Box::new(SerializedFileReader::new(readable).map_err(ReaderError::from)?)
|
74
49
|
};
|
75
|
-
|
50
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
51
|
+
let mut iter = ParquetRowIter::from_file_into(reader);
|
76
52
|
if let Some(cols) = columns {
|
77
53
|
let projection = create_projection_schema(&schema, &cols);
|
78
54
|
iter = iter.project(Some(projection.to_owned())).map_err(|e| {
|
@@ -97,7 +73,8 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
|
|
97
73
|
header_string.push(k.to_owned());
|
98
74
|
}
|
99
75
|
|
100
|
-
let headers = StringCache::intern_many(&header_string)
|
76
|
+
let headers = StringCache::intern_many(&header_string)
|
77
|
+
.expect("Failed to intern headers");
|
101
78
|
|
102
79
|
headers
|
103
80
|
});
|
@@ -110,7 +87,7 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
|
|
110
87
|
Ok(map)
|
111
88
|
})
|
112
89
|
.and_then(|row| Ok(RowRecord::Map::<RandomState>(row)))
|
113
|
-
.map_err(|e| ReaderError::
|
90
|
+
.map_err(|e| ReaderError::from(e))
|
114
91
|
});
|
115
92
|
|
116
93
|
for result in iter {
|
@@ -128,7 +105,7 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
|
|
128
105
|
Ok(vec)
|
129
106
|
})
|
130
107
|
.and_then(|row| Ok(RowRecord::Vec::<RandomState>(row)))
|
131
|
-
.map_err(|e| ReaderError::
|
108
|
+
.map_err(|e| ReaderError::from(e))
|
132
109
|
});
|
133
110
|
|
134
111
|
for result in iter {
|
@@ -1,171 +1,294 @@
|
|
1
|
+
use bytes::Bytes;
|
1
2
|
use magnus::{
|
2
3
|
value::{Opaque, ReprValue},
|
3
4
|
RClass, RString, Ruby, Value,
|
4
5
|
};
|
5
|
-
use
|
6
|
-
|
6
|
+
use parquet::{
|
7
|
+
errors::ParquetError,
|
8
|
+
file::reader::{ChunkReader, Length},
|
9
|
+
};
|
10
|
+
use std::{
|
11
|
+
fs::File,
|
12
|
+
sync::{Mutex, OnceLock},
|
13
|
+
};
|
14
|
+
use std::{
|
15
|
+
io::{self, BufReader, Read, Seek, SeekFrom, Write},
|
16
|
+
sync::Arc,
|
17
|
+
};
|
7
18
|
|
8
19
|
static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
|
9
20
|
|
10
21
|
/// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
|
11
22
|
/// and provide a standard Read implementation for them.
|
12
|
-
pub
|
13
|
-
|
14
|
-
|
23
|
+
pub enum RubyReader {
|
24
|
+
String {
|
25
|
+
inner: Opaque<RString>,
|
26
|
+
offset: usize,
|
27
|
+
},
|
28
|
+
RubyIoLike {
|
29
|
+
inner: Opaque<Value>,
|
30
|
+
},
|
31
|
+
NativeProxyIoLike {
|
32
|
+
proxy_file: File,
|
33
|
+
},
|
15
34
|
}
|
16
35
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
)
|
24
|
-
|
25
|
-
RubyReader::from_string_io(ruby, input)
|
26
|
-
} else if RubyReader::is_io_like(&input) {
|
27
|
-
RubyReader::from_io(input)
|
28
|
-
} else {
|
29
|
-
RubyReader::from_string_like(input)
|
36
|
+
impl RubyReader {
|
37
|
+
fn is_string_io(ruby: &Ruby, value: &Value) -> bool {
|
38
|
+
let string_io_class = STRING_IO_CLASS.get_or_init(|| {
|
39
|
+
let class = RClass::from_value(ruby.eval("StringIO").expect("Failed to find StringIO"))
|
40
|
+
.expect("Failed to get StringIO class");
|
41
|
+
Opaque::from(class)
|
42
|
+
});
|
43
|
+
value.is_kind_of(ruby.get_inner(*string_io_class))
|
30
44
|
}
|
31
|
-
}
|
32
45
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
SeekFrom::Start(i) => (0, i as i64),
|
37
|
-
SeekFrom::Current(i) => (1, i),
|
38
|
-
SeekFrom::End(i) => (2, i),
|
39
|
-
};
|
40
|
-
|
41
|
-
let new_position = self
|
42
|
-
.inner
|
43
|
-
.funcall("seek", (offset, whence))
|
44
|
-
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
46
|
+
fn is_io_like(value: &Value) -> bool {
|
47
|
+
value.respond_to("read", false).unwrap_or(false)
|
48
|
+
}
|
45
49
|
|
46
|
-
|
50
|
+
// For now, don't use this. Having to use seek in length is scary.
|
51
|
+
fn is_seekable_io_like(_value: &Value) -> bool {
|
52
|
+
// Self::is_io_like(value) && value.respond_to("seek", false).unwrap_or(false)
|
53
|
+
false
|
47
54
|
}
|
48
55
|
}
|
49
56
|
|
50
|
-
impl
|
51
|
-
|
52
|
-
let ruby_bytes = RString::from_slice(buf);
|
57
|
+
impl TryFrom<Value> for RubyReader {
|
58
|
+
type Error = magnus::Error;
|
53
59
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
.
|
58
|
-
|
59
|
-
|
60
|
-
|
60
|
+
fn try_from(value: Value) -> Result<Self, Self::Error> {
|
61
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
62
|
+
if RubyReader::is_string_io(&ruby, &value) {
|
63
|
+
let string_content = value.funcall::<_, _, RString>("string", ())?;
|
64
|
+
Ok(RubyReader::String {
|
65
|
+
inner: Opaque::from(string_content),
|
66
|
+
offset: 0,
|
67
|
+
})
|
68
|
+
} else if RubyReader::is_seekable_io_like(&value) {
|
69
|
+
Ok(RubyReader::RubyIoLike {
|
70
|
+
inner: Opaque::from(value),
|
71
|
+
})
|
72
|
+
} else if RubyReader::is_io_like(&value) {
|
73
|
+
let mut temp_file = tempfile::tempfile()
|
74
|
+
.map_err(|e| magnus::Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
61
75
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
76
|
+
// This is safe, because we won't call seek
|
77
|
+
let inner_readable = RubyReader::RubyIoLike {
|
78
|
+
inner: Opaque::from(value),
|
79
|
+
};
|
80
|
+
let mut reader = BufReader::new(inner_readable);
|
81
|
+
io::copy(&mut reader, &mut temp_file)
|
82
|
+
.map_err(|e| magnus::Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
83
|
+
temp_file
|
84
|
+
.seek(SeekFrom::Start(0))
|
85
|
+
.map_err(|e| magnus::Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
66
86
|
|
67
|
-
|
87
|
+
Ok(RubyReader::NativeProxyIoLike {
|
88
|
+
proxy_file: temp_file,
|
89
|
+
})
|
90
|
+
} else {
|
91
|
+
// Try calling `to_str`, and if that fails, try `to_s`
|
92
|
+
let string_content = value
|
93
|
+
.funcall::<_, _, RString>("to_str", ())
|
94
|
+
.or_else(|_| value.funcall::<_, _, RString>("to_s", ()))?;
|
95
|
+
Ok(RubyReader::String {
|
96
|
+
inner: Opaque::from(string_content),
|
97
|
+
offset: 0,
|
98
|
+
})
|
99
|
+
}
|
68
100
|
}
|
69
101
|
}
|
70
102
|
|
71
|
-
impl Seek for RubyReader
|
103
|
+
impl Seek for RubyReader {
|
72
104
|
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
105
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
106
|
+
match self {
|
107
|
+
RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.seek(pos),
|
108
|
+
RubyReader::String {
|
109
|
+
inner,
|
110
|
+
offset: original_offset,
|
111
|
+
} => {
|
112
|
+
let unwrapped_inner = ruby.get_inner(*inner);
|
113
|
+
|
114
|
+
let new_offset = match pos {
|
115
|
+
io::SeekFrom::Start(offset) => offset as usize,
|
116
|
+
io::SeekFrom::Current(offset) => (*original_offset as i64 + offset) as usize,
|
117
|
+
io::SeekFrom::End(offset) => {
|
118
|
+
unwrapped_inner.len().saturating_sub(offset as usize)
|
119
|
+
}
|
120
|
+
};
|
121
|
+
|
122
|
+
*original_offset = new_offset.min(unwrapped_inner.len());
|
123
|
+
Ok(*original_offset as u64)
|
124
|
+
}
|
125
|
+
RubyReader::RubyIoLike { inner } => {
|
126
|
+
let unwrapped_inner = ruby.get_inner(*inner);
|
127
|
+
|
128
|
+
let (whence, ruby_offset) = match pos {
|
129
|
+
SeekFrom::Start(i) => (0, i as i64),
|
130
|
+
SeekFrom::Current(i) => (1, i),
|
131
|
+
SeekFrom::End(i) => (2, i),
|
132
|
+
};
|
133
|
+
|
134
|
+
let new_position = unwrapped_inner
|
135
|
+
.funcall("seek", (ruby_offset, whence))
|
136
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
137
|
+
|
138
|
+
Ok(new_position)
|
139
|
+
}
|
77
140
|
}
|
78
|
-
Ok(self.offset as u64)
|
79
141
|
}
|
80
142
|
}
|
81
143
|
|
82
|
-
impl RubyReader
|
83
|
-
fn
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
144
|
+
impl Read for RubyReader {
|
145
|
+
fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
|
146
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
147
|
+
match self {
|
148
|
+
RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.read(buf),
|
149
|
+
RubyReader::String { inner, offset } => {
|
150
|
+
let unwrapped_inner = ruby.get_inner(*inner);
|
151
|
+
|
152
|
+
let string_buffer = unsafe { unwrapped_inner.as_slice() };
|
153
|
+
if *offset >= string_buffer.len() {
|
154
|
+
return Ok(0); // EOF
|
155
|
+
}
|
156
|
+
|
157
|
+
let remaining = string_buffer.len() - *offset;
|
158
|
+
let copy_size = remaining.min(buf.len());
|
159
|
+
buf[..copy_size].copy_from_slice(&string_buffer[*offset..*offset + copy_size]);
|
160
|
+
|
161
|
+
*offset += copy_size;
|
162
|
+
|
163
|
+
Ok(copy_size)
|
164
|
+
}
|
165
|
+
RubyReader::RubyIoLike { inner } => {
|
166
|
+
let unwrapped_inner = ruby.get_inner(*inner);
|
167
|
+
|
168
|
+
let bytes = unwrapped_inner
|
169
|
+
.funcall::<_, _, Option<RString>>("read", (buf.len(),))
|
170
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
171
|
+
|
172
|
+
match bytes {
|
173
|
+
Some(bytes) => {
|
174
|
+
let string_buffer = unsafe { bytes.as_slice() };
|
175
|
+
buf.write_all(string_buffer)?;
|
176
|
+
Ok(string_buffer.len())
|
177
|
+
}
|
178
|
+
None => {
|
179
|
+
return Ok(0);
|
180
|
+
}
|
181
|
+
}
|
182
|
+
}
|
91
183
|
}
|
92
184
|
}
|
185
|
+
}
|
93
186
|
|
94
|
-
|
95
|
-
|
96
|
-
|
187
|
+
impl Length for RubyReader {
|
188
|
+
fn len(&self) -> u64 {
|
189
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
190
|
+
match self {
|
191
|
+
RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.len(),
|
192
|
+
RubyReader::String { inner, offset: _ } => {
|
193
|
+
let unwrapped_inner = ruby.get_inner(*inner);
|
194
|
+
unwrapped_inner.len() as u64
|
195
|
+
}
|
196
|
+
RubyReader::RubyIoLike { inner } => {
|
197
|
+
let unwrapped_inner = ruby.get_inner(*inner);
|
198
|
+
let current_pos = unwrapped_inner.funcall::<_, _, u64>("seek", (0, 1));
|
199
|
+
|
200
|
+
if let Err(e) = current_pos {
|
201
|
+
eprintln!("Error seeking: {}", e);
|
202
|
+
return 0;
|
203
|
+
}
|
204
|
+
|
205
|
+
if let Err(e) = unwrapped_inner.funcall::<_, _, u64>("seek", (0, 2)) {
|
206
|
+
eprintln!("Error seeking: {}", e);
|
207
|
+
return 0;
|
208
|
+
}
|
97
209
|
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
210
|
+
let size = unwrapped_inner.funcall::<_, _, u64>("pos", ());
|
211
|
+
|
212
|
+
match size {
|
213
|
+
Ok(size) => {
|
214
|
+
// Restore original position
|
215
|
+
if let Err(e) = unwrapped_inner.funcall::<_, _, u64>(
|
216
|
+
"seek",
|
217
|
+
(current_pos.expect("Current position is not set!"), 0),
|
218
|
+
) {
|
219
|
+
eprintln!("Error seeking: {}", e);
|
220
|
+
return 0;
|
221
|
+
}
|
222
|
+
size
|
223
|
+
}
|
224
|
+
Err(e) => {
|
225
|
+
eprintln!("Error seeking: {}", e);
|
226
|
+
return 0;
|
227
|
+
}
|
228
|
+
}
|
229
|
+
}
|
102
230
|
}
|
103
231
|
}
|
104
232
|
}
|
105
233
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
) -> Result<Box<dyn SeekableRead>, magnus::Error> {
|
111
|
-
if !Self::is_string_io(ruby, &input) {
|
112
|
-
return Err(magnus::Error::new(
|
113
|
-
magnus::exception::type_error(),
|
114
|
-
"Input is not a StringIO",
|
115
|
-
));
|
116
|
-
}
|
234
|
+
const READ_BUFFER_SIZE: usize = 16 * 1024;
|
235
|
+
|
236
|
+
#[derive(Clone)]
|
237
|
+
pub struct ThreadSafeRubyReader(Arc<Mutex<RubyReader>>);
|
117
238
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
offset: 0,
|
122
|
-
}))
|
239
|
+
impl ThreadSafeRubyReader {
|
240
|
+
pub fn new(reader: RubyReader) -> Self {
|
241
|
+
Self(Arc::new(Mutex::new(reader)))
|
123
242
|
}
|
243
|
+
}
|
124
244
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
Opaque::from(class)
|
129
|
-
});
|
130
|
-
input.is_kind_of(ruby.get_inner(*string_io_class))
|
245
|
+
impl Length for ThreadSafeRubyReader {
|
246
|
+
fn len(&self) -> u64 {
|
247
|
+
self.0.lock().expect("Failed to lock mutex").len()
|
131
248
|
}
|
249
|
+
}
|
132
250
|
|
133
|
-
|
134
|
-
|
135
|
-
let
|
136
|
-
.
|
137
|
-
.
|
138
|
-
|
139
|
-
|
140
|
-
offset: 0,
|
141
|
-
}))
|
251
|
+
impl Seek for ThreadSafeRubyReader {
|
252
|
+
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
253
|
+
let mut reader = self
|
254
|
+
.0
|
255
|
+
.lock()
|
256
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
257
|
+
reader.seek(pos)
|
142
258
|
}
|
143
259
|
}
|
144
260
|
|
145
|
-
impl Read for
|
146
|
-
fn read(&mut self,
|
147
|
-
let
|
148
|
-
.
|
149
|
-
.
|
261
|
+
impl Read for ThreadSafeRubyReader {
|
262
|
+
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
263
|
+
let mut reader = self
|
264
|
+
.0
|
265
|
+
.lock()
|
150
266
|
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
267
|
+
reader.read(buf)
|
268
|
+
}
|
269
|
+
}
|
151
270
|
|
152
|
-
|
271
|
+
impl ChunkReader for ThreadSafeRubyReader {
|
272
|
+
type T = BufReader<ThreadSafeRubyReader>;
|
153
273
|
|
154
|
-
|
274
|
+
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
275
|
+
let mut reader = self.clone();
|
276
|
+
reader.seek(SeekFrom::Start(start))?;
|
277
|
+
Ok(BufReader::with_capacity(READ_BUFFER_SIZE, reader))
|
155
278
|
}
|
156
|
-
}
|
157
279
|
|
158
|
-
|
159
|
-
|
160
|
-
let
|
161
|
-
|
162
|
-
|
163
|
-
}
|
280
|
+
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
281
|
+
let mut buffer = Vec::with_capacity(length);
|
282
|
+
let mut reader = self.clone();
|
283
|
+
reader.seek(SeekFrom::Start(start))?;
|
284
|
+
let read = reader.take(length as _).read_to_end(&mut buffer)?;
|
164
285
|
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
286
|
+
if read != length {
|
287
|
+
return Err(ParquetError::EOF(format!(
|
288
|
+
"Expected to read {} bytes, read only {}",
|
289
|
+
length, read
|
290
|
+
)));
|
291
|
+
}
|
292
|
+
Ok(buffer.into())
|
170
293
|
}
|
171
294
|
}
|
@@ -45,7 +45,7 @@ pub fn parse_zoned_timestamp(value: &ParquetValue) -> jiff::Timestamp {
|
|
45
45
|
ts.to_zoned(tz).timestamp()
|
46
46
|
} else {
|
47
47
|
// Try IANA timezone
|
48
|
-
match ts.
|
48
|
+
match ts.in_tz(&tz) {
|
49
49
|
Ok(zoned) => zoned.timestamp(),
|
50
50
|
Err(_) => ts, // Fall back to UTC if timezone is invalid
|
51
51
|
}
|
data/lib/parquet/version.rb
CHANGED
data/lib/parquet.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-02-
|
11
|
+
date: 2025-02-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -63,7 +63,6 @@ files:
|
|
63
63
|
- ext/parquet/src/reader/mod.rs
|
64
64
|
- ext/parquet/src/reader/parquet_column_reader.rs
|
65
65
|
- ext/parquet/src/reader/parquet_row_reader.rs
|
66
|
-
- ext/parquet/src/ruby_integration.rs
|
67
66
|
- ext/parquet/src/ruby_reader.rs
|
68
67
|
- ext/parquet/src/types/core_types.rs
|
69
68
|
- ext/parquet/src/types/mod.rs
|
@@ -76,7 +75,6 @@ files:
|
|
76
75
|
- ext/parquet/src/writer/mod.rs
|
77
76
|
- lib/parquet.rb
|
78
77
|
- lib/parquet.rbi
|
79
|
-
- lib/parquet/parquet.so
|
80
78
|
- lib/parquet/version.rb
|
81
79
|
homepage: https://github.com/njaremko/parquet-ruby
|
82
80
|
licenses:
|