parquet 0.2.12-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Cargo.lock +1449 -0
- data/Cargo.toml +3 -0
- data/Gemfile +17 -0
- data/LICENSE +21 -0
- data/README.md +197 -0
- data/Rakefile +27 -0
- data/ext/parquet/Cargo.toml +28 -0
- data/ext/parquet/extconf.rb +4 -0
- data/ext/parquet/src/allocator.rs +13 -0
- data/ext/parquet/src/enumerator.rs +52 -0
- data/ext/parquet/src/header_cache.rs +100 -0
- data/ext/parquet/src/lib.rs +29 -0
- data/ext/parquet/src/reader/mod.rs +44 -0
- data/ext/parquet/src/reader/parquet_column_reader.rs +214 -0
- data/ext/parquet/src/reader/parquet_row_reader.rs +157 -0
- data/ext/parquet/src/ruby_integration.rs +77 -0
- data/ext/parquet/src/ruby_reader.rs +171 -0
- data/ext/parquet/src/types/core_types.rs +75 -0
- data/ext/parquet/src/types/mod.rs +30 -0
- data/ext/parquet/src/types/parquet_value.rs +462 -0
- data/ext/parquet/src/types/record_types.rs +204 -0
- data/ext/parquet/src/types/timestamp.rs +85 -0
- data/ext/parquet/src/types/type_conversion.rs +809 -0
- data/ext/parquet/src/types/writer_types.rs +283 -0
- data/ext/parquet/src/utils.rs +148 -0
- data/ext/parquet/src/writer/mod.rs +575 -0
- data/lib/parquet/version.rb +3 -0
- data/lib/parquet.rb +5 -0
- data/lib/parquet.rbi +113 -0
- metadata +109 -0
@@ -0,0 +1,214 @@
|
|
1
|
+
use crate::header_cache::StringCache;
|
2
|
+
use crate::{
|
3
|
+
create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ForgottenFileHandle,
|
4
|
+
ParquetValueVec, ParserResultType, SeekableRubyValue,
|
5
|
+
};
|
6
|
+
use ahash::RandomState;
|
7
|
+
use magnus::rb_sys::AsRawValue;
|
8
|
+
use magnus::value::{Opaque, ReprValue};
|
9
|
+
use magnus::IntoValue;
|
10
|
+
use magnus::{Error as MagnusError, Ruby, Value};
|
11
|
+
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
|
12
|
+
use parquet::arrow::ProjectionMask;
|
13
|
+
use std::collections::HashMap;
|
14
|
+
use std::fs::File;
|
15
|
+
use std::mem::ManuallyDrop;
|
16
|
+
use std::os::fd::FromRawFd;
|
17
|
+
use std::sync::OnceLock;
|
18
|
+
|
19
|
+
use super::ReaderError;
|
20
|
+
|
21
|
+
#[inline]
|
22
|
+
pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
23
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
24
|
+
|
25
|
+
let ParquetColumnsArgs {
|
26
|
+
to_read,
|
27
|
+
result_type,
|
28
|
+
columns,
|
29
|
+
batch_size,
|
30
|
+
} = parse_parquet_columns_args(&ruby, args)?;
|
31
|
+
|
32
|
+
if !ruby.block_given() {
|
33
|
+
return create_column_enumerator(ColumnEnumeratorArgs {
|
34
|
+
rb_self,
|
35
|
+
to_read,
|
36
|
+
result_type,
|
37
|
+
columns,
|
38
|
+
batch_size,
|
39
|
+
})
|
40
|
+
.map(|yield_enum| yield_enum.into_value_with(&ruby));
|
41
|
+
}
|
42
|
+
|
43
|
+
let (batch_reader, schema, num_rows) = if to_read.is_kind_of(ruby.class_string()) {
|
44
|
+
let path_string = to_read.to_r_string()?;
|
45
|
+
let file_path = unsafe { path_string.as_str()? };
|
46
|
+
let file = File::open(file_path).map_err(|e| ReaderError::FileOpen(e))?;
|
47
|
+
|
48
|
+
let mut builder =
|
49
|
+
ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
|
50
|
+
let schema = builder.schema().clone();
|
51
|
+
let num_rows = builder.metadata().file_metadata().num_rows();
|
52
|
+
|
53
|
+
// If columns are specified, project only those columns
|
54
|
+
if let Some(cols) = &columns {
|
55
|
+
// Get the parquet schema
|
56
|
+
let parquet_schema = builder.parquet_schema();
|
57
|
+
|
58
|
+
// Create a projection mask from column names
|
59
|
+
let projection =
|
60
|
+
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
61
|
+
|
62
|
+
builder = builder.with_projection(projection);
|
63
|
+
}
|
64
|
+
|
65
|
+
if let Some(batch_size) = batch_size {
|
66
|
+
builder = builder.with_batch_size(batch_size);
|
67
|
+
}
|
68
|
+
|
69
|
+
let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
|
70
|
+
|
71
|
+
(reader, schema, num_rows)
|
72
|
+
} else if to_read.is_kind_of(ruby.class_io()) {
|
73
|
+
let raw_value = to_read.as_raw();
|
74
|
+
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
75
|
+
.map_err(|_| {
|
76
|
+
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
77
|
+
})?;
|
78
|
+
|
79
|
+
if fd < 0 {
|
80
|
+
return Err(ReaderError::InvalidFileDescriptor.into());
|
81
|
+
}
|
82
|
+
|
83
|
+
let file = unsafe { File::from_raw_fd(fd) };
|
84
|
+
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
85
|
+
|
86
|
+
let mut builder =
|
87
|
+
ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
|
88
|
+
let schema = builder.schema().clone();
|
89
|
+
let num_rows = builder.metadata().file_metadata().num_rows();
|
90
|
+
|
91
|
+
// If columns are specified, project only those columns
|
92
|
+
if let Some(cols) = &columns {
|
93
|
+
// Get the parquet schema
|
94
|
+
let parquet_schema = builder.parquet_schema();
|
95
|
+
|
96
|
+
// Create a projection mask from column names
|
97
|
+
let projection =
|
98
|
+
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
99
|
+
|
100
|
+
builder = builder.with_projection(projection);
|
101
|
+
}
|
102
|
+
|
103
|
+
if let Some(batch_size) = batch_size {
|
104
|
+
builder = builder.with_batch_size(batch_size);
|
105
|
+
}
|
106
|
+
|
107
|
+
let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
|
108
|
+
|
109
|
+
(reader, schema, num_rows)
|
110
|
+
} else {
|
111
|
+
let readable = SeekableRubyValue(Opaque::from(to_read));
|
112
|
+
|
113
|
+
let mut builder = ParquetRecordBatchReaderBuilder::try_new(readable)
|
114
|
+
.map_err(|e| ReaderError::Parquet(e))?;
|
115
|
+
let schema = builder.schema().clone();
|
116
|
+
let num_rows = builder.metadata().file_metadata().num_rows();
|
117
|
+
|
118
|
+
// If columns are specified, project only those columns
|
119
|
+
if let Some(cols) = &columns {
|
120
|
+
// Get the parquet schema
|
121
|
+
let parquet_schema = builder.parquet_schema();
|
122
|
+
|
123
|
+
// Create a projection mask from column names
|
124
|
+
let projection =
|
125
|
+
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
126
|
+
|
127
|
+
builder = builder.with_projection(projection);
|
128
|
+
}
|
129
|
+
|
130
|
+
if let Some(batch_size) = batch_size {
|
131
|
+
builder = builder.with_batch_size(batch_size);
|
132
|
+
}
|
133
|
+
|
134
|
+
let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
|
135
|
+
|
136
|
+
(reader, schema, num_rows)
|
137
|
+
};
|
138
|
+
|
139
|
+
if num_rows == 0 {
|
140
|
+
let mut map =
|
141
|
+
HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
|
142
|
+
let headers: Vec<String> = schema
|
143
|
+
.fields()
|
144
|
+
.iter()
|
145
|
+
.map(|field| field.name().to_string())
|
146
|
+
.collect();
|
147
|
+
let interned_headers =
|
148
|
+
StringCache::intern_many(&headers).map_err(|e| ReaderError::HeaderIntern(e))?;
|
149
|
+
for field in interned_headers.iter() {
|
150
|
+
map.insert(*field, vec![]);
|
151
|
+
}
|
152
|
+
let record = ColumnRecord::Map(map);
|
153
|
+
let _: Value = ruby.yield_value(record)?;
|
154
|
+
return Ok(ruby.qnil().into_value_with(&ruby));
|
155
|
+
}
|
156
|
+
|
157
|
+
match result_type {
|
158
|
+
ParserResultType::Hash => {
|
159
|
+
let headers = OnceLock::new();
|
160
|
+
let headers_clone = headers.clone();
|
161
|
+
let iter = batch_reader.map(move |batch| {
|
162
|
+
batch.map_err(ReaderError::Arrow).and_then(|batch| {
|
163
|
+
let headers = headers_clone.get_or_init(|| {
|
164
|
+
let schema = batch.schema();
|
165
|
+
let fields = schema.fields();
|
166
|
+
let mut header_string = Vec::with_capacity(fields.len());
|
167
|
+
for field in fields {
|
168
|
+
header_string.push(field.name().to_owned());
|
169
|
+
}
|
170
|
+
StringCache::intern_many(&header_string).unwrap()
|
171
|
+
});
|
172
|
+
|
173
|
+
let mut map =
|
174
|
+
HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
|
175
|
+
|
176
|
+
batch.columns().iter().enumerate().for_each(|(i, column)| {
|
177
|
+
let header = headers[i];
|
178
|
+
let values = ParquetValueVec::try_from(column.clone()).unwrap();
|
179
|
+
map.insert(header, values.into_inner());
|
180
|
+
});
|
181
|
+
|
182
|
+
Ok(ColumnRecord::Map::<RandomState>(map))
|
183
|
+
})
|
184
|
+
});
|
185
|
+
|
186
|
+
for result in iter {
|
187
|
+
let record = result?;
|
188
|
+
let _: Value = ruby.yield_value(record)?;
|
189
|
+
}
|
190
|
+
}
|
191
|
+
ParserResultType::Array => {
|
192
|
+
let iter = batch_reader.map(|batch| {
|
193
|
+
batch.map_err(ReaderError::Arrow).and_then(|batch| {
|
194
|
+
let vec = batch
|
195
|
+
.columns()
|
196
|
+
.into_iter()
|
197
|
+
.map(|column| {
|
198
|
+
let values = ParquetValueVec::try_from(column.clone()).unwrap();
|
199
|
+
values.into_inner()
|
200
|
+
})
|
201
|
+
.collect();
|
202
|
+
Ok(ColumnRecord::Vec::<RandomState>(vec))
|
203
|
+
})
|
204
|
+
});
|
205
|
+
|
206
|
+
for result in iter {
|
207
|
+
let record = result?;
|
208
|
+
let _: Value = ruby.yield_value(record)?;
|
209
|
+
}
|
210
|
+
}
|
211
|
+
}
|
212
|
+
|
213
|
+
Ok(ruby.qnil().into_value_with(&ruby))
|
214
|
+
}
|
@@ -0,0 +1,157 @@
|
|
1
|
+
use crate::header_cache::StringCache;
|
2
|
+
use crate::{
|
3
|
+
create_row_enumerator, utils::*, ForgottenFileHandle, ParquetField, ParserResultType,
|
4
|
+
ReaderError, RowEnumeratorArgs, RowRecord, SeekableRubyValue,
|
5
|
+
};
|
6
|
+
use ahash::RandomState;
|
7
|
+
use magnus::rb_sys::AsRawValue;
|
8
|
+
use magnus::value::{Opaque, ReprValue};
|
9
|
+
use magnus::IntoValue;
|
10
|
+
use magnus::{Error as MagnusError, Ruby, Value};
|
11
|
+
use parquet::file::reader::{FileReader, SerializedFileReader};
|
12
|
+
use parquet::record::reader::RowIter as ParquetRowIter;
|
13
|
+
use parquet::schema::types::{Type as SchemaType, TypePtr};
|
14
|
+
use std::collections::HashMap;
|
15
|
+
use std::fs::File;
|
16
|
+
use std::mem::ManuallyDrop;
|
17
|
+
use std::os::fd::FromRawFd;
|
18
|
+
use std::sync::OnceLock;
|
19
|
+
|
20
|
+
#[inline]
|
21
|
+
pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
22
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
23
|
+
|
24
|
+
let ParquetRowsArgs {
|
25
|
+
to_read,
|
26
|
+
result_type,
|
27
|
+
columns,
|
28
|
+
} = parse_parquet_rows_args(&ruby, args)?;
|
29
|
+
|
30
|
+
if !ruby.block_given() {
|
31
|
+
return create_row_enumerator(RowEnumeratorArgs {
|
32
|
+
rb_self,
|
33
|
+
to_read,
|
34
|
+
result_type,
|
35
|
+
columns,
|
36
|
+
})
|
37
|
+
.map(|yield_enum| yield_enum.into_value_with(&ruby));
|
38
|
+
}
|
39
|
+
|
40
|
+
let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
|
41
|
+
let path_string = to_read.to_r_string()?;
|
42
|
+
let file_path = unsafe { path_string.as_str()? };
|
43
|
+
let file = File::open(file_path).unwrap();
|
44
|
+
let reader = SerializedFileReader::new(file).unwrap();
|
45
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
46
|
+
|
47
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
48
|
+
} else if to_read.is_kind_of(ruby.class_io()) {
|
49
|
+
let raw_value = to_read.as_raw();
|
50
|
+
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
51
|
+
.map_err(|_| {
|
52
|
+
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
53
|
+
})?;
|
54
|
+
|
55
|
+
if fd < 0 {
|
56
|
+
return Err(ReaderError::InvalidFileDescriptor.into());
|
57
|
+
}
|
58
|
+
|
59
|
+
let file = unsafe { File::from_raw_fd(fd) };
|
60
|
+
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
61
|
+
let reader = SerializedFileReader::new(file).unwrap();
|
62
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
63
|
+
|
64
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
65
|
+
} else {
|
66
|
+
let readable = SeekableRubyValue(Opaque::from(to_read));
|
67
|
+
let reader = SerializedFileReader::new(readable).unwrap();
|
68
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
69
|
+
|
70
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
71
|
+
};
|
72
|
+
|
73
|
+
if let Some(cols) = columns {
|
74
|
+
let projection = create_projection_schema(&schema, &cols);
|
75
|
+
iter = iter.project(Some(projection.to_owned())).map_err(|e| {
|
76
|
+
MagnusError::new(
|
77
|
+
ruby.exception_runtime_error(),
|
78
|
+
format!("Failed to create projection: {}", e),
|
79
|
+
)
|
80
|
+
})?;
|
81
|
+
}
|
82
|
+
|
83
|
+
match result_type {
|
84
|
+
ParserResultType::Hash => {
|
85
|
+
let headers = OnceLock::new();
|
86
|
+
let headers_clone = headers.clone();
|
87
|
+
let iter = iter.map(move |row| {
|
88
|
+
row.and_then(|row| {
|
89
|
+
let headers = headers_clone.get_or_init(|| {
|
90
|
+
let column_count = row.get_column_iter().count();
|
91
|
+
|
92
|
+
let mut header_string = Vec::with_capacity(column_count);
|
93
|
+
for (k, _) in row.get_column_iter() {
|
94
|
+
header_string.push(k.to_owned());
|
95
|
+
}
|
96
|
+
|
97
|
+
let headers = StringCache::intern_many(&header_string).unwrap();
|
98
|
+
|
99
|
+
headers
|
100
|
+
});
|
101
|
+
|
102
|
+
let mut map =
|
103
|
+
HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
|
104
|
+
row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
|
105
|
+
map.insert(headers[i], ParquetField(v.clone()));
|
106
|
+
});
|
107
|
+
Ok(map)
|
108
|
+
})
|
109
|
+
.and_then(|row| Ok(RowRecord::Map::<RandomState>(row)))
|
110
|
+
.map_err(|e| ReaderError::Parquet(e))
|
111
|
+
});
|
112
|
+
|
113
|
+
for result in iter {
|
114
|
+
let record = result?;
|
115
|
+
let _: Value = ruby.yield_value(record)?;
|
116
|
+
}
|
117
|
+
}
|
118
|
+
ParserResultType::Array => {
|
119
|
+
let iter = iter.map(|row| {
|
120
|
+
row.and_then(|row| {
|
121
|
+
let column_count = row.get_column_iter().count();
|
122
|
+
let mut vec = Vec::with_capacity(column_count);
|
123
|
+
row.get_column_iter()
|
124
|
+
.for_each(|(_, v)| vec.push(ParquetField(v.clone())));
|
125
|
+
Ok(vec)
|
126
|
+
})
|
127
|
+
.and_then(|row| Ok(RowRecord::Vec::<RandomState>(row)))
|
128
|
+
.map_err(|e| ReaderError::Parquet(e))
|
129
|
+
});
|
130
|
+
|
131
|
+
for result in iter {
|
132
|
+
let record = result?;
|
133
|
+
let _: Value = ruby.yield_value(record)?;
|
134
|
+
}
|
135
|
+
}
|
136
|
+
}
|
137
|
+
|
138
|
+
Ok(ruby.qnil().into_value_with(&ruby))
|
139
|
+
}
|
140
|
+
|
141
|
+
fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
|
142
|
+
if let SchemaType::GroupType { fields, .. } = schema {
|
143
|
+
let projected_fields: Vec<TypePtr> = fields
|
144
|
+
.iter()
|
145
|
+
.filter(|field| columns.contains(&field.name().to_string()))
|
146
|
+
.cloned()
|
147
|
+
.collect();
|
148
|
+
|
149
|
+
SchemaType::GroupType {
|
150
|
+
basic_info: schema.get_basic_info().clone(),
|
151
|
+
fields: projected_fields,
|
152
|
+
}
|
153
|
+
} else {
|
154
|
+
// Return original schema if not a group type
|
155
|
+
schema.clone()
|
156
|
+
}
|
157
|
+
}
|
@@ -0,0 +1,77 @@
|
|
1
|
+
use std::{
|
2
|
+
fs::File,
|
3
|
+
io::{BufReader, SeekFrom},
|
4
|
+
mem::ManuallyDrop,
|
5
|
+
};
|
6
|
+
|
7
|
+
use bytes::Bytes;
|
8
|
+
use magnus::{value::Opaque, Ruby, Value};
|
9
|
+
use parquet::{
|
10
|
+
errors::ParquetError,
|
11
|
+
file::reader::{ChunkReader, Length},
|
12
|
+
};
|
13
|
+
use std::io::Read;
|
14
|
+
|
15
|
+
use crate::ruby_reader::{build_ruby_reader, SeekableRead};
|
16
|
+
|
17
|
+
const READ_BUFFER_SIZE: usize = 16 * 1024;
|
18
|
+
|
19
|
+
pub struct SeekableRubyValue(pub Opaque<Value>);
|
20
|
+
|
21
|
+
impl Length for SeekableRubyValue {
|
22
|
+
fn len(&self) -> u64 {
|
23
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
24
|
+
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
25
|
+
let current_pos = reader.seek(SeekFrom::Current(0)).unwrap();
|
26
|
+
let file_len = reader.seek(SeekFrom::End(0)).unwrap();
|
27
|
+
reader.seek(SeekFrom::Start(current_pos)).unwrap();
|
28
|
+
file_len
|
29
|
+
}
|
30
|
+
}
|
31
|
+
|
32
|
+
impl ChunkReader for SeekableRubyValue {
|
33
|
+
type T = BufReader<Box<dyn SeekableRead>>;
|
34
|
+
|
35
|
+
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
36
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
37
|
+
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
38
|
+
reader.seek(SeekFrom::Start(start))?;
|
39
|
+
Ok(BufReader::with_capacity(READ_BUFFER_SIZE, reader))
|
40
|
+
}
|
41
|
+
|
42
|
+
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
43
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
44
|
+
let mut buffer = Vec::with_capacity(length);
|
45
|
+
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
46
|
+
reader.seek(SeekFrom::Start(start))?;
|
47
|
+
let read = reader.take(length as _).read_to_end(&mut buffer)?;
|
48
|
+
|
49
|
+
if read != length {
|
50
|
+
return Err(ParquetError::EOF(format!(
|
51
|
+
"Expected to read {} bytes, read only {}",
|
52
|
+
length, read
|
53
|
+
)));
|
54
|
+
}
|
55
|
+
Ok(buffer.into())
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
pub struct ForgottenFileHandle(pub ManuallyDrop<File>);
|
60
|
+
|
61
|
+
impl Length for ForgottenFileHandle {
|
62
|
+
fn len(&self) -> u64 {
|
63
|
+
self.0.len()
|
64
|
+
}
|
65
|
+
}
|
66
|
+
|
67
|
+
impl ChunkReader for ForgottenFileHandle {
|
68
|
+
type T = BufReader<File>;
|
69
|
+
|
70
|
+
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
71
|
+
self.0.get_read(start)
|
72
|
+
}
|
73
|
+
|
74
|
+
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
75
|
+
self.0.get_bytes(start, length)
|
76
|
+
}
|
77
|
+
}
|
@@ -0,0 +1,171 @@
|
|
1
|
+
use magnus::{
|
2
|
+
value::{Opaque, ReprValue},
|
3
|
+
RClass, RString, Ruby, Value,
|
4
|
+
};
|
5
|
+
use std::io::{self, Read, Seek, SeekFrom, Write};
|
6
|
+
use std::sync::OnceLock;
|
7
|
+
|
8
|
+
static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
|
9
|
+
|
10
|
+
/// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
|
11
|
+
/// and provide a standard Read implementation for them.
|
12
|
+
pub struct RubyReader<T> {
|
13
|
+
inner: T,
|
14
|
+
offset: usize,
|
15
|
+
}
|
16
|
+
|
17
|
+
pub trait SeekableRead: Read + Seek {}
|
18
|
+
impl<T: Read + Seek> SeekableRead for T {}
|
19
|
+
|
20
|
+
pub fn build_ruby_reader(
|
21
|
+
ruby: &Ruby,
|
22
|
+
input: Value,
|
23
|
+
) -> Result<Box<dyn SeekableRead>, magnus::Error> {
|
24
|
+
if RubyReader::is_string_io(ruby, &input) {
|
25
|
+
RubyReader::from_string_io(ruby, input)
|
26
|
+
} else if RubyReader::is_io_like(&input) {
|
27
|
+
RubyReader::from_io(input)
|
28
|
+
} else {
|
29
|
+
RubyReader::from_string_like(input)
|
30
|
+
}
|
31
|
+
}
|
32
|
+
|
33
|
+
impl Seek for RubyReader<Value> {
|
34
|
+
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
35
|
+
let (whence, offset) = match pos {
|
36
|
+
SeekFrom::Start(i) => (0, i as i64),
|
37
|
+
SeekFrom::Current(i) => (1, i),
|
38
|
+
SeekFrom::End(i) => (2, i),
|
39
|
+
};
|
40
|
+
|
41
|
+
let new_position = self
|
42
|
+
.inner
|
43
|
+
.funcall("seek", (offset, whence))
|
44
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
45
|
+
|
46
|
+
Ok(new_position)
|
47
|
+
}
|
48
|
+
}
|
49
|
+
|
50
|
+
impl Write for RubyReader<Value> {
|
51
|
+
fn write(&mut self, buf: &[u8]) -> Result<usize, io::Error> {
|
52
|
+
let ruby_bytes = RString::from_slice(buf);
|
53
|
+
|
54
|
+
let bytes_written = self
|
55
|
+
.inner
|
56
|
+
.funcall::<_, _, usize>("write", (ruby_bytes,))
|
57
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
58
|
+
|
59
|
+
Ok(bytes_written)
|
60
|
+
}
|
61
|
+
|
62
|
+
fn flush(&mut self) -> Result<(), io::Error> {
|
63
|
+
self.inner
|
64
|
+
.funcall::<_, _, Value>("flush", ())
|
65
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
66
|
+
|
67
|
+
Ok(())
|
68
|
+
}
|
69
|
+
}
|
70
|
+
|
71
|
+
impl Seek for RubyReader<RString> {
|
72
|
+
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
73
|
+
match pos {
|
74
|
+
io::SeekFrom::Start(offset) => self.offset = offset as usize,
|
75
|
+
io::SeekFrom::Current(offset) => self.offset = (self.offset as i64 + offset) as usize,
|
76
|
+
io::SeekFrom::End(offset) => self.offset = self.inner.len() - offset as usize,
|
77
|
+
}
|
78
|
+
Ok(self.offset as u64)
|
79
|
+
}
|
80
|
+
}
|
81
|
+
|
82
|
+
impl RubyReader<Value> {
|
83
|
+
fn from_io(input: Value) -> Result<Box<dyn SeekableRead>, magnus::Error> {
|
84
|
+
if Self::is_io_like(&input) {
|
85
|
+
Ok(Box::new(Self::from_io_like(input)))
|
86
|
+
} else {
|
87
|
+
Err(magnus::Error::new(
|
88
|
+
magnus::exception::type_error(),
|
89
|
+
"Input is not an IO-like object",
|
90
|
+
))
|
91
|
+
}
|
92
|
+
}
|
93
|
+
|
94
|
+
fn is_io_like(input: &Value) -> bool {
|
95
|
+
input.respond_to("read", false).unwrap_or(false)
|
96
|
+
}
|
97
|
+
|
98
|
+
fn from_io_like(input: Value) -> Self {
|
99
|
+
Self {
|
100
|
+
inner: input,
|
101
|
+
offset: 0,
|
102
|
+
}
|
103
|
+
}
|
104
|
+
}
|
105
|
+
|
106
|
+
impl RubyReader<RString> {
|
107
|
+
pub fn from_string_io(
|
108
|
+
ruby: &Ruby,
|
109
|
+
input: Value,
|
110
|
+
) -> Result<Box<dyn SeekableRead>, magnus::Error> {
|
111
|
+
if !Self::is_string_io(ruby, &input) {
|
112
|
+
return Err(magnus::Error::new(
|
113
|
+
magnus::exception::type_error(),
|
114
|
+
"Input is not a StringIO",
|
115
|
+
));
|
116
|
+
}
|
117
|
+
|
118
|
+
let string_content = input.funcall::<_, _, RString>("string", ()).unwrap();
|
119
|
+
Ok(Box::new(Self {
|
120
|
+
inner: string_content,
|
121
|
+
offset: 0,
|
122
|
+
}))
|
123
|
+
}
|
124
|
+
|
125
|
+
fn is_string_io(ruby: &Ruby, input: &Value) -> bool {
|
126
|
+
let string_io_class = STRING_IO_CLASS.get_or_init(|| {
|
127
|
+
let class = RClass::from_value(ruby.eval("StringIO").unwrap()).unwrap();
|
128
|
+
Opaque::from(class)
|
129
|
+
});
|
130
|
+
input.is_kind_of(ruby.get_inner(*string_io_class))
|
131
|
+
}
|
132
|
+
|
133
|
+
fn from_string_like(input: Value) -> Result<Box<dyn SeekableRead>, magnus::Error> {
|
134
|
+
// Try calling `to_str`, and if that fails, try `to_s`
|
135
|
+
let string_content = input
|
136
|
+
.funcall::<_, _, RString>("to_str", ())
|
137
|
+
.or_else(|_| input.funcall::<_, _, RString>("to_s", ()))?;
|
138
|
+
Ok(Box::new(Self {
|
139
|
+
inner: string_content,
|
140
|
+
offset: 0,
|
141
|
+
}))
|
142
|
+
}
|
143
|
+
}
|
144
|
+
|
145
|
+
impl Read for RubyReader<Value> {
|
146
|
+
fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
|
147
|
+
let bytes = self
|
148
|
+
.inner
|
149
|
+
.funcall::<_, _, RString>("read", (buf.len(),))
|
150
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
151
|
+
|
152
|
+
buf.write_all(unsafe { bytes.as_slice() })?;
|
153
|
+
|
154
|
+
Ok(bytes.len())
|
155
|
+
}
|
156
|
+
}
|
157
|
+
|
158
|
+
impl Read for RubyReader<RString> {
|
159
|
+
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
160
|
+
let string_buffer = unsafe { self.inner.as_slice() };
|
161
|
+
if self.offset >= string_buffer.len() {
|
162
|
+
return Ok(0); // EOF
|
163
|
+
}
|
164
|
+
|
165
|
+
let remaining = string_buffer.len() - self.offset;
|
166
|
+
let copy_size = remaining.min(buf.len());
|
167
|
+
buf[..copy_size].copy_from_slice(&string_buffer[self.offset..self.offset + copy_size]);
|
168
|
+
self.offset += copy_size;
|
169
|
+
Ok(copy_size)
|
170
|
+
}
|
171
|
+
}
|
@@ -0,0 +1,75 @@
|
|
1
|
+
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
2
|
+
pub enum ParserResultType {
|
3
|
+
Hash,
|
4
|
+
Array,
|
5
|
+
}
|
6
|
+
|
7
|
+
impl ParserResultType {
|
8
|
+
pub fn iter() -> impl Iterator<Item = Self> {
|
9
|
+
[Self::Hash, Self::Array].into_iter()
|
10
|
+
}
|
11
|
+
}
|
12
|
+
|
13
|
+
impl TryFrom<&str> for ParserResultType {
|
14
|
+
type Error = String;
|
15
|
+
|
16
|
+
fn try_from(value: &str) -> Result<Self, Self::Error> {
|
17
|
+
match value {
|
18
|
+
"hash" => Ok(ParserResultType::Hash),
|
19
|
+
"array" => Ok(ParserResultType::Array),
|
20
|
+
_ => Err(format!("Invalid parser result type: {}", value)),
|
21
|
+
}
|
22
|
+
}
|
23
|
+
}
|
24
|
+
|
25
|
+
impl TryFrom<String> for ParserResultType {
|
26
|
+
type Error = String;
|
27
|
+
|
28
|
+
fn try_from(value: String) -> Result<Self, Self::Error> {
|
29
|
+
Self::try_from(value.as_str())
|
30
|
+
}
|
31
|
+
}
|
32
|
+
|
33
|
+
impl std::fmt::Display for ParserResultType {
|
34
|
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
35
|
+
match self {
|
36
|
+
ParserResultType::Hash => write!(f, "hash"),
|
37
|
+
ParserResultType::Array => write!(f, "array"),
|
38
|
+
}
|
39
|
+
}
|
40
|
+
}
|
41
|
+
|
42
|
+
#[derive(Debug, Clone)]
|
43
|
+
pub struct ListField<'a> {
|
44
|
+
pub item_type: ParquetSchemaType<'a>,
|
45
|
+
pub format: Option<&'a str>,
|
46
|
+
}
|
47
|
+
|
48
|
+
#[derive(Debug, Clone)]
|
49
|
+
pub struct MapField<'a> {
|
50
|
+
pub key_type: ParquetSchemaType<'a>,
|
51
|
+
pub value_type: ParquetSchemaType<'a>,
|
52
|
+
pub format: Option<&'a str>,
|
53
|
+
}
|
54
|
+
|
55
|
+
#[derive(Debug, Clone)]
|
56
|
+
pub enum ParquetSchemaType<'a> {
|
57
|
+
Int8,
|
58
|
+
Int16,
|
59
|
+
Int32,
|
60
|
+
Int64,
|
61
|
+
UInt8,
|
62
|
+
UInt16,
|
63
|
+
UInt32,
|
64
|
+
UInt64,
|
65
|
+
Float,
|
66
|
+
Double,
|
67
|
+
String,
|
68
|
+
Binary,
|
69
|
+
Boolean,
|
70
|
+
Date32,
|
71
|
+
TimestampMillis,
|
72
|
+
TimestampMicros,
|
73
|
+
List(Box<ListField<'a>>),
|
74
|
+
Map(Box<MapField<'a>>),
|
75
|
+
}
|