parquet 0.2.12-arm64-darwin → 0.3.0-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +5 -3
- data/README.md +1 -1
- data/Rakefile +16 -0
- data/lib/parquet/3.2/parquet.bundle +0 -0
- data/lib/parquet/3.3/parquet.bundle +0 -0
- data/lib/parquet/3.4/parquet.bundle +0 -0
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rb +6 -1
- metadata +14 -45
- data/Cargo.lock +0 -1449
- data/Cargo.toml +0 -3
- data/ext/parquet/Cargo.toml +0 -28
- data/ext/parquet/extconf.rb +0 -4
- data/ext/parquet/src/allocator.rs +0 -13
- data/ext/parquet/src/enumerator.rs +0 -52
- data/ext/parquet/src/header_cache.rs +0 -100
- data/ext/parquet/src/lib.rs +0 -29
- data/ext/parquet/src/reader/mod.rs +0 -44
- data/ext/parquet/src/reader/parquet_column_reader.rs +0 -214
- data/ext/parquet/src/reader/parquet_row_reader.rs +0 -157
- data/ext/parquet/src/ruby_integration.rs +0 -77
- data/ext/parquet/src/ruby_reader.rs +0 -171
- data/ext/parquet/src/types/core_types.rs +0 -75
- data/ext/parquet/src/types/mod.rs +0 -30
- data/ext/parquet/src/types/parquet_value.rs +0 -462
- data/ext/parquet/src/types/record_types.rs +0 -204
- data/ext/parquet/src/types/timestamp.rs +0 -85
- data/ext/parquet/src/types/type_conversion.rs +0 -809
- data/ext/parquet/src/types/writer_types.rs +0 -283
- data/ext/parquet/src/utils.rs +0 -148
- data/ext/parquet/src/writer/mod.rs +0 -575
@@ -1,283 +0,0 @@
|
|
1
|
-
use std::{
|
2
|
-
io::{self, Write},
|
3
|
-
str::FromStr,
|
4
|
-
sync::Arc,
|
5
|
-
};
|
6
|
-
|
7
|
-
use arrow_array::{Array, RecordBatch};
|
8
|
-
use magnus::{value::ReprValue, Error as MagnusError, RString, Ruby, Symbol, TryConvert, Value};
|
9
|
-
use parquet::{arrow::ArrowWriter, errors::ParquetError};
|
10
|
-
use tempfile::NamedTempFile;
|
11
|
-
|
12
|
-
use crate::types::{ListField, MapField, ParquetSchemaType};
|
13
|
-
|
14
|
-
#[derive(Debug)]
|
15
|
-
pub struct SchemaField<'a> {
|
16
|
-
pub name: String,
|
17
|
-
pub type_: ParquetSchemaType<'a>,
|
18
|
-
pub format: Option<String>,
|
19
|
-
}
|
20
|
-
|
21
|
-
#[derive(Debug)]
|
22
|
-
pub struct ParquetWriteArgs<'a> {
|
23
|
-
pub read_from: Value,
|
24
|
-
pub write_to: Value,
|
25
|
-
pub schema: Vec<SchemaField<'a>>,
|
26
|
-
pub batch_size: Option<usize>,
|
27
|
-
pub flush_threshold: Option<usize>,
|
28
|
-
pub compression: Option<String>,
|
29
|
-
pub sample_size: Option<usize>,
|
30
|
-
}
|
31
|
-
|
32
|
-
pub trait SendableWrite: Send + Write {}
|
33
|
-
impl<T: Send + Write> SendableWrite for T {}
|
34
|
-
|
35
|
-
pub struct IoLikeValue(pub(crate) Value);
|
36
|
-
|
37
|
-
impl Write for IoLikeValue {
|
38
|
-
fn write(&mut self, buf: &[u8]) -> Result<usize, io::Error> {
|
39
|
-
let ruby_bytes = RString::from_slice(buf);
|
40
|
-
|
41
|
-
let bytes_written = self
|
42
|
-
.0
|
43
|
-
.funcall::<_, _, usize>("write", (ruby_bytes,))
|
44
|
-
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
45
|
-
|
46
|
-
Ok(bytes_written)
|
47
|
-
}
|
48
|
-
|
49
|
-
fn flush(&mut self) -> Result<(), io::Error> {
|
50
|
-
self.0
|
51
|
-
.funcall::<_, _, Value>("flush", ())
|
52
|
-
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
53
|
-
|
54
|
-
Ok(())
|
55
|
-
}
|
56
|
-
}
|
57
|
-
|
58
|
-
impl<'a> FromStr for ParquetSchemaType<'a> {
|
59
|
-
type Err = MagnusError;
|
60
|
-
|
61
|
-
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
62
|
-
match s {
|
63
|
-
"int8" => Ok(ParquetSchemaType::Int8),
|
64
|
-
"int16" => Ok(ParquetSchemaType::Int16),
|
65
|
-
"int32" => Ok(ParquetSchemaType::Int32),
|
66
|
-
"int64" => Ok(ParquetSchemaType::Int64),
|
67
|
-
"uint8" => Ok(ParquetSchemaType::UInt8),
|
68
|
-
"uint16" => Ok(ParquetSchemaType::UInt16),
|
69
|
-
"uint32" => Ok(ParquetSchemaType::UInt32),
|
70
|
-
"uint64" => Ok(ParquetSchemaType::UInt64),
|
71
|
-
"float" | "float32" => Ok(ParquetSchemaType::Float),
|
72
|
-
"double" | "float64" => Ok(ParquetSchemaType::Double),
|
73
|
-
"string" | "utf8" => Ok(ParquetSchemaType::String),
|
74
|
-
"binary" => Ok(ParquetSchemaType::Binary),
|
75
|
-
"boolean" | "bool" => Ok(ParquetSchemaType::Boolean),
|
76
|
-
"date32" => Ok(ParquetSchemaType::Date32),
|
77
|
-
"timestamp_millis" => Ok(ParquetSchemaType::TimestampMillis),
|
78
|
-
"timestamp_micros" => Ok(ParquetSchemaType::TimestampMicros),
|
79
|
-
"list" => Ok(ParquetSchemaType::List(Box::new(ListField {
|
80
|
-
item_type: ParquetSchemaType::Int8,
|
81
|
-
format: None,
|
82
|
-
}))),
|
83
|
-
"map" => Ok(ParquetSchemaType::Map(Box::new(MapField {
|
84
|
-
key_type: ParquetSchemaType::String,
|
85
|
-
value_type: ParquetSchemaType::Int8,
|
86
|
-
format: None,
|
87
|
-
}))),
|
88
|
-
_ => Err(MagnusError::new(
|
89
|
-
magnus::exception::runtime_error(),
|
90
|
-
format!("Invalid schema type: {}", s),
|
91
|
-
)),
|
92
|
-
}
|
93
|
-
}
|
94
|
-
}
|
95
|
-
|
96
|
-
impl<'a> TryConvert for ParquetSchemaType<'a> {
|
97
|
-
fn try_convert(value: Value) -> Result<Self, MagnusError> {
|
98
|
-
let ruby = unsafe { Ruby::get_unchecked() };
|
99
|
-
let schema_type = parse_string_or_symbol(&ruby, value)?;
|
100
|
-
|
101
|
-
schema_type.unwrap().parse()
|
102
|
-
}
|
103
|
-
}
|
104
|
-
|
105
|
-
// We know this type is safe to move between threads because it's just an enum
|
106
|
-
// with simple primitive types and strings
|
107
|
-
unsafe impl<'a> Send for ParquetSchemaType<'a> {}
|
108
|
-
|
109
|
-
fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, MagnusError> {
|
110
|
-
if value.is_nil() {
|
111
|
-
Ok(None)
|
112
|
-
} else if value.is_kind_of(ruby.class_string()) {
|
113
|
-
RString::from_value(value)
|
114
|
-
.ok_or_else(|| {
|
115
|
-
MagnusError::new(magnus::exception::type_error(), "Invalid string value")
|
116
|
-
})?
|
117
|
-
.to_string()
|
118
|
-
.map(|s| Some(s))
|
119
|
-
} else if value.is_kind_of(ruby.class_symbol()) {
|
120
|
-
Symbol::from_value(value)
|
121
|
-
.ok_or_else(|| {
|
122
|
-
MagnusError::new(magnus::exception::type_error(), "Invalid symbol value")
|
123
|
-
})?
|
124
|
-
.funcall("to_s", ())
|
125
|
-
.map(|s| Some(s))
|
126
|
-
} else {
|
127
|
-
Err(MagnusError::new(
|
128
|
-
magnus::exception::type_error(),
|
129
|
-
"Value must be a String or Symbol",
|
130
|
-
))
|
131
|
-
}
|
132
|
-
}
|
133
|
-
|
134
|
-
pub enum WriterOutput {
|
135
|
-
File(ArrowWriter<Box<dyn SendableWrite>>),
|
136
|
-
TempFile(ArrowWriter<Box<dyn SendableWrite>>, NamedTempFile),
|
137
|
-
}
|
138
|
-
|
139
|
-
impl WriterOutput {
|
140
|
-
pub fn write(&mut self, batch: &RecordBatch) -> Result<(), ParquetError> {
|
141
|
-
match self {
|
142
|
-
WriterOutput::File(writer) | WriterOutput::TempFile(writer, _) => writer.write(batch),
|
143
|
-
}
|
144
|
-
}
|
145
|
-
|
146
|
-
pub fn close(self) -> Result<Option<NamedTempFile>, ParquetError> {
|
147
|
-
match self {
|
148
|
-
WriterOutput::File(writer) => {
|
149
|
-
writer.close()?;
|
150
|
-
Ok(None)
|
151
|
-
}
|
152
|
-
WriterOutput::TempFile(writer, temp_file) => {
|
153
|
-
writer.close()?;
|
154
|
-
Ok(Some(temp_file))
|
155
|
-
}
|
156
|
-
}
|
157
|
-
}
|
158
|
-
}
|
159
|
-
|
160
|
-
pub struct ParquetErrorWrapper(pub ParquetError);
|
161
|
-
|
162
|
-
impl From<ParquetErrorWrapper> for MagnusError {
|
163
|
-
fn from(err: ParquetErrorWrapper) -> Self {
|
164
|
-
MagnusError::new(
|
165
|
-
magnus::exception::runtime_error(),
|
166
|
-
format!("Parquet error: {}", err.0),
|
167
|
-
)
|
168
|
-
}
|
169
|
-
}
|
170
|
-
|
171
|
-
pub struct ColumnCollector<'a> {
|
172
|
-
pub name: String,
|
173
|
-
pub type_: ParquetSchemaType<'a>,
|
174
|
-
pub format: Option<String>,
|
175
|
-
pub values: Vec<crate::types::ParquetValue>,
|
176
|
-
}
|
177
|
-
|
178
|
-
impl<'a> ColumnCollector<'a> {
|
179
|
-
pub fn new(name: String, type_: ParquetSchemaType<'a>, format: Option<String>) -> Self {
|
180
|
-
Self {
|
181
|
-
name,
|
182
|
-
type_,
|
183
|
-
format,
|
184
|
-
values: Vec::new(),
|
185
|
-
}
|
186
|
-
}
|
187
|
-
|
188
|
-
pub fn push_value(&mut self, value: Value) -> Result<(), MagnusError> {
|
189
|
-
use crate::types::ParquetValue;
|
190
|
-
use crate::{
|
191
|
-
convert_to_binary, convert_to_boolean, convert_to_date32, convert_to_list,
|
192
|
-
convert_to_map, convert_to_timestamp_micros, convert_to_timestamp_millis,
|
193
|
-
NumericConverter,
|
194
|
-
};
|
195
|
-
|
196
|
-
if value.is_nil() {
|
197
|
-
self.values.push(ParquetValue::Null);
|
198
|
-
return Ok(());
|
199
|
-
}
|
200
|
-
|
201
|
-
let parquet_value = match &self.type_ {
|
202
|
-
ParquetSchemaType::Int8 => {
|
203
|
-
let v = NumericConverter::<i8>::convert_with_string_fallback(value)?;
|
204
|
-
ParquetValue::Int8(v)
|
205
|
-
}
|
206
|
-
ParquetSchemaType::Int16 => {
|
207
|
-
let v = NumericConverter::<i16>::convert_with_string_fallback(value)?;
|
208
|
-
ParquetValue::Int16(v)
|
209
|
-
}
|
210
|
-
ParquetSchemaType::Int32 => {
|
211
|
-
let v = NumericConverter::<i32>::convert_with_string_fallback(value)?;
|
212
|
-
ParquetValue::Int32(v)
|
213
|
-
}
|
214
|
-
ParquetSchemaType::Int64 => {
|
215
|
-
let v = NumericConverter::<i64>::convert_with_string_fallback(value)?;
|
216
|
-
ParquetValue::Int64(v)
|
217
|
-
}
|
218
|
-
ParquetSchemaType::UInt8 => {
|
219
|
-
let v = NumericConverter::<u8>::convert_with_string_fallback(value)?;
|
220
|
-
ParquetValue::UInt8(v)
|
221
|
-
}
|
222
|
-
ParquetSchemaType::UInt16 => {
|
223
|
-
let v = NumericConverter::<u16>::convert_with_string_fallback(value)?;
|
224
|
-
ParquetValue::UInt16(v)
|
225
|
-
}
|
226
|
-
ParquetSchemaType::UInt32 => {
|
227
|
-
let v = NumericConverter::<u32>::convert_with_string_fallback(value)?;
|
228
|
-
ParquetValue::UInt32(v)
|
229
|
-
}
|
230
|
-
ParquetSchemaType::UInt64 => {
|
231
|
-
let v = NumericConverter::<u64>::convert_with_string_fallback(value)?;
|
232
|
-
ParquetValue::UInt64(v)
|
233
|
-
}
|
234
|
-
ParquetSchemaType::Float => {
|
235
|
-
let v = NumericConverter::<f32>::convert_with_string_fallback(value)?;
|
236
|
-
ParquetValue::Float32(v)
|
237
|
-
}
|
238
|
-
ParquetSchemaType::Double => {
|
239
|
-
let v = NumericConverter::<f64>::convert_with_string_fallback(value)?;
|
240
|
-
ParquetValue::Float64(v)
|
241
|
-
}
|
242
|
-
ParquetSchemaType::String => {
|
243
|
-
let v = String::try_convert(value)?;
|
244
|
-
ParquetValue::String(v)
|
245
|
-
}
|
246
|
-
ParquetSchemaType::Binary => {
|
247
|
-
let v = convert_to_binary(value)?;
|
248
|
-
ParquetValue::Bytes(v)
|
249
|
-
}
|
250
|
-
ParquetSchemaType::Boolean => {
|
251
|
-
let v = convert_to_boolean(value)?;
|
252
|
-
ParquetValue::Boolean(v)
|
253
|
-
}
|
254
|
-
ParquetSchemaType::Date32 => {
|
255
|
-
let v = convert_to_date32(value, self.format.as_deref())?;
|
256
|
-
ParquetValue::Date32(v)
|
257
|
-
}
|
258
|
-
ParquetSchemaType::TimestampMillis => {
|
259
|
-
let v = convert_to_timestamp_millis(value, self.format.as_deref())?;
|
260
|
-
ParquetValue::TimestampMillis(v, None)
|
261
|
-
}
|
262
|
-
ParquetSchemaType::TimestampMicros => {
|
263
|
-
let v = convert_to_timestamp_micros(value, self.format.as_deref())?;
|
264
|
-
ParquetValue::TimestampMicros(v, None)
|
265
|
-
}
|
266
|
-
ParquetSchemaType::List(list_field) => {
|
267
|
-
let values = convert_to_list(value, list_field)?;
|
268
|
-
ParquetValue::List(values)
|
269
|
-
}
|
270
|
-
ParquetSchemaType::Map(map_field) => {
|
271
|
-
let map = convert_to_map(value, map_field)?;
|
272
|
-
ParquetValue::Map(map)
|
273
|
-
}
|
274
|
-
};
|
275
|
-
self.values.push(parquet_value);
|
276
|
-
Ok(())
|
277
|
-
}
|
278
|
-
|
279
|
-
pub fn take_array(&mut self) -> Result<Arc<dyn Array>, MagnusError> {
|
280
|
-
let values = std::mem::take(&mut self.values);
|
281
|
-
crate::convert_parquet_values_to_arrow(values, &self.type_)
|
282
|
-
}
|
283
|
-
}
|
data/ext/parquet/src/utils.rs
DELETED
@@ -1,148 +0,0 @@
|
|
1
|
-
use magnus::{
|
2
|
-
scan_args::{get_kwargs, scan_args},
|
3
|
-
value::ReprValue,
|
4
|
-
Error, RString, Ruby, Symbol, Value,
|
5
|
-
};
|
6
|
-
|
7
|
-
use crate::ParserResultType;
|
8
|
-
|
9
|
-
fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, Error> {
|
10
|
-
if value.is_nil() {
|
11
|
-
Ok(None)
|
12
|
-
} else if value.is_kind_of(ruby.class_string()) {
|
13
|
-
RString::from_value(value)
|
14
|
-
.ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid string value"))?
|
15
|
-
.to_string()
|
16
|
-
.map(|s| Some(s))
|
17
|
-
} else if value.is_kind_of(ruby.class_symbol()) {
|
18
|
-
Symbol::from_value(value)
|
19
|
-
.ok_or_else(|| Error::new(magnus::exception::type_error(), "Invalid symbol value"))?
|
20
|
-
.funcall("to_s", ())
|
21
|
-
.map(|s| Some(s))
|
22
|
-
} else {
|
23
|
-
Err(Error::new(
|
24
|
-
magnus::exception::type_error(),
|
25
|
-
"Value must be a String or Symbol",
|
26
|
-
))
|
27
|
-
}
|
28
|
-
}
|
29
|
-
|
30
|
-
#[derive(Debug)]
|
31
|
-
pub struct ParquetRowsArgs {
|
32
|
-
pub to_read: Value,
|
33
|
-
pub result_type: ParserResultType,
|
34
|
-
pub columns: Option<Vec<String>>,
|
35
|
-
}
|
36
|
-
|
37
|
-
/// Parse common arguments for CSV parsing
|
38
|
-
pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRowsArgs, Error> {
|
39
|
-
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
40
|
-
let (to_read,) = parsed_args.required;
|
41
|
-
|
42
|
-
let kwargs = get_kwargs::<_, (), (Option<Option<Value>>, Option<Option<Vec<String>>>), ()>(
|
43
|
-
parsed_args.keywords,
|
44
|
-
&[],
|
45
|
-
&["result_type", "columns"],
|
46
|
-
)?;
|
47
|
-
|
48
|
-
let result_type: ParserResultType = match kwargs
|
49
|
-
.optional
|
50
|
-
.0
|
51
|
-
.flatten()
|
52
|
-
.map(|value| parse_string_or_symbol(ruby, value))
|
53
|
-
{
|
54
|
-
Some(Ok(Some(parsed))) => parsed.try_into().map_err(|e| {
|
55
|
-
Error::new(
|
56
|
-
magnus::exception::runtime_error(),
|
57
|
-
format!(
|
58
|
-
"Invalid result type: {e}. Must be one of {}",
|
59
|
-
ParserResultType::iter()
|
60
|
-
.map(|v| v.to_string())
|
61
|
-
.collect::<Vec<_>>()
|
62
|
-
.join(", ")
|
63
|
-
),
|
64
|
-
)
|
65
|
-
})?,
|
66
|
-
Some(Ok(None)) => ParserResultType::Hash,
|
67
|
-
Some(Err(_)) => {
|
68
|
-
return Err(Error::new(
|
69
|
-
magnus::exception::type_error(),
|
70
|
-
"result_type must be a String or Symbol",
|
71
|
-
))
|
72
|
-
}
|
73
|
-
None => ParserResultType::Hash,
|
74
|
-
};
|
75
|
-
|
76
|
-
Ok(ParquetRowsArgs {
|
77
|
-
to_read,
|
78
|
-
result_type,
|
79
|
-
columns: kwargs.optional.1.flatten(),
|
80
|
-
})
|
81
|
-
}
|
82
|
-
|
83
|
-
#[derive(Debug)]
|
84
|
-
pub struct ParquetColumnsArgs {
|
85
|
-
pub to_read: Value,
|
86
|
-
pub result_type: ParserResultType,
|
87
|
-
pub columns: Option<Vec<String>>,
|
88
|
-
pub batch_size: Option<usize>,
|
89
|
-
}
|
90
|
-
|
91
|
-
/// Parse common arguments for CSV parsing
|
92
|
-
pub fn parse_parquet_columns_args(
|
93
|
-
ruby: &Ruby,
|
94
|
-
args: &[Value],
|
95
|
-
) -> Result<ParquetColumnsArgs, Error> {
|
96
|
-
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
97
|
-
let (to_read,) = parsed_args.required;
|
98
|
-
|
99
|
-
let kwargs = get_kwargs::<
|
100
|
-
_,
|
101
|
-
(),
|
102
|
-
(
|
103
|
-
Option<Option<Value>>,
|
104
|
-
Option<Option<Vec<String>>>,
|
105
|
-
Option<Option<usize>>,
|
106
|
-
),
|
107
|
-
(),
|
108
|
-
>(
|
109
|
-
parsed_args.keywords,
|
110
|
-
&[],
|
111
|
-
&["result_type", "columns", "batch_size"],
|
112
|
-
)?;
|
113
|
-
|
114
|
-
let result_type: ParserResultType = match kwargs
|
115
|
-
.optional
|
116
|
-
.0
|
117
|
-
.flatten()
|
118
|
-
.map(|value| parse_string_or_symbol(ruby, value))
|
119
|
-
{
|
120
|
-
Some(Ok(Some(parsed))) => parsed.try_into().map_err(|e| {
|
121
|
-
Error::new(
|
122
|
-
magnus::exception::runtime_error(),
|
123
|
-
format!(
|
124
|
-
"Invalid result type: {e}. Must be one of {}",
|
125
|
-
ParserResultType::iter()
|
126
|
-
.map(|v| v.to_string())
|
127
|
-
.collect::<Vec<_>>()
|
128
|
-
.join(", ")
|
129
|
-
),
|
130
|
-
)
|
131
|
-
})?,
|
132
|
-
Some(Ok(None)) => ParserResultType::Hash,
|
133
|
-
Some(Err(_)) => {
|
134
|
-
return Err(Error::new(
|
135
|
-
magnus::exception::type_error(),
|
136
|
-
"result_type must be a String or Symbol",
|
137
|
-
))
|
138
|
-
}
|
139
|
-
None => ParserResultType::Hash,
|
140
|
-
};
|
141
|
-
|
142
|
-
Ok(ParquetColumnsArgs {
|
143
|
-
to_read,
|
144
|
-
result_type,
|
145
|
-
columns: kwargs.optional.1.flatten(),
|
146
|
-
batch_size: kwargs.optional.2.flatten(),
|
147
|
-
})
|
148
|
-
}
|