parquet 0.2.12 → 0.2.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/parquet/src/enumerator.rs +8 -0
- data/ext/parquet/src/header_cache.rs +1 -1
- data/ext/parquet/src/reader/mod.rs +4 -0
- data/ext/parquet/src/reader/parquet_column_reader.rs +43 -24
- data/ext/parquet/src/reader/parquet_row_reader.rs +7 -4
- data/ext/parquet/src/types/parquet_value.rs +157 -118
- data/ext/parquet/src/types/record_types.rs +91 -77
- data/ext/parquet/src/types/timestamp.rs +4 -5
- data/ext/parquet/src/types/type_conversion.rs +2 -2
- data/ext/parquet/src/utils.rs +19 -3
- data/lib/parquet/parquet.so +0 -0
- data/lib/parquet/version.rb +1 -1
- metadata +2 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0cf24938c23cee5bc8ed4049e2b3fee7794cb619755e26cf83d4bb8826ebccd7
|
4
|
+
data.tar.gz: 85f55738e3503729535de7854d7438bca69f0b82e648471c285a3eefdb51a69b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 17eaa053e7c05605d63c84786958f2980817509a6ba165654bfe50459cc30a37553671cc8b57a70831c255e463b26fc2768afcee3621664b443f9e0e67dc4460
|
7
|
+
data.tar.gz: e1a90f2683fce4a10b489eba3b0d98754ebeed9c418bd274e1af38c8fd9b5ad50f1e4ce72568f9c98001db408367c9610723f5c24796c801a1eaed4c23377d42
|
@@ -6,6 +6,7 @@ pub struct RowEnumeratorArgs {
|
|
6
6
|
pub to_read: Value,
|
7
7
|
pub result_type: ParserResultType,
|
8
8
|
pub columns: Option<Vec<String>>,
|
9
|
+
pub strict: bool,
|
9
10
|
}
|
10
11
|
|
11
12
|
/// Creates an enumerator for lazy Parquet row parsing
|
@@ -18,6 +19,9 @@ pub fn create_row_enumerator(args: RowEnumeratorArgs) -> Result<magnus::Enumerat
|
|
18
19
|
if let Some(columns) = args.columns {
|
19
20
|
kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
|
20
21
|
}
|
22
|
+
if args.strict {
|
23
|
+
kwargs.aset(Symbol::new("strict"), true)?;
|
24
|
+
}
|
21
25
|
Ok(args
|
22
26
|
.rb_self
|
23
27
|
.enumeratorize("each_row", (args.to_read, KwArgs(kwargs))))
|
@@ -29,6 +33,7 @@ pub struct ColumnEnumeratorArgs {
|
|
29
33
|
pub result_type: ParserResultType,
|
30
34
|
pub columns: Option<Vec<String>>,
|
31
35
|
pub batch_size: Option<usize>,
|
36
|
+
pub strict: bool,
|
32
37
|
}
|
33
38
|
|
34
39
|
#[inline]
|
@@ -46,6 +51,9 @@ pub fn create_column_enumerator(
|
|
46
51
|
if let Some(batch_size) = args.batch_size {
|
47
52
|
kwargs.aset(Symbol::new("batch_size"), batch_size)?;
|
48
53
|
}
|
54
|
+
if args.strict {
|
55
|
+
kwargs.aset(Symbol::new("strict"), true)?;
|
56
|
+
}
|
49
57
|
Ok(args
|
50
58
|
.rb_self
|
51
59
|
.enumeratorize("each_column", (args.to_read, KwArgs(kwargs))))
|
@@ -26,6 +26,10 @@ pub enum ReaderError {
|
|
26
26
|
Parquet(#[from] parquet::errors::ParquetError),
|
27
27
|
#[error("Arrow error: {0}")]
|
28
28
|
Arrow(#[from] arrow_schema::ArrowError),
|
29
|
+
#[error("UTF-8 error: {0}")]
|
30
|
+
Utf8Error(#[from] simdutf8::basic::Utf8Error),
|
31
|
+
#[error("Jiff error: {0}")]
|
32
|
+
Jiff(#[from] jiff::Error),
|
29
33
|
}
|
30
34
|
|
31
35
|
impl From<MagnusError> for ReaderError {
|
@@ -1,4 +1,5 @@
|
|
1
1
|
use crate::header_cache::StringCache;
|
2
|
+
use crate::types::{ArrayWrapper, TryIntoValue};
|
2
3
|
use crate::{
|
3
4
|
create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ForgottenFileHandle,
|
4
5
|
ParquetValueVec, ParserResultType, SeekableRubyValue,
|
@@ -27,6 +28,7 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
|
|
27
28
|
result_type,
|
28
29
|
columns,
|
29
30
|
batch_size,
|
31
|
+
strict,
|
30
32
|
} = parse_parquet_columns_args(&ruby, args)?;
|
31
33
|
|
32
34
|
if !ruby.block_given() {
|
@@ -36,6 +38,7 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
|
|
36
38
|
result_type,
|
37
39
|
columns,
|
38
40
|
batch_size,
|
41
|
+
strict,
|
39
42
|
})
|
40
43
|
.map(|yield_enum| yield_enum.into_value_with(&ruby));
|
41
44
|
}
|
@@ -150,7 +153,7 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
|
|
150
153
|
map.insert(*field, vec![]);
|
151
154
|
}
|
152
155
|
let record = ColumnRecord::Map(map);
|
153
|
-
let _: Value = ruby.yield_value(record)?;
|
156
|
+
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
154
157
|
return Ok(ruby.qnil().into_value_with(&ruby));
|
155
158
|
}
|
156
159
|
|
@@ -160,24 +163,37 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
|
|
160
163
|
let headers_clone = headers.clone();
|
161
164
|
let iter = batch_reader.map(move |batch| {
|
162
165
|
batch.map_err(ReaderError::Arrow).and_then(|batch| {
|
163
|
-
let
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
166
|
+
let local_headers = headers_clone
|
167
|
+
.get_or_init(|| {
|
168
|
+
let schema = batch.schema();
|
169
|
+
let fields = schema.fields();
|
170
|
+
let mut header_string = Vec::with_capacity(fields.len());
|
171
|
+
for field in fields {
|
172
|
+
header_string.push(field.name().to_owned());
|
173
|
+
}
|
174
|
+
StringCache::intern_many(&header_string)
|
175
|
+
})
|
176
|
+
.as_ref()
|
177
|
+
.map_err(|e| ReaderError::HeaderIntern(e.clone()))?;
|
178
|
+
|
179
|
+
let mut map = HashMap::with_capacity_and_hasher(
|
180
|
+
local_headers.len(),
|
181
|
+
RandomState::default(),
|
182
|
+
);
|
183
|
+
|
184
|
+
batch
|
185
|
+
.columns()
|
186
|
+
.iter()
|
187
|
+
.enumerate()
|
188
|
+
.try_for_each(|(i, column)| {
|
189
|
+
let header = local_headers[i];
|
190
|
+
let values = ParquetValueVec::try_from(ArrayWrapper {
|
191
|
+
array: &*column,
|
192
|
+
strict: strict,
|
193
|
+
})?;
|
194
|
+
map.insert(header, values.into_inner());
|
195
|
+
Ok::<_, ReaderError>(())
|
196
|
+
})?;
|
181
197
|
|
182
198
|
Ok(ColumnRecord::Map::<RandomState>(map))
|
183
199
|
})
|
@@ -185,7 +201,7 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
|
|
185
201
|
|
186
202
|
for result in iter {
|
187
203
|
let record = result?;
|
188
|
-
let _: Value = ruby.yield_value(record)?;
|
204
|
+
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
189
205
|
}
|
190
206
|
}
|
191
207
|
ParserResultType::Array => {
|
@@ -195,17 +211,20 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
|
|
195
211
|
.columns()
|
196
212
|
.into_iter()
|
197
213
|
.map(|column| {
|
198
|
-
let values = ParquetValueVec::try_from(
|
199
|
-
|
214
|
+
let values = ParquetValueVec::try_from(ArrayWrapper {
|
215
|
+
array: &*column,
|
216
|
+
strict: strict,
|
217
|
+
})?;
|
218
|
+
Ok::<_, ReaderError>(values.into_inner())
|
200
219
|
})
|
201
|
-
.collect()
|
220
|
+
.collect::<Result<Vec<_>, _>>()?;
|
202
221
|
Ok(ColumnRecord::Vec::<RandomState>(vec))
|
203
222
|
})
|
204
223
|
});
|
205
224
|
|
206
225
|
for result in iter {
|
207
226
|
let record = result?;
|
208
|
-
let _: Value = ruby.yield_value(record)?;
|
227
|
+
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
209
228
|
}
|
210
229
|
}
|
211
230
|
}
|
@@ -1,4 +1,5 @@
|
|
1
1
|
use crate::header_cache::StringCache;
|
2
|
+
use crate::types::TryIntoValue;
|
2
3
|
use crate::{
|
3
4
|
create_row_enumerator, utils::*, ForgottenFileHandle, ParquetField, ParserResultType,
|
4
5
|
ReaderError, RowEnumeratorArgs, RowRecord, SeekableRubyValue,
|
@@ -25,6 +26,7 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
|
|
25
26
|
to_read,
|
26
27
|
result_type,
|
27
28
|
columns,
|
29
|
+
strict,
|
28
30
|
} = parse_parquet_rows_args(&ruby, args)?;
|
29
31
|
|
30
32
|
if !ruby.block_given() {
|
@@ -33,6 +35,7 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
|
|
33
35
|
to_read,
|
34
36
|
result_type,
|
35
37
|
columns,
|
38
|
+
strict,
|
36
39
|
})
|
37
40
|
.map(|yield_enum| yield_enum.into_value_with(&ruby));
|
38
41
|
}
|
@@ -102,7 +105,7 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
|
|
102
105
|
let mut map =
|
103
106
|
HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
|
104
107
|
row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
|
105
|
-
map.insert(headers[i], ParquetField(v.clone()));
|
108
|
+
map.insert(headers[i], ParquetField(v.clone(), strict));
|
106
109
|
});
|
107
110
|
Ok(map)
|
108
111
|
})
|
@@ -112,7 +115,7 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
|
|
112
115
|
|
113
116
|
for result in iter {
|
114
117
|
let record = result?;
|
115
|
-
let _: Value = ruby.yield_value(record)?;
|
118
|
+
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
116
119
|
}
|
117
120
|
}
|
118
121
|
ParserResultType::Array => {
|
@@ -121,7 +124,7 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
|
|
121
124
|
let column_count = row.get_column_iter().count();
|
122
125
|
let mut vec = Vec::with_capacity(column_count);
|
123
126
|
row.get_column_iter()
|
124
|
-
.for_each(|(_, v)| vec.push(ParquetField(v.clone())));
|
127
|
+
.for_each(|(_, v)| vec.push(ParquetField(v.clone(), strict)));
|
125
128
|
Ok(vec)
|
126
129
|
})
|
127
130
|
.and_then(|row| Ok(RowRecord::Vec::<RandomState>(row)))
|
@@ -130,7 +133,7 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
|
|
130
133
|
|
131
134
|
for result in iter {
|
132
135
|
let record = result?;
|
133
|
-
let _: Value = ruby.yield_value(record)?;
|
136
|
+
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
134
137
|
}
|
135
138
|
}
|
136
139
|
}
|
@@ -1,4 +1,7 @@
|
|
1
|
-
use crate::{
|
1
|
+
use crate::{
|
2
|
+
impl_date_conversion, impl_timestamp_array_conversion, impl_timestamp_conversion,
|
3
|
+
reader::ReaderError,
|
4
|
+
};
|
2
5
|
|
3
6
|
use super::*;
|
4
7
|
|
@@ -103,23 +106,23 @@ impl std::hash::Hash for ParquetValue {
|
|
103
106
|
}
|
104
107
|
}
|
105
108
|
|
106
|
-
impl
|
107
|
-
fn
|
109
|
+
impl TryIntoValue for ParquetValue {
|
110
|
+
fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ReaderError> {
|
108
111
|
match self {
|
109
|
-
ParquetValue::Int8(i) => i.into_value_with(handle),
|
110
|
-
ParquetValue::Int16(i) => i.into_value_with(handle),
|
111
|
-
ParquetValue::Int32(i) => i.into_value_with(handle),
|
112
|
-
ParquetValue::Int64(i) => i.into_value_with(handle),
|
113
|
-
ParquetValue::UInt8(i) => i.into_value_with(handle),
|
114
|
-
ParquetValue::UInt16(i) => i.into_value_with(handle),
|
115
|
-
ParquetValue::UInt32(i) => i.into_value_with(handle),
|
116
|
-
ParquetValue::UInt64(i) => i.into_value_with(handle),
|
117
|
-
ParquetValue::Float16(f) => f.into_value_with(handle),
|
118
|
-
ParquetValue::Float32(f) => f.into_value_with(handle),
|
119
|
-
ParquetValue::Float64(f) => f.into_value_with(handle),
|
120
|
-
ParquetValue::Boolean(b) => b.into_value_with(handle),
|
121
|
-
ParquetValue::String(s) => s.into_value_with(handle),
|
122
|
-
ParquetValue::Bytes(b) => handle.str_from_slice(&b).as_value(),
|
112
|
+
ParquetValue::Int8(i) => Ok(i.into_value_with(handle)),
|
113
|
+
ParquetValue::Int16(i) => Ok(i.into_value_with(handle)),
|
114
|
+
ParquetValue::Int32(i) => Ok(i.into_value_with(handle)),
|
115
|
+
ParquetValue::Int64(i) => Ok(i.into_value_with(handle)),
|
116
|
+
ParquetValue::UInt8(i) => Ok(i.into_value_with(handle)),
|
117
|
+
ParquetValue::UInt16(i) => Ok(i.into_value_with(handle)),
|
118
|
+
ParquetValue::UInt32(i) => Ok(i.into_value_with(handle)),
|
119
|
+
ParquetValue::UInt64(i) => Ok(i.into_value_with(handle)),
|
120
|
+
ParquetValue::Float16(f) => Ok(f.into_value_with(handle)),
|
121
|
+
ParquetValue::Float32(f) => Ok(f.into_value_with(handle)),
|
122
|
+
ParquetValue::Float64(f) => Ok(f.into_value_with(handle)),
|
123
|
+
ParquetValue::Boolean(b) => Ok(b.into_value_with(handle)),
|
124
|
+
ParquetValue::String(s) => Ok(s.into_value_with(handle)),
|
125
|
+
ParquetValue::Bytes(b) => Ok(handle.str_from_slice(&b).as_value()),
|
123
126
|
ParquetValue::Date32(d) => impl_date_conversion!(d, handle),
|
124
127
|
ParquetValue::Date64(d) => impl_date_conversion!(d, handle),
|
125
128
|
timestamp @ ParquetValue::TimestampSecond(_, _) => {
|
@@ -136,21 +139,23 @@ impl IntoValue for ParquetValue {
|
|
136
139
|
}
|
137
140
|
ParquetValue::List(l) => {
|
138
141
|
let ary = handle.ary_new_capa(l.len());
|
139
|
-
l.into_iter()
|
140
|
-
|
141
|
-
|
142
|
-
|
142
|
+
l.into_iter().try_for_each(|v| {
|
143
|
+
ary.push(v.try_into_value_with(handle)?)?;
|
144
|
+
Ok::<_, ReaderError>(())
|
145
|
+
})?;
|
146
|
+
Ok(ary.into_value_with(handle))
|
143
147
|
}
|
144
148
|
ParquetValue::Map(m) => {
|
145
149
|
let hash = handle.hash_new_capa(m.len());
|
146
|
-
m.into_iter()
|
147
|
-
.
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
150
|
+
m.into_iter().try_for_each(|(k, v)| {
|
151
|
+
hash.aset(
|
152
|
+
k.try_into_value_with(handle)?,
|
153
|
+
v.try_into_value_with(handle)?,
|
154
|
+
)
|
155
|
+
})?;
|
156
|
+
Ok(hash.into_value_with(handle))
|
157
|
+
}
|
158
|
+
ParquetValue::Null => Ok(handle.qnil().as_value()),
|
154
159
|
}
|
155
160
|
}
|
156
161
|
}
|
@@ -260,18 +265,10 @@ impl std::cmp::PartialEq for ParquetValueVec {
|
|
260
265
|
|
261
266
|
impl std::cmp::Eq for ParquetValueVec {}
|
262
267
|
|
263
|
-
impl TryFrom<Arc<dyn Array>> for ParquetValueVec {
|
264
|
-
type Error = String;
|
265
|
-
|
266
|
-
fn try_from(column: Arc<dyn Array>) -> Result<Self, Self::Error> {
|
267
|
-
ParquetValueVec::try_from(&*column)
|
268
|
-
}
|
269
|
-
}
|
270
|
-
|
271
268
|
macro_rules! impl_numeric_array_conversion {
|
272
269
|
($column:expr, $array_type:ty, $variant:ident) => {{
|
273
270
|
let array = downcast_array::<$array_type>($column);
|
274
|
-
if array.is_nullable() {
|
271
|
+
Ok(ParquetValueVec(if array.is_nullable() {
|
275
272
|
array
|
276
273
|
.values()
|
277
274
|
.iter()
|
@@ -290,13 +287,13 @@ macro_rules! impl_numeric_array_conversion {
|
|
290
287
|
.iter()
|
291
288
|
.map(|x| ParquetValue::$variant(*x))
|
292
289
|
.collect()
|
293
|
-
}
|
290
|
+
}))
|
294
291
|
}};
|
295
292
|
}
|
296
293
|
macro_rules! impl_boolean_array_conversion {
|
297
294
|
($column:expr, $array_type:ty, $variant:ident) => {{
|
298
295
|
let array = downcast_array::<$array_type>($column);
|
299
|
-
if array.is_nullable() {
|
296
|
+
Ok(ParquetValueVec(if array.is_nullable() {
|
300
297
|
array
|
301
298
|
.values()
|
302
299
|
.iter()
|
@@ -315,34 +312,50 @@ macro_rules! impl_boolean_array_conversion {
|
|
315
312
|
.iter()
|
316
313
|
.map(|x| ParquetValue::$variant(x))
|
317
314
|
.collect()
|
318
|
-
}
|
315
|
+
}))
|
319
316
|
}};
|
320
317
|
}
|
321
318
|
|
322
|
-
|
323
|
-
|
319
|
+
pub struct ArrayWrapper<'a> {
|
320
|
+
pub array: &'a dyn Array,
|
321
|
+
pub strict: bool,
|
322
|
+
}
|
323
|
+
|
324
|
+
impl<'a> TryFrom<ArrayWrapper<'a>> for ParquetValueVec {
|
325
|
+
type Error = ReaderError;
|
324
326
|
|
325
|
-
fn try_from(column:
|
326
|
-
|
327
|
-
DataType::Boolean =>
|
328
|
-
|
329
|
-
|
330
|
-
DataType::
|
331
|
-
DataType::
|
332
|
-
DataType::
|
333
|
-
DataType::
|
334
|
-
DataType::
|
335
|
-
DataType::
|
336
|
-
DataType::
|
337
|
-
DataType::
|
338
|
-
DataType::
|
339
|
-
|
327
|
+
fn try_from(column: ArrayWrapper<'a>) -> Result<Self, Self::Error> {
|
328
|
+
match column.array.data_type() {
|
329
|
+
DataType::Boolean => {
|
330
|
+
impl_boolean_array_conversion!(column.array, BooleanArray, Boolean)
|
331
|
+
}
|
332
|
+
DataType::Int8 => impl_numeric_array_conversion!(column.array, Int8Array, Int8),
|
333
|
+
DataType::Int16 => impl_numeric_array_conversion!(column.array, Int16Array, Int16),
|
334
|
+
DataType::Int32 => impl_numeric_array_conversion!(column.array, Int32Array, Int32),
|
335
|
+
DataType::Int64 => impl_numeric_array_conversion!(column.array, Int64Array, Int64),
|
336
|
+
DataType::UInt8 => impl_numeric_array_conversion!(column.array, UInt8Array, UInt8),
|
337
|
+
DataType::UInt16 => impl_numeric_array_conversion!(column.array, UInt16Array, UInt16),
|
338
|
+
DataType::UInt32 => impl_numeric_array_conversion!(column.array, UInt32Array, UInt32),
|
339
|
+
DataType::UInt64 => impl_numeric_array_conversion!(column.array, UInt64Array, UInt64),
|
340
|
+
DataType::Float32 => {
|
341
|
+
impl_numeric_array_conversion!(column.array, Float32Array, Float32)
|
342
|
+
}
|
343
|
+
DataType::Float64 => {
|
344
|
+
impl_numeric_array_conversion!(column.array, Float64Array, Float64)
|
345
|
+
}
|
346
|
+
DataType::Date32 => impl_numeric_array_conversion!(column.array, Date32Array, Date32),
|
347
|
+
DataType::Date64 => impl_numeric_array_conversion!(column.array, Date64Array, Date64),
|
340
348
|
DataType::Timestamp(TimeUnit::Second, tz) => {
|
341
|
-
impl_timestamp_array_conversion!(
|
349
|
+
impl_timestamp_array_conversion!(
|
350
|
+
column.array,
|
351
|
+
TimestampSecondArray,
|
352
|
+
TimestampSecond,
|
353
|
+
tz
|
354
|
+
)
|
342
355
|
}
|
343
356
|
DataType::Timestamp(TimeUnit::Millisecond, tz) => {
|
344
357
|
impl_timestamp_array_conversion!(
|
345
|
-
column,
|
358
|
+
column.array,
|
346
359
|
TimestampMillisecondArray,
|
347
360
|
TimestampMillis,
|
348
361
|
tz
|
@@ -350,7 +363,7 @@ impl TryFrom<&dyn Array> for ParquetValueVec {
|
|
350
363
|
}
|
351
364
|
DataType::Timestamp(TimeUnit::Microsecond, tz) => {
|
352
365
|
impl_timestamp_array_conversion!(
|
353
|
-
column,
|
366
|
+
column.array,
|
354
367
|
TimestampMicrosecondArray,
|
355
368
|
TimestampMicros,
|
356
369
|
tz
|
@@ -358,72 +371,93 @@ impl TryFrom<&dyn Array> for ParquetValueVec {
|
|
358
371
|
}
|
359
372
|
DataType::Timestamp(TimeUnit::Nanosecond, tz) => {
|
360
373
|
impl_timestamp_array_conversion!(
|
361
|
-
column,
|
374
|
+
column.array,
|
362
375
|
TimestampNanosecondArray,
|
363
376
|
TimestampNanos,
|
364
377
|
tz
|
365
378
|
)
|
366
379
|
}
|
367
380
|
DataType::Float16 => {
|
368
|
-
let array = downcast_array::<Float16Array>(column);
|
381
|
+
let array = downcast_array::<Float16Array>(column.array);
|
369
382
|
if array.is_nullable() {
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
383
|
+
Ok(ParquetValueVec(
|
384
|
+
array
|
385
|
+
.values()
|
386
|
+
.iter()
|
387
|
+
.enumerate()
|
388
|
+
.map(|(i, x)| {
|
389
|
+
if array.is_null(i) {
|
390
|
+
ParquetValue::Null
|
391
|
+
} else {
|
392
|
+
ParquetValue::Float16(f32::from(*x))
|
393
|
+
}
|
394
|
+
})
|
395
|
+
.collect(),
|
396
|
+
))
|
382
397
|
} else {
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
398
|
+
Ok(ParquetValueVec(
|
399
|
+
array
|
400
|
+
.values()
|
401
|
+
.iter()
|
402
|
+
.map(|x| ParquetValue::Float16(f32::from(*x)))
|
403
|
+
.collect(),
|
404
|
+
))
|
388
405
|
}
|
389
406
|
}
|
390
407
|
DataType::Utf8 => {
|
391
|
-
let array = downcast_array::<StringArray>(column);
|
392
|
-
array
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
408
|
+
let array = downcast_array::<StringArray>(column.array);
|
409
|
+
let mut tmp_vec = Vec::with_capacity(array.len());
|
410
|
+
let iter = array.iter().map(|opt_x| match opt_x {
|
411
|
+
Some(x) => {
|
412
|
+
if column.strict {
|
413
|
+
Ok::<_, ReaderError>(ParquetValue::String(
|
414
|
+
simdutf8::basic::from_utf8(x.as_bytes())?.to_string(),
|
415
|
+
))
|
416
|
+
} else {
|
417
|
+
Ok::<_, ReaderError>(ParquetValue::String(x.to_string()))
|
418
|
+
}
|
419
|
+
}
|
420
|
+
None => Ok(ParquetValue::Null),
|
421
|
+
});
|
422
|
+
for x in iter {
|
423
|
+
tmp_vec.push(x?);
|
424
|
+
}
|
425
|
+
Ok(ParquetValueVec(tmp_vec))
|
399
426
|
}
|
400
427
|
DataType::Binary => {
|
401
|
-
let array = downcast_array::<BinaryArray>(column);
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
428
|
+
let array = downcast_array::<BinaryArray>(column.array);
|
429
|
+
Ok(ParquetValueVec(
|
430
|
+
array
|
431
|
+
.iter()
|
432
|
+
.map(|opt_x| match opt_x {
|
433
|
+
Some(x) => ParquetValue::Bytes(x.to_vec()),
|
434
|
+
None => ParquetValue::Null,
|
435
|
+
})
|
436
|
+
.collect(),
|
437
|
+
))
|
409
438
|
}
|
410
439
|
DataType::List(_field) => {
|
411
|
-
let list_array = downcast_array::<ListArray>(column);
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
}
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
440
|
+
let list_array = downcast_array::<ListArray>(column.array);
|
441
|
+
Ok(ParquetValueVec(
|
442
|
+
list_array
|
443
|
+
.iter()
|
444
|
+
.map(|x| match x {
|
445
|
+
Some(values) => match ParquetValueVec::try_from(ArrayWrapper {
|
446
|
+
array: &*values,
|
447
|
+
strict: column.strict,
|
448
|
+
}) {
|
449
|
+
Ok(vec) => ParquetValue::List(vec.into_inner()),
|
450
|
+
Err(e) => {
|
451
|
+
panic!("Error converting list array to ParquetValueVec: {}", e)
|
452
|
+
}
|
453
|
+
},
|
454
|
+
None => ParquetValue::Null,
|
455
|
+
})
|
456
|
+
.collect(),
|
457
|
+
))
|
424
458
|
}
|
425
459
|
DataType::Struct(_) => {
|
426
|
-
let struct_array = downcast_array::<StructArray>(column);
|
460
|
+
let struct_array = downcast_array::<StructArray>(column.array);
|
427
461
|
let mut values = Vec::with_capacity(struct_array.len());
|
428
462
|
for i in 0..struct_array.len() {
|
429
463
|
if struct_array.is_null(i) {
|
@@ -433,8 +467,11 @@ impl TryFrom<&dyn Array> for ParquetValueVec {
|
|
433
467
|
|
434
468
|
let mut map = std::collections::HashMap::new();
|
435
469
|
for (field_idx, field) in struct_array.fields().iter().enumerate() {
|
436
|
-
let
|
437
|
-
let field_values = match ParquetValueVec::try_from(
|
470
|
+
let c = struct_array.column(field_idx);
|
471
|
+
let field_values = match ParquetValueVec::try_from(ArrayWrapper {
|
472
|
+
array: &*c.slice(i, 1),
|
473
|
+
strict: column.strict,
|
474
|
+
}) {
|
438
475
|
Ok(vec) => vec.into_inner(),
|
439
476
|
Err(e) => {
|
440
477
|
panic!("Error converting struct field to ParquetValueVec: {}", e)
|
@@ -447,16 +484,18 @@ impl TryFrom<&dyn Array> for ParquetValueVec {
|
|
447
484
|
}
|
448
485
|
values.push(ParquetValue::Map(map));
|
449
486
|
}
|
450
|
-
values
|
487
|
+
Ok(ParquetValueVec(values))
|
451
488
|
}
|
452
489
|
DataType::Null => {
|
453
|
-
let x = downcast_array::<NullArray>(column);
|
454
|
-
vec![ParquetValue::Null; x.len()]
|
490
|
+
let x = downcast_array::<NullArray>(column.array);
|
491
|
+
Ok(ParquetValueVec(vec![ParquetValue::Null; x.len()]))
|
455
492
|
}
|
456
493
|
_ => {
|
457
|
-
return Err(format!(
|
494
|
+
return Err(ReaderError::Ruby(format!(
|
495
|
+
"Unsupported data type: {:?}",
|
496
|
+
column.array.data_type()
|
497
|
+
)));
|
458
498
|
}
|
459
|
-
}
|
460
|
-
Ok(ParquetValueVec(tmp_vec))
|
499
|
+
}
|
461
500
|
}
|
462
501
|
}
|
@@ -1,5 +1,7 @@
|
|
1
1
|
use itertools::Itertools;
|
2
2
|
|
3
|
+
use crate::reader::ReaderError;
|
4
|
+
|
3
5
|
use super::*;
|
4
6
|
|
5
7
|
#[derive(Debug)]
|
@@ -15,15 +17,16 @@ pub enum ColumnRecord<S: BuildHasher + Default> {
|
|
15
17
|
}
|
16
18
|
|
17
19
|
#[derive(Debug)]
|
18
|
-
pub struct ParquetField(pub Field);
|
20
|
+
pub struct ParquetField(pub Field, pub bool);
|
19
21
|
|
20
|
-
impl<S: BuildHasher + Default>
|
21
|
-
fn
|
22
|
+
impl<S: BuildHasher + Default> TryIntoValue for RowRecord<S> {
|
23
|
+
fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ReaderError> {
|
22
24
|
match self {
|
23
25
|
RowRecord::Vec(vec) => {
|
24
26
|
let ary = handle.ary_new_capa(vec.len());
|
25
|
-
vec.into_iter()
|
26
|
-
|
27
|
+
vec.into_iter()
|
28
|
+
.try_for_each(|v| ary.push(v.try_into_value_with(handle)?))?;
|
29
|
+
Ok(handle.into_value(ary))
|
27
30
|
}
|
28
31
|
RowRecord::Map(map) => {
|
29
32
|
let hash = handle.hash_new_capa(map.len());
|
@@ -36,41 +39,41 @@ impl<S: BuildHasher + Default> IntoValue for RowRecord<S> {
|
|
36
39
|
for (k, v) in chunk {
|
37
40
|
if i + 1 >= values.len() {
|
38
41
|
// Bulk insert current batch if array is full
|
39
|
-
hash.bulk_insert(&values[..i])
|
42
|
+
hash.bulk_insert(&values[..i])?;
|
40
43
|
values[..i].fill(handle.qnil().as_value());
|
41
44
|
i = 0;
|
42
45
|
}
|
43
46
|
values[i] = handle.into_value(k);
|
44
|
-
values[i + 1] =
|
47
|
+
values[i + 1] = v.try_into_value_with(handle)?;
|
45
48
|
i += 2;
|
46
49
|
}
|
47
50
|
// Insert any remaining pairs
|
48
51
|
if i > 0 {
|
49
|
-
hash.bulk_insert(&values[..i])
|
52
|
+
hash.bulk_insert(&values[..i])?;
|
50
53
|
values[..i].fill(handle.qnil().as_value());
|
51
54
|
i = 0;
|
52
55
|
}
|
53
56
|
}
|
54
57
|
|
55
|
-
hash.into_value_with(handle)
|
58
|
+
Ok(hash.into_value_with(handle))
|
56
59
|
}
|
57
60
|
}
|
58
61
|
}
|
59
62
|
}
|
60
63
|
|
61
|
-
impl<S: BuildHasher + Default>
|
62
|
-
fn
|
64
|
+
impl<S: BuildHasher + Default> TryIntoValue for ColumnRecord<S> {
|
65
|
+
fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ReaderError> {
|
63
66
|
match self {
|
64
67
|
ColumnRecord::Vec(vec) => {
|
65
68
|
let ary = handle.ary_new_capa(vec.len());
|
66
|
-
vec.into_iter()
|
67
|
-
.
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
ary.into_value_with(handle)
|
69
|
+
vec.into_iter().try_for_each(|v| {
|
70
|
+
let nested_ary = handle.ary_new_capa(v.len());
|
71
|
+
v.into_iter()
|
72
|
+
.try_for_each(|v| nested_ary.push(v.try_into_value_with(handle)?))?;
|
73
|
+
ary.push(nested_ary.into_value_with(handle))?;
|
74
|
+
Ok::<_, ReaderError>(())
|
75
|
+
})?;
|
76
|
+
Ok(ary.into_value_with(handle))
|
74
77
|
}
|
75
78
|
ColumnRecord::Map(map) => {
|
76
79
|
let hash = handle.hash_new_capa(map.len());
|
@@ -83,91 +86,98 @@ impl<S: BuildHasher + Default> IntoValue for ColumnRecord<S> {
|
|
83
86
|
for (k, v) in chunk {
|
84
87
|
if i + 1 >= values.len() {
|
85
88
|
// Bulk insert current batch if array is full
|
86
|
-
hash.bulk_insert(&values[..i])
|
89
|
+
hash.bulk_insert(&values[..i])?;
|
87
90
|
values[..i].fill(handle.qnil().as_value());
|
88
91
|
i = 0;
|
89
92
|
}
|
90
93
|
values[i] = handle.into_value(k);
|
91
94
|
let ary = handle.ary_new_capa(v.len());
|
92
|
-
v.into_iter()
|
95
|
+
v.into_iter()
|
96
|
+
.try_for_each(|v| ary.push(v.try_into_value_with(handle)?))?;
|
93
97
|
values[i + 1] = handle.into_value(ary);
|
94
98
|
i += 2;
|
95
99
|
}
|
96
100
|
// Insert any remaining pairs
|
97
101
|
if i > 0 {
|
98
|
-
hash.bulk_insert(&values[..i])
|
102
|
+
hash.bulk_insert(&values[..i])?;
|
99
103
|
values[..i].fill(handle.qnil().as_value());
|
100
104
|
i = 0;
|
101
105
|
}
|
102
106
|
}
|
103
107
|
|
104
|
-
hash.into_value_with(handle)
|
108
|
+
Ok(hash.into_value_with(handle))
|
105
109
|
}
|
106
110
|
}
|
107
111
|
}
|
108
112
|
}
|
109
113
|
|
110
|
-
|
111
|
-
fn
|
114
|
+
pub trait TryIntoValue {
|
115
|
+
fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ReaderError>;
|
116
|
+
}
|
117
|
+
|
118
|
+
impl TryIntoValue for ParquetField {
|
119
|
+
fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ReaderError> {
|
112
120
|
match self.0 {
|
113
|
-
Field::Null => handle.qnil().as_value(),
|
114
|
-
Field::Bool(b) => b.into_value_with(handle),
|
115
|
-
Field::Short(s) => s.into_value_with(handle),
|
116
|
-
Field::Int(i) => i.into_value_with(handle),
|
117
|
-
Field::Long(l) => l.into_value_with(handle),
|
118
|
-
Field::UByte(ub) => ub.into_value_with(handle),
|
119
|
-
Field::UShort(us) => us.into_value_with(handle),
|
120
|
-
Field::UInt(ui) => ui.into_value_with(handle),
|
121
|
-
Field::ULong(ul) => ul.into_value_with(handle),
|
122
|
-
Field::Float16(f) => f32::from(f).into_value_with(handle),
|
123
|
-
Field::Float(f) => f.into_value_with(handle),
|
124
|
-
Field::Double(d) => d.into_value_with(handle),
|
125
|
-
Field::Str(s) =>
|
126
|
-
|
127
|
-
|
121
|
+
Field::Null => Ok(handle.qnil().as_value()),
|
122
|
+
Field::Bool(b) => Ok(b.into_value_with(handle)),
|
123
|
+
Field::Short(s) => Ok(s.into_value_with(handle)),
|
124
|
+
Field::Int(i) => Ok(i.into_value_with(handle)),
|
125
|
+
Field::Long(l) => Ok(l.into_value_with(handle)),
|
126
|
+
Field::UByte(ub) => Ok(ub.into_value_with(handle)),
|
127
|
+
Field::UShort(us) => Ok(us.into_value_with(handle)),
|
128
|
+
Field::UInt(ui) => Ok(ui.into_value_with(handle)),
|
129
|
+
Field::ULong(ul) => Ok(ul.into_value_with(handle)),
|
130
|
+
Field::Float16(f) => Ok(f32::from(f).into_value_with(handle)),
|
131
|
+
Field::Float(f) => Ok(f.into_value_with(handle)),
|
132
|
+
Field::Double(d) => Ok(d.into_value_with(handle)),
|
133
|
+
Field::Str(s) => {
|
134
|
+
if self.1 {
|
135
|
+
Ok(simdutf8::basic::from_utf8(s.as_bytes())
|
136
|
+
.map_err(|e| ReaderError::Utf8Error(e))
|
137
|
+
.and_then(|s| Ok(s.into_value_with(handle)))?)
|
138
|
+
} else {
|
139
|
+
Ok(handle.str_from_slice(s.as_bytes()).as_value())
|
140
|
+
}
|
141
|
+
}
|
142
|
+
Field::Byte(b) => Ok(b.into_value_with(handle)),
|
143
|
+
Field::Bytes(b) => Ok(handle.str_from_slice(b.data()).as_value()),
|
128
144
|
Field::Date(d) => {
|
129
|
-
let ts = jiff::Timestamp::from_second((d as i64) * 86400)
|
145
|
+
let ts = jiff::Timestamp::from_second((d as i64) * 86400)?;
|
130
146
|
let formatted = ts.strftime("%Y-%m-%d").to_string();
|
131
|
-
formatted.into_value_with(handle)
|
147
|
+
Ok(formatted.into_value_with(handle))
|
132
148
|
}
|
133
149
|
Field::TimestampMillis(ts) => {
|
134
|
-
let ts = jiff::Timestamp::from_millisecond(ts)
|
150
|
+
let ts = jiff::Timestamp::from_millisecond(ts)?;
|
135
151
|
let time_class = handle.class_time();
|
136
|
-
time_class
|
137
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
138
|
-
.
|
139
|
-
.into_value_with(handle)
|
152
|
+
Ok(time_class
|
153
|
+
.funcall::<_, _, Value>("parse", (ts.to_string(),))?
|
154
|
+
.into_value_with(handle))
|
140
155
|
}
|
141
156
|
Field::TimestampMicros(ts) => {
|
142
|
-
let ts = jiff::Timestamp::from_microsecond(ts)
|
157
|
+
let ts = jiff::Timestamp::from_microsecond(ts)?;
|
143
158
|
let time_class = handle.class_time();
|
144
|
-
time_class
|
145
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
146
|
-
.
|
147
|
-
.into_value_with(handle)
|
159
|
+
Ok(time_class
|
160
|
+
.funcall::<_, _, Value>("parse", (ts.to_string(),))?
|
161
|
+
.into_value_with(handle))
|
148
162
|
}
|
149
163
|
Field::ListInternal(list) => {
|
150
164
|
let elements = list.elements();
|
151
165
|
let ary = handle.ary_new_capa(elements.len());
|
152
|
-
elements
|
153
|
-
.
|
154
|
-
|
155
|
-
|
156
|
-
ary.into_value_with(handle)
|
166
|
+
elements.iter().try_for_each(|e| {
|
167
|
+
ary.push(ParquetField(e.clone(), self.1).try_into_value_with(handle)?)
|
168
|
+
})?;
|
169
|
+
Ok(ary.into_value_with(handle))
|
157
170
|
}
|
158
171
|
Field::MapInternal(map) => {
|
159
172
|
let entries = map.entries();
|
160
173
|
let hash = handle.hash_new_capa(entries.len());
|
161
|
-
entries
|
162
|
-
.
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
})
|
169
|
-
.unwrap();
|
170
|
-
hash.into_value_with(handle)
|
174
|
+
entries.iter().try_for_each(|(k, v)| {
|
175
|
+
hash.aset(
|
176
|
+
ParquetField(k.clone(), self.1).try_into_value_with(handle)?,
|
177
|
+
ParquetField(v.clone(), self.1).try_into_value_with(handle)?,
|
178
|
+
)
|
179
|
+
})?;
|
180
|
+
Ok(hash.into_value_with(handle))
|
171
181
|
}
|
172
182
|
Field::Decimal(d) => {
|
173
183
|
let value = match d {
|
@@ -185,20 +195,24 @@ impl IntoValue for ParquetField {
|
|
185
195
|
format!("{}e-{}", unscaled, scale)
|
186
196
|
}
|
187
197
|
};
|
188
|
-
handle.eval(&format!("BigDecimal(\"{value}\")"))
|
198
|
+
Ok(handle.eval(&format!("BigDecimal(\"{value}\")"))?)
|
189
199
|
}
|
190
200
|
Field::Group(row) => {
|
191
201
|
let hash = handle.hash_new();
|
192
|
-
row.get_column_iter()
|
193
|
-
.
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
.unwrap();
|
200
|
-
hash.into_value_with(handle)
|
202
|
+
row.get_column_iter().try_for_each(|(k, v)| {
|
203
|
+
hash.aset(
|
204
|
+
k.clone().into_value_with(handle),
|
205
|
+
ParquetField(v.clone(), self.1).try_into_value_with(handle)?,
|
206
|
+
)
|
207
|
+
})?;
|
208
|
+
Ok(hash.into_value_with(handle))
|
201
209
|
}
|
202
210
|
}
|
203
211
|
}
|
204
212
|
}
|
213
|
+
|
214
|
+
// impl IntoValue for ParquetField {
|
215
|
+
// fn into_value_with(self, handle: &Ruby) -> Value {
|
216
|
+
// self.try_into_value_with(handle).unwrap()
|
217
|
+
// }
|
218
|
+
// }
|
@@ -64,10 +64,9 @@ macro_rules! impl_timestamp_conversion {
|
|
64
64
|
ParquetValue::$unit(ts, tz) => {
|
65
65
|
let ts = parse_zoned_timestamp(&ParquetValue::$unit(ts, tz));
|
66
66
|
let time_class = $handle.class_time();
|
67
|
-
time_class
|
68
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
69
|
-
.
|
70
|
-
.into_value_with($handle)
|
67
|
+
Ok(time_class
|
68
|
+
.funcall::<_, _, Value>("parse", (ts.to_string(),))?
|
69
|
+
.into_value_with($handle))
|
71
70
|
}
|
72
71
|
_ => panic!("Invalid timestamp type"),
|
73
72
|
}
|
@@ -80,6 +79,6 @@ macro_rules! impl_date_conversion {
|
|
80
79
|
($value:expr, $handle:expr) => {{
|
81
80
|
let ts = jiff::Timestamp::from_second(($value as i64) * 86400).unwrap();
|
82
81
|
let formatted = ts.strftime("%Y-%m-%d").to_string();
|
83
|
-
formatted.into_value_with($handle)
|
82
|
+
Ok(formatted.into_value_with($handle))
|
84
83
|
}};
|
85
84
|
}
|
@@ -419,7 +419,7 @@ macro_rules! impl_timestamp_to_arrow_conversion {
|
|
419
419
|
macro_rules! impl_timestamp_array_conversion {
|
420
420
|
($column:expr, $array_type:ty, $variant:ident, $tz:expr) => {{
|
421
421
|
let array = downcast_array::<$array_type>($column);
|
422
|
-
if array.is_nullable() {
|
422
|
+
Ok(ParquetValueVec(if array.is_nullable() {
|
423
423
|
array
|
424
424
|
.values()
|
425
425
|
.iter()
|
@@ -438,7 +438,7 @@ macro_rules! impl_timestamp_array_conversion {
|
|
438
438
|
.iter()
|
439
439
|
.map(|x| ParquetValue::$variant(*x, $tz.clone().map(|s| s.into())))
|
440
440
|
.collect()
|
441
|
-
}
|
441
|
+
}))
|
442
442
|
}};
|
443
443
|
}
|
444
444
|
|
data/ext/parquet/src/utils.rs
CHANGED
@@ -32,6 +32,7 @@ pub struct ParquetRowsArgs {
|
|
32
32
|
pub to_read: Value,
|
33
33
|
pub result_type: ParserResultType,
|
34
34
|
pub columns: Option<Vec<String>>,
|
35
|
+
pub strict: bool,
|
35
36
|
}
|
36
37
|
|
37
38
|
/// Parse common arguments for CSV parsing
|
@@ -39,10 +40,19 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
|
|
39
40
|
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
40
41
|
let (to_read,) = parsed_args.required;
|
41
42
|
|
42
|
-
let kwargs = get_kwargs::<
|
43
|
+
let kwargs = get_kwargs::<
|
44
|
+
_,
|
45
|
+
(),
|
46
|
+
(
|
47
|
+
Option<Option<Value>>,
|
48
|
+
Option<Option<Vec<String>>>,
|
49
|
+
Option<Option<bool>>,
|
50
|
+
),
|
51
|
+
(),
|
52
|
+
>(
|
43
53
|
parsed_args.keywords,
|
44
54
|
&[],
|
45
|
-
&["result_type", "columns"],
|
55
|
+
&["result_type", "columns", "strict"],
|
46
56
|
)?;
|
47
57
|
|
48
58
|
let result_type: ParserResultType = match kwargs
|
@@ -73,10 +83,13 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
|
|
73
83
|
None => ParserResultType::Hash,
|
74
84
|
};
|
75
85
|
|
86
|
+
let strict = kwargs.optional.2.flatten().unwrap_or(false);
|
87
|
+
|
76
88
|
Ok(ParquetRowsArgs {
|
77
89
|
to_read,
|
78
90
|
result_type,
|
79
91
|
columns: kwargs.optional.1.flatten(),
|
92
|
+
strict,
|
80
93
|
})
|
81
94
|
}
|
82
95
|
|
@@ -86,6 +99,7 @@ pub struct ParquetColumnsArgs {
|
|
86
99
|
pub result_type: ParserResultType,
|
87
100
|
pub columns: Option<Vec<String>>,
|
88
101
|
pub batch_size: Option<usize>,
|
102
|
+
pub strict: bool,
|
89
103
|
}
|
90
104
|
|
91
105
|
/// Parse common arguments for CSV parsing
|
@@ -103,12 +117,13 @@ pub fn parse_parquet_columns_args(
|
|
103
117
|
Option<Option<Value>>,
|
104
118
|
Option<Option<Vec<String>>>,
|
105
119
|
Option<Option<usize>>,
|
120
|
+
Option<Option<bool>>,
|
106
121
|
),
|
107
122
|
(),
|
108
123
|
>(
|
109
124
|
parsed_args.keywords,
|
110
125
|
&[],
|
111
|
-
&["result_type", "columns", "batch_size"],
|
126
|
+
&["result_type", "columns", "batch_size", "strict"],
|
112
127
|
)?;
|
113
128
|
|
114
129
|
let result_type: ParserResultType = match kwargs
|
@@ -144,5 +159,6 @@ pub fn parse_parquet_columns_args(
|
|
144
159
|
result_type,
|
145
160
|
columns: kwargs.optional.1.flatten(),
|
146
161
|
batch_size: kwargs.optional.2.flatten(),
|
162
|
+
strict: kwargs.optional.3.flatten().unwrap_or(false),
|
147
163
|
})
|
148
164
|
}
|
Binary file
|
data/lib/parquet/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
@@ -76,6 +76,7 @@ files:
|
|
76
76
|
- ext/parquet/src/writer/mod.rs
|
77
77
|
- lib/parquet.rb
|
78
78
|
- lib/parquet.rbi
|
79
|
+
- lib/parquet/parquet.so
|
79
80
|
- lib/parquet/version.rb
|
80
81
|
homepage: https://github.com/njaremko/parquet-ruby
|
81
82
|
licenses:
|