parquet 0.2.10 → 0.2.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +7 -0
- data/ext/parquet/Cargo.toml +1 -0
- data/ext/parquet/src/enumerator.rs +17 -17
- data/ext/parquet/src/header_cache.rs +21 -81
- data/ext/parquet/src/reader/mod.rs +6 -0
- data/ext/parquet/src/reader/parquet_column_reader.rs +93 -98
- data/ext/parquet/src/reader/parquet_row_reader.rs +55 -47
- data/ext/parquet/src/types/parquet_value.rs +157 -118
- data/ext/parquet/src/types/record_types.rs +91 -77
- data/ext/parquet/src/types/timestamp.rs +4 -5
- data/ext/parquet/src/types/type_conversion.rs +2 -2
- data/ext/parquet/src/utils.rs +19 -3
- data/lib/parquet/parquet.so +0 -0
- data/lib/parquet/version.rb +1 -1
- metadata +3 -2
@@ -1,5 +1,7 @@
|
|
1
1
|
use itertools::Itertools;
|
2
2
|
|
3
|
+
use crate::reader::ReaderError;
|
4
|
+
|
3
5
|
use super::*;
|
4
6
|
|
5
7
|
#[derive(Debug)]
|
@@ -15,15 +17,16 @@ pub enum ColumnRecord<S: BuildHasher + Default> {
|
|
15
17
|
}
|
16
18
|
|
17
19
|
#[derive(Debug)]
|
18
|
-
pub struct ParquetField(pub Field);
|
20
|
+
pub struct ParquetField(pub Field, pub bool);
|
19
21
|
|
20
|
-
impl<S: BuildHasher + Default>
|
21
|
-
fn
|
22
|
+
impl<S: BuildHasher + Default> TryIntoValue for RowRecord<S> {
|
23
|
+
fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ReaderError> {
|
22
24
|
match self {
|
23
25
|
RowRecord::Vec(vec) => {
|
24
26
|
let ary = handle.ary_new_capa(vec.len());
|
25
|
-
vec.into_iter()
|
26
|
-
|
27
|
+
vec.into_iter()
|
28
|
+
.try_for_each(|v| ary.push(v.try_into_value_with(handle)?))?;
|
29
|
+
Ok(handle.into_value(ary))
|
27
30
|
}
|
28
31
|
RowRecord::Map(map) => {
|
29
32
|
let hash = handle.hash_new_capa(map.len());
|
@@ -36,41 +39,41 @@ impl<S: BuildHasher + Default> IntoValue for RowRecord<S> {
|
|
36
39
|
for (k, v) in chunk {
|
37
40
|
if i + 1 >= values.len() {
|
38
41
|
// Bulk insert current batch if array is full
|
39
|
-
hash.bulk_insert(&values[..i])
|
42
|
+
hash.bulk_insert(&values[..i])?;
|
40
43
|
values[..i].fill(handle.qnil().as_value());
|
41
44
|
i = 0;
|
42
45
|
}
|
43
46
|
values[i] = handle.into_value(k);
|
44
|
-
values[i + 1] =
|
47
|
+
values[i + 1] = v.try_into_value_with(handle)?;
|
45
48
|
i += 2;
|
46
49
|
}
|
47
50
|
// Insert any remaining pairs
|
48
51
|
if i > 0 {
|
49
|
-
hash.bulk_insert(&values[..i])
|
52
|
+
hash.bulk_insert(&values[..i])?;
|
50
53
|
values[..i].fill(handle.qnil().as_value());
|
51
54
|
i = 0;
|
52
55
|
}
|
53
56
|
}
|
54
57
|
|
55
|
-
hash.into_value_with(handle)
|
58
|
+
Ok(hash.into_value_with(handle))
|
56
59
|
}
|
57
60
|
}
|
58
61
|
}
|
59
62
|
}
|
60
63
|
|
61
|
-
impl<S: BuildHasher + Default>
|
62
|
-
fn
|
64
|
+
impl<S: BuildHasher + Default> TryIntoValue for ColumnRecord<S> {
|
65
|
+
fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ReaderError> {
|
63
66
|
match self {
|
64
67
|
ColumnRecord::Vec(vec) => {
|
65
68
|
let ary = handle.ary_new_capa(vec.len());
|
66
|
-
vec.into_iter()
|
67
|
-
.
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
ary.into_value_with(handle)
|
69
|
+
vec.into_iter().try_for_each(|v| {
|
70
|
+
let nested_ary = handle.ary_new_capa(v.len());
|
71
|
+
v.into_iter()
|
72
|
+
.try_for_each(|v| nested_ary.push(v.try_into_value_with(handle)?))?;
|
73
|
+
ary.push(nested_ary.into_value_with(handle))?;
|
74
|
+
Ok::<_, ReaderError>(())
|
75
|
+
})?;
|
76
|
+
Ok(ary.into_value_with(handle))
|
74
77
|
}
|
75
78
|
ColumnRecord::Map(map) => {
|
76
79
|
let hash = handle.hash_new_capa(map.len());
|
@@ -83,91 +86,98 @@ impl<S: BuildHasher + Default> IntoValue for ColumnRecord<S> {
|
|
83
86
|
for (k, v) in chunk {
|
84
87
|
if i + 1 >= values.len() {
|
85
88
|
// Bulk insert current batch if array is full
|
86
|
-
hash.bulk_insert(&values[..i])
|
89
|
+
hash.bulk_insert(&values[..i])?;
|
87
90
|
values[..i].fill(handle.qnil().as_value());
|
88
91
|
i = 0;
|
89
92
|
}
|
90
93
|
values[i] = handle.into_value(k);
|
91
94
|
let ary = handle.ary_new_capa(v.len());
|
92
|
-
v.into_iter()
|
95
|
+
v.into_iter()
|
96
|
+
.try_for_each(|v| ary.push(v.try_into_value_with(handle)?))?;
|
93
97
|
values[i + 1] = handle.into_value(ary);
|
94
98
|
i += 2;
|
95
99
|
}
|
96
100
|
// Insert any remaining pairs
|
97
101
|
if i > 0 {
|
98
|
-
hash.bulk_insert(&values[..i])
|
102
|
+
hash.bulk_insert(&values[..i])?;
|
99
103
|
values[..i].fill(handle.qnil().as_value());
|
100
104
|
i = 0;
|
101
105
|
}
|
102
106
|
}
|
103
107
|
|
104
|
-
hash.into_value_with(handle)
|
108
|
+
Ok(hash.into_value_with(handle))
|
105
109
|
}
|
106
110
|
}
|
107
111
|
}
|
108
112
|
}
|
109
113
|
|
110
|
-
|
111
|
-
fn
|
114
|
+
pub trait TryIntoValue {
|
115
|
+
fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ReaderError>;
|
116
|
+
}
|
117
|
+
|
118
|
+
impl TryIntoValue for ParquetField {
|
119
|
+
fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ReaderError> {
|
112
120
|
match self.0 {
|
113
|
-
Field::Null => handle.qnil().as_value(),
|
114
|
-
Field::Bool(b) => b.into_value_with(handle),
|
115
|
-
Field::Short(s) => s.into_value_with(handle),
|
116
|
-
Field::Int(i) => i.into_value_with(handle),
|
117
|
-
Field::Long(l) => l.into_value_with(handle),
|
118
|
-
Field::UByte(ub) => ub.into_value_with(handle),
|
119
|
-
Field::UShort(us) => us.into_value_with(handle),
|
120
|
-
Field::UInt(ui) => ui.into_value_with(handle),
|
121
|
-
Field::ULong(ul) => ul.into_value_with(handle),
|
122
|
-
Field::Float16(f) => f32::from(f).into_value_with(handle),
|
123
|
-
Field::Float(f) => f.into_value_with(handle),
|
124
|
-
Field::Double(d) => d.into_value_with(handle),
|
125
|
-
Field::Str(s) =>
|
126
|
-
|
127
|
-
|
121
|
+
Field::Null => Ok(handle.qnil().as_value()),
|
122
|
+
Field::Bool(b) => Ok(b.into_value_with(handle)),
|
123
|
+
Field::Short(s) => Ok(s.into_value_with(handle)),
|
124
|
+
Field::Int(i) => Ok(i.into_value_with(handle)),
|
125
|
+
Field::Long(l) => Ok(l.into_value_with(handle)),
|
126
|
+
Field::UByte(ub) => Ok(ub.into_value_with(handle)),
|
127
|
+
Field::UShort(us) => Ok(us.into_value_with(handle)),
|
128
|
+
Field::UInt(ui) => Ok(ui.into_value_with(handle)),
|
129
|
+
Field::ULong(ul) => Ok(ul.into_value_with(handle)),
|
130
|
+
Field::Float16(f) => Ok(f32::from(f).into_value_with(handle)),
|
131
|
+
Field::Float(f) => Ok(f.into_value_with(handle)),
|
132
|
+
Field::Double(d) => Ok(d.into_value_with(handle)),
|
133
|
+
Field::Str(s) => {
|
134
|
+
if self.1 {
|
135
|
+
Ok(simdutf8::basic::from_utf8(s.as_bytes())
|
136
|
+
.map_err(|e| ReaderError::Utf8Error(e))
|
137
|
+
.and_then(|s| Ok(s.into_value_with(handle)))?)
|
138
|
+
} else {
|
139
|
+
Ok(handle.str_from_slice(s.as_bytes()).as_value())
|
140
|
+
}
|
141
|
+
}
|
142
|
+
Field::Byte(b) => Ok(b.into_value_with(handle)),
|
143
|
+
Field::Bytes(b) => Ok(handle.str_from_slice(b.data()).as_value()),
|
128
144
|
Field::Date(d) => {
|
129
|
-
let ts = jiff::Timestamp::from_second((d as i64) * 86400)
|
145
|
+
let ts = jiff::Timestamp::from_second((d as i64) * 86400)?;
|
130
146
|
let formatted = ts.strftime("%Y-%m-%d").to_string();
|
131
|
-
formatted.into_value_with(handle)
|
147
|
+
Ok(formatted.into_value_with(handle))
|
132
148
|
}
|
133
149
|
Field::TimestampMillis(ts) => {
|
134
|
-
let ts = jiff::Timestamp::from_millisecond(ts)
|
150
|
+
let ts = jiff::Timestamp::from_millisecond(ts)?;
|
135
151
|
let time_class = handle.class_time();
|
136
|
-
time_class
|
137
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
138
|
-
.
|
139
|
-
.into_value_with(handle)
|
152
|
+
Ok(time_class
|
153
|
+
.funcall::<_, _, Value>("parse", (ts.to_string(),))?
|
154
|
+
.into_value_with(handle))
|
140
155
|
}
|
141
156
|
Field::TimestampMicros(ts) => {
|
142
|
-
let ts = jiff::Timestamp::from_microsecond(ts)
|
157
|
+
let ts = jiff::Timestamp::from_microsecond(ts)?;
|
143
158
|
let time_class = handle.class_time();
|
144
|
-
time_class
|
145
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
146
|
-
.
|
147
|
-
.into_value_with(handle)
|
159
|
+
Ok(time_class
|
160
|
+
.funcall::<_, _, Value>("parse", (ts.to_string(),))?
|
161
|
+
.into_value_with(handle))
|
148
162
|
}
|
149
163
|
Field::ListInternal(list) => {
|
150
164
|
let elements = list.elements();
|
151
165
|
let ary = handle.ary_new_capa(elements.len());
|
152
|
-
elements
|
153
|
-
.
|
154
|
-
|
155
|
-
|
156
|
-
ary.into_value_with(handle)
|
166
|
+
elements.iter().try_for_each(|e| {
|
167
|
+
ary.push(ParquetField(e.clone(), self.1).try_into_value_with(handle)?)
|
168
|
+
})?;
|
169
|
+
Ok(ary.into_value_with(handle))
|
157
170
|
}
|
158
171
|
Field::MapInternal(map) => {
|
159
172
|
let entries = map.entries();
|
160
173
|
let hash = handle.hash_new_capa(entries.len());
|
161
|
-
entries
|
162
|
-
.
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
})
|
169
|
-
.unwrap();
|
170
|
-
hash.into_value_with(handle)
|
174
|
+
entries.iter().try_for_each(|(k, v)| {
|
175
|
+
hash.aset(
|
176
|
+
ParquetField(k.clone(), self.1).try_into_value_with(handle)?,
|
177
|
+
ParquetField(v.clone(), self.1).try_into_value_with(handle)?,
|
178
|
+
)
|
179
|
+
})?;
|
180
|
+
Ok(hash.into_value_with(handle))
|
171
181
|
}
|
172
182
|
Field::Decimal(d) => {
|
173
183
|
let value = match d {
|
@@ -185,20 +195,24 @@ impl IntoValue for ParquetField {
|
|
185
195
|
format!("{}e-{}", unscaled, scale)
|
186
196
|
}
|
187
197
|
};
|
188
|
-
handle.eval(&format!("BigDecimal(\"{value}\")"))
|
198
|
+
Ok(handle.eval(&format!("BigDecimal(\"{value}\")"))?)
|
189
199
|
}
|
190
200
|
Field::Group(row) => {
|
191
201
|
let hash = handle.hash_new();
|
192
|
-
row.get_column_iter()
|
193
|
-
.
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
.unwrap();
|
200
|
-
hash.into_value_with(handle)
|
202
|
+
row.get_column_iter().try_for_each(|(k, v)| {
|
203
|
+
hash.aset(
|
204
|
+
k.clone().into_value_with(handle),
|
205
|
+
ParquetField(v.clone(), self.1).try_into_value_with(handle)?,
|
206
|
+
)
|
207
|
+
})?;
|
208
|
+
Ok(hash.into_value_with(handle))
|
201
209
|
}
|
202
210
|
}
|
203
211
|
}
|
204
212
|
}
|
213
|
+
|
214
|
+
// impl IntoValue for ParquetField {
|
215
|
+
// fn into_value_with(self, handle: &Ruby) -> Value {
|
216
|
+
// self.try_into_value_with(handle).unwrap()
|
217
|
+
// }
|
218
|
+
// }
|
@@ -64,10 +64,9 @@ macro_rules! impl_timestamp_conversion {
|
|
64
64
|
ParquetValue::$unit(ts, tz) => {
|
65
65
|
let ts = parse_zoned_timestamp(&ParquetValue::$unit(ts, tz));
|
66
66
|
let time_class = $handle.class_time();
|
67
|
-
time_class
|
68
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))
|
69
|
-
.
|
70
|
-
.into_value_with($handle)
|
67
|
+
Ok(time_class
|
68
|
+
.funcall::<_, _, Value>("parse", (ts.to_string(),))?
|
69
|
+
.into_value_with($handle))
|
71
70
|
}
|
72
71
|
_ => panic!("Invalid timestamp type"),
|
73
72
|
}
|
@@ -80,6 +79,6 @@ macro_rules! impl_date_conversion {
|
|
80
79
|
($value:expr, $handle:expr) => {{
|
81
80
|
let ts = jiff::Timestamp::from_second(($value as i64) * 86400).unwrap();
|
82
81
|
let formatted = ts.strftime("%Y-%m-%d").to_string();
|
83
|
-
formatted.into_value_with($handle)
|
82
|
+
Ok(formatted.into_value_with($handle))
|
84
83
|
}};
|
85
84
|
}
|
@@ -419,7 +419,7 @@ macro_rules! impl_timestamp_to_arrow_conversion {
|
|
419
419
|
macro_rules! impl_timestamp_array_conversion {
|
420
420
|
($column:expr, $array_type:ty, $variant:ident, $tz:expr) => {{
|
421
421
|
let array = downcast_array::<$array_type>($column);
|
422
|
-
if array.is_nullable() {
|
422
|
+
Ok(ParquetValueVec(if array.is_nullable() {
|
423
423
|
array
|
424
424
|
.values()
|
425
425
|
.iter()
|
@@ -438,7 +438,7 @@ macro_rules! impl_timestamp_array_conversion {
|
|
438
438
|
.iter()
|
439
439
|
.map(|x| ParquetValue::$variant(*x, $tz.clone().map(|s| s.into())))
|
440
440
|
.collect()
|
441
|
-
}
|
441
|
+
}))
|
442
442
|
}};
|
443
443
|
}
|
444
444
|
|
data/ext/parquet/src/utils.rs
CHANGED
@@ -32,6 +32,7 @@ pub struct ParquetRowsArgs {
|
|
32
32
|
pub to_read: Value,
|
33
33
|
pub result_type: ParserResultType,
|
34
34
|
pub columns: Option<Vec<String>>,
|
35
|
+
pub strict: bool,
|
35
36
|
}
|
36
37
|
|
37
38
|
/// Parse common arguments for CSV parsing
|
@@ -39,10 +40,19 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
|
|
39
40
|
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
40
41
|
let (to_read,) = parsed_args.required;
|
41
42
|
|
42
|
-
let kwargs = get_kwargs::<
|
43
|
+
let kwargs = get_kwargs::<
|
44
|
+
_,
|
45
|
+
(),
|
46
|
+
(
|
47
|
+
Option<Option<Value>>,
|
48
|
+
Option<Option<Vec<String>>>,
|
49
|
+
Option<Option<bool>>,
|
50
|
+
),
|
51
|
+
(),
|
52
|
+
>(
|
43
53
|
parsed_args.keywords,
|
44
54
|
&[],
|
45
|
-
&["result_type", "columns"],
|
55
|
+
&["result_type", "columns", "strict"],
|
46
56
|
)?;
|
47
57
|
|
48
58
|
let result_type: ParserResultType = match kwargs
|
@@ -73,10 +83,13 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
|
|
73
83
|
None => ParserResultType::Hash,
|
74
84
|
};
|
75
85
|
|
86
|
+
let strict = kwargs.optional.2.flatten().unwrap_or(false);
|
87
|
+
|
76
88
|
Ok(ParquetRowsArgs {
|
77
89
|
to_read,
|
78
90
|
result_type,
|
79
91
|
columns: kwargs.optional.1.flatten(),
|
92
|
+
strict,
|
80
93
|
})
|
81
94
|
}
|
82
95
|
|
@@ -86,6 +99,7 @@ pub struct ParquetColumnsArgs {
|
|
86
99
|
pub result_type: ParserResultType,
|
87
100
|
pub columns: Option<Vec<String>>,
|
88
101
|
pub batch_size: Option<usize>,
|
102
|
+
pub strict: bool,
|
89
103
|
}
|
90
104
|
|
91
105
|
/// Parse common arguments for CSV parsing
|
@@ -103,12 +117,13 @@ pub fn parse_parquet_columns_args(
|
|
103
117
|
Option<Option<Value>>,
|
104
118
|
Option<Option<Vec<String>>>,
|
105
119
|
Option<Option<usize>>,
|
120
|
+
Option<Option<bool>>,
|
106
121
|
),
|
107
122
|
(),
|
108
123
|
>(
|
109
124
|
parsed_args.keywords,
|
110
125
|
&[],
|
111
|
-
&["result_type", "columns", "batch_size"],
|
126
|
+
&["result_type", "columns", "batch_size", "strict"],
|
112
127
|
)?;
|
113
128
|
|
114
129
|
let result_type: ParserResultType = match kwargs
|
@@ -144,5 +159,6 @@ pub fn parse_parquet_columns_args(
|
|
144
159
|
result_type,
|
145
160
|
columns: kwargs.optional.1.flatten(),
|
146
161
|
batch_size: kwargs.optional.2.flatten(),
|
162
|
+
strict: kwargs.optional.3.flatten().unwrap_or(false),
|
147
163
|
})
|
148
164
|
}
|
Binary file
|
data/lib/parquet/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-02-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -76,6 +76,7 @@ files:
|
|
76
76
|
- ext/parquet/src/writer/mod.rs
|
77
77
|
- lib/parquet.rb
|
78
78
|
- lib/parquet.rbi
|
79
|
+
- lib/parquet/parquet.so
|
79
80
|
- lib/parquet/version.rb
|
80
81
|
homepage: https://github.com/njaremko/parquet-ruby
|
81
82
|
licenses:
|