parquet 0.2.10 → 0.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,7 @@
1
1
  use itertools::Itertools;
2
2
 
3
+ use crate::reader::ReaderError;
4
+
3
5
  use super::*;
4
6
 
5
7
  #[derive(Debug)]
@@ -15,15 +17,16 @@ pub enum ColumnRecord<S: BuildHasher + Default> {
15
17
  }
16
18
 
17
19
  #[derive(Debug)]
18
- pub struct ParquetField(pub Field);
20
+ pub struct ParquetField(pub Field, pub bool);
19
21
 
20
- impl<S: BuildHasher + Default> IntoValue for RowRecord<S> {
21
- fn into_value_with(self, handle: &Ruby) -> Value {
22
+ impl<S: BuildHasher + Default> TryIntoValue for RowRecord<S> {
23
+ fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ReaderError> {
22
24
  match self {
23
25
  RowRecord::Vec(vec) => {
24
26
  let ary = handle.ary_new_capa(vec.len());
25
- vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
26
- handle.into_value(ary)
27
+ vec.into_iter()
28
+ .try_for_each(|v| ary.push(v.try_into_value_with(handle)?))?;
29
+ Ok(handle.into_value(ary))
27
30
  }
28
31
  RowRecord::Map(map) => {
29
32
  let hash = handle.hash_new_capa(map.len());
@@ -36,41 +39,41 @@ impl<S: BuildHasher + Default> IntoValue for RowRecord<S> {
36
39
  for (k, v) in chunk {
37
40
  if i + 1 >= values.len() {
38
41
  // Bulk insert current batch if array is full
39
- hash.bulk_insert(&values[..i]).unwrap();
42
+ hash.bulk_insert(&values[..i])?;
40
43
  values[..i].fill(handle.qnil().as_value());
41
44
  i = 0;
42
45
  }
43
46
  values[i] = handle.into_value(k);
44
- values[i + 1] = handle.into_value(v);
47
+ values[i + 1] = v.try_into_value_with(handle)?;
45
48
  i += 2;
46
49
  }
47
50
  // Insert any remaining pairs
48
51
  if i > 0 {
49
- hash.bulk_insert(&values[..i]).unwrap();
52
+ hash.bulk_insert(&values[..i])?;
50
53
  values[..i].fill(handle.qnil().as_value());
51
54
  i = 0;
52
55
  }
53
56
  }
54
57
 
55
- hash.into_value_with(handle)
58
+ Ok(hash.into_value_with(handle))
56
59
  }
57
60
  }
58
61
  }
59
62
  }
60
63
 
61
- impl<S: BuildHasher + Default> IntoValue for ColumnRecord<S> {
62
- fn into_value_with(self, handle: &Ruby) -> Value {
64
+ impl<S: BuildHasher + Default> TryIntoValue for ColumnRecord<S> {
65
+ fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ReaderError> {
63
66
  match self {
64
67
  ColumnRecord::Vec(vec) => {
65
68
  let ary = handle.ary_new_capa(vec.len());
66
- vec.into_iter()
67
- .try_for_each(|v| {
68
- let nested_ary = handle.ary_new_capa(v.len());
69
- v.into_iter().try_for_each(|v| nested_ary.push(v)).unwrap();
70
- ary.push(nested_ary.into_value_with(handle))
71
- })
72
- .unwrap();
73
- ary.into_value_with(handle)
69
+ vec.into_iter().try_for_each(|v| {
70
+ let nested_ary = handle.ary_new_capa(v.len());
71
+ v.into_iter()
72
+ .try_for_each(|v| nested_ary.push(v.try_into_value_with(handle)?))?;
73
+ ary.push(nested_ary.into_value_with(handle))?;
74
+ Ok::<_, ReaderError>(())
75
+ })?;
76
+ Ok(ary.into_value_with(handle))
74
77
  }
75
78
  ColumnRecord::Map(map) => {
76
79
  let hash = handle.hash_new_capa(map.len());
@@ -83,91 +86,98 @@ impl<S: BuildHasher + Default> IntoValue for ColumnRecord<S> {
83
86
  for (k, v) in chunk {
84
87
  if i + 1 >= values.len() {
85
88
  // Bulk insert current batch if array is full
86
- hash.bulk_insert(&values[..i]).unwrap();
89
+ hash.bulk_insert(&values[..i])?;
87
90
  values[..i].fill(handle.qnil().as_value());
88
91
  i = 0;
89
92
  }
90
93
  values[i] = handle.into_value(k);
91
94
  let ary = handle.ary_new_capa(v.len());
92
- v.into_iter().try_for_each(|v| ary.push(v)).unwrap();
95
+ v.into_iter()
96
+ .try_for_each(|v| ary.push(v.try_into_value_with(handle)?))?;
93
97
  values[i + 1] = handle.into_value(ary);
94
98
  i += 2;
95
99
  }
96
100
  // Insert any remaining pairs
97
101
  if i > 0 {
98
- hash.bulk_insert(&values[..i]).unwrap();
102
+ hash.bulk_insert(&values[..i])?;
99
103
  values[..i].fill(handle.qnil().as_value());
100
104
  i = 0;
101
105
  }
102
106
  }
103
107
 
104
- hash.into_value_with(handle)
108
+ Ok(hash.into_value_with(handle))
105
109
  }
106
110
  }
107
111
  }
108
112
  }
109
113
 
110
- impl IntoValue for ParquetField {
111
- fn into_value_with(self, handle: &Ruby) -> Value {
114
+ pub trait TryIntoValue {
115
+ fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ReaderError>;
116
+ }
117
+
118
+ impl TryIntoValue for ParquetField {
119
+ fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ReaderError> {
112
120
  match self.0 {
113
- Field::Null => handle.qnil().as_value(),
114
- Field::Bool(b) => b.into_value_with(handle),
115
- Field::Short(s) => s.into_value_with(handle),
116
- Field::Int(i) => i.into_value_with(handle),
117
- Field::Long(l) => l.into_value_with(handle),
118
- Field::UByte(ub) => ub.into_value_with(handle),
119
- Field::UShort(us) => us.into_value_with(handle),
120
- Field::UInt(ui) => ui.into_value_with(handle),
121
- Field::ULong(ul) => ul.into_value_with(handle),
122
- Field::Float16(f) => f32::from(f).into_value_with(handle),
123
- Field::Float(f) => f.into_value_with(handle),
124
- Field::Double(d) => d.into_value_with(handle),
125
- Field::Str(s) => s.into_value_with(handle),
126
- Field::Byte(b) => b.into_value_with(handle),
127
- Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
121
+ Field::Null => Ok(handle.qnil().as_value()),
122
+ Field::Bool(b) => Ok(b.into_value_with(handle)),
123
+ Field::Short(s) => Ok(s.into_value_with(handle)),
124
+ Field::Int(i) => Ok(i.into_value_with(handle)),
125
+ Field::Long(l) => Ok(l.into_value_with(handle)),
126
+ Field::UByte(ub) => Ok(ub.into_value_with(handle)),
127
+ Field::UShort(us) => Ok(us.into_value_with(handle)),
128
+ Field::UInt(ui) => Ok(ui.into_value_with(handle)),
129
+ Field::ULong(ul) => Ok(ul.into_value_with(handle)),
130
+ Field::Float16(f) => Ok(f32::from(f).into_value_with(handle)),
131
+ Field::Float(f) => Ok(f.into_value_with(handle)),
132
+ Field::Double(d) => Ok(d.into_value_with(handle)),
133
+ Field::Str(s) => {
134
+ if self.1 {
135
+ Ok(simdutf8::basic::from_utf8(s.as_bytes())
136
+ .map_err(|e| ReaderError::Utf8Error(e))
137
+ .and_then(|s| Ok(s.into_value_with(handle)))?)
138
+ } else {
139
+ Ok(handle.str_from_slice(s.as_bytes()).as_value())
140
+ }
141
+ }
142
+ Field::Byte(b) => Ok(b.into_value_with(handle)),
143
+ Field::Bytes(b) => Ok(handle.str_from_slice(b.data()).as_value()),
128
144
  Field::Date(d) => {
129
- let ts = jiff::Timestamp::from_second((d as i64) * 86400).unwrap();
145
+ let ts = jiff::Timestamp::from_second((d as i64) * 86400)?;
130
146
  let formatted = ts.strftime("%Y-%m-%d").to_string();
131
- formatted.into_value_with(handle)
147
+ Ok(formatted.into_value_with(handle))
132
148
  }
133
149
  Field::TimestampMillis(ts) => {
134
- let ts = jiff::Timestamp::from_millisecond(ts).unwrap();
150
+ let ts = jiff::Timestamp::from_millisecond(ts)?;
135
151
  let time_class = handle.class_time();
136
- time_class
137
- .funcall::<_, _, Value>("parse", (ts.to_string(),))
138
- .unwrap()
139
- .into_value_with(handle)
152
+ Ok(time_class
153
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))?
154
+ .into_value_with(handle))
140
155
  }
141
156
  Field::TimestampMicros(ts) => {
142
- let ts = jiff::Timestamp::from_microsecond(ts).unwrap();
157
+ let ts = jiff::Timestamp::from_microsecond(ts)?;
143
158
  let time_class = handle.class_time();
144
- time_class
145
- .funcall::<_, _, Value>("parse", (ts.to_string(),))
146
- .unwrap()
147
- .into_value_with(handle)
159
+ Ok(time_class
160
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))?
161
+ .into_value_with(handle))
148
162
  }
149
163
  Field::ListInternal(list) => {
150
164
  let elements = list.elements();
151
165
  let ary = handle.ary_new_capa(elements.len());
152
- elements
153
- .iter()
154
- .try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
155
- .unwrap();
156
- ary.into_value_with(handle)
166
+ elements.iter().try_for_each(|e| {
167
+ ary.push(ParquetField(e.clone(), self.1).try_into_value_with(handle)?)
168
+ })?;
169
+ Ok(ary.into_value_with(handle))
157
170
  }
158
171
  Field::MapInternal(map) => {
159
172
  let entries = map.entries();
160
173
  let hash = handle.hash_new_capa(entries.len());
161
- entries
162
- .iter()
163
- .try_for_each(|(k, v)| {
164
- hash.aset(
165
- ParquetField(k.clone()).into_value_with(handle),
166
- ParquetField(v.clone()).into_value_with(handle),
167
- )
168
- })
169
- .unwrap();
170
- hash.into_value_with(handle)
174
+ entries.iter().try_for_each(|(k, v)| {
175
+ hash.aset(
176
+ ParquetField(k.clone(), self.1).try_into_value_with(handle)?,
177
+ ParquetField(v.clone(), self.1).try_into_value_with(handle)?,
178
+ )
179
+ })?;
180
+ Ok(hash.into_value_with(handle))
171
181
  }
172
182
  Field::Decimal(d) => {
173
183
  let value = match d {
@@ -185,20 +195,24 @@ impl IntoValue for ParquetField {
185
195
  format!("{}e-{}", unscaled, scale)
186
196
  }
187
197
  };
188
- handle.eval(&format!("BigDecimal(\"{value}\")")).unwrap()
198
+ Ok(handle.eval(&format!("BigDecimal(\"{value}\")"))?)
189
199
  }
190
200
  Field::Group(row) => {
191
201
  let hash = handle.hash_new();
192
- row.get_column_iter()
193
- .try_for_each(|(k, v)| {
194
- hash.aset(
195
- k.clone().into_value_with(handle),
196
- ParquetField(v.clone()).into_value_with(handle),
197
- )
198
- })
199
- .unwrap();
200
- hash.into_value_with(handle)
202
+ row.get_column_iter().try_for_each(|(k, v)| {
203
+ hash.aset(
204
+ k.clone().into_value_with(handle),
205
+ ParquetField(v.clone(), self.1).try_into_value_with(handle)?,
206
+ )
207
+ })?;
208
+ Ok(hash.into_value_with(handle))
201
209
  }
202
210
  }
203
211
  }
204
212
  }
213
+
214
+ // impl IntoValue for ParquetField {
215
+ // fn into_value_with(self, handle: &Ruby) -> Value {
216
+ // self.try_into_value_with(handle).unwrap()
217
+ // }
218
+ // }
@@ -64,10 +64,9 @@ macro_rules! impl_timestamp_conversion {
64
64
  ParquetValue::$unit(ts, tz) => {
65
65
  let ts = parse_zoned_timestamp(&ParquetValue::$unit(ts, tz));
66
66
  let time_class = $handle.class_time();
67
- time_class
68
- .funcall::<_, _, Value>("parse", (ts.to_string(),))
69
- .unwrap()
70
- .into_value_with($handle)
67
+ Ok(time_class
68
+ .funcall::<_, _, Value>("parse", (ts.to_string(),))?
69
+ .into_value_with($handle))
71
70
  }
72
71
  _ => panic!("Invalid timestamp type"),
73
72
  }
@@ -80,6 +79,6 @@ macro_rules! impl_date_conversion {
80
79
  ($value:expr, $handle:expr) => {{
81
80
  let ts = jiff::Timestamp::from_second(($value as i64) * 86400).unwrap();
82
81
  let formatted = ts.strftime("%Y-%m-%d").to_string();
83
- formatted.into_value_with($handle)
82
+ Ok(formatted.into_value_with($handle))
84
83
  }};
85
84
  }
@@ -419,7 +419,7 @@ macro_rules! impl_timestamp_to_arrow_conversion {
419
419
  macro_rules! impl_timestamp_array_conversion {
420
420
  ($column:expr, $array_type:ty, $variant:ident, $tz:expr) => {{
421
421
  let array = downcast_array::<$array_type>($column);
422
- if array.is_nullable() {
422
+ Ok(ParquetValueVec(if array.is_nullable() {
423
423
  array
424
424
  .values()
425
425
  .iter()
@@ -438,7 +438,7 @@ macro_rules! impl_timestamp_array_conversion {
438
438
  .iter()
439
439
  .map(|x| ParquetValue::$variant(*x, $tz.clone().map(|s| s.into())))
440
440
  .collect()
441
- }
441
+ }))
442
442
  }};
443
443
  }
444
444
 
@@ -32,6 +32,7 @@ pub struct ParquetRowsArgs {
32
32
  pub to_read: Value,
33
33
  pub result_type: ParserResultType,
34
34
  pub columns: Option<Vec<String>>,
35
+ pub strict: bool,
35
36
  }
36
37
 
37
38
  /// Parse common arguments for CSV parsing
@@ -39,10 +40,19 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
39
40
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
40
41
  let (to_read,) = parsed_args.required;
41
42
 
42
- let kwargs = get_kwargs::<_, (), (Option<Option<Value>>, Option<Option<Vec<String>>>), ()>(
43
+ let kwargs = get_kwargs::<
44
+ _,
45
+ (),
46
+ (
47
+ Option<Option<Value>>,
48
+ Option<Option<Vec<String>>>,
49
+ Option<Option<bool>>,
50
+ ),
51
+ (),
52
+ >(
43
53
  parsed_args.keywords,
44
54
  &[],
45
- &["result_type", "columns"],
55
+ &["result_type", "columns", "strict"],
46
56
  )?;
47
57
 
48
58
  let result_type: ParserResultType = match kwargs
@@ -73,10 +83,13 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
73
83
  None => ParserResultType::Hash,
74
84
  };
75
85
 
86
+ let strict = kwargs.optional.2.flatten().unwrap_or(false);
87
+
76
88
  Ok(ParquetRowsArgs {
77
89
  to_read,
78
90
  result_type,
79
91
  columns: kwargs.optional.1.flatten(),
92
+ strict,
80
93
  })
81
94
  }
82
95
 
@@ -86,6 +99,7 @@ pub struct ParquetColumnsArgs {
86
99
  pub result_type: ParserResultType,
87
100
  pub columns: Option<Vec<String>>,
88
101
  pub batch_size: Option<usize>,
102
+ pub strict: bool,
89
103
  }
90
104
 
91
105
  /// Parse common arguments for CSV parsing
@@ -103,12 +117,13 @@ pub fn parse_parquet_columns_args(
103
117
  Option<Option<Value>>,
104
118
  Option<Option<Vec<String>>>,
105
119
  Option<Option<usize>>,
120
+ Option<Option<bool>>,
106
121
  ),
107
122
  (),
108
123
  >(
109
124
  parsed_args.keywords,
110
125
  &[],
111
- &["result_type", "columns", "batch_size"],
126
+ &["result_type", "columns", "batch_size", "strict"],
112
127
  )?;
113
128
 
114
129
  let result_type: ParserResultType = match kwargs
@@ -144,5 +159,6 @@ pub fn parse_parquet_columns_args(
144
159
  result_type,
145
160
  columns: kwargs.optional.1.flatten(),
146
161
  batch_size: kwargs.optional.2.flatten(),
162
+ strict: kwargs.optional.3.flatten().unwrap_or(false),
147
163
  })
148
164
  }
Binary file
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.2.10"
2
+ VERSION = "0.2.13"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.10
4
+ version: 0.2.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-30 00:00:00.000000000 Z
11
+ date: 2025-02-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -76,6 +76,7 @@ files:
76
76
  - ext/parquet/src/writer/mod.rs
77
77
  - lib/parquet.rb
78
78
  - lib/parquet.rbi
79
+ - lib/parquet/parquet.so
79
80
  - lib/parquet/version.rb
80
81
  homepage: https://github.com/njaremko/parquet-ruby
81
82
  licenses: