parquet 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,32 +1,36 @@
1
+ use super::core_types::SchemaNode;
2
+ use crate::{
3
+ reader::ReaderError,
4
+ types::{ListField, MapField, ParquetSchemaType},
5
+ };
6
+ use arrow_array::{Array, RecordBatch};
7
+ use magnus::{value::ReprValue, Error as MagnusError, RString, Ruby, Symbol, TryConvert, Value};
8
+ use parquet::{arrow::ArrowWriter, errors::ParquetError};
1
9
  use std::{
2
10
  io::{self, Write},
3
11
  str::FromStr,
4
12
  sync::Arc,
5
13
  };
6
-
7
- use arrow_array::{Array, RecordBatch};
8
- use magnus::{value::ReprValue, Error as MagnusError, RString, Ruby, Symbol, TryConvert, Value};
9
- use parquet::{arrow::ArrowWriter, errors::ParquetError};
10
14
  use tempfile::NamedTempFile;
11
15
 
12
- use crate::types::{ListField, MapField, ParquetSchemaType};
13
-
14
- #[derive(Debug)]
16
+ #[derive(Debug, Clone)]
15
17
  pub struct SchemaField<'a> {
16
18
  pub name: String,
17
19
  pub type_: ParquetSchemaType<'a>,
18
20
  pub format: Option<String>,
21
+ pub nullable: bool,
19
22
  }
20
23
 
21
24
  #[derive(Debug)]
22
- pub struct ParquetWriteArgs<'a> {
25
+ pub struct ParquetWriteArgs {
23
26
  pub read_from: Value,
24
27
  pub write_to: Value,
25
- pub schema: Vec<SchemaField<'a>>,
28
+ pub schema: SchemaNode,
26
29
  pub batch_size: Option<usize>,
27
30
  pub flush_threshold: Option<usize>,
28
31
  pub compression: Option<String>,
29
32
  pub sample_size: Option<usize>,
33
+ pub logger: Option<Value>,
30
34
  }
31
35
 
32
36
  pub trait SendableWrite: Send + Write {}
@@ -59,6 +63,42 @@ impl<'a> FromStr for ParquetSchemaType<'a> {
59
63
  type Err = MagnusError;
60
64
 
61
65
  fn from_str(s: &str) -> Result<Self, Self::Err> {
66
+ // Check if it's a list type
67
+ if let Some(inner_type_str) = s.strip_prefix("list<").and_then(|s| s.strip_suffix(">")) {
68
+ let inner_type = inner_type_str.parse::<ParquetSchemaType>()?;
69
+ return Ok(ParquetSchemaType::List(Box::new(ListField {
70
+ item_type: inner_type,
71
+ format: None,
72
+ nullable: true,
73
+ })));
74
+ }
75
+
76
+ // Check if it's a map type
77
+ if let Some(kv_types_str) = s.strip_prefix("map<").and_then(|s| s.strip_suffix(">")) {
78
+ let parts: Vec<&str> = kv_types_str.splitn(2, ',').collect();
79
+ if parts.len() != 2 {
80
+ return Err(MagnusError::new(
81
+ magnus::exception::runtime_error(),
82
+ format!(
83
+ "Invalid map format. Expected 'map<keyType,valueType>', got '{}'",
84
+ s
85
+ ),
86
+ ));
87
+ }
88
+
89
+ let key_type = parts[0].trim().parse::<ParquetSchemaType>()?;
90
+ let value_type = parts[1].trim().parse::<ParquetSchemaType>()?;
91
+
92
+ return Ok(ParquetSchemaType::Map(Box::new(MapField {
93
+ key_type,
94
+ value_type,
95
+ key_format: None,
96
+ value_format: None,
97
+ value_nullable: true,
98
+ })));
99
+ }
100
+
101
+ // Handle primitive types
62
102
  match s {
63
103
  "int8" => Ok(ParquetSchemaType::Int8),
64
104
  "int16" => Ok(ParquetSchemaType::Int16),
@@ -77,13 +117,16 @@ impl<'a> FromStr for ParquetSchemaType<'a> {
77
117
  "timestamp_millis" => Ok(ParquetSchemaType::TimestampMillis),
78
118
  "timestamp_micros" => Ok(ParquetSchemaType::TimestampMicros),
79
119
  "list" => Ok(ParquetSchemaType::List(Box::new(ListField {
80
- item_type: ParquetSchemaType::Int8,
120
+ item_type: ParquetSchemaType::String,
81
121
  format: None,
122
+ nullable: true,
82
123
  }))),
83
124
  "map" => Ok(ParquetSchemaType::Map(Box::new(MapField {
84
125
  key_type: ParquetSchemaType::String,
85
- value_type: ParquetSchemaType::Int8,
86
- format: None,
126
+ value_type: ParquetSchemaType::String,
127
+ key_format: None,
128
+ value_format: None,
129
+ value_nullable: true,
87
130
  }))),
88
131
  _ => Err(MagnusError::new(
89
132
  magnus::exception::runtime_error(),
@@ -98,7 +141,11 @@ impl<'a> TryConvert for ParquetSchemaType<'a> {
98
141
  let ruby = unsafe { Ruby::get_unchecked() };
99
142
  let schema_type = parse_string_or_symbol(&ruby, value)?;
100
143
 
101
- schema_type.unwrap().parse()
144
+ schema_type
145
+ .ok_or_else(|| {
146
+ MagnusError::new(magnus::exception::type_error(), "Invalid schema type")
147
+ })?
148
+ .parse()
102
149
  }
103
150
  }
104
151
 
@@ -157,126 +204,50 @@ impl WriterOutput {
157
204
  }
158
205
  }
159
206
 
160
- pub struct ParquetErrorWrapper(pub ParquetError);
161
-
162
- impl From<ParquetErrorWrapper> for MagnusError {
163
- fn from(err: ParquetErrorWrapper) -> Self {
164
- MagnusError::new(
165
- magnus::exception::runtime_error(),
166
- format!("Parquet error: {}", err.0),
167
- )
168
- }
169
- }
170
-
171
207
  pub struct ColumnCollector<'a> {
172
208
  pub name: String,
173
209
  pub type_: ParquetSchemaType<'a>,
174
210
  pub format: Option<String>,
211
+ pub nullable: bool,
175
212
  pub values: Vec<crate::types::ParquetValue>,
176
213
  }
177
214
 
178
215
  impl<'a> ColumnCollector<'a> {
179
- pub fn new(name: String, type_: ParquetSchemaType<'a>, format: Option<String>) -> Self {
216
+ pub fn new(
217
+ name: String,
218
+ type_: ParquetSchemaType<'a>,
219
+ format: Option<String>,
220
+ nullable: bool,
221
+ ) -> Self {
180
222
  Self {
181
223
  name,
182
224
  type_,
183
225
  format,
226
+ nullable,
184
227
  values: Vec::new(),
185
228
  }
186
229
  }
187
230
 
188
231
  pub fn push_value(&mut self, value: Value) -> Result<(), MagnusError> {
189
232
  use crate::types::ParquetValue;
190
- use crate::{
191
- convert_to_binary, convert_to_boolean, convert_to_date32, convert_to_list,
192
- convert_to_map, convert_to_timestamp_micros, convert_to_timestamp_millis,
193
- NumericConverter,
194
- };
195
233
 
196
234
  if value.is_nil() {
197
- self.values.push(ParquetValue::Null);
198
- return Ok(());
235
+ if !self.nullable {
236
+ // For non-nullable fields, raise an error
237
+ return Err(MagnusError::new(
238
+ magnus::exception::runtime_error(),
239
+ "Cannot write nil value for non-nullable field",
240
+ ));
241
+ }
199
242
  }
200
243
 
201
- let parquet_value = match &self.type_ {
202
- ParquetSchemaType::Int8 => {
203
- let v = NumericConverter::<i8>::convert_with_string_fallback(value)?;
204
- ParquetValue::Int8(v)
205
- }
206
- ParquetSchemaType::Int16 => {
207
- let v = NumericConverter::<i16>::convert_with_string_fallback(value)?;
208
- ParquetValue::Int16(v)
209
- }
210
- ParquetSchemaType::Int32 => {
211
- let v = NumericConverter::<i32>::convert_with_string_fallback(value)?;
212
- ParquetValue::Int32(v)
213
- }
214
- ParquetSchemaType::Int64 => {
215
- let v = NumericConverter::<i64>::convert_with_string_fallback(value)?;
216
- ParquetValue::Int64(v)
217
- }
218
- ParquetSchemaType::UInt8 => {
219
- let v = NumericConverter::<u8>::convert_with_string_fallback(value)?;
220
- ParquetValue::UInt8(v)
221
- }
222
- ParquetSchemaType::UInt16 => {
223
- let v = NumericConverter::<u16>::convert_with_string_fallback(value)?;
224
- ParquetValue::UInt16(v)
225
- }
226
- ParquetSchemaType::UInt32 => {
227
- let v = NumericConverter::<u32>::convert_with_string_fallback(value)?;
228
- ParquetValue::UInt32(v)
229
- }
230
- ParquetSchemaType::UInt64 => {
231
- let v = NumericConverter::<u64>::convert_with_string_fallback(value)?;
232
- ParquetValue::UInt64(v)
233
- }
234
- ParquetSchemaType::Float => {
235
- let v = NumericConverter::<f32>::convert_with_string_fallback(value)?;
236
- ParquetValue::Float32(v)
237
- }
238
- ParquetSchemaType::Double => {
239
- let v = NumericConverter::<f64>::convert_with_string_fallback(value)?;
240
- ParquetValue::Float64(v)
241
- }
242
- ParquetSchemaType::String => {
243
- let v = String::try_convert(value)?;
244
- ParquetValue::String(v)
245
- }
246
- ParquetSchemaType::Binary => {
247
- let v = convert_to_binary(value)?;
248
- ParquetValue::Bytes(v)
249
- }
250
- ParquetSchemaType::Boolean => {
251
- let v = convert_to_boolean(value)?;
252
- ParquetValue::Boolean(v)
253
- }
254
- ParquetSchemaType::Date32 => {
255
- let v = convert_to_date32(value, self.format.as_deref())?;
256
- ParquetValue::Date32(v)
257
- }
258
- ParquetSchemaType::TimestampMillis => {
259
- let v = convert_to_timestamp_millis(value, self.format.as_deref())?;
260
- ParquetValue::TimestampMillis(v, None)
261
- }
262
- ParquetSchemaType::TimestampMicros => {
263
- let v = convert_to_timestamp_micros(value, self.format.as_deref())?;
264
- ParquetValue::TimestampMicros(v, None)
265
- }
266
- ParquetSchemaType::List(list_field) => {
267
- let values = convert_to_list(value, list_field)?;
268
- ParquetValue::List(values)
269
- }
270
- ParquetSchemaType::Map(map_field) => {
271
- let map = convert_to_map(value, map_field)?;
272
- ParquetValue::Map(map)
273
- }
274
- };
244
+ // For all other types, proceed as normal
245
+ let parquet_value = ParquetValue::from_value(value, &self.type_, self.format.as_deref())?;
275
246
  self.values.push(parquet_value);
276
247
  Ok(())
277
248
  }
278
249
 
279
- pub fn take_array(&mut self) -> Result<Arc<dyn Array>, MagnusError> {
250
+ pub fn take_array(&mut self) -> Result<Arc<dyn Array>, ReaderError> {
280
251
  let values = std::mem::take(&mut self.values);
281
252
  crate::convert_parquet_values_to_arrow(values, &self.type_)
282
253
  }
@@ -1,12 +1,12 @@
1
+ use crate::ParserResultType;
1
2
  use magnus::{
2
3
  scan_args::{get_kwargs, scan_args},
3
4
  value::ReprValue,
4
5
  Error, RString, Ruby, Symbol, Value,
5
6
  };
6
7
 
7
- use crate::ParserResultType;
8
-
9
- fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, Error> {
8
+ /// Convert a Ruby Value to a String, handling both String and Symbol types
9
+ pub fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, Error> {
10
10
  if value.is_nil() {
11
11
  Ok(None)
12
12
  } else if value.is_kind_of(ruby.class_string()) {
@@ -33,9 +33,10 @@ pub struct ParquetRowsArgs {
33
33
  pub result_type: ParserResultType,
34
34
  pub columns: Option<Vec<String>>,
35
35
  pub strict: bool,
36
+ pub logger: Option<Value>,
36
37
  }
37
38
 
38
- /// Parse common arguments for CSV parsing
39
+ /// Parse common arguments for parquet row iteration
39
40
  pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRowsArgs, Error> {
40
41
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
41
42
  let (to_read,) = parsed_args.required;
@@ -47,12 +48,13 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
47
48
  Option<Option<Value>>,
48
49
  Option<Option<Vec<String>>>,
49
50
  Option<Option<bool>>,
51
+ Option<Option<Value>>,
50
52
  ),
51
53
  (),
52
54
  >(
53
55
  parsed_args.keywords,
54
56
  &[],
55
- &["result_type", "columns", "strict"],
57
+ &["result_type", "columns", "strict", "logger"],
56
58
  )?;
57
59
 
58
60
  let result_type: ParserResultType = match kwargs
@@ -84,12 +86,14 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
84
86
  };
85
87
 
86
88
  let strict = kwargs.optional.2.flatten().unwrap_or(true);
89
+ let logger = kwargs.optional.3.flatten();
87
90
 
88
91
  Ok(ParquetRowsArgs {
89
92
  to_read,
90
93
  result_type,
91
94
  columns: kwargs.optional.1.flatten(),
92
95
  strict,
96
+ logger,
93
97
  })
94
98
  }
95
99
 
@@ -100,9 +104,10 @@ pub struct ParquetColumnsArgs {
100
104
  pub columns: Option<Vec<String>>,
101
105
  pub batch_size: Option<usize>,
102
106
  pub strict: bool,
107
+ pub logger: Option<Value>,
103
108
  }
104
109
 
105
- /// Parse common arguments for CSV parsing
110
+ /// Parse common arguments for parquet column iteration
106
111
  pub fn parse_parquet_columns_args(
107
112
  ruby: &Ruby,
108
113
  args: &[Value],
@@ -118,12 +123,13 @@ pub fn parse_parquet_columns_args(
118
123
  Option<Option<Vec<String>>>,
119
124
  Option<Option<usize>>,
120
125
  Option<Option<bool>>,
126
+ Option<Option<Value>>,
121
127
  ),
122
128
  (),
123
129
  >(
124
130
  parsed_args.keywords,
125
131
  &[],
126
- &["result_type", "columns", "batch_size", "strict"],
132
+ &["result_type", "columns", "batch_size", "strict", "logger"],
127
133
  )?;
128
134
 
129
135
  let result_type: ParserResultType = match kwargs
@@ -154,11 +160,25 @@ pub fn parse_parquet_columns_args(
154
160
  None => ParserResultType::Hash,
155
161
  };
156
162
 
163
+ let batch_size = kwargs.optional.2.flatten();
164
+ if let Some(sz) = batch_size {
165
+ if sz <= 0 {
166
+ return Err(Error::new(
167
+ ruby.exception_arg_error(),
168
+ format!("batch_size must be > 0, got {}", sz),
169
+ ));
170
+ }
171
+ }
172
+
173
+ let strict = kwargs.optional.3.flatten().unwrap_or(true);
174
+ let logger = kwargs.optional.4.flatten();
175
+
157
176
  Ok(ParquetColumnsArgs {
158
177
  to_read,
159
178
  result_type,
160
179
  columns: kwargs.optional.1.flatten(),
161
- batch_size: kwargs.optional.2.flatten(),
162
- strict: kwargs.optional.3.flatten().unwrap_or(true),
180
+ batch_size,
181
+ strict,
182
+ logger,
163
183
  })
164
184
  }