parquet 0.4.2 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,32 +1,36 @@
1
+ use super::{core_types::SchemaNode, ParquetGemError, PrimitiveType};
2
+ use crate::{
3
+ types::{ListField, MapField, ParquetSchemaType},
4
+ utils::parse_string_or_symbol,
5
+ };
6
+ use arrow_array::{Array, RecordBatch};
7
+ use magnus::{value::ReprValue, Error as MagnusError, RString, Ruby, TryConvert, Value};
8
+ use parquet::{arrow::ArrowWriter, errors::ParquetError};
1
9
  use std::{
2
10
  io::{self, Write},
3
11
  str::FromStr,
4
12
  sync::Arc,
5
13
  };
6
-
7
- use arrow_array::{Array, RecordBatch};
8
- use magnus::{value::ReprValue, Error as MagnusError, RString, Ruby, Symbol, TryConvert, Value};
9
- use parquet::{arrow::ArrowWriter, errors::ParquetError};
10
14
  use tempfile::NamedTempFile;
11
15
 
12
- use crate::types::{convert_to_string, ListField, MapField, ParquetSchemaType};
13
-
14
- #[derive(Debug)]
16
+ #[derive(Debug, Clone)]
15
17
  pub struct SchemaField<'a> {
16
18
  pub name: String,
17
19
  pub type_: ParquetSchemaType<'a>,
18
20
  pub format: Option<String>,
21
+ pub nullable: bool,
19
22
  }
20
23
 
21
24
  #[derive(Debug)]
22
- pub struct ParquetWriteArgs<'a> {
25
+ pub struct ParquetWriteArgs {
23
26
  pub read_from: Value,
24
27
  pub write_to: Value,
25
- pub schema: Vec<SchemaField<'a>>,
28
+ pub schema: SchemaNode,
26
29
  pub batch_size: Option<usize>,
27
30
  pub flush_threshold: Option<usize>,
28
31
  pub compression: Option<String>,
29
32
  pub sample_size: Option<usize>,
33
+ pub logger: Option<Value>,
30
34
  }
31
35
 
32
36
  pub trait SendableWrite: Send + Write {}
@@ -59,31 +63,63 @@ impl<'a> FromStr for ParquetSchemaType<'a> {
59
63
  type Err = MagnusError;
60
64
 
61
65
  fn from_str(s: &str) -> Result<Self, Self::Err> {
66
+ // Check if it's a list type
67
+ if let Some(inner_type_str) = s.strip_prefix("list<").and_then(|s| s.strip_suffix(">")) {
68
+ let inner_type = inner_type_str.parse::<ParquetSchemaType>()?;
69
+ return Ok(ParquetSchemaType::List(Box::new(ListField {
70
+ item_type: inner_type,
71
+ format: None,
72
+ nullable: true,
73
+ })));
74
+ }
75
+
76
+ // Check if it's a map type
77
+ if let Some(kv_types_str) = s.strip_prefix("map<").and_then(|s| s.strip_suffix(">")) {
78
+ let parts: Vec<&str> = kv_types_str.splitn(2, ',').collect();
79
+ if parts.len() != 2 {
80
+ return Err(MagnusError::new(
81
+ magnus::exception::runtime_error(),
82
+ format!(
83
+ "Invalid map format. Expected 'map<keyType,valueType>', got '{}'",
84
+ s
85
+ ),
86
+ ));
87
+ }
88
+
89
+ let key_type = parts[0].trim().parse::<ParquetSchemaType>()?;
90
+ let value_type = parts[1].trim().parse::<ParquetSchemaType>()?;
91
+
92
+ return Ok(ParquetSchemaType::Map(Box::new(MapField {
93
+ key_type,
94
+ value_type,
95
+ key_format: None,
96
+ value_format: None,
97
+ value_nullable: true,
98
+ })));
99
+ }
100
+
101
+ // Handle primitive types
62
102
  match s {
63
- "int8" => Ok(ParquetSchemaType::Int8),
64
- "int16" => Ok(ParquetSchemaType::Int16),
65
- "int32" => Ok(ParquetSchemaType::Int32),
66
- "int64" => Ok(ParquetSchemaType::Int64),
67
- "uint8" => Ok(ParquetSchemaType::UInt8),
68
- "uint16" => Ok(ParquetSchemaType::UInt16),
69
- "uint32" => Ok(ParquetSchemaType::UInt32),
70
- "uint64" => Ok(ParquetSchemaType::UInt64),
71
- "float" | "float32" => Ok(ParquetSchemaType::Float),
72
- "double" | "float64" => Ok(ParquetSchemaType::Double),
73
- "string" | "utf8" => Ok(ParquetSchemaType::String),
74
- "binary" => Ok(ParquetSchemaType::Binary),
75
- "boolean" | "bool" => Ok(ParquetSchemaType::Boolean),
76
- "date32" => Ok(ParquetSchemaType::Date32),
77
- "timestamp_millis" => Ok(ParquetSchemaType::TimestampMillis),
78
- "timestamp_micros" => Ok(ParquetSchemaType::TimestampMicros),
103
+ "int8" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Int8)),
104
+ "int16" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Int16)),
105
+ "int32" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Int32)),
106
+ "int64" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Int64)),
107
+ "uint8" => Ok(ParquetSchemaType::Primitive(PrimitiveType::UInt8)),
108
+ "uint16" => Ok(ParquetSchemaType::Primitive(PrimitiveType::UInt16)),
109
+ "uint32" => Ok(ParquetSchemaType::Primitive(PrimitiveType::UInt32)),
110
+ "uint64" => Ok(ParquetSchemaType::Primitive(PrimitiveType::UInt64)),
111
+ "float" | "float32" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Float32)),
112
+ "double" | "float64" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Float64)),
113
+ "string" | "utf8" => Ok(ParquetSchemaType::Primitive(PrimitiveType::String)),
114
+ "binary" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Binary)),
115
+ "boolean" | "bool" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Boolean)),
116
+ "date32" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Date32)),
117
+ "timestamp_millis" => Ok(ParquetSchemaType::Primitive(PrimitiveType::TimestampMillis)),
118
+ "timestamp_micros" => Ok(ParquetSchemaType::Primitive(PrimitiveType::TimestampMicros)),
79
119
  "list" => Ok(ParquetSchemaType::List(Box::new(ListField {
80
- item_type: ParquetSchemaType::Int8,
81
- format: None,
82
- }))),
83
- "map" => Ok(ParquetSchemaType::Map(Box::new(MapField {
84
- key_type: ParquetSchemaType::String,
85
- value_type: ParquetSchemaType::Int8,
120
+ item_type: ParquetSchemaType::Primitive(PrimitiveType::String),
86
121
  format: None,
122
+ nullable: true,
87
123
  }))),
88
124
  _ => Err(MagnusError::new(
89
125
  magnus::exception::runtime_error(),
@@ -98,7 +134,11 @@ impl<'a> TryConvert for ParquetSchemaType<'a> {
98
134
  let ruby = unsafe { Ruby::get_unchecked() };
99
135
  let schema_type = parse_string_or_symbol(&ruby, value)?;
100
136
 
101
- schema_type.unwrap().parse()
137
+ schema_type
138
+ .ok_or_else(|| {
139
+ MagnusError::new(magnus::exception::type_error(), "Invalid schema type")
140
+ })?
141
+ .parse()
102
142
  }
103
143
  }
104
144
 
@@ -106,31 +146,6 @@ impl<'a> TryConvert for ParquetSchemaType<'a> {
106
146
  // with simple primitive types and strings
107
147
  unsafe impl<'a> Send for ParquetSchemaType<'a> {}
108
148
 
109
- fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, MagnusError> {
110
- if value.is_nil() {
111
- Ok(None)
112
- } else if value.is_kind_of(ruby.class_string()) {
113
- RString::from_value(value)
114
- .ok_or_else(|| {
115
- MagnusError::new(magnus::exception::type_error(), "Invalid string value")
116
- })?
117
- .to_string()
118
- .map(|s| Some(s))
119
- } else if value.is_kind_of(ruby.class_symbol()) {
120
- Symbol::from_value(value)
121
- .ok_or_else(|| {
122
- MagnusError::new(magnus::exception::type_error(), "Invalid symbol value")
123
- })?
124
- .funcall("to_s", ())
125
- .map(|s| Some(s))
126
- } else {
127
- Err(MagnusError::new(
128
- magnus::exception::type_error(),
129
- "Value must be a String or Symbol",
130
- ))
131
- }
132
- }
133
-
134
149
  pub enum WriterOutput {
135
150
  File(ArrowWriter<Box<dyn SendableWrite>>),
136
151
  TempFile(ArrowWriter<Box<dyn SendableWrite>>, NamedTempFile),
@@ -157,126 +172,54 @@ impl WriterOutput {
157
172
  }
158
173
  }
159
174
 
160
- pub struct ParquetErrorWrapper(pub ParquetError);
161
-
162
- impl From<ParquetErrorWrapper> for MagnusError {
163
- fn from(err: ParquetErrorWrapper) -> Self {
164
- MagnusError::new(
165
- magnus::exception::runtime_error(),
166
- format!("Parquet error: {}", err.0),
167
- )
168
- }
169
- }
170
-
171
175
  pub struct ColumnCollector<'a> {
176
+ pub ruby: &'a Ruby,
172
177
  pub name: String,
173
178
  pub type_: ParquetSchemaType<'a>,
174
179
  pub format: Option<String>,
180
+ pub nullable: bool,
175
181
  pub values: Vec<crate::types::ParquetValue>,
176
182
  }
177
183
 
178
184
  impl<'a> ColumnCollector<'a> {
179
- pub fn new(name: String, type_: ParquetSchemaType<'a>, format: Option<String>) -> Self {
185
+ pub fn new(
186
+ ruby: &'a Ruby,
187
+ name: String,
188
+ type_: ParquetSchemaType<'a>,
189
+ format: Option<String>,
190
+ nullable: bool,
191
+ ) -> Self {
180
192
  Self {
193
+ ruby,
181
194
  name,
182
195
  type_,
183
196
  format,
197
+ nullable,
184
198
  values: Vec::new(),
185
199
  }
186
200
  }
187
201
 
188
202
  pub fn push_value(&mut self, value: Value) -> Result<(), MagnusError> {
189
203
  use crate::types::ParquetValue;
190
- use crate::{
191
- convert_to_binary, convert_to_boolean, convert_to_date32, convert_to_list,
192
- convert_to_map, convert_to_timestamp_micros, convert_to_timestamp_millis,
193
- NumericConverter,
194
- };
195
204
 
196
205
  if value.is_nil() {
197
- self.values.push(ParquetValue::Null);
198
- return Ok(());
206
+ if !self.nullable {
207
+ // For non-nullable fields, raise an error
208
+ return Err(MagnusError::new(
209
+ magnus::exception::runtime_error(),
210
+ "Cannot write nil value for non-nullable field",
211
+ ));
212
+ }
199
213
  }
200
214
 
201
- let parquet_value = match &self.type_ {
202
- ParquetSchemaType::Int8 => {
203
- let v = NumericConverter::<i8>::convert_with_string_fallback(value)?;
204
- ParquetValue::Int8(v)
205
- }
206
- ParquetSchemaType::Int16 => {
207
- let v = NumericConverter::<i16>::convert_with_string_fallback(value)?;
208
- ParquetValue::Int16(v)
209
- }
210
- ParquetSchemaType::Int32 => {
211
- let v = NumericConverter::<i32>::convert_with_string_fallback(value)?;
212
- ParquetValue::Int32(v)
213
- }
214
- ParquetSchemaType::Int64 => {
215
- let v = NumericConverter::<i64>::convert_with_string_fallback(value)?;
216
- ParquetValue::Int64(v)
217
- }
218
- ParquetSchemaType::UInt8 => {
219
- let v = NumericConverter::<u8>::convert_with_string_fallback(value)?;
220
- ParquetValue::UInt8(v)
221
- }
222
- ParquetSchemaType::UInt16 => {
223
- let v = NumericConverter::<u16>::convert_with_string_fallback(value)?;
224
- ParquetValue::UInt16(v)
225
- }
226
- ParquetSchemaType::UInt32 => {
227
- let v = NumericConverter::<u32>::convert_with_string_fallback(value)?;
228
- ParquetValue::UInt32(v)
229
- }
230
- ParquetSchemaType::UInt64 => {
231
- let v = NumericConverter::<u64>::convert_with_string_fallback(value)?;
232
- ParquetValue::UInt64(v)
233
- }
234
- ParquetSchemaType::Float => {
235
- let v = NumericConverter::<f32>::convert_with_string_fallback(value)?;
236
- ParquetValue::Float32(v)
237
- }
238
- ParquetSchemaType::Double => {
239
- let v = NumericConverter::<f64>::convert_with_string_fallback(value)?;
240
- ParquetValue::Float64(v)
241
- }
242
- ParquetSchemaType::String => {
243
- let v = convert_to_string(value)?;
244
- ParquetValue::String(v)
245
- }
246
- ParquetSchemaType::Binary => {
247
- let v = convert_to_binary(value)?;
248
- ParquetValue::Bytes(v)
249
- }
250
- ParquetSchemaType::Boolean => {
251
- let v = convert_to_boolean(value)?;
252
- ParquetValue::Boolean(v)
253
- }
254
- ParquetSchemaType::Date32 => {
255
- let v = convert_to_date32(value, self.format.as_deref())?;
256
- ParquetValue::Date32(v)
257
- }
258
- ParquetSchemaType::TimestampMillis => {
259
- let v = convert_to_timestamp_millis(value, self.format.as_deref())?;
260
- ParquetValue::TimestampMillis(v, None)
261
- }
262
- ParquetSchemaType::TimestampMicros => {
263
- let v = convert_to_timestamp_micros(value, self.format.as_deref())?;
264
- ParquetValue::TimestampMicros(v, None)
265
- }
266
- ParquetSchemaType::List(list_field) => {
267
- let values = convert_to_list(value, list_field)?;
268
- ParquetValue::List(values)
269
- }
270
- ParquetSchemaType::Map(map_field) => {
271
- let map = convert_to_map(value, map_field)?;
272
- ParquetValue::Map(map)
273
- }
274
- };
215
+ // For all other types, proceed as normal
216
+ let parquet_value =
217
+ ParquetValue::from_value(self.ruby, value, &self.type_, self.format.as_deref())?;
275
218
  self.values.push(parquet_value);
276
219
  Ok(())
277
220
  }
278
221
 
279
- pub fn take_array(&mut self) -> Result<Arc<dyn Array>, MagnusError> {
222
+ pub fn take_array(&mut self) -> Result<Arc<dyn Array>, ParquetGemError> {
280
223
  let values = std::mem::take(&mut self.values);
281
224
  crate::convert_parquet_values_to_arrow(values, &self.type_)
282
225
  }
@@ -1,12 +1,12 @@
1
+ use crate::ParserResultType;
1
2
  use magnus::{
2
3
  scan_args::{get_kwargs, scan_args},
3
4
  value::ReprValue,
4
5
  Error, RString, Ruby, Symbol, Value,
5
6
  };
6
7
 
7
- use crate::ParserResultType;
8
-
9
- fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, Error> {
8
+ /// Convert a Ruby Value to a String, handling both String and Symbol types
9
+ pub fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, Error> {
10
10
  if value.is_nil() {
11
11
  Ok(None)
12
12
  } else if value.is_kind_of(ruby.class_string()) {
@@ -33,9 +33,10 @@ pub struct ParquetRowsArgs {
33
33
  pub result_type: ParserResultType,
34
34
  pub columns: Option<Vec<String>>,
35
35
  pub strict: bool,
36
+ pub logger: Option<Value>,
36
37
  }
37
38
 
38
- /// Parse common arguments for CSV parsing
39
+ /// Parse common arguments for parquet row iteration
39
40
  pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRowsArgs, Error> {
40
41
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
41
42
  let (to_read,) = parsed_args.required;
@@ -47,12 +48,13 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
47
48
  Option<Option<Value>>,
48
49
  Option<Option<Vec<String>>>,
49
50
  Option<Option<bool>>,
51
+ Option<Option<Value>>,
50
52
  ),
51
53
  (),
52
54
  >(
53
55
  parsed_args.keywords,
54
56
  &[],
55
- &["result_type", "columns", "strict"],
57
+ &["result_type", "columns", "strict", "logger"],
56
58
  )?;
57
59
 
58
60
  let result_type: ParserResultType = match kwargs
@@ -84,12 +86,14 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
84
86
  };
85
87
 
86
88
  let strict = kwargs.optional.2.flatten().unwrap_or(true);
89
+ let logger = kwargs.optional.3.flatten();
87
90
 
88
91
  Ok(ParquetRowsArgs {
89
92
  to_read,
90
93
  result_type,
91
94
  columns: kwargs.optional.1.flatten(),
92
95
  strict,
96
+ logger,
93
97
  })
94
98
  }
95
99
 
@@ -100,9 +104,10 @@ pub struct ParquetColumnsArgs {
100
104
  pub columns: Option<Vec<String>>,
101
105
  pub batch_size: Option<usize>,
102
106
  pub strict: bool,
107
+ pub logger: Option<Value>,
103
108
  }
104
109
 
105
- /// Parse common arguments for CSV parsing
110
+ /// Parse common arguments for parquet column iteration
106
111
  pub fn parse_parquet_columns_args(
107
112
  ruby: &Ruby,
108
113
  args: &[Value],
@@ -118,12 +123,13 @@ pub fn parse_parquet_columns_args(
118
123
  Option<Option<Vec<String>>>,
119
124
  Option<Option<usize>>,
120
125
  Option<Option<bool>>,
126
+ Option<Option<Value>>,
121
127
  ),
122
128
  (),
123
129
  >(
124
130
  parsed_args.keywords,
125
131
  &[],
126
- &["result_type", "columns", "batch_size", "strict"],
132
+ &["result_type", "columns", "batch_size", "strict", "logger"],
127
133
  )?;
128
134
 
129
135
  let result_type: ParserResultType = match kwargs
@@ -154,11 +160,25 @@ pub fn parse_parquet_columns_args(
154
160
  None => ParserResultType::Hash,
155
161
  };
156
162
 
163
+ let batch_size = kwargs.optional.2.flatten();
164
+ if let Some(sz) = batch_size {
165
+ if sz <= 0 {
166
+ return Err(Error::new(
167
+ ruby.exception_arg_error(),
168
+ format!("batch_size must be > 0, got {}", sz),
169
+ ));
170
+ }
171
+ }
172
+
173
+ let strict = kwargs.optional.3.flatten().unwrap_or(true);
174
+ let logger = kwargs.optional.4.flatten();
175
+
157
176
  Ok(ParquetColumnsArgs {
158
177
  to_read,
159
178
  result_type,
160
179
  columns: kwargs.optional.1.flatten(),
161
- batch_size: kwargs.optional.2.flatten(),
162
- strict: kwargs.optional.3.flatten().unwrap_or(true),
180
+ batch_size,
181
+ strict,
182
+ logger,
163
183
  })
164
184
  }