parquet 0.0.4 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,270 @@
1
+ use std::{
2
+ io::{self, Write},
3
+ str::FromStr,
4
+ sync::Arc,
5
+ };
6
+
7
+ use arrow_array::{Array, RecordBatch};
8
+ use magnus::{value::ReprValue, Error as MagnusError, RString, Ruby, Symbol, TryConvert, Value};
9
+ use parquet::{arrow::ArrowWriter, errors::ParquetError};
10
+ use tempfile::NamedTempFile;
11
+
12
+ use crate::types::{ListField, MapField, ParquetSchemaType};
13
+
14
+ #[derive(Debug)]
15
+ pub struct SchemaField {
16
+ pub name: String,
17
+ pub type_: ParquetSchemaType,
18
+ }
19
+
20
+ #[derive(Debug)]
21
+ pub struct ParquetWriteArgs {
22
+ pub read_from: Value,
23
+ pub write_to: Value,
24
+ pub schema: Vec<SchemaField>,
25
+ pub batch_size: Option<usize>,
26
+ }
27
+
28
+ pub trait SendableWrite: Send + Write {}
29
+ impl<T: Send + Write> SendableWrite for T {}
30
+
31
+ pub struct IoLikeValue(pub(crate) Value);
32
+
33
+ impl Write for IoLikeValue {
34
+ fn write(&mut self, buf: &[u8]) -> Result<usize, io::Error> {
35
+ let ruby_bytes = RString::from_slice(buf);
36
+
37
+ let bytes_written = self
38
+ .0
39
+ .funcall::<_, _, usize>("write", (ruby_bytes,))
40
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
41
+
42
+ Ok(bytes_written)
43
+ }
44
+
45
+ fn flush(&mut self) -> Result<(), io::Error> {
46
+ self.0
47
+ .funcall::<_, _, Value>("flush", ())
48
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
49
+
50
+ Ok(())
51
+ }
52
+ }
53
+
54
+ impl FromStr for ParquetSchemaType {
55
+ type Err = MagnusError;
56
+
57
+ fn from_str(s: &str) -> Result<Self, Self::Err> {
58
+ match s {
59
+ "int8" => Ok(ParquetSchemaType::Int8),
60
+ "int16" => Ok(ParquetSchemaType::Int16),
61
+ "int32" => Ok(ParquetSchemaType::Int32),
62
+ "int64" => Ok(ParquetSchemaType::Int64),
63
+ "uint8" => Ok(ParquetSchemaType::UInt8),
64
+ "uint16" => Ok(ParquetSchemaType::UInt16),
65
+ "uint32" => Ok(ParquetSchemaType::UInt32),
66
+ "uint64" => Ok(ParquetSchemaType::UInt64),
67
+ "float" | "float32" => Ok(ParquetSchemaType::Float),
68
+ "double" | "float64" => Ok(ParquetSchemaType::Double),
69
+ "string" | "utf8" => Ok(ParquetSchemaType::String),
70
+ "binary" => Ok(ParquetSchemaType::Binary),
71
+ "boolean" | "bool" => Ok(ParquetSchemaType::Boolean),
72
+ "date32" => Ok(ParquetSchemaType::Date32),
73
+ "timestamp_millis" => Ok(ParquetSchemaType::TimestampMillis),
74
+ "timestamp_micros" => Ok(ParquetSchemaType::TimestampMicros),
75
+ "list" => Ok(ParquetSchemaType::List(Box::new(ListField {
76
+ item_type: ParquetSchemaType::Int8,
77
+ }))),
78
+ "map" => Ok(ParquetSchemaType::Map(Box::new(MapField {
79
+ key_type: ParquetSchemaType::String,
80
+ value_type: ParquetSchemaType::Int8,
81
+ }))),
82
+ _ => Err(MagnusError::new(
83
+ magnus::exception::runtime_error(),
84
+ format!("Invalid schema type: {}", s),
85
+ )),
86
+ }
87
+ }
88
+ }
89
+
90
+ impl TryConvert for ParquetSchemaType {
91
+ fn try_convert(value: Value) -> Result<Self, MagnusError> {
92
+ let ruby = unsafe { Ruby::get_unchecked() };
93
+ let schema_type = parse_string_or_symbol(&ruby, value)?;
94
+
95
+ schema_type.unwrap().parse()
96
+ }
97
+ }
98
+
99
+ // We know this type is safe to move between threads because it's just an enum
100
+ // with simple primitive types and strings
101
+ unsafe impl Send for ParquetSchemaType {}
102
+
103
+ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, MagnusError> {
104
+ if value.is_nil() {
105
+ Ok(None)
106
+ } else if value.is_kind_of(ruby.class_string()) {
107
+ RString::from_value(value)
108
+ .ok_or_else(|| {
109
+ MagnusError::new(magnus::exception::type_error(), "Invalid string value")
110
+ })?
111
+ .to_string()
112
+ .map(|s| Some(s))
113
+ } else if value.is_kind_of(ruby.class_symbol()) {
114
+ Symbol::from_value(value)
115
+ .ok_or_else(|| {
116
+ MagnusError::new(magnus::exception::type_error(), "Invalid symbol value")
117
+ })?
118
+ .funcall("to_s", ())
119
+ .map(|s| Some(s))
120
+ } else {
121
+ Err(MagnusError::new(
122
+ magnus::exception::type_error(),
123
+ "Value must be a String or Symbol",
124
+ ))
125
+ }
126
+ }
127
+
128
+ pub enum WriterOutput {
129
+ File(ArrowWriter<Box<dyn SendableWrite>>),
130
+ TempFile(ArrowWriter<Box<dyn SendableWrite>>, NamedTempFile),
131
+ }
132
+
133
+ impl WriterOutput {
134
+ pub fn write(&mut self, batch: &RecordBatch) -> Result<(), ParquetError> {
135
+ match self {
136
+ WriterOutput::File(writer) | WriterOutput::TempFile(writer, _) => writer.write(batch),
137
+ }
138
+ }
139
+
140
+ pub fn close(self) -> Result<Option<NamedTempFile>, ParquetError> {
141
+ match self {
142
+ WriterOutput::File(writer) => {
143
+ writer.close()?;
144
+ Ok(None)
145
+ }
146
+ WriterOutput::TempFile(writer, temp_file) => {
147
+ writer.close()?;
148
+ Ok(Some(temp_file))
149
+ }
150
+ }
151
+ }
152
+ }
153
+
154
+ pub struct ParquetErrorWrapper(pub ParquetError);
155
+
156
+ impl From<ParquetErrorWrapper> for MagnusError {
157
+ fn from(err: ParquetErrorWrapper) -> Self {
158
+ MagnusError::new(
159
+ magnus::exception::runtime_error(),
160
+ format!("Parquet error: {}", err.0),
161
+ )
162
+ }
163
+ }
164
+
165
+ pub struct ColumnCollector {
166
+ pub name: String,
167
+ pub type_: ParquetSchemaType,
168
+ pub values: Vec<crate::types::ParquetValue>,
169
+ }
170
+
171
+ impl ColumnCollector {
172
+ pub fn new(name: String, type_: ParquetSchemaType) -> Self {
173
+ Self {
174
+ name,
175
+ type_,
176
+ values: Vec::new(),
177
+ }
178
+ }
179
+
180
+ pub fn push_value(&mut self, value: Value) -> Result<(), MagnusError> {
181
+ use crate::types::ParquetValue;
182
+ use crate::{
183
+ convert_to_binary, convert_to_boolean, convert_to_date32, convert_to_list,
184
+ convert_to_map, convert_to_timestamp_micros, convert_to_timestamp_millis,
185
+ NumericConverter,
186
+ };
187
+
188
+ let parquet_value = match &self.type_ {
189
+ ParquetSchemaType::Int8 => {
190
+ let v = NumericConverter::<i8>::convert_with_string_fallback(value)?;
191
+ ParquetValue::Int8(v)
192
+ }
193
+ ParquetSchemaType::Int16 => {
194
+ let v = NumericConverter::<i16>::convert_with_string_fallback(value)?;
195
+ ParquetValue::Int16(v)
196
+ }
197
+ ParquetSchemaType::Int32 => {
198
+ let v = NumericConverter::<i32>::convert_with_string_fallback(value)?;
199
+ ParquetValue::Int32(v)
200
+ }
201
+ ParquetSchemaType::Int64 => {
202
+ let v = NumericConverter::<i64>::convert_with_string_fallback(value)?;
203
+ ParquetValue::Int64(v)
204
+ }
205
+ ParquetSchemaType::UInt8 => {
206
+ let v = NumericConverter::<u8>::convert_with_string_fallback(value)?;
207
+ ParquetValue::UInt8(v)
208
+ }
209
+ ParquetSchemaType::UInt16 => {
210
+ let v = NumericConverter::<u16>::convert_with_string_fallback(value)?;
211
+ ParquetValue::UInt16(v)
212
+ }
213
+ ParquetSchemaType::UInt32 => {
214
+ let v = NumericConverter::<u32>::convert_with_string_fallback(value)?;
215
+ ParquetValue::UInt32(v)
216
+ }
217
+ ParquetSchemaType::UInt64 => {
218
+ let v = NumericConverter::<u64>::convert_with_string_fallback(value)?;
219
+ ParquetValue::UInt64(v)
220
+ }
221
+ ParquetSchemaType::Float => {
222
+ let v = NumericConverter::<f32>::convert_with_string_fallback(value)?;
223
+ ParquetValue::Float32(v)
224
+ }
225
+ ParquetSchemaType::Double => {
226
+ let v = NumericConverter::<f64>::convert_with_string_fallback(value)?;
227
+ ParquetValue::Float64(v)
228
+ }
229
+ ParquetSchemaType::String => {
230
+ let v = String::try_convert(value)?;
231
+ ParquetValue::String(v)
232
+ }
233
+ ParquetSchemaType::Binary => {
234
+ let v = convert_to_binary(value)?;
235
+ ParquetValue::Bytes(v)
236
+ }
237
+ ParquetSchemaType::Boolean => {
238
+ let v = convert_to_boolean(value)?;
239
+ ParquetValue::Boolean(v)
240
+ }
241
+ ParquetSchemaType::Date32 => {
242
+ let v = convert_to_date32(value)?;
243
+ ParquetValue::Date32(v)
244
+ }
245
+ ParquetSchemaType::TimestampMillis => {
246
+ let v = convert_to_timestamp_millis(value)?;
247
+ ParquetValue::TimestampMillis(v, None)
248
+ }
249
+ ParquetSchemaType::TimestampMicros => {
250
+ let v = convert_to_timestamp_micros(value)?;
251
+ ParquetValue::TimestampMicros(v, None)
252
+ }
253
+ ParquetSchemaType::List(list_field) => {
254
+ let values = convert_to_list(value, list_field)?;
255
+ ParquetValue::List(values)
256
+ }
257
+ ParquetSchemaType::Map(map_field) => {
258
+ let map = convert_to_map(value, map_field)?;
259
+ ParquetValue::Map(map)
260
+ }
261
+ };
262
+ self.values.push(parquet_value);
263
+ Ok(())
264
+ }
265
+
266
+ pub fn take_array(&mut self) -> Result<Arc<dyn Array>, MagnusError> {
267
+ let values = std::mem::take(&mut self.values);
268
+ crate::convert_parquet_values_to_arrow(values, &self.type_)
269
+ }
270
+ }
@@ -4,6 +4,8 @@ use magnus::{
4
4
  Error, RString, Ruby, Symbol, Value,
5
5
  };
6
6
 
7
+ use crate::ParserResultType;
8
+
7
9
  fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, Error> {
8
10
  if value.is_nil() {
9
11
  Ok(None)
@@ -28,7 +30,7 @@ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, E
28
30
  #[derive(Debug)]
29
31
  pub struct ParquetRowsArgs {
30
32
  pub to_read: Value,
31
- pub result_type: String,
33
+ pub result_type: ParserResultType,
32
34
  pub columns: Option<Vec<String>>,
33
35
  }
34
36
 
@@ -43,28 +45,31 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
43
45
  &["result_type", "columns"],
44
46
  )?;
45
47
 
46
- let result_type = match kwargs
48
+ let result_type: ParserResultType = match kwargs
47
49
  .optional
48
50
  .0
49
51
  .map(|value| parse_string_or_symbol(ruby, value))
50
52
  {
51
- Some(Ok(Some(parsed))) => match parsed.as_str() {
52
- "hash" | "array" => parsed,
53
- _ => {
54
- return Err(Error::new(
55
- magnus::exception::runtime_error(),
56
- "result_type must be either 'hash' or 'array'",
57
- ))
58
- }
59
- },
60
- Some(Ok(None)) => String::from("hash"),
53
+ Some(Ok(Some(parsed))) => parsed.try_into().map_err(|e| {
54
+ Error::new(
55
+ magnus::exception::runtime_error(),
56
+ format!(
57
+ "Invalid result type: {e}. Must be one of {}",
58
+ ParserResultType::iter()
59
+ .map(|v| v.to_string())
60
+ .collect::<Vec<_>>()
61
+ .join(", ")
62
+ ),
63
+ )
64
+ })?,
65
+ Some(Ok(None)) => ParserResultType::Hash,
61
66
  Some(Err(_)) => {
62
67
  return Err(Error::new(
63
68
  magnus::exception::type_error(),
64
69
  "result_type must be a String or Symbol",
65
70
  ))
66
71
  }
67
- None => String::from("hash"),
72
+ None => ParserResultType::Hash,
68
73
  };
69
74
 
70
75
  Ok(ParquetRowsArgs {
@@ -77,7 +82,7 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
77
82
  #[derive(Debug)]
78
83
  pub struct ParquetColumnsArgs {
79
84
  pub to_read: Value,
80
- pub result_type: String,
85
+ pub result_type: ParserResultType,
81
86
  pub columns: Option<Vec<String>>,
82
87
  pub batch_size: Option<usize>,
83
88
  }
@@ -96,28 +101,31 @@ pub fn parse_parquet_columns_args(
96
101
  &["result_type", "columns", "batch_size"],
97
102
  )?;
98
103
 
99
- let result_type = match kwargs
104
+ let result_type: ParserResultType = match kwargs
100
105
  .optional
101
106
  .0
102
107
  .map(|value| parse_string_or_symbol(ruby, value))
103
108
  {
104
- Some(Ok(Some(parsed))) => match parsed.as_str() {
105
- "hash" | "array" => parsed,
106
- _ => {
107
- return Err(Error::new(
108
- magnus::exception::runtime_error(),
109
- "result_type must be either 'hash' or 'array'",
110
- ))
111
- }
112
- },
113
- Some(Ok(None)) => String::from("hash"),
109
+ Some(Ok(Some(parsed))) => parsed.try_into().map_err(|e| {
110
+ Error::new(
111
+ magnus::exception::runtime_error(),
112
+ format!(
113
+ "Invalid result type: {e}. Must be one of {}",
114
+ ParserResultType::iter()
115
+ .map(|v| v.to_string())
116
+ .collect::<Vec<_>>()
117
+ .join(", ")
118
+ ),
119
+ )
120
+ })?,
121
+ Some(Ok(None)) => ParserResultType::Hash,
114
122
  Some(Err(_)) => {
115
123
  return Err(Error::new(
116
124
  magnus::exception::type_error(),
117
125
  "result_type must be a String or Symbol",
118
126
  ))
119
127
  }
120
- None => String::from("hash"),
128
+ None => ParserResultType::Hash,
121
129
  };
122
130
 
123
131
  Ok(ParquetColumnsArgs {