parquet 0.4.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/parquet/src/types/mod.rs +1 -1
- data/ext/parquet/src/types/parquet_value.rs +1 -1
- data/ext/parquet/src/types/type_conversion.rs +16 -1
- data/ext/parquet/src/types/writer_types.rs +2 -2
- data/ext/parquet/src/writer/mod.rs +82 -57
- data/lib/parquet/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8e05c4544ed72011e1a877befa04e9c29f9d61d6f5663095f3356735c2411299
|
4
|
+
data.tar.gz: 6642f5575e9802a3eb05af6a4523fcc198644a6b465a2fa911c35985f27943f2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b68013d8de3a6454d4ebddaae549fb4c02d0a6102718259b9a43133ceed8803060fc04fd06287e041ee0afdf0cd6c3556ff2c6a41d61a6acd226ef239caab090
|
7
|
+
data.tar.gz: 074f9faf08f2a178322cf74667ca93c24561cb49b4b91f7f98b653ea61a561628c3b06c4afe9d0a6e13410479fbd87482b88ee09d0dfbc91c554b820350fdad3
|
@@ -22,7 +22,7 @@ use arrow_array::{
|
|
22
22
|
TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
|
23
23
|
};
|
24
24
|
use arrow_schema::{DataType, TimeUnit};
|
25
|
-
use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby,
|
25
|
+
use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby, Value};
|
26
26
|
use parquet::data_type::Decimal;
|
27
27
|
use parquet::record::Field;
|
28
28
|
use std::{collections::HashMap, hash::BuildHasher, sync::Arc};
|
@@ -3,7 +3,7 @@ use std::str::FromStr;
|
|
3
3
|
use super::*;
|
4
4
|
use arrow_array::builder::*;
|
5
5
|
use jiff::tz::{Offset, TimeZone};
|
6
|
-
use magnus::{RArray, TryConvert};
|
6
|
+
use magnus::{RArray, RString, TryConvert};
|
7
7
|
|
8
8
|
pub struct NumericConverter<T> {
|
9
9
|
_phantom: std::marker::PhantomData<T>,
|
@@ -194,6 +194,21 @@ pub fn convert_to_boolean(value: Value) -> Result<bool, MagnusError> {
|
|
194
194
|
}
|
195
195
|
}
|
196
196
|
|
197
|
+
pub fn convert_to_string(value: Value) -> Result<String, MagnusError> {
|
198
|
+
String::try_convert(value).or_else(|_| {
|
199
|
+
if value.respond_to("to_s", false)? {
|
200
|
+
value.funcall::<_, _, RString>("to_s", ())?.to_string()
|
201
|
+
} else if value.respond_to("to_str", false)? {
|
202
|
+
value.funcall::<_, _, RString>("to_str", ())?.to_string()
|
203
|
+
} else {
|
204
|
+
Err(MagnusError::new(
|
205
|
+
magnus::exception::type_error(),
|
206
|
+
format!("Not able to convert {:?} to String", value),
|
207
|
+
))
|
208
|
+
}
|
209
|
+
})
|
210
|
+
}
|
211
|
+
|
197
212
|
pub fn convert_to_list(
|
198
213
|
value: Value,
|
199
214
|
list_field: &ListField,
|
@@ -9,7 +9,7 @@ use magnus::{value::ReprValue, Error as MagnusError, RString, Ruby, Symbol, TryC
|
|
9
9
|
use parquet::{arrow::ArrowWriter, errors::ParquetError};
|
10
10
|
use tempfile::NamedTempFile;
|
11
11
|
|
12
|
-
use crate::types::{ListField, MapField, ParquetSchemaType};
|
12
|
+
use crate::types::{convert_to_string, ListField, MapField, ParquetSchemaType};
|
13
13
|
|
14
14
|
#[derive(Debug)]
|
15
15
|
pub struct SchemaField<'a> {
|
@@ -240,7 +240,7 @@ impl<'a> ColumnCollector<'a> {
|
|
240
240
|
ParquetValue::Float64(v)
|
241
241
|
}
|
242
242
|
ParquetSchemaType::String => {
|
243
|
-
let v =
|
243
|
+
let v = convert_to_string(value)?;
|
244
244
|
ParquetValue::String(v)
|
245
245
|
}
|
246
246
|
ParquetSchemaType::Binary => {
|
@@ -42,7 +42,7 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
42
42
|
|
43
43
|
let kwargs = get_kwargs::<
|
44
44
|
_,
|
45
|
-
(
|
45
|
+
(Option<RArray>, Value),
|
46
46
|
(
|
47
47
|
Option<Option<usize>>,
|
48
48
|
Option<Option<usize>>,
|
@@ -61,71 +61,88 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
61
61
|
],
|
62
62
|
)?;
|
63
63
|
|
64
|
-
let
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
})?;
|
70
|
-
|
71
|
-
let mut schema = Vec::with_capacity(schema_array.len());
|
72
|
-
|
73
|
-
for (idx, field_hash) in schema_array.into_iter().enumerate() {
|
74
|
-
if !field_hash.is_kind_of(ruby.class_hash()) {
|
75
|
-
return Err(MagnusError::new(
|
76
|
-
magnus::exception::type_error(),
|
77
|
-
format!("schema[{}] must be a hash", idx),
|
78
|
-
));
|
79
|
-
}
|
80
|
-
|
81
|
-
let entries: Vec<(Value, Value)> = field_hash.funcall("to_a", ())?;
|
82
|
-
if entries.len() != 1 {
|
83
|
-
return Err(MagnusError::new(
|
64
|
+
let schema = if kwargs.required.0.is_none() || kwargs.required.0.unwrap().is_empty() {
|
65
|
+
// If schema is nil, we need to peek at the first value to determine column count
|
66
|
+
let first_value = read_from.funcall::<_, _, Value>("peek", ())?;
|
67
|
+
let array = RArray::from_value(first_value).ok_or_else(|| {
|
68
|
+
MagnusError::new(
|
84
69
|
magnus::exception::type_error(),
|
85
|
-
|
86
|
-
)
|
87
|
-
}
|
70
|
+
"First value must be an array when schema is not provided",
|
71
|
+
)
|
72
|
+
})?;
|
88
73
|
|
89
|
-
|
90
|
-
|
74
|
+
// Generate field names f0, f1, f2, etc.
|
75
|
+
(0..array.len())
|
76
|
+
.map(|i| SchemaField {
|
77
|
+
name: format!("f{}", i),
|
78
|
+
type_: ParquetSchemaType::String,
|
79
|
+
format: None,
|
80
|
+
})
|
81
|
+
.collect()
|
82
|
+
} else {
|
83
|
+
let schema_array = kwargs.required.0.unwrap();
|
91
84
|
|
92
|
-
let
|
93
|
-
let type_hash: Vec<(Value, Value)> = type_value.funcall("to_a", ())?;
|
94
|
-
let mut type_str = None;
|
95
|
-
let mut format_str = None;
|
85
|
+
let mut schema = Vec::with_capacity(schema_array.len());
|
96
86
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
return Err(MagnusError::new(
|
104
|
-
magnus::exception::type_error(),
|
105
|
-
format!("Unknown key '{}' in type definition", key),
|
106
|
-
))
|
107
|
-
}
|
108
|
-
}
|
87
|
+
for (idx, field_hash) in schema_array.into_iter().enumerate() {
|
88
|
+
if !field_hash.is_kind_of(ruby.class_hash()) {
|
89
|
+
return Err(MagnusError::new(
|
90
|
+
magnus::exception::type_error(),
|
91
|
+
format!("schema[{}] must be a hash", idx),
|
92
|
+
));
|
109
93
|
}
|
110
94
|
|
111
|
-
let
|
112
|
-
|
95
|
+
let entries: Vec<(Value, Value)> = field_hash.funcall("to_a", ())?;
|
96
|
+
if entries.len() != 1 {
|
97
|
+
return Err(MagnusError::new(
|
113
98
|
magnus::exception::type_error(),
|
114
|
-
"
|
115
|
-
)
|
116
|
-
}
|
99
|
+
format!("schema[{}] must contain exactly one key-value pair", idx),
|
100
|
+
));
|
101
|
+
}
|
117
102
|
|
118
|
-
(
|
119
|
-
|
120
|
-
|
121
|
-
|
103
|
+
let (name, type_value) = &entries[0];
|
104
|
+
let name = String::try_convert(name.clone())?;
|
105
|
+
|
106
|
+
let (type_, format) = if type_value.is_kind_of(ruby.class_hash()) {
|
107
|
+
let type_hash: Vec<(Value, Value)> = type_value.funcall("to_a", ())?;
|
108
|
+
let mut type_str = None;
|
109
|
+
let mut format_str = None;
|
110
|
+
|
111
|
+
for (key, value) in type_hash {
|
112
|
+
let key = String::try_convert(key)?;
|
113
|
+
match key.as_str() {
|
114
|
+
"type" => type_str = Some(value),
|
115
|
+
"format" => format_str = Some(String::try_convert(value)?),
|
116
|
+
_ => {
|
117
|
+
return Err(MagnusError::new(
|
118
|
+
magnus::exception::type_error(),
|
119
|
+
format!("Unknown key '{}' in type definition", key),
|
120
|
+
))
|
121
|
+
}
|
122
|
+
}
|
123
|
+
}
|
122
124
|
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
125
|
+
let type_str = type_str.ok_or_else(|| {
|
126
|
+
MagnusError::new(
|
127
|
+
magnus::exception::type_error(),
|
128
|
+
"Missing 'type' in type definition",
|
129
|
+
)
|
130
|
+
})?;
|
131
|
+
|
132
|
+
(ParquetSchemaType::try_convert(type_str)?, format_str)
|
133
|
+
} else {
|
134
|
+
(ParquetSchemaType::try_convert(type_value.clone())?, None)
|
135
|
+
};
|
136
|
+
|
137
|
+
schema.push(SchemaField {
|
138
|
+
name,
|
139
|
+
type_,
|
140
|
+
format,
|
141
|
+
});
|
142
|
+
}
|
143
|
+
|
144
|
+
schema
|
145
|
+
};
|
129
146
|
|
130
147
|
Ok(ParquetWriteArgs {
|
131
148
|
read_from,
|
@@ -396,6 +413,14 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
|
396
413
|
MagnusError::new(ruby.exception_type_error(), "Batch must be an array")
|
397
414
|
})?;
|
398
415
|
|
416
|
+
// Batch array must be an array of arrays. Check that the first value in `batch_array` is an array.
|
417
|
+
batch_array.entry::<RArray>(0).map_err(|_| {
|
418
|
+
MagnusError::new(
|
419
|
+
ruby.exception_type_error(),
|
420
|
+
"When writing columns, data must be formatted as batches of columns: [[batch1_col1, batch1_col2], [batch2_col1, batch2_col2]].",
|
421
|
+
)
|
422
|
+
})?;
|
423
|
+
|
399
424
|
// Validate batch length matches schema
|
400
425
|
if batch_array.len() != schema.len() {
|
401
426
|
return Err(MagnusError::new(
|
data/lib/parquet/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-02-
|
11
|
+
date: 2025-02-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|