parquet 0.4.0 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/parquet/src/ruby_reader.rs +34 -18
- data/ext/parquet/src/types/mod.rs +1 -1
- data/ext/parquet/src/types/parquet_value.rs +1 -1
- data/ext/parquet/src/types/type_conversion.rs +16 -1
- data/ext/parquet/src/types/writer_types.rs +2 -2
- data/ext/parquet/src/writer/mod.rs +82 -57
- data/lib/parquet/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8e05c4544ed72011e1a877befa04e9c29f9d61d6f5663095f3356735c2411299
|
4
|
+
data.tar.gz: 6642f5575e9802a3eb05af6a4523fcc198644a6b465a2fa911c35985f27943f2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b68013d8de3a6454d4ebddaae549fb4c02d0a6102718259b9a43133ceed8803060fc04fd06287e041ee0afdf0cd6c3556ff2c6a41d61a6acd226ef239caab090
|
7
|
+
data.tar.gz: 074f9faf08f2a178322cf74667ca93c24561cb49b4b91f7f98b653ea61a561628c3b06c4afe9d0a6e13410479fbd87482b88ee09d0dfbc91c554b820350fdad3
|
@@ -184,37 +184,53 @@ impl Length for RubyReader {
|
|
184
184
|
}
|
185
185
|
RubyReader::RubyIoLike { inner } => {
|
186
186
|
let unwrapped_inner = ruby.get_inner(*inner);
|
187
|
-
let current_pos = unwrapped_inner.funcall::<_, _, u64>("seek", (0, 1));
|
188
187
|
|
189
|
-
|
188
|
+
// Get current position
|
189
|
+
let current_pos = match unwrapped_inner.funcall::<_, _, u64>("pos", ()) {
|
190
|
+
Ok(pos) => pos,
|
191
|
+
Err(e) => {
|
192
|
+
eprintln!("Error seeking: {}", e);
|
193
|
+
return 0;
|
194
|
+
}
|
195
|
+
};
|
196
|
+
|
197
|
+
// Seek to end
|
198
|
+
if let Err(e) = unwrapped_inner.funcall::<_, _, u64>("seek", (0, 2)) {
|
190
199
|
eprintln!("Error seeking: {}", e);
|
191
200
|
return 0;
|
192
201
|
}
|
193
202
|
|
194
|
-
|
203
|
+
// Offset at the end of the file is the length of the file
|
204
|
+
let size = match unwrapped_inner.funcall::<_, _, u64>("pos", ()) {
|
205
|
+
Ok(pos) => pos,
|
206
|
+
Err(e) => {
|
207
|
+
eprintln!("Error seeking: {}", e);
|
208
|
+
return 0;
|
209
|
+
}
|
210
|
+
};
|
211
|
+
|
212
|
+
// Restore original position
|
213
|
+
if let Err(e) = unwrapped_inner.funcall::<_, _, u64>("seek", (current_pos, 0)) {
|
195
214
|
eprintln!("Error seeking: {}", e);
|
196
215
|
return 0;
|
197
216
|
}
|
198
217
|
|
199
|
-
let
|
200
|
-
|
201
|
-
match size {
|
202
|
-
Ok(size) => {
|
203
|
-
// Restore original position
|
204
|
-
if let Err(e) = unwrapped_inner.funcall::<_, _, u64>(
|
205
|
-
"seek",
|
206
|
-
(current_pos.expect("Current position is not set!"), 0),
|
207
|
-
) {
|
208
|
-
eprintln!("Error seeking: {}", e);
|
209
|
-
return 0;
|
210
|
-
}
|
211
|
-
size
|
212
|
-
}
|
218
|
+
let final_pos = match unwrapped_inner.funcall::<_, _, u64>("pos", ()) {
|
219
|
+
Ok(pos) => pos,
|
213
220
|
Err(e) => {
|
214
221
|
eprintln!("Error seeking: {}", e);
|
215
222
|
return 0;
|
216
223
|
}
|
217
|
-
}
|
224
|
+
};
|
225
|
+
|
226
|
+
assert_eq!(
|
227
|
+
current_pos, final_pos,
|
228
|
+
"Failed to restore original position in seekable IO object. Started at position {}, but ended at {}",
|
229
|
+
current_pos,
|
230
|
+
final_pos
|
231
|
+
);
|
232
|
+
|
233
|
+
size
|
218
234
|
}
|
219
235
|
}
|
220
236
|
}
|
@@ -22,7 +22,7 @@ use arrow_array::{
|
|
22
22
|
TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
|
23
23
|
};
|
24
24
|
use arrow_schema::{DataType, TimeUnit};
|
25
|
-
use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby,
|
25
|
+
use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby, Value};
|
26
26
|
use parquet::data_type::Decimal;
|
27
27
|
use parquet::record::Field;
|
28
28
|
use std::{collections::HashMap, hash::BuildHasher, sync::Arc};
|
@@ -3,7 +3,7 @@ use std::str::FromStr;
|
|
3
3
|
use super::*;
|
4
4
|
use arrow_array::builder::*;
|
5
5
|
use jiff::tz::{Offset, TimeZone};
|
6
|
-
use magnus::{RArray, TryConvert};
|
6
|
+
use magnus::{RArray, RString, TryConvert};
|
7
7
|
|
8
8
|
pub struct NumericConverter<T> {
|
9
9
|
_phantom: std::marker::PhantomData<T>,
|
@@ -194,6 +194,21 @@ pub fn convert_to_boolean(value: Value) -> Result<bool, MagnusError> {
|
|
194
194
|
}
|
195
195
|
}
|
196
196
|
|
197
|
+
pub fn convert_to_string(value: Value) -> Result<String, MagnusError> {
|
198
|
+
String::try_convert(value).or_else(|_| {
|
199
|
+
if value.respond_to("to_s", false)? {
|
200
|
+
value.funcall::<_, _, RString>("to_s", ())?.to_string()
|
201
|
+
} else if value.respond_to("to_str", false)? {
|
202
|
+
value.funcall::<_, _, RString>("to_str", ())?.to_string()
|
203
|
+
} else {
|
204
|
+
Err(MagnusError::new(
|
205
|
+
magnus::exception::type_error(),
|
206
|
+
format!("Not able to convert {:?} to String", value),
|
207
|
+
))
|
208
|
+
}
|
209
|
+
})
|
210
|
+
}
|
211
|
+
|
197
212
|
pub fn convert_to_list(
|
198
213
|
value: Value,
|
199
214
|
list_field: &ListField,
|
@@ -9,7 +9,7 @@ use magnus::{value::ReprValue, Error as MagnusError, RString, Ruby, Symbol, TryC
|
|
9
9
|
use parquet::{arrow::ArrowWriter, errors::ParquetError};
|
10
10
|
use tempfile::NamedTempFile;
|
11
11
|
|
12
|
-
use crate::types::{ListField, MapField, ParquetSchemaType};
|
12
|
+
use crate::types::{convert_to_string, ListField, MapField, ParquetSchemaType};
|
13
13
|
|
14
14
|
#[derive(Debug)]
|
15
15
|
pub struct SchemaField<'a> {
|
@@ -240,7 +240,7 @@ impl<'a> ColumnCollector<'a> {
|
|
240
240
|
ParquetValue::Float64(v)
|
241
241
|
}
|
242
242
|
ParquetSchemaType::String => {
|
243
|
-
let v =
|
243
|
+
let v = convert_to_string(value)?;
|
244
244
|
ParquetValue::String(v)
|
245
245
|
}
|
246
246
|
ParquetSchemaType::Binary => {
|
@@ -42,7 +42,7 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
42
42
|
|
43
43
|
let kwargs = get_kwargs::<
|
44
44
|
_,
|
45
|
-
(
|
45
|
+
(Option<RArray>, Value),
|
46
46
|
(
|
47
47
|
Option<Option<usize>>,
|
48
48
|
Option<Option<usize>>,
|
@@ -61,71 +61,88 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
61
61
|
],
|
62
62
|
)?;
|
63
63
|
|
64
|
-
let
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
})?;
|
70
|
-
|
71
|
-
let mut schema = Vec::with_capacity(schema_array.len());
|
72
|
-
|
73
|
-
for (idx, field_hash) in schema_array.into_iter().enumerate() {
|
74
|
-
if !field_hash.is_kind_of(ruby.class_hash()) {
|
75
|
-
return Err(MagnusError::new(
|
76
|
-
magnus::exception::type_error(),
|
77
|
-
format!("schema[{}] must be a hash", idx),
|
78
|
-
));
|
79
|
-
}
|
80
|
-
|
81
|
-
let entries: Vec<(Value, Value)> = field_hash.funcall("to_a", ())?;
|
82
|
-
if entries.len() != 1 {
|
83
|
-
return Err(MagnusError::new(
|
64
|
+
let schema = if kwargs.required.0.is_none() || kwargs.required.0.unwrap().is_empty() {
|
65
|
+
// If schema is nil, we need to peek at the first value to determine column count
|
66
|
+
let first_value = read_from.funcall::<_, _, Value>("peek", ())?;
|
67
|
+
let array = RArray::from_value(first_value).ok_or_else(|| {
|
68
|
+
MagnusError::new(
|
84
69
|
magnus::exception::type_error(),
|
85
|
-
|
86
|
-
)
|
87
|
-
}
|
70
|
+
"First value must be an array when schema is not provided",
|
71
|
+
)
|
72
|
+
})?;
|
88
73
|
|
89
|
-
|
90
|
-
|
74
|
+
// Generate field names f0, f1, f2, etc.
|
75
|
+
(0..array.len())
|
76
|
+
.map(|i| SchemaField {
|
77
|
+
name: format!("f{}", i),
|
78
|
+
type_: ParquetSchemaType::String,
|
79
|
+
format: None,
|
80
|
+
})
|
81
|
+
.collect()
|
82
|
+
} else {
|
83
|
+
let schema_array = kwargs.required.0.unwrap();
|
91
84
|
|
92
|
-
let
|
93
|
-
let type_hash: Vec<(Value, Value)> = type_value.funcall("to_a", ())?;
|
94
|
-
let mut type_str = None;
|
95
|
-
let mut format_str = None;
|
85
|
+
let mut schema = Vec::with_capacity(schema_array.len());
|
96
86
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
return Err(MagnusError::new(
|
104
|
-
magnus::exception::type_error(),
|
105
|
-
format!("Unknown key '{}' in type definition", key),
|
106
|
-
))
|
107
|
-
}
|
108
|
-
}
|
87
|
+
for (idx, field_hash) in schema_array.into_iter().enumerate() {
|
88
|
+
if !field_hash.is_kind_of(ruby.class_hash()) {
|
89
|
+
return Err(MagnusError::new(
|
90
|
+
magnus::exception::type_error(),
|
91
|
+
format!("schema[{}] must be a hash", idx),
|
92
|
+
));
|
109
93
|
}
|
110
94
|
|
111
|
-
let
|
112
|
-
|
95
|
+
let entries: Vec<(Value, Value)> = field_hash.funcall("to_a", ())?;
|
96
|
+
if entries.len() != 1 {
|
97
|
+
return Err(MagnusError::new(
|
113
98
|
magnus::exception::type_error(),
|
114
|
-
"
|
115
|
-
)
|
116
|
-
}
|
99
|
+
format!("schema[{}] must contain exactly one key-value pair", idx),
|
100
|
+
));
|
101
|
+
}
|
117
102
|
|
118
|
-
(
|
119
|
-
|
120
|
-
|
121
|
-
|
103
|
+
let (name, type_value) = &entries[0];
|
104
|
+
let name = String::try_convert(name.clone())?;
|
105
|
+
|
106
|
+
let (type_, format) = if type_value.is_kind_of(ruby.class_hash()) {
|
107
|
+
let type_hash: Vec<(Value, Value)> = type_value.funcall("to_a", ())?;
|
108
|
+
let mut type_str = None;
|
109
|
+
let mut format_str = None;
|
110
|
+
|
111
|
+
for (key, value) in type_hash {
|
112
|
+
let key = String::try_convert(key)?;
|
113
|
+
match key.as_str() {
|
114
|
+
"type" => type_str = Some(value),
|
115
|
+
"format" => format_str = Some(String::try_convert(value)?),
|
116
|
+
_ => {
|
117
|
+
return Err(MagnusError::new(
|
118
|
+
magnus::exception::type_error(),
|
119
|
+
format!("Unknown key '{}' in type definition", key),
|
120
|
+
))
|
121
|
+
}
|
122
|
+
}
|
123
|
+
}
|
122
124
|
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
125
|
+
let type_str = type_str.ok_or_else(|| {
|
126
|
+
MagnusError::new(
|
127
|
+
magnus::exception::type_error(),
|
128
|
+
"Missing 'type' in type definition",
|
129
|
+
)
|
130
|
+
})?;
|
131
|
+
|
132
|
+
(ParquetSchemaType::try_convert(type_str)?, format_str)
|
133
|
+
} else {
|
134
|
+
(ParquetSchemaType::try_convert(type_value.clone())?, None)
|
135
|
+
};
|
136
|
+
|
137
|
+
schema.push(SchemaField {
|
138
|
+
name,
|
139
|
+
type_,
|
140
|
+
format,
|
141
|
+
});
|
142
|
+
}
|
143
|
+
|
144
|
+
schema
|
145
|
+
};
|
129
146
|
|
130
147
|
Ok(ParquetWriteArgs {
|
131
148
|
read_from,
|
@@ -396,6 +413,14 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
|
396
413
|
MagnusError::new(ruby.exception_type_error(), "Batch must be an array")
|
397
414
|
})?;
|
398
415
|
|
416
|
+
// Batch array must be an array of arrays. Check that the first value in `batch_array` is an array.
|
417
|
+
batch_array.entry::<RArray>(0).map_err(|_| {
|
418
|
+
MagnusError::new(
|
419
|
+
ruby.exception_type_error(),
|
420
|
+
"When writing columns, data must be formatted as batches of columns: [[batch1_col1, batch1_col2], [batch2_col1, batch2_col2]].",
|
421
|
+
)
|
422
|
+
})?;
|
423
|
+
|
399
424
|
// Validate batch length matches schema
|
400
425
|
if batch_array.len() != schema.len() {
|
401
426
|
return Err(MagnusError::new(
|
data/lib/parquet/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-02-
|
11
|
+
date: 2025-02-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|