parquet 0.4.1 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2b5b56cca903ed731d6981d3113e3833a6e6a6a0ffcd301040b32ab0c72bf9c1
4
- data.tar.gz: 0e4486e2a67a051852166ac81754c9bfb2807c2ffa51eeda5edb41050432e930
3
+ metadata.gz: 8e05c4544ed72011e1a877befa04e9c29f9d61d6f5663095f3356735c2411299
4
+ data.tar.gz: 6642f5575e9802a3eb05af6a4523fcc198644a6b465a2fa911c35985f27943f2
5
5
  SHA512:
6
- metadata.gz: affe353c972f130973b309ca1ee928928278254830fa39bee8e4bed5452b5b381e2d27f6986067a0f9b27769a78f195887d371dc4825f61096414af92e9edb94
7
- data.tar.gz: 9c823757be4b81d3ccafb57571c1d98b530a0d10696712083a0447b4871bf90720ba588d8c48777926b3f7a7f2ffede0c2ed7c9a076420b4dea183b7290ba47e
6
+ metadata.gz: b68013d8de3a6454d4ebddaae549fb4c02d0a6102718259b9a43133ceed8803060fc04fd06287e041ee0afdf0cd6c3556ff2c6a41d61a6acd226ef239caab090
7
+ data.tar.gz: 074f9faf08f2a178322cf74667ca93c24561cb49b4b91f7f98b653ea61a561628c3b06c4afe9d0a6e13410479fbd87482b88ee09d0dfbc91c554b820350fdad3
@@ -22,7 +22,7 @@ use arrow_array::{
22
22
  TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
23
23
  };
24
24
  use arrow_schema::{DataType, TimeUnit};
25
- use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby, TryConvert, Value};
25
+ use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby, Value};
26
26
  use parquet::data_type::Decimal;
27
27
  use parquet::record::Field;
28
28
  use std::{collections::HashMap, hash::BuildHasher, sync::Arc};
@@ -208,7 +208,7 @@ impl ParquetValue {
208
208
  Ok(ParquetValue::Float64(v))
209
209
  }
210
210
  ParquetSchemaType::String => {
211
- let v = String::try_convert(value)?;
211
+ let v = convert_to_string(value)?;
212
212
  Ok(ParquetValue::String(v))
213
213
  }
214
214
  ParquetSchemaType::Binary => {
@@ -3,7 +3,7 @@ use std::str::FromStr;
3
3
  use super::*;
4
4
  use arrow_array::builder::*;
5
5
  use jiff::tz::{Offset, TimeZone};
6
- use magnus::{RArray, TryConvert};
6
+ use magnus::{RArray, RString, TryConvert};
7
7
 
8
8
  pub struct NumericConverter<T> {
9
9
  _phantom: std::marker::PhantomData<T>,
@@ -194,6 +194,21 @@ pub fn convert_to_boolean(value: Value) -> Result<bool, MagnusError> {
194
194
  }
195
195
  }
196
196
 
197
+ pub fn convert_to_string(value: Value) -> Result<String, MagnusError> {
198
+ String::try_convert(value).or_else(|_| {
199
+ if value.respond_to("to_s", false)? {
200
+ value.funcall::<_, _, RString>("to_s", ())?.to_string()
201
+ } else if value.respond_to("to_str", false)? {
202
+ value.funcall::<_, _, RString>("to_str", ())?.to_string()
203
+ } else {
204
+ Err(MagnusError::new(
205
+ magnus::exception::type_error(),
206
+ format!("Not able to convert {:?} to String", value),
207
+ ))
208
+ }
209
+ })
210
+ }
211
+
197
212
  pub fn convert_to_list(
198
213
  value: Value,
199
214
  list_field: &ListField,
@@ -9,7 +9,7 @@ use magnus::{value::ReprValue, Error as MagnusError, RString, Ruby, Symbol, TryC
9
9
  use parquet::{arrow::ArrowWriter, errors::ParquetError};
10
10
  use tempfile::NamedTempFile;
11
11
 
12
- use crate::types::{ListField, MapField, ParquetSchemaType};
12
+ use crate::types::{convert_to_string, ListField, MapField, ParquetSchemaType};
13
13
 
14
14
  #[derive(Debug)]
15
15
  pub struct SchemaField<'a> {
@@ -240,7 +240,7 @@ impl<'a> ColumnCollector<'a> {
240
240
  ParquetValue::Float64(v)
241
241
  }
242
242
  ParquetSchemaType::String => {
243
- let v = String::try_convert(value)?;
243
+ let v = convert_to_string(value)?;
244
244
  ParquetValue::String(v)
245
245
  }
246
246
  ParquetSchemaType::Binary => {
@@ -42,7 +42,7 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
42
42
 
43
43
  let kwargs = get_kwargs::<
44
44
  _,
45
- (Value, Value),
45
+ (Option<RArray>, Value),
46
46
  (
47
47
  Option<Option<usize>>,
48
48
  Option<Option<usize>>,
@@ -61,71 +61,88 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
61
61
  ],
62
62
  )?;
63
63
 
64
- let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
65
- MagnusError::new(
66
- magnus::exception::type_error(),
67
- "schema must be an array of hashes",
68
- )
69
- })?;
70
-
71
- let mut schema = Vec::with_capacity(schema_array.len());
72
-
73
- for (idx, field_hash) in schema_array.into_iter().enumerate() {
74
- if !field_hash.is_kind_of(ruby.class_hash()) {
75
- return Err(MagnusError::new(
76
- magnus::exception::type_error(),
77
- format!("schema[{}] must be a hash", idx),
78
- ));
79
- }
80
-
81
- let entries: Vec<(Value, Value)> = field_hash.funcall("to_a", ())?;
82
- if entries.len() != 1 {
83
- return Err(MagnusError::new(
64
+ let schema = if kwargs.required.0.is_none() || kwargs.required.0.unwrap().is_empty() {
65
+ // If schema is nil, we need to peek at the first value to determine column count
66
+ let first_value = read_from.funcall::<_, _, Value>("peek", ())?;
67
+ let array = RArray::from_value(first_value).ok_or_else(|| {
68
+ MagnusError::new(
84
69
  magnus::exception::type_error(),
85
- format!("schema[{}] must contain exactly one key-value pair", idx),
86
- ));
87
- }
70
+ "First value must be an array when schema is not provided",
71
+ )
72
+ })?;
88
73
 
89
- let (name, type_value) = &entries[0];
90
- let name = String::try_convert(name.clone())?;
74
+ // Generate field names f0, f1, f2, etc.
75
+ (0..array.len())
76
+ .map(|i| SchemaField {
77
+ name: format!("f{}", i),
78
+ type_: ParquetSchemaType::String,
79
+ format: None,
80
+ })
81
+ .collect()
82
+ } else {
83
+ let schema_array = kwargs.required.0.unwrap();
91
84
 
92
- let (type_, format) = if type_value.is_kind_of(ruby.class_hash()) {
93
- let type_hash: Vec<(Value, Value)> = type_value.funcall("to_a", ())?;
94
- let mut type_str = None;
95
- let mut format_str = None;
85
+ let mut schema = Vec::with_capacity(schema_array.len());
96
86
 
97
- for (key, value) in type_hash {
98
- let key = String::try_convert(key)?;
99
- match key.as_str() {
100
- "type" => type_str = Some(value),
101
- "format" => format_str = Some(String::try_convert(value)?),
102
- _ => {
103
- return Err(MagnusError::new(
104
- magnus::exception::type_error(),
105
- format!("Unknown key '{}' in type definition", key),
106
- ))
107
- }
108
- }
87
+ for (idx, field_hash) in schema_array.into_iter().enumerate() {
88
+ if !field_hash.is_kind_of(ruby.class_hash()) {
89
+ return Err(MagnusError::new(
90
+ magnus::exception::type_error(),
91
+ format!("schema[{}] must be a hash", idx),
92
+ ));
109
93
  }
110
94
 
111
- let type_str = type_str.ok_or_else(|| {
112
- MagnusError::new(
95
+ let entries: Vec<(Value, Value)> = field_hash.funcall("to_a", ())?;
96
+ if entries.len() != 1 {
97
+ return Err(MagnusError::new(
113
98
  magnus::exception::type_error(),
114
- "Missing 'type' in type definition",
115
- )
116
- })?;
99
+ format!("schema[{}] must contain exactly one key-value pair", idx),
100
+ ));
101
+ }
117
102
 
118
- (ParquetSchemaType::try_convert(type_str)?, format_str)
119
- } else {
120
- (ParquetSchemaType::try_convert(type_value.clone())?, None)
121
- };
103
+ let (name, type_value) = &entries[0];
104
+ let name = String::try_convert(name.clone())?;
105
+
106
+ let (type_, format) = if type_value.is_kind_of(ruby.class_hash()) {
107
+ let type_hash: Vec<(Value, Value)> = type_value.funcall("to_a", ())?;
108
+ let mut type_str = None;
109
+ let mut format_str = None;
110
+
111
+ for (key, value) in type_hash {
112
+ let key = String::try_convert(key)?;
113
+ match key.as_str() {
114
+ "type" => type_str = Some(value),
115
+ "format" => format_str = Some(String::try_convert(value)?),
116
+ _ => {
117
+ return Err(MagnusError::new(
118
+ magnus::exception::type_error(),
119
+ format!("Unknown key '{}' in type definition", key),
120
+ ))
121
+ }
122
+ }
123
+ }
122
124
 
123
- schema.push(SchemaField {
124
- name,
125
- type_,
126
- format,
127
- });
128
- }
125
+ let type_str = type_str.ok_or_else(|| {
126
+ MagnusError::new(
127
+ magnus::exception::type_error(),
128
+ "Missing 'type' in type definition",
129
+ )
130
+ })?;
131
+
132
+ (ParquetSchemaType::try_convert(type_str)?, format_str)
133
+ } else {
134
+ (ParquetSchemaType::try_convert(type_value.clone())?, None)
135
+ };
136
+
137
+ schema.push(SchemaField {
138
+ name,
139
+ type_,
140
+ format,
141
+ });
142
+ }
143
+
144
+ schema
145
+ };
129
146
 
130
147
  Ok(ParquetWriteArgs {
131
148
  read_from,
@@ -396,6 +413,14 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
396
413
  MagnusError::new(ruby.exception_type_error(), "Batch must be an array")
397
414
  })?;
398
415
 
416
+ // Batch array must be an array of arrays. Check that the first value in `batch_array` is an array.
417
+ batch_array.entry::<RArray>(0).map_err(|_| {
418
+ MagnusError::new(
419
+ ruby.exception_type_error(),
420
+ "When writing columns, data must be formatted as batches of columns: [[batch1_col1, batch1_col2], [batch2_col1, batch2_col2]].",
421
+ )
422
+ })?;
423
+
399
424
  // Validate batch length matches schema
400
425
  if batch_array.len() != schema.len() {
401
426
  return Err(MagnusError::new(
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.4.1"
2
+ VERSION = "0.4.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-02-20 00:00:00.000000000 Z
11
+ date: 2025-02-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys