parquet 0.2.6 → 0.2.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 794d11142b73d13b665ecdb4ffd46df6ab7d97e5f99336e2bc91b79dbb55a514
4
- data.tar.gz: eb2843d724e7aad70445a8b992a527e3bee0a79055fdeab7f2ebd2cdfb6247d6
3
+ metadata.gz: c1ed4f490a4f03443598dbe1b0e110746052f613a4c5575f9b8e47c6e160bb40
4
+ data.tar.gz: 4db314d1707e633799e996c6fb777135ff0ea364a76c0a7d8fc5c429e2394d9f
5
5
  SHA512:
6
- metadata.gz: 8b97550fb18f2ab4db0b5fbb170d12448237665d9372242d4027760f1c697be0d1e7a8bb47d43886f704e0923ddf57544961fe5af29c596b49aac188f714b9e6
7
- data.tar.gz: 1ea56a23e39a084d40690d4e7bd108ec2a4cb20b61714bd564e68600d3f3edda3ffd5c3e646d49d4bb85632ad14f2c7d5735e645610e7a863d9e25d6f1d2b90d
6
+ metadata.gz: b3f0a15cf467d030d3002c21bc6b64b6cd16e91e972b8de1e928abfd9bd373cfb5c4f77cdd1a6db7c620055e9657ec623866e0d8a0cb3a8e21a0c252bde3df87
7
+ data.tar.gz: 77f41921f5818051b597d2941688f6eca2a24d86333c58dec45d6e47e7161bfdd70e78f50a0f7ddd6cc99356c2b477451ab43adf9caa201501815c6b1a731d5c
@@ -40,18 +40,20 @@ impl std::fmt::Display for ParserResultType {
40
40
  }
41
41
 
42
42
  #[derive(Debug, Clone)]
43
- pub struct ListField {
44
- pub item_type: ParquetSchemaType,
43
+ pub struct ListField<'a> {
44
+ pub item_type: ParquetSchemaType<'a>,
45
+ pub format: Option<&'a str>,
45
46
  }
46
47
 
47
48
  #[derive(Debug, Clone)]
48
- pub struct MapField {
49
- pub key_type: ParquetSchemaType,
50
- pub value_type: ParquetSchemaType,
49
+ pub struct MapField<'a> {
50
+ pub key_type: ParquetSchemaType<'a>,
51
+ pub value_type: ParquetSchemaType<'a>,
52
+ pub format: Option<&'a str>,
51
53
  }
52
54
 
53
55
  #[derive(Debug, Clone)]
54
- pub enum ParquetSchemaType {
56
+ pub enum ParquetSchemaType<'a> {
55
57
  Int8,
56
58
  Int16,
57
59
  Int32,
@@ -68,6 +70,6 @@ pub enum ParquetSchemaType {
68
70
  Date32,
69
71
  TimestampMillis,
70
72
  TimestampMicros,
71
- List(Box<ListField>),
72
- Map(Box<MapField>),
73
+ List(Box<ListField<'a>>),
74
+ Map(Box<MapField<'a>>),
73
75
  }
@@ -215,15 +215,15 @@ impl ParquetValue {
215
215
  Ok(ParquetValue::Boolean(v))
216
216
  }
217
217
  ParquetSchemaType::Date32 => {
218
- let v = convert_to_date32(value)?;
218
+ let v = convert_to_date32(value, None)?;
219
219
  Ok(ParquetValue::Date32(v))
220
220
  }
221
221
  ParquetSchemaType::TimestampMillis => {
222
- let v = convert_to_timestamp_millis(value)?;
222
+ let v = convert_to_timestamp_millis(value, None)?;
223
223
  Ok(ParquetValue::TimestampMillis(v, None))
224
224
  }
225
225
  ParquetSchemaType::TimestampMicros => {
226
- let v = convert_to_timestamp_micros(value)?;
226
+ let v = convert_to_timestamp_micros(value, None)?;
227
227
  Ok(ParquetValue::TimestampMicros(v, None))
228
228
  }
229
229
  ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => Err(MagnusError::new(
@@ -30,17 +30,35 @@ where
30
30
  }
31
31
  }
32
32
 
33
- pub fn convert_to_date32(value: Value) -> Result<i32, MagnusError> {
33
+ pub fn convert_to_date32(value: Value, format: Option<&str>) -> Result<i32, MagnusError> {
34
34
  let ruby = unsafe { Ruby::get_unchecked() };
35
35
  if value.is_kind_of(ruby.class_string()) {
36
36
  let s = String::try_convert(value)?;
37
- // Parse string into Timestamp using jiff
38
- let date: jiff::civil::Date = s.parse().map_err(|e| {
39
- MagnusError::new(
40
- magnus::exception::type_error(),
41
- format!("Failed to parse '{}' as date32: {}", s, e),
42
- )
43
- })?;
37
+ // Parse string into Date using jiff
38
+ let date = if let Some(fmt) = format {
39
+ jiff::civil::Date::strptime(&fmt, &s).or_else(|e1| {
40
+ // Try parsing as DateTime and convert to Date with zero offset
41
+ jiff::civil::DateTime::strptime(&fmt, &s)
42
+ .and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
43
+ .map(|dt| dt.date())
44
+ .map_err(|e2| {
45
+ MagnusError::new(
46
+ magnus::exception::type_error(),
47
+ format!(
48
+ "Failed to parse '{}' with format '{}' as date32: {} (and as datetime: {})",
49
+ s, fmt, e1, e2
50
+ ),
51
+ )
52
+ })
53
+ })?
54
+ } else {
55
+ s.parse().map_err(|e| {
56
+ MagnusError::new(
57
+ magnus::exception::type_error(),
58
+ format!("Failed to parse '{}' as date32: {}", s, e),
59
+ )
60
+ })?
61
+ };
44
62
 
45
63
  let timestamp = date.at(0, 0, 0, 0);
46
64
 
@@ -63,17 +81,36 @@ pub fn convert_to_date32(value: Value) -> Result<i32, MagnusError> {
63
81
  }
64
82
  }
65
83
 
66
- pub fn convert_to_timestamp_millis(value: Value) -> Result<i64, MagnusError> {
84
+ pub fn convert_to_timestamp_millis(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
67
85
  let ruby = unsafe { Ruby::get_unchecked() };
68
86
  if value.is_kind_of(ruby.class_string()) {
69
87
  let s = String::try_convert(value)?;
70
88
  // Parse string into Timestamp using jiff
71
- let timestamp: jiff::Timestamp = s.parse().map_err(|e| {
72
- MagnusError::new(
73
- magnus::exception::type_error(),
74
- format!("Failed to parse '{}' as timestamp_millis: {}", s, e),
75
- )
76
- })?;
89
+ let timestamp = if let Some(fmt) = format {
90
+ jiff::Timestamp::strptime(&fmt, &s)
91
+ .or_else(|e1| {
92
+ // Try parsing as DateTime and convert to Timestamp with zero offset
93
+ jiff::civil::DateTime::strptime(&fmt, &s)
94
+ .and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
95
+ .map(|dt| dt.timestamp())
96
+ .map_err(|e2| {
97
+ MagnusError::new(
98
+ magnus::exception::type_error(),
99
+ format!(
100
+ "Failed to parse '{}' with format '{}' as timestamp_millis: {} (and as datetime: {})",
101
+ s, fmt, e1, e2
102
+ ),
103
+ )
104
+ })
105
+ })?
106
+ } else {
107
+ s.parse().map_err(|e| {
108
+ MagnusError::new(
109
+ magnus::exception::type_error(),
110
+ format!("Failed to parse '{}' as timestamp_millis: {}", s, e),
111
+ )
112
+ })?
113
+ };
77
114
  // Convert to milliseconds
78
115
  Ok(timestamp.as_millisecond())
79
116
  } else if value.is_kind_of(ruby.class_time()) {
@@ -91,17 +128,36 @@ pub fn convert_to_timestamp_millis(value: Value) -> Result<i64, MagnusError> {
91
128
  }
92
129
  }
93
130
 
94
- pub fn convert_to_timestamp_micros(value: Value) -> Result<i64, MagnusError> {
131
+ pub fn convert_to_timestamp_micros(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
95
132
  let ruby = unsafe { Ruby::get_unchecked() };
96
133
  if value.is_kind_of(ruby.class_string()) {
97
134
  let s = String::try_convert(value)?;
98
135
  // Parse string into Timestamp using jiff
99
- let timestamp: jiff::Timestamp = s.parse().map_err(|e| {
100
- MagnusError::new(
101
- magnus::exception::type_error(),
102
- format!("Failed to parse '{}' as timestamp_micros: {}", s, e),
103
- )
104
- })?;
136
+ let timestamp = if let Some(fmt) = format {
137
+ jiff::Timestamp::strptime(&fmt, &s).or_else(|e1| {
138
+ // Try parsing as DateTime and convert to Timestamp with zero offset
139
+ jiff::civil::DateTime::strptime(&fmt, &s).and_then(|dt| {
140
+ dt.to_zoned(TimeZone::fixed(Offset::constant(0)))
141
+ })
142
+ .map(|dt| dt.timestamp())
143
+ .map_err(|e2| {
144
+ MagnusError::new(
145
+ magnus::exception::type_error(),
146
+ format!(
147
+ "Failed to parse '{}' with format '{}' as timestamp_micros: {} (and as datetime: {})",
148
+ s, fmt, e1, e2
149
+ ),
150
+ )
151
+ })
152
+ })?
153
+ } else {
154
+ s.parse().map_err(|e| {
155
+ MagnusError::new(
156
+ magnus::exception::type_error(),
157
+ format!("Failed to parse '{}' as timestamp_micros: {}", s, e),
158
+ )
159
+ })?
160
+ };
105
161
  // Convert to microseconds
106
162
  Ok(timestamp.as_microsecond())
107
163
  } else if value.is_kind_of(ruby.class_time()) {
@@ -204,15 +260,15 @@ pub fn convert_to_list(
204
260
  ParquetValue::Boolean(v)
205
261
  }
206
262
  ParquetSchemaType::Date32 => {
207
- let v = convert_to_date32(item_value)?;
263
+ let v = convert_to_date32(item_value, list_field.format)?;
208
264
  ParquetValue::Date32(v)
209
265
  }
210
266
  ParquetSchemaType::TimestampMillis => {
211
- let v = convert_to_timestamp_millis(item_value)?;
267
+ let v = convert_to_timestamp_millis(item_value, list_field.format)?;
212
268
  ParquetValue::TimestampMillis(v, None)
213
269
  }
214
270
  ParquetSchemaType::TimestampMicros => {
215
- let v = convert_to_timestamp_micros(item_value)?;
271
+ let v = convert_to_timestamp_micros(item_value, list_field.format)?;
216
272
  ParquetValue::TimestampMicros(v, None)
217
273
  }
218
274
  ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
@@ -310,15 +366,15 @@ pub fn convert_to_map(
310
366
  ParquetValue::Boolean(v)
311
367
  }
312
368
  ParquetSchemaType::Date32 => {
313
- let v = convert_to_date32(value)?;
369
+ let v = convert_to_date32(value, map_field.format)?;
314
370
  ParquetValue::Date32(v)
315
371
  }
316
372
  ParquetSchemaType::TimestampMillis => {
317
- let v = convert_to_timestamp_millis(value)?;
373
+ let v = convert_to_timestamp_millis(value, map_field.format)?;
318
374
  ParquetValue::TimestampMillis(v, None)
319
375
  }
320
376
  ParquetSchemaType::TimestampMicros => {
321
- let v = convert_to_timestamp_micros(value)?;
377
+ let v = convert_to_timestamp_micros(value, map_field.format)?;
322
378
  ParquetValue::TimestampMicros(v, None)
323
379
  }
324
380
  ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
@@ -12,16 +12,17 @@ use tempfile::NamedTempFile;
12
12
  use crate::types::{ListField, MapField, ParquetSchemaType};
13
13
 
14
14
  #[derive(Debug)]
15
- pub struct SchemaField {
15
+ pub struct SchemaField<'a> {
16
16
  pub name: String,
17
- pub type_: ParquetSchemaType,
17
+ pub type_: ParquetSchemaType<'a>,
18
+ pub format: Option<String>,
18
19
  }
19
20
 
20
21
  #[derive(Debug)]
21
- pub struct ParquetWriteArgs {
22
+ pub struct ParquetWriteArgs<'a> {
22
23
  pub read_from: Value,
23
24
  pub write_to: Value,
24
- pub schema: Vec<SchemaField>,
25
+ pub schema: Vec<SchemaField<'a>>,
25
26
  pub batch_size: Option<usize>,
26
27
  }
27
28
 
@@ -51,7 +52,7 @@ impl Write for IoLikeValue {
51
52
  }
52
53
  }
53
54
 
54
- impl FromStr for ParquetSchemaType {
55
+ impl<'a> FromStr for ParquetSchemaType<'a> {
55
56
  type Err = MagnusError;
56
57
 
57
58
  fn from_str(s: &str) -> Result<Self, Self::Err> {
@@ -74,10 +75,12 @@ impl FromStr for ParquetSchemaType {
74
75
  "timestamp_micros" => Ok(ParquetSchemaType::TimestampMicros),
75
76
  "list" => Ok(ParquetSchemaType::List(Box::new(ListField {
76
77
  item_type: ParquetSchemaType::Int8,
78
+ format: None,
77
79
  }))),
78
80
  "map" => Ok(ParquetSchemaType::Map(Box::new(MapField {
79
81
  key_type: ParquetSchemaType::String,
80
82
  value_type: ParquetSchemaType::Int8,
83
+ format: None,
81
84
  }))),
82
85
  _ => Err(MagnusError::new(
83
86
  magnus::exception::runtime_error(),
@@ -87,7 +90,7 @@ impl FromStr for ParquetSchemaType {
87
90
  }
88
91
  }
89
92
 
90
- impl TryConvert for ParquetSchemaType {
93
+ impl<'a> TryConvert for ParquetSchemaType<'a> {
91
94
  fn try_convert(value: Value) -> Result<Self, MagnusError> {
92
95
  let ruby = unsafe { Ruby::get_unchecked() };
93
96
  let schema_type = parse_string_or_symbol(&ruby, value)?;
@@ -98,7 +101,7 @@ impl TryConvert for ParquetSchemaType {
98
101
 
99
102
  // We know this type is safe to move between threads because it's just an enum
100
103
  // with simple primitive types and strings
101
- unsafe impl Send for ParquetSchemaType {}
104
+ unsafe impl<'a> Send for ParquetSchemaType<'a> {}
102
105
 
103
106
  fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, MagnusError> {
104
107
  if value.is_nil() {
@@ -162,17 +165,19 @@ impl From<ParquetErrorWrapper> for MagnusError {
162
165
  }
163
166
  }
164
167
 
165
- pub struct ColumnCollector {
168
+ pub struct ColumnCollector<'a> {
166
169
  pub name: String,
167
- pub type_: ParquetSchemaType,
170
+ pub type_: ParquetSchemaType<'a>,
171
+ pub format: Option<String>,
168
172
  pub values: Vec<crate::types::ParquetValue>,
169
173
  }
170
174
 
171
- impl ColumnCollector {
172
- pub fn new(name: String, type_: ParquetSchemaType) -> Self {
175
+ impl<'a> ColumnCollector<'a> {
176
+ pub fn new(name: String, type_: ParquetSchemaType<'a>, format: Option<String>) -> Self {
173
177
  Self {
174
178
  name,
175
179
  type_,
180
+ format,
176
181
  values: Vec::new(),
177
182
  }
178
183
  }
@@ -244,15 +249,15 @@ impl ColumnCollector {
244
249
  ParquetValue::Boolean(v)
245
250
  }
246
251
  ParquetSchemaType::Date32 => {
247
- let v = convert_to_date32(value)?;
252
+ let v = convert_to_date32(value, self.format.as_deref())?;
248
253
  ParquetValue::Date32(v)
249
254
  }
250
255
  ParquetSchemaType::TimestampMillis => {
251
- let v = convert_to_timestamp_millis(value)?;
256
+ let v = convert_to_timestamp_millis(value, self.format.as_deref())?;
252
257
  ParquetValue::TimestampMillis(v, None)
253
258
  }
254
259
  ParquetSchemaType::TimestampMicros => {
255
- let v = convert_to_timestamp_micros(value)?;
260
+ let v = convert_to_timestamp_micros(value, self.format.as_deref())?;
256
261
  ParquetValue::TimestampMicros(v, None)
257
262
  }
258
263
  ParquetSchemaType::List(list_field) => {
@@ -59,11 +59,45 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
59
59
  ));
60
60
  }
61
61
 
62
- let (name, type_str) = &entries[0];
62
+ let (name, type_value) = &entries[0];
63
63
  let name = String::try_convert(name.clone())?;
64
- let type_ = ParquetSchemaType::try_convert(type_str.clone())?;
65
64
 
66
- schema.push(SchemaField { name, type_ });
65
+ let (type_, format) = if type_value.is_kind_of(ruby.class_hash()) {
66
+ let type_hash: Vec<(Value, Value)> = type_value.funcall("to_a", ())?;
67
+ let mut type_str = None;
68
+ let mut format_str = None;
69
+
70
+ for (key, value) in type_hash {
71
+ let key = String::try_convert(key)?;
72
+ match key.as_str() {
73
+ "type" => type_str = Some(value),
74
+ "format" => format_str = Some(String::try_convert(value)?),
75
+ _ => {
76
+ return Err(MagnusError::new(
77
+ magnus::exception::type_error(),
78
+ format!("Unknown key '{}' in type definition", key),
79
+ ))
80
+ }
81
+ }
82
+ }
83
+
84
+ let type_str = type_str.ok_or_else(|| {
85
+ MagnusError::new(
86
+ magnus::exception::type_error(),
87
+ "Missing 'type' in type definition",
88
+ )
89
+ })?;
90
+
91
+ (ParquetSchemaType::try_convert(type_str)?, format_str)
92
+ } else {
93
+ (ParquetSchemaType::try_convert(type_value.clone())?, None)
94
+ };
95
+
96
+ schema.push(SchemaField {
97
+ name,
98
+ type_,
99
+ format,
100
+ });
67
101
  }
68
102
 
69
103
  Ok(ParquetWriteArgs {
@@ -130,7 +164,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
130
164
  // Create collectors for each column
131
165
  let mut column_collectors: Vec<ColumnCollector> = schema
132
166
  .into_iter()
133
- .map(|field| ColumnCollector::new(field.name, field.type_))
167
+ .map(|field| ColumnCollector::new(field.name, field.type_, field.format))
134
168
  .collect();
135
169
 
136
170
  let mut rows_in_batch = 0;
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.2.6"
2
+ VERSION = "0.2.7"
3
3
  end
data/lib/parquet.rbi CHANGED
@@ -1,4 +1,4 @@
1
- # typed: strict
1
+ # typed: true
2
2
 
3
3
  module Parquet
4
4
  # Options:
@@ -7,13 +7,20 @@ module Parquet
7
7
  # ("hash" or "array" or :hash or :array)
8
8
  # - `columns`: When present, only the specified columns will be included in the output.
9
9
  # This is useful for reducing how much data is read and improving performance.
10
+ sig do
11
+ params(
12
+ input: T.any(String, File, StringIO, IO),
13
+ result_type: T.nilable(T.any(String, Symbol)),
14
+ columns: T.nilable(T::Array[String])
15
+ ).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
16
+ end
10
17
  sig do
11
18
  params(
12
19
  input: T.any(String, File, StringIO, IO),
13
20
  result_type: T.nilable(T.any(String, Symbol)),
14
21
  columns: T.nilable(T::Array[String]),
15
22
  blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
16
- ).returns(T.any(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])], NilClass))
23
+ ).returns(NilClass)
17
24
  end
18
25
  def self.each_row(input, result_type: nil, columns: nil, &blk)
19
26
  end
@@ -24,6 +31,14 @@ module Parquet
24
31
  # ("hash" or "array" or :hash or :array)
25
32
  # - `columns`: When present, only the specified columns will be included in the output.
26
33
  # - `batch_size`: When present, specifies the number of rows per batch
34
+ sig do
35
+ params(
36
+ input: T.any(String, File, StringIO, IO),
37
+ result_type: T.nilable(T.any(String, Symbol)),
38
+ columns: T.nilable(T::Array[String]),
39
+ batch_size: T.nilable(Integer)
40
+ ).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
41
+ end
27
42
  sig do
28
43
  params(
29
44
  input: T.any(String, File, StringIO, IO),
@@ -32,14 +47,22 @@ module Parquet
32
47
  batch_size: T.nilable(Integer),
33
48
  blk:
34
49
  T.nilable(T.proc.params(batch: T.any(T::Hash[String, T::Array[T.untyped]], T::Array[T::Array[T.untyped]])).void)
35
- ).returns(T.any(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])], NilClass))
50
+ ).returns(NilClass)
36
51
  end
37
52
  def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, &blk)
38
53
  end
39
54
 
40
55
  # Options:
41
56
  # - `read_from`: An Enumerator yielding arrays of values representing each row
42
- # - `schema`: Array of hashes specifying column names and types
57
+ # - `schema`: Array of hashes specifying column names and types. Supported types:
58
+ # - `int8`, `int16`, `int32`, `int64`
59
+ # - `uint8`, `uint16`, `uint32`, `uint64`
60
+ # - `float`, `double`
61
+ # - `string`
62
+ # - `binary`
63
+ # - `boolean`
64
+ # - `date32`
65
+ # - `timestamp_millis`, `timestamp_micros`
43
66
  # - `write_to`: String path or IO object to write the parquet file to
44
67
  # - `batch_size`: Optional batch size for writing (defaults to 1000)
45
68
  sig do
@@ -55,7 +78,16 @@ module Parquet
55
78
 
56
79
  # Options:
57
80
  # - `read_from`: An Enumerator yielding arrays of column batches
58
- # - `schema`: Array of hashes specifying column names and types
81
+ # - `schema`: Array of hashes specifying column names and types. Supported types:
82
+ # - `int8`, `int16`, `int32`, `int64`
83
+ # - `uint8`, `uint16`, `uint32`, `uint64`
84
+ # - `float`, `double`
85
+ # - `string`
86
+ # - `binary`
87
+ # - `boolean`
88
+ # - `date32`
89
+ # - `timestamp_millis`, `timestamp_micros`
90
+ # - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
59
91
  # - `write_to`: String path or IO object to write the parquet file to
60
92
  sig do
61
93
  params(
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.6
4
+ version: 0.2.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko