parquet 0.2.5 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 90e876ca198a0e1871f692a382f09ceaeec670d162da26f2c102ea4eca4244bf
4
- data.tar.gz: 96743e260cbd2fb55f6cdeaf256fbb1e915c57651fdc3f20fdd58b6a34596544
3
+ metadata.gz: c1ed4f490a4f03443598dbe1b0e110746052f613a4c5575f9b8e47c6e160bb40
4
+ data.tar.gz: 4db314d1707e633799e996c6fb777135ff0ea364a76c0a7d8fc5c429e2394d9f
5
5
  SHA512:
6
- metadata.gz: 1609a37c5a9bd9f1d57bb31dd02b2fdb5b608a7c044686e6ef2513c95e53e830bd7bf7048a36904465a32a5915425c7b6bf581c5b35a4fb19f950cbca20913b2
7
- data.tar.gz: 96ec18377fc5944556760329c126f440de61d3b378bfa976a66437db03f0a51220c880afd14098a5b1968daa968d2e836c50f83bef21507789ba4df314c48148
6
+ metadata.gz: b3f0a15cf467d030d3002c21bc6b64b6cd16e91e972b8de1e928abfd9bd373cfb5c4f77cdd1a6db7c620055e9657ec623866e0d8a0cb3a8e21a0c252bde3df87
7
+ data.tar.gz: 77f41921f5818051b597d2941688f6eca2a24d86333c58dec45d6e47e7161bfdd70e78f50a0f7ddd6cc99356c2b477451ab43adf9caa201501815c6b1a731d5c
@@ -40,18 +40,20 @@ impl std::fmt::Display for ParserResultType {
40
40
  }
41
41
 
42
42
  #[derive(Debug, Clone)]
43
- pub struct ListField {
44
- pub item_type: ParquetSchemaType,
43
+ pub struct ListField<'a> {
44
+ pub item_type: ParquetSchemaType<'a>,
45
+ pub format: Option<&'a str>,
45
46
  }
46
47
 
47
48
  #[derive(Debug, Clone)]
48
- pub struct MapField {
49
- pub key_type: ParquetSchemaType,
50
- pub value_type: ParquetSchemaType,
49
+ pub struct MapField<'a> {
50
+ pub key_type: ParquetSchemaType<'a>,
51
+ pub value_type: ParquetSchemaType<'a>,
52
+ pub format: Option<&'a str>,
51
53
  }
52
54
 
53
55
  #[derive(Debug, Clone)]
54
- pub enum ParquetSchemaType {
56
+ pub enum ParquetSchemaType<'a> {
55
57
  Int8,
56
58
  Int16,
57
59
  Int32,
@@ -68,6 +70,6 @@ pub enum ParquetSchemaType {
68
70
  Date32,
69
71
  TimestampMillis,
70
72
  TimestampMicros,
71
- List(Box<ListField>),
72
- Map(Box<MapField>),
73
+ List(Box<ListField<'a>>),
74
+ Map(Box<MapField<'a>>),
73
75
  }
@@ -157,6 +157,10 @@ impl IntoValue for ParquetValue {
157
157
 
158
158
  impl ParquetValue {
159
159
  pub fn from_value(value: Value, type_: &ParquetSchemaType) -> Result<Self, MagnusError> {
160
+ if value.is_nil() {
161
+ return Ok(ParquetValue::Null);
162
+ }
163
+
160
164
  match type_ {
161
165
  ParquetSchemaType::Int8 => {
162
166
  let v = NumericConverter::<i8>::convert_with_string_fallback(value)?;
@@ -211,15 +215,15 @@ impl ParquetValue {
211
215
  Ok(ParquetValue::Boolean(v))
212
216
  }
213
217
  ParquetSchemaType::Date32 => {
214
- let v = convert_to_date32(value)?;
218
+ let v = convert_to_date32(value, None)?;
215
219
  Ok(ParquetValue::Date32(v))
216
220
  }
217
221
  ParquetSchemaType::TimestampMillis => {
218
- let v = convert_to_timestamp_millis(value)?;
222
+ let v = convert_to_timestamp_millis(value, None)?;
219
223
  Ok(ParquetValue::TimestampMillis(v, None))
220
224
  }
221
225
  ParquetSchemaType::TimestampMicros => {
222
- let v = convert_to_timestamp_micros(value)?;
226
+ let v = convert_to_timestamp_micros(value, None)?;
223
227
  Ok(ParquetValue::TimestampMicros(v, None))
224
228
  }
225
229
  ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => Err(MagnusError::new(
@@ -30,17 +30,35 @@ where
30
30
  }
31
31
  }
32
32
 
33
- pub fn convert_to_date32(value: Value) -> Result<i32, MagnusError> {
33
+ pub fn convert_to_date32(value: Value, format: Option<&str>) -> Result<i32, MagnusError> {
34
34
  let ruby = unsafe { Ruby::get_unchecked() };
35
35
  if value.is_kind_of(ruby.class_string()) {
36
36
  let s = String::try_convert(value)?;
37
- // Parse string into Timestamp using jiff
38
- let date: jiff::civil::Date = s.parse().map_err(|e| {
39
- MagnusError::new(
40
- magnus::exception::type_error(),
41
- format!("Failed to parse '{}' as date32: {}", s, e),
42
- )
43
- })?;
37
+ // Parse string into Date using jiff
38
+ let date = if let Some(fmt) = format {
39
+ jiff::civil::Date::strptime(&fmt, &s).or_else(|e1| {
40
+ // Try parsing as DateTime and convert to Date with zero offset
41
+ jiff::civil::DateTime::strptime(&fmt, &s)
42
+ .and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
43
+ .map(|dt| dt.date())
44
+ .map_err(|e2| {
45
+ MagnusError::new(
46
+ magnus::exception::type_error(),
47
+ format!(
48
+ "Failed to parse '{}' with format '{}' as date32: {} (and as datetime: {})",
49
+ s, fmt, e1, e2
50
+ ),
51
+ )
52
+ })
53
+ })?
54
+ } else {
55
+ s.parse().map_err(|e| {
56
+ MagnusError::new(
57
+ magnus::exception::type_error(),
58
+ format!("Failed to parse '{}' as date32: {}", s, e),
59
+ )
60
+ })?
61
+ };
44
62
 
45
63
  let timestamp = date.at(0, 0, 0, 0);
46
64
 
@@ -63,17 +81,36 @@ pub fn convert_to_date32(value: Value) -> Result<i32, MagnusError> {
63
81
  }
64
82
  }
65
83
 
66
- pub fn convert_to_timestamp_millis(value: Value) -> Result<i64, MagnusError> {
84
+ pub fn convert_to_timestamp_millis(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
67
85
  let ruby = unsafe { Ruby::get_unchecked() };
68
86
  if value.is_kind_of(ruby.class_string()) {
69
87
  let s = String::try_convert(value)?;
70
88
  // Parse string into Timestamp using jiff
71
- let timestamp: jiff::Timestamp = s.parse().map_err(|e| {
72
- MagnusError::new(
73
- magnus::exception::type_error(),
74
- format!("Failed to parse '{}' as timestamp_millis: {}", s, e),
75
- )
76
- })?;
89
+ let timestamp = if let Some(fmt) = format {
90
+ jiff::Timestamp::strptime(&fmt, &s)
91
+ .or_else(|e1| {
92
+ // Try parsing as DateTime and convert to Timestamp with zero offset
93
+ jiff::civil::DateTime::strptime(&fmt, &s)
94
+ .and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
95
+ .map(|dt| dt.timestamp())
96
+ .map_err(|e2| {
97
+ MagnusError::new(
98
+ magnus::exception::type_error(),
99
+ format!(
100
+ "Failed to parse '{}' with format '{}' as timestamp_millis: {} (and as datetime: {})",
101
+ s, fmt, e1, e2
102
+ ),
103
+ )
104
+ })
105
+ })?
106
+ } else {
107
+ s.parse().map_err(|e| {
108
+ MagnusError::new(
109
+ magnus::exception::type_error(),
110
+ format!("Failed to parse '{}' as timestamp_millis: {}", s, e),
111
+ )
112
+ })?
113
+ };
77
114
  // Convert to milliseconds
78
115
  Ok(timestamp.as_millisecond())
79
116
  } else if value.is_kind_of(ruby.class_time()) {
@@ -91,17 +128,36 @@ pub fn convert_to_timestamp_millis(value: Value) -> Result<i64, MagnusError> {
91
128
  }
92
129
  }
93
130
 
94
- pub fn convert_to_timestamp_micros(value: Value) -> Result<i64, MagnusError> {
131
+ pub fn convert_to_timestamp_micros(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
95
132
  let ruby = unsafe { Ruby::get_unchecked() };
96
133
  if value.is_kind_of(ruby.class_string()) {
97
134
  let s = String::try_convert(value)?;
98
135
  // Parse string into Timestamp using jiff
99
- let timestamp: jiff::Timestamp = s.parse().map_err(|e| {
100
- MagnusError::new(
101
- magnus::exception::type_error(),
102
- format!("Failed to parse '{}' as timestamp_micros: {}", s, e),
103
- )
104
- })?;
136
+ let timestamp = if let Some(fmt) = format {
137
+ jiff::Timestamp::strptime(&fmt, &s).or_else(|e1| {
138
+ // Try parsing as DateTime and convert to Timestamp with zero offset
139
+ jiff::civil::DateTime::strptime(&fmt, &s).and_then(|dt| {
140
+ dt.to_zoned(TimeZone::fixed(Offset::constant(0)))
141
+ })
142
+ .map(|dt| dt.timestamp())
143
+ .map_err(|e2| {
144
+ MagnusError::new(
145
+ magnus::exception::type_error(),
146
+ format!(
147
+ "Failed to parse '{}' with format '{}' as timestamp_micros: {} (and as datetime: {})",
148
+ s, fmt, e1, e2
149
+ ),
150
+ )
151
+ })
152
+ })?
153
+ } else {
154
+ s.parse().map_err(|e| {
155
+ MagnusError::new(
156
+ magnus::exception::type_error(),
157
+ format!("Failed to parse '{}' as timestamp_micros: {}", s, e),
158
+ )
159
+ })?
160
+ };
105
161
  // Convert to microseconds
106
162
  Ok(timestamp.as_microsecond())
107
163
  } else if value.is_kind_of(ruby.class_time()) {
@@ -204,15 +260,15 @@ pub fn convert_to_list(
204
260
  ParquetValue::Boolean(v)
205
261
  }
206
262
  ParquetSchemaType::Date32 => {
207
- let v = convert_to_date32(item_value)?;
263
+ let v = convert_to_date32(item_value, list_field.format)?;
208
264
  ParquetValue::Date32(v)
209
265
  }
210
266
  ParquetSchemaType::TimestampMillis => {
211
- let v = convert_to_timestamp_millis(item_value)?;
267
+ let v = convert_to_timestamp_millis(item_value, list_field.format)?;
212
268
  ParquetValue::TimestampMillis(v, None)
213
269
  }
214
270
  ParquetSchemaType::TimestampMicros => {
215
- let v = convert_to_timestamp_micros(item_value)?;
271
+ let v = convert_to_timestamp_micros(item_value, list_field.format)?;
216
272
  ParquetValue::TimestampMicros(v, None)
217
273
  }
218
274
  ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
@@ -310,15 +366,15 @@ pub fn convert_to_map(
310
366
  ParquetValue::Boolean(v)
311
367
  }
312
368
  ParquetSchemaType::Date32 => {
313
- let v = convert_to_date32(value)?;
369
+ let v = convert_to_date32(value, map_field.format)?;
314
370
  ParquetValue::Date32(v)
315
371
  }
316
372
  ParquetSchemaType::TimestampMillis => {
317
- let v = convert_to_timestamp_millis(value)?;
373
+ let v = convert_to_timestamp_millis(value, map_field.format)?;
318
374
  ParquetValue::TimestampMillis(v, None)
319
375
  }
320
376
  ParquetSchemaType::TimestampMicros => {
321
- let v = convert_to_timestamp_micros(value)?;
377
+ let v = convert_to_timestamp_micros(value, map_field.format)?;
322
378
  ParquetValue::TimestampMicros(v, None)
323
379
  }
324
380
  ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
@@ -592,12 +648,12 @@ pub fn convert_parquet_values_to_arrow(
592
648
  };
593
649
 
594
650
  let mut list_builder = ListBuilder::new(value_builder);
651
+
595
652
  for value in values {
596
653
  match value {
597
654
  ParquetValue::List(items) => {
598
- list_builder.append(true);
599
655
  for item in items {
600
- match list_field.item_type {
656
+ match &list_field.item_type {
601
657
  ParquetSchemaType::Int8 => append_list_value_copy!(
602
658
  list_builder,
603
659
  ParquetSchemaType::Int8,
@@ -12,16 +12,17 @@ use tempfile::NamedTempFile;
12
12
  use crate::types::{ListField, MapField, ParquetSchemaType};
13
13
 
14
14
  #[derive(Debug)]
15
- pub struct SchemaField {
15
+ pub struct SchemaField<'a> {
16
16
  pub name: String,
17
- pub type_: ParquetSchemaType,
17
+ pub type_: ParquetSchemaType<'a>,
18
+ pub format: Option<String>,
18
19
  }
19
20
 
20
21
  #[derive(Debug)]
21
- pub struct ParquetWriteArgs {
22
+ pub struct ParquetWriteArgs<'a> {
22
23
  pub read_from: Value,
23
24
  pub write_to: Value,
24
- pub schema: Vec<SchemaField>,
25
+ pub schema: Vec<SchemaField<'a>>,
25
26
  pub batch_size: Option<usize>,
26
27
  }
27
28
 
@@ -51,7 +52,7 @@ impl Write for IoLikeValue {
51
52
  }
52
53
  }
53
54
 
54
- impl FromStr for ParquetSchemaType {
55
+ impl<'a> FromStr for ParquetSchemaType<'a> {
55
56
  type Err = MagnusError;
56
57
 
57
58
  fn from_str(s: &str) -> Result<Self, Self::Err> {
@@ -74,10 +75,12 @@ impl FromStr for ParquetSchemaType {
74
75
  "timestamp_micros" => Ok(ParquetSchemaType::TimestampMicros),
75
76
  "list" => Ok(ParquetSchemaType::List(Box::new(ListField {
76
77
  item_type: ParquetSchemaType::Int8,
78
+ format: None,
77
79
  }))),
78
80
  "map" => Ok(ParquetSchemaType::Map(Box::new(MapField {
79
81
  key_type: ParquetSchemaType::String,
80
82
  value_type: ParquetSchemaType::Int8,
83
+ format: None,
81
84
  }))),
82
85
  _ => Err(MagnusError::new(
83
86
  magnus::exception::runtime_error(),
@@ -87,7 +90,7 @@ impl FromStr for ParquetSchemaType {
87
90
  }
88
91
  }
89
92
 
90
- impl TryConvert for ParquetSchemaType {
93
+ impl<'a> TryConvert for ParquetSchemaType<'a> {
91
94
  fn try_convert(value: Value) -> Result<Self, MagnusError> {
92
95
  let ruby = unsafe { Ruby::get_unchecked() };
93
96
  let schema_type = parse_string_or_symbol(&ruby, value)?;
@@ -98,7 +101,7 @@ impl TryConvert for ParquetSchemaType {
98
101
 
99
102
  // We know this type is safe to move between threads because it's just an enum
100
103
  // with simple primitive types and strings
101
- unsafe impl Send for ParquetSchemaType {}
104
+ unsafe impl<'a> Send for ParquetSchemaType<'a> {}
102
105
 
103
106
  fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, MagnusError> {
104
107
  if value.is_nil() {
@@ -162,17 +165,19 @@ impl From<ParquetErrorWrapper> for MagnusError {
162
165
  }
163
166
  }
164
167
 
165
- pub struct ColumnCollector {
168
+ pub struct ColumnCollector<'a> {
166
169
  pub name: String,
167
- pub type_: ParquetSchemaType,
170
+ pub type_: ParquetSchemaType<'a>,
171
+ pub format: Option<String>,
168
172
  pub values: Vec<crate::types::ParquetValue>,
169
173
  }
170
174
 
171
- impl ColumnCollector {
172
- pub fn new(name: String, type_: ParquetSchemaType) -> Self {
175
+ impl<'a> ColumnCollector<'a> {
176
+ pub fn new(name: String, type_: ParquetSchemaType<'a>, format: Option<String>) -> Self {
173
177
  Self {
174
178
  name,
175
179
  type_,
180
+ format,
176
181
  values: Vec::new(),
177
182
  }
178
183
  }
@@ -185,6 +190,11 @@ impl ColumnCollector {
185
190
  NumericConverter,
186
191
  };
187
192
 
193
+ if value.is_nil() {
194
+ self.values.push(ParquetValue::Null);
195
+ return Ok(());
196
+ }
197
+
188
198
  let parquet_value = match &self.type_ {
189
199
  ParquetSchemaType::Int8 => {
190
200
  let v = NumericConverter::<i8>::convert_with_string_fallback(value)?;
@@ -239,15 +249,15 @@ impl ColumnCollector {
239
249
  ParquetValue::Boolean(v)
240
250
  }
241
251
  ParquetSchemaType::Date32 => {
242
- let v = convert_to_date32(value)?;
252
+ let v = convert_to_date32(value, self.format.as_deref())?;
243
253
  ParquetValue::Date32(v)
244
254
  }
245
255
  ParquetSchemaType::TimestampMillis => {
246
- let v = convert_to_timestamp_millis(value)?;
256
+ let v = convert_to_timestamp_millis(value, self.format.as_deref())?;
247
257
  ParquetValue::TimestampMillis(v, None)
248
258
  }
249
259
  ParquetSchemaType::TimestampMicros => {
250
- let v = convert_to_timestamp_micros(value)?;
260
+ let v = convert_to_timestamp_micros(value, self.format.as_deref())?;
251
261
  ParquetValue::TimestampMicros(v, None)
252
262
  }
253
263
  ParquetSchemaType::List(list_field) => {
@@ -39,7 +39,7 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
39
39
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
40
40
  let (to_read,) = parsed_args.required;
41
41
 
42
- let kwargs = get_kwargs::<_, (), (Option<Value>, Option<Vec<String>>), ()>(
42
+ let kwargs = get_kwargs::<_, (), (Option<Option<Value>>, Option<Option<Vec<String>>>), ()>(
43
43
  parsed_args.keywords,
44
44
  &[],
45
45
  &["result_type", "columns"],
@@ -48,6 +48,7 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
48
48
  let result_type: ParserResultType = match kwargs
49
49
  .optional
50
50
  .0
51
+ .flatten()
51
52
  .map(|value| parse_string_or_symbol(ruby, value))
52
53
  {
53
54
  Some(Ok(Some(parsed))) => parsed.try_into().map_err(|e| {
@@ -75,7 +76,7 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
75
76
  Ok(ParquetRowsArgs {
76
77
  to_read,
77
78
  result_type,
78
- columns: kwargs.optional.1,
79
+ columns: kwargs.optional.1.flatten(),
79
80
  })
80
81
  }
81
82
 
@@ -95,7 +96,16 @@ pub fn parse_parquet_columns_args(
95
96
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
96
97
  let (to_read,) = parsed_args.required;
97
98
 
98
- let kwargs = get_kwargs::<_, (), (Option<Value>, Option<Vec<String>>, Option<usize>), ()>(
99
+ let kwargs = get_kwargs::<
100
+ _,
101
+ (),
102
+ (
103
+ Option<Option<Value>>,
104
+ Option<Option<Vec<String>>>,
105
+ Option<Option<usize>>,
106
+ ),
107
+ (),
108
+ >(
99
109
  parsed_args.keywords,
100
110
  &[],
101
111
  &["result_type", "columns", "batch_size"],
@@ -104,6 +114,7 @@ pub fn parse_parquet_columns_args(
104
114
  let result_type: ParserResultType = match kwargs
105
115
  .optional
106
116
  .0
117
+ .flatten()
107
118
  .map(|value| parse_string_or_symbol(ruby, value))
108
119
  {
109
120
  Some(Ok(Some(parsed))) => parsed.try_into().map_err(|e| {
@@ -131,7 +142,7 @@ pub fn parse_parquet_columns_args(
131
142
  Ok(ParquetColumnsArgs {
132
143
  to_read,
133
144
  result_type,
134
- columns: kwargs.optional.1,
135
- batch_size: kwargs.optional.2,
145
+ columns: kwargs.optional.1.flatten(),
146
+ batch_size: kwargs.optional.2.flatten(),
136
147
  })
137
148
  }
@@ -28,7 +28,7 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
28
28
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
29
29
  let (read_from,) = parsed_args.required;
30
30
 
31
- let kwargs = get_kwargs::<_, (Value, Value), (Option<usize>,), ()>(
31
+ let kwargs = get_kwargs::<_, (Value, Value), (Option<Option<usize>>,), ()>(
32
32
  parsed_args.keywords,
33
33
  &["schema", "write_to"],
34
34
  &["batch_size"],
@@ -59,18 +59,52 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
59
59
  ));
60
60
  }
61
61
 
62
- let (name, type_str) = &entries[0];
62
+ let (name, type_value) = &entries[0];
63
63
  let name = String::try_convert(name.clone())?;
64
- let type_ = ParquetSchemaType::try_convert(type_str.clone())?;
65
64
 
66
- schema.push(SchemaField { name, type_ });
65
+ let (type_, format) = if type_value.is_kind_of(ruby.class_hash()) {
66
+ let type_hash: Vec<(Value, Value)> = type_value.funcall("to_a", ())?;
67
+ let mut type_str = None;
68
+ let mut format_str = None;
69
+
70
+ for (key, value) in type_hash {
71
+ let key = String::try_convert(key)?;
72
+ match key.as_str() {
73
+ "type" => type_str = Some(value),
74
+ "format" => format_str = Some(String::try_convert(value)?),
75
+ _ => {
76
+ return Err(MagnusError::new(
77
+ magnus::exception::type_error(),
78
+ format!("Unknown key '{}' in type definition", key),
79
+ ))
80
+ }
81
+ }
82
+ }
83
+
84
+ let type_str = type_str.ok_or_else(|| {
85
+ MagnusError::new(
86
+ magnus::exception::type_error(),
87
+ "Missing 'type' in type definition",
88
+ )
89
+ })?;
90
+
91
+ (ParquetSchemaType::try_convert(type_str)?, format_str)
92
+ } else {
93
+ (ParquetSchemaType::try_convert(type_value.clone())?, None)
94
+ };
95
+
96
+ schema.push(SchemaField {
97
+ name,
98
+ type_,
99
+ format,
100
+ });
67
101
  }
68
102
 
69
103
  Ok(ParquetWriteArgs {
70
104
  read_from,
71
105
  write_to: kwargs.required.1,
72
106
  schema,
73
- batch_size: kwargs.optional.0,
107
+ batch_size: kwargs.optional.0.flatten(),
74
108
  })
75
109
  }
76
110
 
@@ -130,7 +164,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
130
164
  // Create collectors for each column
131
165
  let mut column_collectors: Vec<ColumnCollector> = schema
132
166
  .into_iter()
133
- .map(|field| ColumnCollector::new(field.name, field.type_))
167
+ .map(|field| ColumnCollector::new(field.name, field.type_, field.format))
134
168
  .collect();
135
169
 
136
170
  let mut rows_in_batch = 0;
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.2.5"
2
+ VERSION = "0.2.7"
3
3
  end
data/lib/parquet.rbi CHANGED
@@ -1,4 +1,4 @@
1
- # typed: strict
1
+ # typed: true
2
2
 
3
3
  module Parquet
4
4
  # Options:
@@ -7,13 +7,20 @@ module Parquet
7
7
  # ("hash" or "array" or :hash or :array)
8
8
  # - `columns`: When present, only the specified columns will be included in the output.
9
9
  # This is useful for reducing how much data is read and improving performance.
10
+ sig do
11
+ params(
12
+ input: T.any(String, File, StringIO, IO),
13
+ result_type: T.nilable(T.any(String, Symbol)),
14
+ columns: T.nilable(T::Array[String])
15
+ ).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
16
+ end
10
17
  sig do
11
18
  params(
12
19
  input: T.any(String, File, StringIO, IO),
13
20
  result_type: T.nilable(T.any(String, Symbol)),
14
21
  columns: T.nilable(T::Array[String]),
15
22
  blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
16
- ).returns(T.any(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])], NilClass))
23
+ ).returns(NilClass)
17
24
  end
18
25
  def self.each_row(input, result_type: nil, columns: nil, &blk)
19
26
  end
@@ -24,6 +31,14 @@ module Parquet
24
31
  # ("hash" or "array" or :hash or :array)
25
32
  # - `columns`: When present, only the specified columns will be included in the output.
26
33
  # - `batch_size`: When present, specifies the number of rows per batch
34
+ sig do
35
+ params(
36
+ input: T.any(String, File, StringIO, IO),
37
+ result_type: T.nilable(T.any(String, Symbol)),
38
+ columns: T.nilable(T::Array[String]),
39
+ batch_size: T.nilable(Integer)
40
+ ).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
41
+ end
27
42
  sig do
28
43
  params(
29
44
  input: T.any(String, File, StringIO, IO),
@@ -32,14 +47,22 @@ module Parquet
32
47
  batch_size: T.nilable(Integer),
33
48
  blk:
34
49
  T.nilable(T.proc.params(batch: T.any(T::Hash[String, T::Array[T.untyped]], T::Array[T::Array[T.untyped]])).void)
35
- ).returns(T.any(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])], NilClass))
50
+ ).returns(NilClass)
36
51
  end
37
52
  def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, &blk)
38
53
  end
39
54
 
40
55
  # Options:
41
56
  # - `read_from`: An Enumerator yielding arrays of values representing each row
42
- # - `schema`: Array of hashes specifying column names and types
57
+ # - `schema`: Array of hashes specifying column names and types. Supported types:
58
+ # - `int8`, `int16`, `int32`, `int64`
59
+ # - `uint8`, `uint16`, `uint32`, `uint64`
60
+ # - `float`, `double`
61
+ # - `string`
62
+ # - `binary`
63
+ # - `boolean`
64
+ # - `date32`
65
+ # - `timestamp_millis`, `timestamp_micros`
43
66
  # - `write_to`: String path or IO object to write the parquet file to
44
67
  # - `batch_size`: Optional batch size for writing (defaults to 1000)
45
68
  sig do
@@ -55,7 +78,16 @@ module Parquet
55
78
 
56
79
  # Options:
57
80
  # - `read_from`: An Enumerator yielding arrays of column batches
58
- # - `schema`: Array of hashes specifying column names and types
81
+ # - `schema`: Array of hashes specifying column names and types. Supported types:
82
+ # - `int8`, `int16`, `int32`, `int64`
83
+ # - `uint8`, `uint16`, `uint32`, `uint64`
84
+ # - `float`, `double`
85
+ # - `string`
86
+ # - `binary`
87
+ # - `boolean`
88
+ # - `date32`
89
+ # - `timestamp_millis`, `timestamp_micros`
90
+ # - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
59
91
  # - `write_to`: String path or IO object to write the parquet file to
60
92
  sig do
61
93
  params(
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.5
4
+ version: 0.2.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko