parquet 0.2.5 → 0.2.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 90e876ca198a0e1871f692a382f09ceaeec670d162da26f2c102ea4eca4244bf
4
- data.tar.gz: 96743e260cbd2fb55f6cdeaf256fbb1e915c57651fdc3f20fdd58b6a34596544
3
+ metadata.gz: c1ed4f490a4f03443598dbe1b0e110746052f613a4c5575f9b8e47c6e160bb40
4
+ data.tar.gz: 4db314d1707e633799e996c6fb777135ff0ea364a76c0a7d8fc5c429e2394d9f
5
5
  SHA512:
6
- metadata.gz: 1609a37c5a9bd9f1d57bb31dd02b2fdb5b608a7c044686e6ef2513c95e53e830bd7bf7048a36904465a32a5915425c7b6bf581c5b35a4fb19f950cbca20913b2
7
- data.tar.gz: 96ec18377fc5944556760329c126f440de61d3b378bfa976a66437db03f0a51220c880afd14098a5b1968daa968d2e836c50f83bef21507789ba4df314c48148
6
+ metadata.gz: b3f0a15cf467d030d3002c21bc6b64b6cd16e91e972b8de1e928abfd9bd373cfb5c4f77cdd1a6db7c620055e9657ec623866e0d8a0cb3a8e21a0c252bde3df87
7
+ data.tar.gz: 77f41921f5818051b597d2941688f6eca2a24d86333c58dec45d6e47e7161bfdd70e78f50a0f7ddd6cc99356c2b477451ab43adf9caa201501815c6b1a731d5c
@@ -40,18 +40,20 @@ impl std::fmt::Display for ParserResultType {
40
40
  }
41
41
 
42
42
  #[derive(Debug, Clone)]
43
- pub struct ListField {
44
- pub item_type: ParquetSchemaType,
43
+ pub struct ListField<'a> {
44
+ pub item_type: ParquetSchemaType<'a>,
45
+ pub format: Option<&'a str>,
45
46
  }
46
47
 
47
48
  #[derive(Debug, Clone)]
48
- pub struct MapField {
49
- pub key_type: ParquetSchemaType,
50
- pub value_type: ParquetSchemaType,
49
+ pub struct MapField<'a> {
50
+ pub key_type: ParquetSchemaType<'a>,
51
+ pub value_type: ParquetSchemaType<'a>,
52
+ pub format: Option<&'a str>,
51
53
  }
52
54
 
53
55
  #[derive(Debug, Clone)]
54
- pub enum ParquetSchemaType {
56
+ pub enum ParquetSchemaType<'a> {
55
57
  Int8,
56
58
  Int16,
57
59
  Int32,
@@ -68,6 +70,6 @@ pub enum ParquetSchemaType {
68
70
  Date32,
69
71
  TimestampMillis,
70
72
  TimestampMicros,
71
- List(Box<ListField>),
72
- Map(Box<MapField>),
73
+ List(Box<ListField<'a>>),
74
+ Map(Box<MapField<'a>>),
73
75
  }
@@ -157,6 +157,10 @@ impl IntoValue for ParquetValue {
157
157
 
158
158
  impl ParquetValue {
159
159
  pub fn from_value(value: Value, type_: &ParquetSchemaType) -> Result<Self, MagnusError> {
160
+ if value.is_nil() {
161
+ return Ok(ParquetValue::Null);
162
+ }
163
+
160
164
  match type_ {
161
165
  ParquetSchemaType::Int8 => {
162
166
  let v = NumericConverter::<i8>::convert_with_string_fallback(value)?;
@@ -211,15 +215,15 @@ impl ParquetValue {
211
215
  Ok(ParquetValue::Boolean(v))
212
216
  }
213
217
  ParquetSchemaType::Date32 => {
214
- let v = convert_to_date32(value)?;
218
+ let v = convert_to_date32(value, None)?;
215
219
  Ok(ParquetValue::Date32(v))
216
220
  }
217
221
  ParquetSchemaType::TimestampMillis => {
218
- let v = convert_to_timestamp_millis(value)?;
222
+ let v = convert_to_timestamp_millis(value, None)?;
219
223
  Ok(ParquetValue::TimestampMillis(v, None))
220
224
  }
221
225
  ParquetSchemaType::TimestampMicros => {
222
- let v = convert_to_timestamp_micros(value)?;
226
+ let v = convert_to_timestamp_micros(value, None)?;
223
227
  Ok(ParquetValue::TimestampMicros(v, None))
224
228
  }
225
229
  ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => Err(MagnusError::new(
@@ -30,17 +30,35 @@ where
30
30
  }
31
31
  }
32
32
 
33
- pub fn convert_to_date32(value: Value) -> Result<i32, MagnusError> {
33
+ pub fn convert_to_date32(value: Value, format: Option<&str>) -> Result<i32, MagnusError> {
34
34
  let ruby = unsafe { Ruby::get_unchecked() };
35
35
  if value.is_kind_of(ruby.class_string()) {
36
36
  let s = String::try_convert(value)?;
37
- // Parse string into Timestamp using jiff
38
- let date: jiff::civil::Date = s.parse().map_err(|e| {
39
- MagnusError::new(
40
- magnus::exception::type_error(),
41
- format!("Failed to parse '{}' as date32: {}", s, e),
42
- )
43
- })?;
37
+ // Parse string into Date using jiff
38
+ let date = if let Some(fmt) = format {
39
+ jiff::civil::Date::strptime(&fmt, &s).or_else(|e1| {
40
+ // Try parsing as DateTime and convert to Date with zero offset
41
+ jiff::civil::DateTime::strptime(&fmt, &s)
42
+ .and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
43
+ .map(|dt| dt.date())
44
+ .map_err(|e2| {
45
+ MagnusError::new(
46
+ magnus::exception::type_error(),
47
+ format!(
48
+ "Failed to parse '{}' with format '{}' as date32: {} (and as datetime: {})",
49
+ s, fmt, e1, e2
50
+ ),
51
+ )
52
+ })
53
+ })?
54
+ } else {
55
+ s.parse().map_err(|e| {
56
+ MagnusError::new(
57
+ magnus::exception::type_error(),
58
+ format!("Failed to parse '{}' as date32: {}", s, e),
59
+ )
60
+ })?
61
+ };
44
62
 
45
63
  let timestamp = date.at(0, 0, 0, 0);
46
64
 
@@ -63,17 +81,36 @@ pub fn convert_to_date32(value: Value) -> Result<i32, MagnusError> {
63
81
  }
64
82
  }
65
83
 
66
- pub fn convert_to_timestamp_millis(value: Value) -> Result<i64, MagnusError> {
84
+ pub fn convert_to_timestamp_millis(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
67
85
  let ruby = unsafe { Ruby::get_unchecked() };
68
86
  if value.is_kind_of(ruby.class_string()) {
69
87
  let s = String::try_convert(value)?;
70
88
  // Parse string into Timestamp using jiff
71
- let timestamp: jiff::Timestamp = s.parse().map_err(|e| {
72
- MagnusError::new(
73
- magnus::exception::type_error(),
74
- format!("Failed to parse '{}' as timestamp_millis: {}", s, e),
75
- )
76
- })?;
89
+ let timestamp = if let Some(fmt) = format {
90
+ jiff::Timestamp::strptime(&fmt, &s)
91
+ .or_else(|e1| {
92
+ // Try parsing as DateTime and convert to Timestamp with zero offset
93
+ jiff::civil::DateTime::strptime(&fmt, &s)
94
+ .and_then(|dt| dt.to_zoned(TimeZone::fixed(Offset::constant(0))))
95
+ .map(|dt| dt.timestamp())
96
+ .map_err(|e2| {
97
+ MagnusError::new(
98
+ magnus::exception::type_error(),
99
+ format!(
100
+ "Failed to parse '{}' with format '{}' as timestamp_millis: {} (and as datetime: {})",
101
+ s, fmt, e1, e2
102
+ ),
103
+ )
104
+ })
105
+ })?
106
+ } else {
107
+ s.parse().map_err(|e| {
108
+ MagnusError::new(
109
+ magnus::exception::type_error(),
110
+ format!("Failed to parse '{}' as timestamp_millis: {}", s, e),
111
+ )
112
+ })?
113
+ };
77
114
  // Convert to milliseconds
78
115
  Ok(timestamp.as_millisecond())
79
116
  } else if value.is_kind_of(ruby.class_time()) {
@@ -91,17 +128,36 @@ pub fn convert_to_timestamp_millis(value: Value) -> Result<i64, MagnusError> {
91
128
  }
92
129
  }
93
130
 
94
- pub fn convert_to_timestamp_micros(value: Value) -> Result<i64, MagnusError> {
131
+ pub fn convert_to_timestamp_micros(value: Value, format: Option<&str>) -> Result<i64, MagnusError> {
95
132
  let ruby = unsafe { Ruby::get_unchecked() };
96
133
  if value.is_kind_of(ruby.class_string()) {
97
134
  let s = String::try_convert(value)?;
98
135
  // Parse string into Timestamp using jiff
99
- let timestamp: jiff::Timestamp = s.parse().map_err(|e| {
100
- MagnusError::new(
101
- magnus::exception::type_error(),
102
- format!("Failed to parse '{}' as timestamp_micros: {}", s, e),
103
- )
104
- })?;
136
+ let timestamp = if let Some(fmt) = format {
137
+ jiff::Timestamp::strptime(&fmt, &s).or_else(|e1| {
138
+ // Try parsing as DateTime and convert to Timestamp with zero offset
139
+ jiff::civil::DateTime::strptime(&fmt, &s).and_then(|dt| {
140
+ dt.to_zoned(TimeZone::fixed(Offset::constant(0)))
141
+ })
142
+ .map(|dt| dt.timestamp())
143
+ .map_err(|e2| {
144
+ MagnusError::new(
145
+ magnus::exception::type_error(),
146
+ format!(
147
+ "Failed to parse '{}' with format '{}' as timestamp_micros: {} (and as datetime: {})",
148
+ s, fmt, e1, e2
149
+ ),
150
+ )
151
+ })
152
+ })?
153
+ } else {
154
+ s.parse().map_err(|e| {
155
+ MagnusError::new(
156
+ magnus::exception::type_error(),
157
+ format!("Failed to parse '{}' as timestamp_micros: {}", s, e),
158
+ )
159
+ })?
160
+ };
105
161
  // Convert to microseconds
106
162
  Ok(timestamp.as_microsecond())
107
163
  } else if value.is_kind_of(ruby.class_time()) {
@@ -204,15 +260,15 @@ pub fn convert_to_list(
204
260
  ParquetValue::Boolean(v)
205
261
  }
206
262
  ParquetSchemaType::Date32 => {
207
- let v = convert_to_date32(item_value)?;
263
+ let v = convert_to_date32(item_value, list_field.format)?;
208
264
  ParquetValue::Date32(v)
209
265
  }
210
266
  ParquetSchemaType::TimestampMillis => {
211
- let v = convert_to_timestamp_millis(item_value)?;
267
+ let v = convert_to_timestamp_millis(item_value, list_field.format)?;
212
268
  ParquetValue::TimestampMillis(v, None)
213
269
  }
214
270
  ParquetSchemaType::TimestampMicros => {
215
- let v = convert_to_timestamp_micros(item_value)?;
271
+ let v = convert_to_timestamp_micros(item_value, list_field.format)?;
216
272
  ParquetValue::TimestampMicros(v, None)
217
273
  }
218
274
  ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
@@ -310,15 +366,15 @@ pub fn convert_to_map(
310
366
  ParquetValue::Boolean(v)
311
367
  }
312
368
  ParquetSchemaType::Date32 => {
313
- let v = convert_to_date32(value)?;
369
+ let v = convert_to_date32(value, map_field.format)?;
314
370
  ParquetValue::Date32(v)
315
371
  }
316
372
  ParquetSchemaType::TimestampMillis => {
317
- let v = convert_to_timestamp_millis(value)?;
373
+ let v = convert_to_timestamp_millis(value, map_field.format)?;
318
374
  ParquetValue::TimestampMillis(v, None)
319
375
  }
320
376
  ParquetSchemaType::TimestampMicros => {
321
- let v = convert_to_timestamp_micros(value)?;
377
+ let v = convert_to_timestamp_micros(value, map_field.format)?;
322
378
  ParquetValue::TimestampMicros(v, None)
323
379
  }
324
380
  ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
@@ -592,12 +648,12 @@ pub fn convert_parquet_values_to_arrow(
592
648
  };
593
649
 
594
650
  let mut list_builder = ListBuilder::new(value_builder);
651
+
595
652
  for value in values {
596
653
  match value {
597
654
  ParquetValue::List(items) => {
598
- list_builder.append(true);
599
655
  for item in items {
600
- match list_field.item_type {
656
+ match &list_field.item_type {
601
657
  ParquetSchemaType::Int8 => append_list_value_copy!(
602
658
  list_builder,
603
659
  ParquetSchemaType::Int8,
@@ -12,16 +12,17 @@ use tempfile::NamedTempFile;
12
12
  use crate::types::{ListField, MapField, ParquetSchemaType};
13
13
 
14
14
  #[derive(Debug)]
15
- pub struct SchemaField {
15
+ pub struct SchemaField<'a> {
16
16
  pub name: String,
17
- pub type_: ParquetSchemaType,
17
+ pub type_: ParquetSchemaType<'a>,
18
+ pub format: Option<String>,
18
19
  }
19
20
 
20
21
  #[derive(Debug)]
21
- pub struct ParquetWriteArgs {
22
+ pub struct ParquetWriteArgs<'a> {
22
23
  pub read_from: Value,
23
24
  pub write_to: Value,
24
- pub schema: Vec<SchemaField>,
25
+ pub schema: Vec<SchemaField<'a>>,
25
26
  pub batch_size: Option<usize>,
26
27
  }
27
28
 
@@ -51,7 +52,7 @@ impl Write for IoLikeValue {
51
52
  }
52
53
  }
53
54
 
54
- impl FromStr for ParquetSchemaType {
55
+ impl<'a> FromStr for ParquetSchemaType<'a> {
55
56
  type Err = MagnusError;
56
57
 
57
58
  fn from_str(s: &str) -> Result<Self, Self::Err> {
@@ -74,10 +75,12 @@ impl FromStr for ParquetSchemaType {
74
75
  "timestamp_micros" => Ok(ParquetSchemaType::TimestampMicros),
75
76
  "list" => Ok(ParquetSchemaType::List(Box::new(ListField {
76
77
  item_type: ParquetSchemaType::Int8,
78
+ format: None,
77
79
  }))),
78
80
  "map" => Ok(ParquetSchemaType::Map(Box::new(MapField {
79
81
  key_type: ParquetSchemaType::String,
80
82
  value_type: ParquetSchemaType::Int8,
83
+ format: None,
81
84
  }))),
82
85
  _ => Err(MagnusError::new(
83
86
  magnus::exception::runtime_error(),
@@ -87,7 +90,7 @@ impl FromStr for ParquetSchemaType {
87
90
  }
88
91
  }
89
92
 
90
- impl TryConvert for ParquetSchemaType {
93
+ impl<'a> TryConvert for ParquetSchemaType<'a> {
91
94
  fn try_convert(value: Value) -> Result<Self, MagnusError> {
92
95
  let ruby = unsafe { Ruby::get_unchecked() };
93
96
  let schema_type = parse_string_or_symbol(&ruby, value)?;
@@ -98,7 +101,7 @@ impl TryConvert for ParquetSchemaType {
98
101
 
99
102
  // We know this type is safe to move between threads because it's just an enum
100
103
  // with simple primitive types and strings
101
- unsafe impl Send for ParquetSchemaType {}
104
+ unsafe impl<'a> Send for ParquetSchemaType<'a> {}
102
105
 
103
106
  fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, MagnusError> {
104
107
  if value.is_nil() {
@@ -162,17 +165,19 @@ impl From<ParquetErrorWrapper> for MagnusError {
162
165
  }
163
166
  }
164
167
 
165
- pub struct ColumnCollector {
168
+ pub struct ColumnCollector<'a> {
166
169
  pub name: String,
167
- pub type_: ParquetSchemaType,
170
+ pub type_: ParquetSchemaType<'a>,
171
+ pub format: Option<String>,
168
172
  pub values: Vec<crate::types::ParquetValue>,
169
173
  }
170
174
 
171
- impl ColumnCollector {
172
- pub fn new(name: String, type_: ParquetSchemaType) -> Self {
175
+ impl<'a> ColumnCollector<'a> {
176
+ pub fn new(name: String, type_: ParquetSchemaType<'a>, format: Option<String>) -> Self {
173
177
  Self {
174
178
  name,
175
179
  type_,
180
+ format,
176
181
  values: Vec::new(),
177
182
  }
178
183
  }
@@ -185,6 +190,11 @@ impl ColumnCollector {
185
190
  NumericConverter,
186
191
  };
187
192
 
193
+ if value.is_nil() {
194
+ self.values.push(ParquetValue::Null);
195
+ return Ok(());
196
+ }
197
+
188
198
  let parquet_value = match &self.type_ {
189
199
  ParquetSchemaType::Int8 => {
190
200
  let v = NumericConverter::<i8>::convert_with_string_fallback(value)?;
@@ -239,15 +249,15 @@ impl ColumnCollector {
239
249
  ParquetValue::Boolean(v)
240
250
  }
241
251
  ParquetSchemaType::Date32 => {
242
- let v = convert_to_date32(value)?;
252
+ let v = convert_to_date32(value, self.format.as_deref())?;
243
253
  ParquetValue::Date32(v)
244
254
  }
245
255
  ParquetSchemaType::TimestampMillis => {
246
- let v = convert_to_timestamp_millis(value)?;
256
+ let v = convert_to_timestamp_millis(value, self.format.as_deref())?;
247
257
  ParquetValue::TimestampMillis(v, None)
248
258
  }
249
259
  ParquetSchemaType::TimestampMicros => {
250
- let v = convert_to_timestamp_micros(value)?;
260
+ let v = convert_to_timestamp_micros(value, self.format.as_deref())?;
251
261
  ParquetValue::TimestampMicros(v, None)
252
262
  }
253
263
  ParquetSchemaType::List(list_field) => {
@@ -39,7 +39,7 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
39
39
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
40
40
  let (to_read,) = parsed_args.required;
41
41
 
42
- let kwargs = get_kwargs::<_, (), (Option<Value>, Option<Vec<String>>), ()>(
42
+ let kwargs = get_kwargs::<_, (), (Option<Option<Value>>, Option<Option<Vec<String>>>), ()>(
43
43
  parsed_args.keywords,
44
44
  &[],
45
45
  &["result_type", "columns"],
@@ -48,6 +48,7 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
48
48
  let result_type: ParserResultType = match kwargs
49
49
  .optional
50
50
  .0
51
+ .flatten()
51
52
  .map(|value| parse_string_or_symbol(ruby, value))
52
53
  {
53
54
  Some(Ok(Some(parsed))) => parsed.try_into().map_err(|e| {
@@ -75,7 +76,7 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
75
76
  Ok(ParquetRowsArgs {
76
77
  to_read,
77
78
  result_type,
78
- columns: kwargs.optional.1,
79
+ columns: kwargs.optional.1.flatten(),
79
80
  })
80
81
  }
81
82
 
@@ -95,7 +96,16 @@ pub fn parse_parquet_columns_args(
95
96
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
96
97
  let (to_read,) = parsed_args.required;
97
98
 
98
- let kwargs = get_kwargs::<_, (), (Option<Value>, Option<Vec<String>>, Option<usize>), ()>(
99
+ let kwargs = get_kwargs::<
100
+ _,
101
+ (),
102
+ (
103
+ Option<Option<Value>>,
104
+ Option<Option<Vec<String>>>,
105
+ Option<Option<usize>>,
106
+ ),
107
+ (),
108
+ >(
99
109
  parsed_args.keywords,
100
110
  &[],
101
111
  &["result_type", "columns", "batch_size"],
@@ -104,6 +114,7 @@ pub fn parse_parquet_columns_args(
104
114
  let result_type: ParserResultType = match kwargs
105
115
  .optional
106
116
  .0
117
+ .flatten()
107
118
  .map(|value| parse_string_or_symbol(ruby, value))
108
119
  {
109
120
  Some(Ok(Some(parsed))) => parsed.try_into().map_err(|e| {
@@ -131,7 +142,7 @@ pub fn parse_parquet_columns_args(
131
142
  Ok(ParquetColumnsArgs {
132
143
  to_read,
133
144
  result_type,
134
- columns: kwargs.optional.1,
135
- batch_size: kwargs.optional.2,
145
+ columns: kwargs.optional.1.flatten(),
146
+ batch_size: kwargs.optional.2.flatten(),
136
147
  })
137
148
  }
@@ -28,7 +28,7 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
28
28
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
29
29
  let (read_from,) = parsed_args.required;
30
30
 
31
- let kwargs = get_kwargs::<_, (Value, Value), (Option<usize>,), ()>(
31
+ let kwargs = get_kwargs::<_, (Value, Value), (Option<Option<usize>>,), ()>(
32
32
  parsed_args.keywords,
33
33
  &["schema", "write_to"],
34
34
  &["batch_size"],
@@ -59,18 +59,52 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
59
59
  ));
60
60
  }
61
61
 
62
- let (name, type_str) = &entries[0];
62
+ let (name, type_value) = &entries[0];
63
63
  let name = String::try_convert(name.clone())?;
64
- let type_ = ParquetSchemaType::try_convert(type_str.clone())?;
65
64
 
66
- schema.push(SchemaField { name, type_ });
65
+ let (type_, format) = if type_value.is_kind_of(ruby.class_hash()) {
66
+ let type_hash: Vec<(Value, Value)> = type_value.funcall("to_a", ())?;
67
+ let mut type_str = None;
68
+ let mut format_str = None;
69
+
70
+ for (key, value) in type_hash {
71
+ let key = String::try_convert(key)?;
72
+ match key.as_str() {
73
+ "type" => type_str = Some(value),
74
+ "format" => format_str = Some(String::try_convert(value)?),
75
+ _ => {
76
+ return Err(MagnusError::new(
77
+ magnus::exception::type_error(),
78
+ format!("Unknown key '{}' in type definition", key),
79
+ ))
80
+ }
81
+ }
82
+ }
83
+
84
+ let type_str = type_str.ok_or_else(|| {
85
+ MagnusError::new(
86
+ magnus::exception::type_error(),
87
+ "Missing 'type' in type definition",
88
+ )
89
+ })?;
90
+
91
+ (ParquetSchemaType::try_convert(type_str)?, format_str)
92
+ } else {
93
+ (ParquetSchemaType::try_convert(type_value.clone())?, None)
94
+ };
95
+
96
+ schema.push(SchemaField {
97
+ name,
98
+ type_,
99
+ format,
100
+ });
67
101
  }
68
102
 
69
103
  Ok(ParquetWriteArgs {
70
104
  read_from,
71
105
  write_to: kwargs.required.1,
72
106
  schema,
73
- batch_size: kwargs.optional.0,
107
+ batch_size: kwargs.optional.0.flatten(),
74
108
  })
75
109
  }
76
110
 
@@ -130,7 +164,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
130
164
  // Create collectors for each column
131
165
  let mut column_collectors: Vec<ColumnCollector> = schema
132
166
  .into_iter()
133
- .map(|field| ColumnCollector::new(field.name, field.type_))
167
+ .map(|field| ColumnCollector::new(field.name, field.type_, field.format))
134
168
  .collect();
135
169
 
136
170
  let mut rows_in_batch = 0;
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.2.5"
2
+ VERSION = "0.2.7"
3
3
  end
data/lib/parquet.rbi CHANGED
@@ -1,4 +1,4 @@
1
- # typed: strict
1
+ # typed: true
2
2
 
3
3
  module Parquet
4
4
  # Options:
@@ -7,13 +7,20 @@ module Parquet
7
7
  # ("hash" or "array" or :hash or :array)
8
8
  # - `columns`: When present, only the specified columns will be included in the output.
9
9
  # This is useful for reducing how much data is read and improving performance.
10
+ sig do
11
+ params(
12
+ input: T.any(String, File, StringIO, IO),
13
+ result_type: T.nilable(T.any(String, Symbol)),
14
+ columns: T.nilable(T::Array[String])
15
+ ).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
16
+ end
10
17
  sig do
11
18
  params(
12
19
  input: T.any(String, File, StringIO, IO),
13
20
  result_type: T.nilable(T.any(String, Symbol)),
14
21
  columns: T.nilable(T::Array[String]),
15
22
  blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
16
- ).returns(T.any(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])], NilClass))
23
+ ).returns(NilClass)
17
24
  end
18
25
  def self.each_row(input, result_type: nil, columns: nil, &blk)
19
26
  end
@@ -24,6 +31,14 @@ module Parquet
24
31
  # ("hash" or "array" or :hash or :array)
25
32
  # - `columns`: When present, only the specified columns will be included in the output.
26
33
  # - `batch_size`: When present, specifies the number of rows per batch
34
+ sig do
35
+ params(
36
+ input: T.any(String, File, StringIO, IO),
37
+ result_type: T.nilable(T.any(String, Symbol)),
38
+ columns: T.nilable(T::Array[String]),
39
+ batch_size: T.nilable(Integer)
40
+ ).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
41
+ end
27
42
  sig do
28
43
  params(
29
44
  input: T.any(String, File, StringIO, IO),
@@ -32,14 +47,22 @@ module Parquet
32
47
  batch_size: T.nilable(Integer),
33
48
  blk:
34
49
  T.nilable(T.proc.params(batch: T.any(T::Hash[String, T::Array[T.untyped]], T::Array[T::Array[T.untyped]])).void)
35
- ).returns(T.any(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])], NilClass))
50
+ ).returns(NilClass)
36
51
  end
37
52
  def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, &blk)
38
53
  end
39
54
 
40
55
  # Options:
41
56
  # - `read_from`: An Enumerator yielding arrays of values representing each row
42
- # - `schema`: Array of hashes specifying column names and types
57
+ # - `schema`: Array of hashes specifying column names and types. Supported types:
58
+ # - `int8`, `int16`, `int32`, `int64`
59
+ # - `uint8`, `uint16`, `uint32`, `uint64`
60
+ # - `float`, `double`
61
+ # - `string`
62
+ # - `binary`
63
+ # - `boolean`
64
+ # - `date32`
65
+ # - `timestamp_millis`, `timestamp_micros`
43
66
  # - `write_to`: String path or IO object to write the parquet file to
44
67
  # - `batch_size`: Optional batch size for writing (defaults to 1000)
45
68
  sig do
@@ -55,7 +78,16 @@ module Parquet
55
78
 
56
79
  # Options:
57
80
  # - `read_from`: An Enumerator yielding arrays of column batches
58
- # - `schema`: Array of hashes specifying column names and types
81
+ # - `schema`: Array of hashes specifying column names and types. Supported types:
82
+ # - `int8`, `int16`, `int32`, `int64`
83
+ # - `uint8`, `uint16`, `uint32`, `uint64`
84
+ # - `float`, `double`
85
+ # - `string`
86
+ # - `binary`
87
+ # - `boolean`
88
+ # - `date32`
89
+ # - `timestamp_millis`, `timestamp_micros`
90
+ # - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
59
91
  # - `write_to`: String path or IO object to write the parquet file to
60
92
  sig do
61
93
  params(
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.5
4
+ version: 0.2.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko