parquet 0.5.10 → 0.5.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 114891cfa5fa190e1f00d44803327f1c90cc11f64ba23f7f2a9cc9f9379da787
4
- data.tar.gz: 9168b2be960faa93ce9c84d170c6e8f73819535bcedbf3d3b26869ff9829ecc6
3
+ metadata.gz: 4f2474bf56190257826281d5135739d010ad3a8e51a30eea807d03fc147f7300
4
+ data.tar.gz: 880084ad0ceb3836195588ce834583359ecf65304e826fd3b025590b960fed37
5
5
  SHA512:
6
- metadata.gz: f07f99a188ac5fa0663616fba00b1990a2cbd6bb14462383915f0e1617c26c5ca481840c16179958f2b3760b334f176e2e4542d95e3cc922379948ac2b0bfa61
7
- data.tar.gz: 42c7b0779d6e3fa46addc5fa92420f326418a54962d391e9b063db8378f8a5f8c2916b43f356649fc127e8fc582aa1e98d7afd71f0bc5f9700a0664ed46313f6
6
+ metadata.gz: 71348a0d7a46fdb32467a15466201898f8752ec232fe279d30e30631a0d876a639474cfc492c92316c131d7ed057ded4eb9d8a2cbb4b13fb81ebd46de806aa51
7
+ data.tar.gz: 476b7f307813c3163088557b0b5af62117a9ccb3bb964d5d5e072d2634a9ad11eab3712b4d6585f7007fbbd5f872ebf4d2d5b296de885048a04f6313209dd179
data/README.md CHANGED
@@ -265,6 +265,7 @@ The following data types are supported in the schema:
265
265
  - `boolean`
266
266
  - `date32`
267
267
  - `timestamp_millis`, `timestamp_micros`
268
+ - `time_millis`, `time_micros`
268
269
 
269
270
  ### Schema DSL for Complex Data Types
270
271
 
@@ -12,81 +12,27 @@ use magnus::value::ReprValue;
12
12
  use magnus::{Error as MagnusError, Ruby, Value};
13
13
 
14
14
  use crate::header_cache::StringCache;
15
- use crate::logger::RubyLogger;
16
15
  use crate::ruby_reader::{RubyReader, ThreadSafeRubyReader};
17
16
  use crate::types::{ParquetGemError, TryIntoValue};
18
17
  use crate::ColumnRecord;
19
18
 
20
- use super::format_detector::{detect_file_format, detect_format_from_extension, FileFormat};
21
-
22
- /// Represents the different data sources we can open
23
- pub enum DataSource {
24
- Parquet(Either<File, ThreadSafeRubyReader>),
25
- Arrow(Either<File, ThreadSafeRubyReader>),
26
- }
27
-
28
- /// Opens a data file (Parquet or Arrow) for reading, automatically detecting the format
29
- pub fn open_data_source(
19
+ /// Opens a parquet file or IO-like object for reading
20
+ ///
21
+ /// This function handles both file paths (as strings) and IO-like objects,
22
+ /// returning either a File or a ThreadSafeRubyReader that can be used with
23
+ /// parquet readers.
24
+ pub fn open_parquet_source(
30
25
  ruby: Rc<Ruby>,
31
26
  to_read: Value,
32
- ruby_logger: &RubyLogger,
33
- ) -> Result<DataSource, ParquetGemError> {
27
+ ) -> Result<Either<File, ThreadSafeRubyReader>, ParquetGemError> {
34
28
  if to_read.is_kind_of(ruby.class_string()) {
35
29
  let path_string = to_read.to_r_string()?;
36
30
  let file_path = unsafe { path_string.as_str()? };
37
-
38
- // Try to detect format from extension first
39
- let format_hint = detect_format_from_extension(file_path);
40
-
41
- let mut file = File::open(file_path).map_err(ParquetGemError::from)?;
42
-
43
- // Detect actual format from file content
44
- let format = detect_file_format(&mut file)?;
45
-
46
- // Warn if extension doesn't match content
47
- if let Some(hint) = format_hint {
48
- if hint != format {
49
- ruby_logger.warn(|| {
50
- format!(
51
- "Extension implied format {:?} but actual format is {:?}",
52
- hint, format
53
- )
54
- })?;
55
- }
56
- }
57
-
58
- match format {
59
- FileFormat::Parquet => Ok(DataSource::Parquet(Either::Left(file))),
60
- FileFormat::Arrow => Ok(DataSource::Arrow(Either::Left(file))),
61
- }
31
+ let file = File::open(file_path).map_err(ParquetGemError::from)?;
32
+ Ok(Either::Left(file))
62
33
  } else {
63
- // For IO-like objects, we need to use a temporary file
64
- use std::io::{Read, Write};
65
- use tempfile::NamedTempFile;
66
-
67
- let mut readable = RubyReader::new(ruby.clone(), to_read)?;
68
- let mut temp_file = NamedTempFile::new().map_err(ParquetGemError::from)?;
69
-
70
- // Copy the entire content to the temporary file
71
- let mut buffer = vec![0u8; 8192];
72
- loop {
73
- let bytes_read = readable.read(&mut buffer)?;
74
- if bytes_read == 0 {
75
- break;
76
- }
77
- temp_file.write_all(&buffer[..bytes_read])?;
78
- }
79
- temp_file.flush()?;
80
-
81
- // Detect format from the temporary file
82
- let mut file = temp_file.reopen()?;
83
- let format = detect_file_format(&mut file)?;
84
-
85
- // Use the temporary file as the source
86
- match format {
87
- FileFormat::Parquet => Ok(DataSource::Parquet(Either::Left(file))),
88
- FileFormat::Arrow => Ok(DataSource::Arrow(Either::Left(file))),
89
- }
34
+ let readable = ThreadSafeRubyReader::new(RubyReader::new(ruby, to_read)?);
35
+ Ok(Either::Right(readable))
90
36
  }
91
37
  }
92
38
 
@@ -1,6 +1,4 @@
1
- mod arrow_reader;
2
1
  mod common;
3
- mod format_detector;
4
2
  mod parquet_column_reader;
5
3
  mod parquet_row_reader;
6
4
  mod unified;
@@ -190,10 +188,7 @@ pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusEr
190
188
  if args.len() != 1 {
191
189
  return Err(MagnusError::new(
192
190
  magnus::exception::arg_error(),
193
- format!(
194
- "metadata expects exactly 1 argument (file path or IO-like object), got {}",
195
- args.len()
196
- ),
191
+ format!("metadata expects exactly 1 argument (file path or IO-like object), got {}", args.len()),
197
192
  ));
198
193
  }
199
194
 
@@ -213,4 +208,4 @@ pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusEr
213
208
  let metadata = reader.finish().map_err(ParquetGemError::Parquet)?;
214
209
 
215
210
  Ok(RubyParquetMetaData(metadata).try_into_value_with(&ruby)?)
216
- }
211
+ }
@@ -13,11 +13,8 @@ use std::collections::HashMap;
13
13
  use std::rc::Rc;
14
14
  use std::sync::OnceLock;
15
15
 
16
- use super::arrow_reader::{
17
- process_arrow_column_data, process_arrow_file_column_data, process_arrow_row_data,
18
- };
19
16
  use super::common::{
20
- create_batch_reader, handle_block_or_enum, handle_empty_file, open_data_source, DataSource,
17
+ create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
21
18
  };
22
19
  use crate::types::ArrayWrapper;
23
20
 
@@ -103,99 +100,34 @@ pub fn parse_parquet_unified(
103
100
  }
104
101
  }
105
102
 
106
- // Open the data source and detect format
107
- let source = open_data_source(ruby.clone(), to_read, &ruby_logger)?;
103
+ // Open the Parquet source
104
+ let source = open_parquet_source(ruby.clone(), to_read)?;
108
105
 
109
- // Based on the source format and parser type, handle the data differently
110
- match (source, &parser_type) {
111
- (DataSource::Parquet(reader), ParserType::Row { strict }) => {
112
- // Handle Parquet row-based parsing
106
+ // Based on the parser type, handle the data differently
107
+ match parser_type {
108
+ ParserType::Row { strict } => {
109
+ // Handle row-based parsing
113
110
  process_row_data(
114
111
  ruby.clone(),
115
- reader,
112
+ source,
116
113
  &columns,
117
114
  result_type,
118
- *strict,
115
+ strict,
119
116
  &ruby_logger,
120
117
  )?;
121
118
  }
122
- (DataSource::Parquet(reader), ParserType::Column { batch_size, strict }) => {
123
- // Handle Parquet column-based parsing
119
+ ParserType::Column { batch_size, strict } => {
120
+ // Handle column-based parsing
124
121
  process_column_data(
125
122
  ruby.clone(),
126
- reader,
123
+ source,
127
124
  &columns,
128
125
  result_type,
129
- *batch_size,
130
- *strict,
126
+ batch_size,
127
+ strict,
131
128
  &ruby_logger,
132
129
  )?;
133
130
  }
134
- (DataSource::Arrow(reader), ParserType::Row { strict }) => {
135
- // Handle Arrow row-based parsing
136
- match reader {
137
- Either::Left(file) => {
138
- // For seekable files, use FileReader which handles IPC file format
139
- use arrow_ipc::reader::FileReader;
140
- let file_reader = FileReader::try_new(file, None)
141
- .map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
142
-
143
- use super::arrow_reader::process_arrow_file_row_data;
144
- process_arrow_file_row_data(
145
- ruby.clone(),
146
- file_reader,
147
- &columns,
148
- result_type,
149
- *strict,
150
- &ruby_logger,
151
- )?;
152
- }
153
- Either::Right(readable) => {
154
- use arrow_ipc::reader::StreamReader;
155
- let stream_reader = StreamReader::try_new(readable, None)
156
- .map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
157
- process_arrow_row_data(
158
- ruby.clone(),
159
- stream_reader,
160
- &columns,
161
- result_type,
162
- *strict,
163
- &ruby_logger,
164
- )?;
165
- }
166
- }
167
- }
168
- (DataSource::Arrow(reader), ParserType::Column { batch_size, strict }) => {
169
- // Handle Arrow column-based parsing
170
- match reader {
171
- Either::Left(file) => {
172
- // For seekable files, we can use the optimized FileReader
173
- process_arrow_file_column_data(
174
- ruby.clone(),
175
- file,
176
- &columns,
177
- result_type,
178
- *batch_size,
179
- *strict,
180
- &ruby_logger,
181
- )?;
182
- }
183
- Either::Right(readable) => {
184
- use arrow_ipc::reader::StreamReader;
185
- let stream_reader = StreamReader::try_new(readable, None)
186
- .map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
187
- process_arrow_column_data(
188
- ruby.clone(),
189
- stream_reader,
190
- &columns,
191
- result_type,
192
- *batch_size,
193
- *strict,
194
- &ruby_logger,
195
- )?;
196
- }
197
- }
198
- }
199
131
  }
200
132
 
201
133
  Ok(ruby.qnil().into_value_with(&ruby))
@@ -115,4 +115,6 @@ pub enum PrimitiveType {
115
115
  Date32,
116
116
  TimestampMillis,
117
117
  TimestampMicros,
118
+ TimeMillis,
119
+ TimeMicros,
118
120
  }
@@ -25,9 +25,9 @@ use arrow_array::cast::downcast_array;
25
25
  use arrow_array::{
26
26
  Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, Decimal256Array,
27
27
  Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
28
- ListArray, NullArray, StringArray, StructArray, TimestampMicrosecondArray,
29
- TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array,
30
- UInt32Array, UInt64Array, UInt8Array,
28
+ ListArray, NullArray, StringArray, StructArray, Time32MillisecondArray, Time64MicrosecondArray,
29
+ TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
30
+ TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
31
31
  };
32
32
  use arrow_schema::{DataType, TimeUnit};
33
33
  use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby, Value};
@@ -55,10 +55,6 @@ pub enum ParquetGemError {
55
55
  Parquet(#[from] parquet::errors::ParquetError),
56
56
  #[error("Arrow error: {0}")]
57
57
  Arrow(#[from] arrow_schema::ArrowError),
58
- #[error("Arrow IPC error: {0}")]
59
- ArrowIpc(String),
60
- #[error("Unknown file format")]
61
- UnknownFormat,
62
58
  #[error("UTF-8 error: {0}")]
63
59
  Utf8Error(#[from] simdutf8::basic::Utf8Error),
64
60
  #[error("Jiff error: {0}")]
@@ -29,6 +29,8 @@ pub enum ParquetValue {
29
29
  TimestampMillis(i64, Option<Arc<str>>),
30
30
  TimestampMicros(i64, Option<Arc<str>>),
31
31
  TimestampNanos(i64, Option<Arc<str>>),
32
+ TimeMillis(i32), // Time of day in milliseconds since midnight
33
+ TimeMicros(i64), // Time of day in microseconds since midnight
32
34
  List(Vec<ParquetValue>), // A list of values (can be empty or have null items)
33
35
  // We're not using a separate NilList type anymore - we'll handle nil lists elsewhere
34
36
  Map(HashMap<ParquetValue, ParquetValue>),
@@ -108,6 +110,8 @@ impl PartialEq for ParquetValue {
108
110
  (ParquetValue::TimestampMillis(a, _), ParquetValue::TimestampMillis(b, _)) => a == b,
109
111
  (ParquetValue::TimestampMicros(a, _), ParquetValue::TimestampMicros(b, _)) => a == b,
110
112
  (ParquetValue::TimestampNanos(a, _), ParquetValue::TimestampNanos(b, _)) => a == b,
113
+ (ParquetValue::TimeMillis(a), ParquetValue::TimeMillis(b)) => a == b,
114
+ (ParquetValue::TimeMicros(a), ParquetValue::TimeMicros(b)) => a == b,
111
115
  (ParquetValue::List(a), ParquetValue::List(b)) => a == b,
112
116
  (ParquetValue::Null, ParquetValue::Null) => true,
113
117
  _ => false,
@@ -160,6 +164,8 @@ impl std::hash::Hash for ParquetValue {
160
164
  ts.hash(state);
161
165
  tz.hash(state);
162
166
  }
167
+ ParquetValue::TimeMillis(t) => t.hash(state),
168
+ ParquetValue::TimeMicros(t) => t.hash(state),
163
169
  ParquetValue::List(l) => l.hash(state),
164
170
  ParquetValue::Map(m) => {
165
171
  for (k, v) in m {
@@ -224,6 +230,38 @@ impl TryIntoValue for ParquetValue {
224
230
  timestamp @ ParquetValue::TimestampNanos(_, _) => {
225
231
  impl_timestamp_conversion!(timestamp, TimestampNanos, handle)
226
232
  }
233
+ ParquetValue::TimeMillis(millis) => {
234
+ // Convert time of day in milliseconds to a Ruby Time object
235
+ // Use epoch date (1970-01-01) with the given time
236
+ let total_seconds = millis / 1000;
237
+ let ms = millis % 1000;
238
+ let hours = total_seconds / 3600;
239
+ let minutes = (total_seconds % 3600) / 60;
240
+ let seconds = total_seconds % 60;
241
+
242
+ // Create a Time object for 1970-01-01 with the given time
243
+ let time_class = handle.class_time();
244
+ let time = time_class.funcall::<_, _, Value>(
245
+ "new",
246
+ (1970, 1, 1, hours, minutes, seconds, ms * 1000), // Ruby expects microseconds
247
+ )?;
248
+ Ok(time.into_value_with(handle))
249
+ }
250
+ ParquetValue::TimeMicros(micros) => {
251
+ // Convert time of day in microseconds to a Ruby Time object
252
+ // Use epoch date (1970-01-01) with the given time
253
+ let total_seconds = micros / 1_000_000;
254
+ let us = micros % 1_000_000;
255
+ let hours = total_seconds / 3600;
256
+ let minutes = (total_seconds % 3600) / 60;
257
+ let seconds = total_seconds % 60;
258
+
259
+ // Create a Time object for 1970-01-01 with the given time
260
+ let time_class = handle.class_time();
261
+ let time = time_class
262
+ .funcall::<_, _, Value>("new", (1970, 1, 1, hours, minutes, seconds, us))?;
263
+ Ok(time.into_value_with(handle))
264
+ }
227
265
  ParquetValue::List(l) => {
228
266
  // For lists, convert to Ruby array and check for specific cases
229
267
  // when we might need to return nil instead of an empty array
@@ -356,12 +394,32 @@ impl ParquetValue {
356
394
  Ok(ParquetValue::Date32(v))
357
395
  }
358
396
  PrimitiveType::TimestampMillis => {
359
- let v = convert_to_timestamp_millis(ruby, value, format)?;
360
- Ok(ParquetValue::TimestampMillis(v, None))
397
+ if value.is_kind_of(ruby.class_time()) {
398
+ use crate::types::timestamp::ruby_time_to_timestamp_with_tz;
399
+ let (v, tz) = ruby_time_to_timestamp_with_tz(value, "millis")?;
400
+ Ok(ParquetValue::TimestampMillis(v, tz))
401
+ } else {
402
+ let v = convert_to_timestamp_millis(ruby, value, format)?;
403
+ Ok(ParquetValue::TimestampMillis(v, None))
404
+ }
361
405
  }
362
406
  PrimitiveType::TimestampMicros => {
363
- let v = convert_to_timestamp_micros(ruby, value, format)?;
364
- Ok(ParquetValue::TimestampMicros(v, None))
407
+ if value.is_kind_of(ruby.class_time()) {
408
+ use crate::types::timestamp::ruby_time_to_timestamp_with_tz;
409
+ let (v, tz) = ruby_time_to_timestamp_with_tz(value, "micros")?;
410
+ Ok(ParquetValue::TimestampMicros(v, tz))
411
+ } else {
412
+ let v = convert_to_timestamp_micros(ruby, value, format)?;
413
+ Ok(ParquetValue::TimestampMicros(v, None))
414
+ }
415
+ }
416
+ PrimitiveType::TimeMillis => {
417
+ let v = convert_to_time_millis(ruby, value, format)?;
418
+ Ok(ParquetValue::TimeMillis(v))
419
+ }
420
+ PrimitiveType::TimeMicros => {
421
+ let v = convert_to_time_micros(ruby, value, format)?;
422
+ Ok(ParquetValue::TimeMicros(v))
365
423
  }
366
424
  },
367
425
  ParquetSchemaType::List(list_field) => {
@@ -980,6 +1038,52 @@ impl<'a> TryFrom<ArrayWrapper<'a>> for ParquetValueVec {
980
1038
  tz
981
1039
  )
982
1040
  }
1041
+ DataType::Time32(TimeUnit::Millisecond) => {
1042
+ let array = downcast_array::<Time32MillisecondArray>(column.array);
1043
+ Ok(ParquetValueVec(if array.is_nullable() {
1044
+ array
1045
+ .values()
1046
+ .iter()
1047
+ .enumerate()
1048
+ .map(|(i, x)| {
1049
+ if array.is_null(i) {
1050
+ ParquetValue::Null
1051
+ } else {
1052
+ ParquetValue::TimeMillis(*x)
1053
+ }
1054
+ })
1055
+ .collect()
1056
+ } else {
1057
+ array
1058
+ .values()
1059
+ .iter()
1060
+ .map(|x| ParquetValue::TimeMillis(*x))
1061
+ .collect()
1062
+ }))
1063
+ }
1064
+ DataType::Time64(TimeUnit::Microsecond) => {
1065
+ let array = downcast_array::<Time64MicrosecondArray>(column.array);
1066
+ Ok(ParquetValueVec(if array.is_nullable() {
1067
+ array
1068
+ .values()
1069
+ .iter()
1070
+ .enumerate()
1071
+ .map(|(i, x)| {
1072
+ if array.is_null(i) {
1073
+ ParquetValue::Null
1074
+ } else {
1075
+ ParquetValue::TimeMicros(*x)
1076
+ }
1077
+ })
1078
+ .collect()
1079
+ } else {
1080
+ array
1081
+ .values()
1082
+ .iter()
1083
+ .map(|x| ParquetValue::TimeMicros(*x))
1084
+ .collect()
1085
+ }))
1086
+ }
983
1087
  DataType::Float16 => {
984
1088
  let array = downcast_array::<Float16Array>(column.array);
985
1089
  if array.is_nullable() {
@@ -295,6 +295,8 @@ fn parse_primitive_type(s: &str) -> Option<PrimitiveType> {
295
295
  "date" | "date32" => Some(PrimitiveType::Date32),
296
296
  "timestamp_millis" | "timestamp_ms" => Some(PrimitiveType::TimestampMillis),
297
297
  "timestamp_micros" | "timestamp_us" => Some(PrimitiveType::TimestampMicros),
298
+ "time_millis" | "time_ms" => Some(PrimitiveType::TimeMillis),
299
+ "time_micros" | "time_us" => Some(PrimitiveType::TimeMicros),
298
300
  "decimal" => Some(PrimitiveType::Decimal128(38, 0)), // Maximum precision, scale 0
299
301
  "decimal256" => Some(PrimitiveType::Decimal256(38, 0)), // Maximum precision, scale 0
300
302
  _ => None,
@@ -337,6 +339,12 @@ pub fn schema_node_to_arrow_field(node: &SchemaNode) -> ArrowField {
337
339
  PrimitiveType::TimestampMicros => {
338
340
  ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None)
339
341
  }
342
+ PrimitiveType::TimeMillis => {
343
+ ArrowDataType::Time32(arrow_schema::TimeUnit::Millisecond)
344
+ }
345
+ PrimitiveType::TimeMicros => {
346
+ ArrowDataType::Time64(arrow_schema::TimeUnit::Microsecond)
347
+ }
340
348
  };
341
349
  ArrowField::new(name, dt, *nullable)
342
350
  }