parquet 0.0.4 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,152 @@
1
+ use crate::header_cache::{HeaderCacheCleanupIter, StringCache};
2
+ use crate::{
3
+ create_row_enumerator, utils::*, ForgottenFileHandle, ParquetField, ParserResultType,
4
+ ReaderError, RowEnumeratorArgs, RowRecord, SeekableRubyValue,
5
+ };
6
+ use ahash::RandomState;
7
+ use magnus::rb_sys::AsRawValue;
8
+ use magnus::value::{Opaque, ReprValue};
9
+ use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
10
+ use parquet::file::reader::{FileReader, SerializedFileReader};
11
+ use parquet::record::reader::RowIter as ParquetRowIter;
12
+ use parquet::schema::types::{Type as SchemaType, TypePtr};
13
+ use std::collections::HashMap;
14
+ use std::fs::File;
15
+ use std::mem::ManuallyDrop;
16
+ use std::os::fd::FromRawFd;
17
+ use std::sync::OnceLock;
18
+
19
+ #[inline]
20
+ pub fn parse_parquet_rows<'a>(
21
+ rb_self: Value,
22
+ args: &[Value],
23
+ ) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
24
+ let ruby = unsafe { Ruby::get_unchecked() };
25
+
26
+ let ParquetRowsArgs {
27
+ to_read,
28
+ result_type,
29
+ columns,
30
+ } = parse_parquet_rows_args(&ruby, args)?;
31
+
32
+ if !ruby.block_given() {
33
+ return create_row_enumerator(RowEnumeratorArgs {
34
+ rb_self,
35
+ to_read,
36
+ result_type,
37
+ columns,
38
+ });
39
+ }
40
+
41
+ let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
42
+ let path_string = to_read.to_r_string()?;
43
+ let file_path = unsafe { path_string.as_str()? };
44
+ let file = File::open(file_path).unwrap();
45
+ let reader = SerializedFileReader::new(file).unwrap();
46
+ let schema = reader.metadata().file_metadata().schema().clone();
47
+
48
+ (schema, ParquetRowIter::from_file_into(Box::new(reader)))
49
+ } else if to_read.is_kind_of(ruby.class_io()) {
50
+ let raw_value = to_read.as_raw();
51
+ let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
52
+ .map_err(|_| {
53
+ ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
54
+ })?;
55
+
56
+ if fd < 0 {
57
+ return Err(ReaderError::InvalidFileDescriptor.into());
58
+ }
59
+
60
+ let file = unsafe { File::from_raw_fd(fd) };
61
+ let file = ForgottenFileHandle(ManuallyDrop::new(file));
62
+ let reader = SerializedFileReader::new(file).unwrap();
63
+ let schema = reader.metadata().file_metadata().schema().clone();
64
+
65
+ (schema, ParquetRowIter::from_file_into(Box::new(reader)))
66
+ } else {
67
+ let readable = SeekableRubyValue(Opaque::from(to_read));
68
+ let reader = SerializedFileReader::new(readable).unwrap();
69
+ let schema = reader.metadata().file_metadata().schema().clone();
70
+
71
+ (schema, ParquetRowIter::from_file_into(Box::new(reader)))
72
+ };
73
+
74
+ if let Some(cols) = columns {
75
+ let projection = create_projection_schema(&schema, &cols);
76
+ iter = iter.project(Some(projection.to_owned())).map_err(|e| {
77
+ MagnusError::new(
78
+ ruby.exception_runtime_error(),
79
+ format!("Failed to create projection: {}", e),
80
+ )
81
+ })?;
82
+ }
83
+
84
+ let iter: Box<dyn Iterator<Item = RowRecord<RandomState>>> = match result_type {
85
+ ParserResultType::Hash => {
86
+ let headers = OnceLock::new();
87
+ let headers_clone = headers.clone();
88
+ let iter = iter
89
+ .filter_map(move |row| {
90
+ row.ok().map(|row| {
91
+ let headers = headers_clone.get_or_init(|| {
92
+ let column_count = row.get_column_iter().count();
93
+
94
+ let mut header_string = Vec::with_capacity(column_count);
95
+ for (k, _) in row.get_column_iter() {
96
+ header_string.push(k.to_owned());
97
+ }
98
+
99
+ let headers = StringCache::intern_many(&header_string).unwrap();
100
+
101
+ headers
102
+ });
103
+
104
+ let mut map =
105
+ HashMap::with_capacity_and_hasher(headers.len(), Default::default());
106
+ row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
107
+ map.insert(headers[i], ParquetField(v.clone()));
108
+ });
109
+ map
110
+ })
111
+ })
112
+ .map(RowRecord::Map);
113
+
114
+ Box::new(HeaderCacheCleanupIter {
115
+ inner: iter,
116
+ headers,
117
+ })
118
+ }
119
+ ParserResultType::Array => Box::new(
120
+ iter.filter_map(|row| {
121
+ row.ok().map(|row| {
122
+ let column_count = row.get_column_iter().count();
123
+ let mut vec = Vec::with_capacity(column_count);
124
+ row.get_column_iter()
125
+ .for_each(|(_, v)| vec.push(ParquetField(v.clone())));
126
+ vec
127
+ })
128
+ })
129
+ .map(RowRecord::Vec),
130
+ ),
131
+ };
132
+
133
+ Ok(Yield::Iter(iter))
134
+ }
135
+
136
+ fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
137
+ if let SchemaType::GroupType { fields, .. } = schema {
138
+ let projected_fields: Vec<TypePtr> = fields
139
+ .iter()
140
+ .filter(|field| columns.contains(&field.name().to_string()))
141
+ .cloned()
142
+ .collect();
143
+
144
+ SchemaType::GroupType {
145
+ basic_info: schema.get_basic_info().clone(),
146
+ fields: projected_fields,
147
+ }
148
+ } else {
149
+ // Return original schema if not a group type
150
+ schema.clone()
151
+ }
152
+ }
@@ -14,9 +14,8 @@ pub struct RubyReader<T> {
14
14
  offset: usize,
15
15
  }
16
16
 
17
- pub trait SeekableRead: std::io::Read + Seek {}
18
- impl SeekableRead for RubyReader<Value> {}
19
- impl SeekableRead for RubyReader<RString> {}
17
+ pub trait SeekableRead: Read + Seek {}
18
+ impl<T: Read + Seek> SeekableRead for T {}
20
19
 
21
20
  pub fn build_ruby_reader(
22
21
  ruby: &Ruby,
@@ -0,0 +1,73 @@
1
+ #[derive(Copy, Clone, Debug, PartialEq, Eq)]
2
+ pub enum ParserResultType {
3
+ Hash,
4
+ Array,
5
+ }
6
+
7
+ impl ParserResultType {
8
+ pub fn iter() -> impl Iterator<Item = Self> {
9
+ [Self::Hash, Self::Array].into_iter()
10
+ }
11
+ }
12
+
13
+ impl TryFrom<&str> for ParserResultType {
14
+ type Error = String;
15
+
16
+ fn try_from(value: &str) -> Result<Self, Self::Error> {
17
+ match value {
18
+ "hash" => Ok(ParserResultType::Hash),
19
+ "array" => Ok(ParserResultType::Array),
20
+ _ => Err(format!("Invalid parser result type: {}", value)),
21
+ }
22
+ }
23
+ }
24
+
25
+ impl TryFrom<String> for ParserResultType {
26
+ type Error = String;
27
+
28
+ fn try_from(value: String) -> Result<Self, Self::Error> {
29
+ Self::try_from(value.as_str())
30
+ }
31
+ }
32
+
33
+ impl std::fmt::Display for ParserResultType {
34
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
35
+ match self {
36
+ ParserResultType::Hash => write!(f, "hash"),
37
+ ParserResultType::Array => write!(f, "array"),
38
+ }
39
+ }
40
+ }
41
+
42
+ #[derive(Debug, Clone)]
43
+ pub struct ListField {
44
+ pub item_type: ParquetSchemaType,
45
+ }
46
+
47
+ #[derive(Debug, Clone)]
48
+ pub struct MapField {
49
+ pub key_type: ParquetSchemaType,
50
+ pub value_type: ParquetSchemaType,
51
+ }
52
+
53
+ #[derive(Debug, Clone)]
54
+ pub enum ParquetSchemaType {
55
+ Int8,
56
+ Int16,
57
+ Int32,
58
+ Int64,
59
+ UInt8,
60
+ UInt16,
61
+ UInt32,
62
+ UInt64,
63
+ Float,
64
+ Double,
65
+ String,
66
+ Binary,
67
+ Boolean,
68
+ Date32,
69
+ TimestampMillis,
70
+ TimestampMicros,
71
+ List(Box<ListField>),
72
+ Map(Box<MapField>),
73
+ }
@@ -0,0 +1,30 @@
1
+ // Re-export all public items from submodules
2
+ mod core_types;
3
+ mod parquet_value;
4
+ mod record_types;
5
+ mod timestamp;
6
+ mod type_conversion;
7
+ mod writer_types;
8
+
9
+ pub use core_types::*;
10
+ pub use parquet_value::*;
11
+ pub use record_types::*;
12
+ pub use timestamp::*;
13
+ pub use type_conversion::*;
14
+ pub use writer_types::*;
15
+
16
+ // Common imports used across the module
17
+ use arrow_array::cast::downcast_array;
18
+ use arrow_array::{
19
+ Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Float16Array, Float32Array,
20
+ Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, NullArray, StringArray,
21
+ StructArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
22
+ TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
23
+ };
24
+ use arrow_schema::{DataType, TimeUnit};
25
+ use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby, TryConvert, Value};
26
+ use parquet::data_type::Decimal;
27
+ use parquet::record::Field;
28
+ use std::{collections::HashMap, hash::BuildHasher, sync::Arc};
29
+
30
+ use crate::header_cache::StringCacheKey;