parquet 0.0.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,152 @@
1
+ use crate::header_cache::{HeaderCacheCleanupIter, StringCache};
2
+ use crate::{
3
+ create_row_enumerator, utils::*, ForgottenFileHandle, ParquetField, ParserResultType,
4
+ ReaderError, RowEnumeratorArgs, RowRecord, SeekableRubyValue,
5
+ };
6
+ use ahash::RandomState;
7
+ use magnus::rb_sys::AsRawValue;
8
+ use magnus::value::{Opaque, ReprValue};
9
+ use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
10
+ use parquet::file::reader::{FileReader, SerializedFileReader};
11
+ use parquet::record::reader::RowIter as ParquetRowIter;
12
+ use parquet::schema::types::{Type as SchemaType, TypePtr};
13
+ use std::collections::HashMap;
14
+ use std::fs::File;
15
+ use std::mem::ManuallyDrop;
16
+ use std::os::fd::FromRawFd;
17
+ use std::sync::OnceLock;
18
+
19
+ #[inline]
20
+ pub fn parse_parquet_rows<'a>(
21
+ rb_self: Value,
22
+ args: &[Value],
23
+ ) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
24
+ let ruby = unsafe { Ruby::get_unchecked() };
25
+
26
+ let ParquetRowsArgs {
27
+ to_read,
28
+ result_type,
29
+ columns,
30
+ } = parse_parquet_rows_args(&ruby, args)?;
31
+
32
+ if !ruby.block_given() {
33
+ return create_row_enumerator(RowEnumeratorArgs {
34
+ rb_self,
35
+ to_read,
36
+ result_type,
37
+ columns,
38
+ });
39
+ }
40
+
41
+ let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
42
+ let path_string = to_read.to_r_string()?;
43
+ let file_path = unsafe { path_string.as_str()? };
44
+ let file = File::open(file_path).unwrap();
45
+ let reader = SerializedFileReader::new(file).unwrap();
46
+ let schema = reader.metadata().file_metadata().schema().clone();
47
+
48
+ (schema, ParquetRowIter::from_file_into(Box::new(reader)))
49
+ } else if to_read.is_kind_of(ruby.class_io()) {
50
+ let raw_value = to_read.as_raw();
51
+ let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
52
+ .map_err(|_| {
53
+ ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
54
+ })?;
55
+
56
+ if fd < 0 {
57
+ return Err(ReaderError::InvalidFileDescriptor.into());
58
+ }
59
+
60
+ let file = unsafe { File::from_raw_fd(fd) };
61
+ let file = ForgottenFileHandle(ManuallyDrop::new(file));
62
+ let reader = SerializedFileReader::new(file).unwrap();
63
+ let schema = reader.metadata().file_metadata().schema().clone();
64
+
65
+ (schema, ParquetRowIter::from_file_into(Box::new(reader)))
66
+ } else {
67
+ let readable = SeekableRubyValue(Opaque::from(to_read));
68
+ let reader = SerializedFileReader::new(readable).unwrap();
69
+ let schema = reader.metadata().file_metadata().schema().clone();
70
+
71
+ (schema, ParquetRowIter::from_file_into(Box::new(reader)))
72
+ };
73
+
74
+ if let Some(cols) = columns {
75
+ let projection = create_projection_schema(&schema, &cols);
76
+ iter = iter.project(Some(projection.to_owned())).map_err(|e| {
77
+ MagnusError::new(
78
+ ruby.exception_runtime_error(),
79
+ format!("Failed to create projection: {}", e),
80
+ )
81
+ })?;
82
+ }
83
+
84
+ let iter: Box<dyn Iterator<Item = RowRecord<RandomState>>> = match result_type {
85
+ ParserResultType::Hash => {
86
+ let headers = OnceLock::new();
87
+ let headers_clone = headers.clone();
88
+ let iter = iter
89
+ .filter_map(move |row| {
90
+ row.ok().map(|row| {
91
+ let headers = headers_clone.get_or_init(|| {
92
+ let column_count = row.get_column_iter().count();
93
+
94
+ let mut header_string = Vec::with_capacity(column_count);
95
+ for (k, _) in row.get_column_iter() {
96
+ header_string.push(k.to_owned());
97
+ }
98
+
99
+ let headers = StringCache::intern_many(&header_string).unwrap();
100
+
101
+ headers
102
+ });
103
+
104
+ let mut map =
105
+ HashMap::with_capacity_and_hasher(headers.len(), Default::default());
106
+ row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
107
+ map.insert(headers[i], ParquetField(v.clone()));
108
+ });
109
+ map
110
+ })
111
+ })
112
+ .map(RowRecord::Map);
113
+
114
+ Box::new(HeaderCacheCleanupIter {
115
+ inner: iter,
116
+ headers,
117
+ })
118
+ }
119
+ ParserResultType::Array => Box::new(
120
+ iter.filter_map(|row| {
121
+ row.ok().map(|row| {
122
+ let column_count = row.get_column_iter().count();
123
+ let mut vec = Vec::with_capacity(column_count);
124
+ row.get_column_iter()
125
+ .for_each(|(_, v)| vec.push(ParquetField(v.clone())));
126
+ vec
127
+ })
128
+ })
129
+ .map(RowRecord::Vec),
130
+ ),
131
+ };
132
+
133
+ Ok(Yield::Iter(iter))
134
+ }
135
+
136
+ fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
137
+ if let SchemaType::GroupType { fields, .. } = schema {
138
+ let projected_fields: Vec<TypePtr> = fields
139
+ .iter()
140
+ .filter(|field| columns.contains(&field.name().to_string()))
141
+ .cloned()
142
+ .collect();
143
+
144
+ SchemaType::GroupType {
145
+ basic_info: schema.get_basic_info().clone(),
146
+ fields: projected_fields,
147
+ }
148
+ } else {
149
+ // Return original schema if not a group type
150
+ schema.clone()
151
+ }
152
+ }
@@ -14,9 +14,8 @@ pub struct RubyReader<T> {
14
14
  offset: usize,
15
15
  }
16
16
 
17
- pub trait SeekableRead: std::io::Read + Seek {}
18
- impl SeekableRead for RubyReader<Value> {}
19
- impl SeekableRead for RubyReader<RString> {}
17
+ pub trait SeekableRead: Read + Seek {}
18
+ impl<T: Read + Seek> SeekableRead for T {}
20
19
 
21
20
  pub fn build_ruby_reader(
22
21
  ruby: &Ruby,
@@ -0,0 +1,73 @@
1
+ #[derive(Copy, Clone, Debug, PartialEq, Eq)]
2
+ pub enum ParserResultType {
3
+ Hash,
4
+ Array,
5
+ }
6
+
7
+ impl ParserResultType {
8
+ pub fn iter() -> impl Iterator<Item = Self> {
9
+ [Self::Hash, Self::Array].into_iter()
10
+ }
11
+ }
12
+
13
+ impl TryFrom<&str> for ParserResultType {
14
+ type Error = String;
15
+
16
+ fn try_from(value: &str) -> Result<Self, Self::Error> {
17
+ match value {
18
+ "hash" => Ok(ParserResultType::Hash),
19
+ "array" => Ok(ParserResultType::Array),
20
+ _ => Err(format!("Invalid parser result type: {}", value)),
21
+ }
22
+ }
23
+ }
24
+
25
+ impl TryFrom<String> for ParserResultType {
26
+ type Error = String;
27
+
28
+ fn try_from(value: String) -> Result<Self, Self::Error> {
29
+ Self::try_from(value.as_str())
30
+ }
31
+ }
32
+
33
+ impl std::fmt::Display for ParserResultType {
34
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
35
+ match self {
36
+ ParserResultType::Hash => write!(f, "hash"),
37
+ ParserResultType::Array => write!(f, "array"),
38
+ }
39
+ }
40
+ }
41
+
42
+ #[derive(Debug, Clone)]
43
+ pub struct ListField {
44
+ pub item_type: ParquetSchemaType,
45
+ }
46
+
47
+ #[derive(Debug, Clone)]
48
+ pub struct MapField {
49
+ pub key_type: ParquetSchemaType,
50
+ pub value_type: ParquetSchemaType,
51
+ }
52
+
53
+ #[derive(Debug, Clone)]
54
+ pub enum ParquetSchemaType {
55
+ Int8,
56
+ Int16,
57
+ Int32,
58
+ Int64,
59
+ UInt8,
60
+ UInt16,
61
+ UInt32,
62
+ UInt64,
63
+ Float,
64
+ Double,
65
+ String,
66
+ Binary,
67
+ Boolean,
68
+ Date32,
69
+ TimestampMillis,
70
+ TimestampMicros,
71
+ List(Box<ListField>),
72
+ Map(Box<MapField>),
73
+ }
@@ -0,0 +1,30 @@
1
+ // Re-export all public items from submodules
2
+ mod core_types;
3
+ mod parquet_value;
4
+ mod record_types;
5
+ mod timestamp;
6
+ mod type_conversion;
7
+ mod writer_types;
8
+
9
+ pub use core_types::*;
10
+ pub use parquet_value::*;
11
+ pub use record_types::*;
12
+ pub use timestamp::*;
13
+ pub use type_conversion::*;
14
+ pub use writer_types::*;
15
+
16
+ // Common imports used across the module
17
+ use arrow_array::cast::downcast_array;
18
+ use arrow_array::{
19
+ Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Float16Array, Float32Array,
20
+ Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, NullArray, StringArray,
21
+ StructArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
22
+ TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
23
+ };
24
+ use arrow_schema::{DataType, TimeUnit};
25
+ use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby, TryConvert, Value};
26
+ use parquet::data_type::Decimal;
27
+ use parquet::record::Field;
28
+ use std::{collections::HashMap, hash::BuildHasher, sync::Arc};
29
+
30
+ use crate::header_cache::StringCacheKey;