parquet 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b88d6751418f21c4ec032d05b6d0a6e9dbd37304983ed80e1a290508c787d118
4
- data.tar.gz: 948702f38cad3c4d4e76efccbd9d7d8ad4c81366c4dcba2c71058cc4d013c237
3
+ metadata.gz: 398a8ab4fe6b9c8e82d63ec832aa73163e75874c39080d87291a60397756df42
4
+ data.tar.gz: cace20e14d0eddc6e3185b2f9294253cb57c1689ec463ff66bc903d3c780af13
5
5
  SHA512:
6
- metadata.gz: 30f90ee2f597aa6e2d5a84b8ab9780af3d71fa41d3a1152f47d7a12b34bc203b8ff06b04c3f929f689c93be9e962186a3e6c305f61724b36ad4e6ad551c11f49
7
- data.tar.gz: 5a83b007e0c4789c6cfde1f8037228b0b00f2f0ef7ea0f932d7eaafefb91669db422450bbfd923f4388e2bfc644cae57f514828a2e4a2868ee6a20b492af428e
6
+ metadata.gz: 72ae6542b367fe433016f06fa109aaa77fe360bbc1df64e5c997db8fcc0a00aa166aa19a37240a706b3f443612770b80bc387dd41b34ee4a94ab26c3b0e74832
7
+ data.tar.gz: f69b10c6d4c8d879cdd3fce7c3b44933a99569358d1adfa3106760bd7c66036a2fef86737cf4dc6369be46234c124b9f2ef66e82fab118e36b5b079e9d23e10b
data/Cargo.lock CHANGED
@@ -826,16 +826,6 @@ dependencies = [
826
826
  "wasm-bindgen",
827
827
  ]
828
828
 
829
- [[package]]
830
- name = "kanal"
831
- version = "0.1.0-pre8"
832
- source = "registry+https://github.com/rust-lang/crates.io-index"
833
- checksum = "b05d55519627edaf7fd0f29981f6dc03fb52df3f5b257130eb8d0bf2801ea1d7"
834
- dependencies = [
835
- "futures-core",
836
- "lock_api",
837
- ]
838
-
839
829
  [[package]]
840
830
  name = "lazy_static"
841
831
  version = "1.5.0"
@@ -975,18 +965,6 @@ dependencies = [
975
965
  "twox-hash",
976
966
  ]
977
967
 
978
- [[package]]
979
- name = "magnus"
980
- version = "0.6.4"
981
- source = "registry+https://github.com/rust-lang/crates.io-index"
982
- checksum = "b1597ef40aa8c36be098249e82c9a20cf7199278ac1c1a1a995eeead6a184479"
983
- dependencies = [
984
- "magnus-macros",
985
- "rb-sys",
986
- "rb-sys-env",
987
- "seq-macro",
988
- ]
989
-
990
968
  [[package]]
991
969
  name = "magnus"
992
970
  version = "0.7.1"
@@ -1203,13 +1181,10 @@ dependencies = [
1203
1181
  "itertools 0.14.0",
1204
1182
  "jemallocator",
1205
1183
  "jiff",
1206
- "kanal",
1207
- "magnus 0.7.1",
1184
+ "magnus",
1208
1185
  "mimalloc",
1209
1186
  "parquet 54.0.0",
1210
1187
  "rb-sys",
1211
- "serde",
1212
- "serde_magnus",
1213
1188
  "thiserror",
1214
1189
  ]
1215
1190
 
@@ -1467,17 +1442,6 @@ dependencies = [
1467
1442
  "serde",
1468
1443
  ]
1469
1444
 
1470
- [[package]]
1471
- name = "serde_magnus"
1472
- version = "0.8.1"
1473
- source = "registry+https://github.com/rust-lang/crates.io-index"
1474
- checksum = "76c20da583b5e1016e9199ef5f3260f7a8d1b253307d232600f6b12737262dbd"
1475
- dependencies = [
1476
- "magnus 0.6.4",
1477
- "serde",
1478
- "tap",
1479
- ]
1480
-
1481
1445
  [[package]]
1482
1446
  name = "shell-words"
1483
1447
  version = "1.1.0"
@@ -1566,12 +1530,6 @@ dependencies = [
1566
1530
  "syn",
1567
1531
  ]
1568
1532
 
1569
- [[package]]
1570
- name = "tap"
1571
- version = "1.0.1"
1572
- source = "registry+https://github.com/rust-lang/crates.io-index"
1573
- checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
1574
-
1575
1533
  [[package]]
1576
1534
  name = "thiserror"
1577
1535
  version = "2.0.9"
data/Gemfile CHANGED
@@ -8,7 +8,7 @@ gemspec
8
8
 
9
9
  group :development do
10
10
  gem "benchmark-ips", "~> 2.12"
11
- # gem "polars-df"
11
+ gem "polars-df"
12
12
  gem "duckdb"
13
13
  end
14
14
 
@@ -8,19 +8,15 @@ crate-type = ["cdylib"]
8
8
 
9
9
  [dependencies]
10
10
  ahash = "0.8"
11
- parquet = { version = "^54.0", features = ["json", "object_store"] }
12
- arrow-schema = "54.0.0"
13
11
  arrow-array = "54.0.0"
12
+ arrow-schema = "54.0.0"
14
13
  bytes = "^1.9"
15
- kanal = "0.1.0-pre8"
14
+ itertools = "^0.14"
15
+ jiff = "0.1.19"
16
16
  magnus = { version = "0.7", features = ["rb-sys"] }
17
+ parquet = { version = "^54.0", features = ["json", "object_store"] }
17
18
  rb-sys = "^0.9"
18
- serde = { version = "1.0", features = ["derive"] }
19
- serde_magnus = "0.8.1"
20
19
  thiserror = "2.0"
21
- itertools = "^0.14"
22
- jiff = "0.1.19"
23
-
24
20
 
25
21
  [target.'cfg(target_os = "linux")'.dependencies]
26
22
  jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
@@ -3,12 +3,12 @@ use magnus::{
3
3
  block::Yield, value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value,
4
4
  };
5
5
 
6
- use crate::{ColumnRecord, RowRecord};
6
+ use crate::{ColumnRecord, ParserResultType, RowRecord};
7
7
 
8
8
  pub struct RowEnumeratorArgs {
9
9
  pub rb_self: Value,
10
10
  pub to_read: Value,
11
- pub result_type: String,
11
+ pub result_type: ParserResultType,
12
12
  pub columns: Option<Vec<String>>,
13
13
  }
14
14
 
@@ -17,7 +17,10 @@ pub fn create_row_enumerator(
17
17
  args: RowEnumeratorArgs,
18
18
  ) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
19
19
  let kwargs = RHash::new();
20
- kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
20
+ kwargs.aset(
21
+ Symbol::new("result_type"),
22
+ Symbol::new(args.result_type.to_string()),
23
+ )?;
21
24
  if let Some(columns) = args.columns {
22
25
  kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
23
26
  }
@@ -30,7 +33,7 @@ pub fn create_row_enumerator(
30
33
  pub struct ColumnEnumeratorArgs {
31
34
  pub rb_self: Value,
32
35
  pub to_read: Value,
33
- pub result_type: String,
36
+ pub result_type: ParserResultType,
34
37
  pub columns: Option<Vec<String>>,
35
38
  pub batch_size: Option<usize>,
36
39
  }
@@ -40,7 +43,10 @@ pub fn create_column_enumerator(
40
43
  args: ColumnEnumeratorArgs,
41
44
  ) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
42
45
  let kwargs = RHash::new();
43
- kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
46
+ kwargs.aset(
47
+ Symbol::new("result_type"),
48
+ Symbol::new(args.result_type.to_string()),
49
+ )?;
44
50
  if let Some(columns) = args.columns {
45
51
  kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
46
52
  }
@@ -7,6 +7,9 @@ mod ruby_reader;
7
7
  mod types;
8
8
  mod utils;
9
9
 
10
+ mod parquet_column_reader;
11
+ mod parquet_row_reader;
12
+
10
13
  use crate::enumerator::*;
11
14
  use crate::reader::*;
12
15
  use crate::ruby_integration::*;
@@ -0,0 +1,238 @@
1
+ use crate::header_cache::{CacheError, HeaderCacheCleanupIter, StringCache};
2
+ use crate::{
3
+ create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ForgottenFileHandle,
4
+ ParquetValueVec, ParserResultType, SeekableRubyValue,
5
+ };
6
+ use ahash::RandomState;
7
+ use magnus::rb_sys::AsRawValue;
8
+ use magnus::value::{Opaque, ReprValue};
9
+ use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
10
+ use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
11
+ use parquet::arrow::ProjectionMask;
12
+ use parquet::errors::ParquetError;
13
+ use std::collections::HashMap;
14
+ use std::fs::File;
15
+ use std::io;
16
+ use std::mem::ManuallyDrop;
17
+ use std::os::fd::FromRawFd;
18
+ use std::sync::OnceLock;
19
+ use thiserror::Error;
20
+
21
+ #[inline]
22
+ pub fn parse_parquet_columns<'a>(
23
+ rb_self: Value,
24
+ args: &[Value],
25
+ ) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
26
+ let ruby = unsafe { Ruby::get_unchecked() };
27
+
28
+ let ParquetColumnsArgs {
29
+ to_read,
30
+ result_type,
31
+ columns,
32
+ batch_size,
33
+ } = parse_parquet_columns_args(&ruby, args)?;
34
+
35
+ if !ruby.block_given() {
36
+ return create_column_enumerator(ColumnEnumeratorArgs {
37
+ rb_self,
38
+ to_read,
39
+ result_type,
40
+ columns,
41
+ batch_size,
42
+ });
43
+ }
44
+
45
+ let (batch_reader, schema, num_rows) = if to_read.is_kind_of(ruby.class_string()) {
46
+ let path_string = to_read.to_r_string()?;
47
+ let file_path = unsafe { path_string.as_str()? };
48
+ let file = File::open(file_path).map_err(|e| ReaderError::FileOpen(e))?;
49
+
50
+ let mut builder =
51
+ ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
52
+ let schema = builder.schema().clone();
53
+ let num_rows = builder.metadata().file_metadata().num_rows();
54
+
55
+ // If columns are specified, project only those columns
56
+ if let Some(cols) = &columns {
57
+ // Get the parquet schema
58
+ let parquet_schema = builder.parquet_schema();
59
+
60
+ // Create a projection mask from column names
61
+ let projection =
62
+ ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
63
+
64
+ builder = builder.with_projection(projection);
65
+ }
66
+
67
+ if let Some(batch_size) = batch_size {
68
+ builder = builder.with_batch_size(batch_size);
69
+ }
70
+
71
+ let reader = builder.build().unwrap();
72
+
73
+ (reader, schema, num_rows)
74
+ } else if to_read.is_kind_of(ruby.class_io()) {
75
+ let raw_value = to_read.as_raw();
76
+ let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
77
+ .map_err(|_| {
78
+ ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
79
+ })?;
80
+
81
+ if fd < 0 {
82
+ return Err(ReaderError::InvalidFileDescriptor.into());
83
+ }
84
+
85
+ let file = unsafe { File::from_raw_fd(fd) };
86
+ let file = ForgottenFileHandle(ManuallyDrop::new(file));
87
+
88
+ let mut builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
89
+ let schema = builder.schema().clone();
90
+ let num_rows = builder.metadata().file_metadata().num_rows();
91
+
92
+ if let Some(batch_size) = batch_size {
93
+ builder = builder.with_batch_size(batch_size);
94
+ }
95
+
96
+ // If columns are specified, project only those columns
97
+ if let Some(cols) = &columns {
98
+ // Get the parquet schema
99
+ let parquet_schema = builder.parquet_schema();
100
+
101
+ // Create a projection mask from column names
102
+ let projection =
103
+ ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
104
+
105
+ builder = builder.with_projection(projection);
106
+ }
107
+
108
+ let reader = builder.build().unwrap();
109
+
110
+ (reader, schema, num_rows)
111
+ } else {
112
+ let readable = SeekableRubyValue(Opaque::from(to_read));
113
+
114
+ let mut builder = ParquetRecordBatchReaderBuilder::try_new(readable).unwrap();
115
+ let schema = builder.schema().clone();
116
+ let num_rows = builder.metadata().file_metadata().num_rows();
117
+
118
+ if let Some(batch_size) = batch_size {
119
+ builder = builder.with_batch_size(batch_size);
120
+ }
121
+
122
+ // If columns are specified, project only those columns
123
+ if let Some(cols) = &columns {
124
+ // Get the parquet schema
125
+ let parquet_schema = builder.parquet_schema();
126
+
127
+ // Create a projection mask from column names
128
+ let projection =
129
+ ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
130
+
131
+ builder = builder.with_projection(projection);
132
+ }
133
+
134
+ let reader = builder.build().unwrap();
135
+
136
+ (reader, schema, num_rows)
137
+ };
138
+
139
+ if num_rows == 0 {
140
+ let mut map =
141
+ HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
142
+ for field in schema.fields() {
143
+ map.insert(
144
+ StringCache::intern(field.name().to_string()).unwrap(),
145
+ vec![],
146
+ );
147
+ }
148
+ let column_record = vec![ColumnRecord::Map(map)];
149
+ return Ok(Yield::Iter(Box::new(column_record.into_iter())));
150
+ }
151
+
152
+ let iter: Box<dyn Iterator<Item = ColumnRecord<RandomState>>> = match result_type {
153
+ ParserResultType::Hash => {
154
+ let headers = OnceLock::new();
155
+ let headers_clone = headers.clone();
156
+ let iter = batch_reader
157
+ .filter_map(move |batch| {
158
+ batch.ok().map(|batch| {
159
+ let headers = headers_clone.get_or_init(|| {
160
+ let schema = batch.schema();
161
+ let fields = schema.fields();
162
+ let mut header_string = Vec::with_capacity(fields.len());
163
+ for field in fields {
164
+ header_string.push(field.name().to_owned());
165
+ }
166
+ StringCache::intern_many(&header_string).unwrap()
167
+ });
168
+
169
+ let mut map =
170
+ HashMap::with_capacity_and_hasher(headers.len(), Default::default());
171
+
172
+ batch.columns().iter().enumerate().for_each(|(i, column)| {
173
+ let header = headers[i];
174
+ let values = ParquetValueVec::try_from(column.clone()).unwrap();
175
+ map.insert(header, values.into_inner());
176
+ });
177
+
178
+ map
179
+ })
180
+ })
181
+ .map(ColumnRecord::Map);
182
+
183
+ Box::new(HeaderCacheCleanupIter {
184
+ inner: iter,
185
+ headers,
186
+ })
187
+ }
188
+ ParserResultType::Array => Box::new(
189
+ batch_reader
190
+ .filter_map(|batch| {
191
+ batch.ok().map(|batch| {
192
+ batch
193
+ .columns()
194
+ .into_iter()
195
+ .map(|column| {
196
+ let values = ParquetValueVec::try_from(column.clone()).unwrap();
197
+ values.into_inner()
198
+ })
199
+ .collect()
200
+ })
201
+ })
202
+ .map(ColumnRecord::Vec),
203
+ ),
204
+ };
205
+
206
+ Ok(Yield::Iter(iter))
207
+ }
208
+
209
+ #[derive(Error, Debug)]
210
+ pub enum ReaderError {
211
+ #[error("Failed to get file descriptor: {0}")]
212
+ FileDescriptor(String),
213
+ #[error("Invalid file descriptor")]
214
+ InvalidFileDescriptor,
215
+ #[error("Failed to open file: {0}")]
216
+ FileOpen(#[from] io::Error),
217
+ #[error("Failed to intern headers: {0}")]
218
+ HeaderIntern(#[from] CacheError),
219
+ #[error("Ruby error: {0}")]
220
+ Ruby(String),
221
+ #[error("Parquet error: {0}")]
222
+ Parquet(#[from] ParquetError),
223
+ }
224
+
225
+ impl From<MagnusError> for ReaderError {
226
+ fn from(err: MagnusError) -> Self {
227
+ Self::Ruby(err.to_string())
228
+ }
229
+ }
230
+
231
+ impl From<ReaderError> for MagnusError {
232
+ fn from(err: ReaderError) -> Self {
233
+ MagnusError::new(
234
+ Ruby::get().unwrap().exception_runtime_error(),
235
+ err.to_string(),
236
+ )
237
+ }
238
+ }
@@ -0,0 +1,152 @@
1
+ use crate::header_cache::{HeaderCacheCleanupIter, StringCache};
2
+ use crate::{
3
+ create_row_enumerator, utils::*, ForgottenFileHandle, ParquetField, ParserResultType,
4
+ ReaderError, RowEnumeratorArgs, RowRecord, SeekableRubyValue,
5
+ };
6
+ use ahash::RandomState;
7
+ use magnus::rb_sys::AsRawValue;
8
+ use magnus::value::{Opaque, ReprValue};
9
+ use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
10
+ use parquet::file::reader::{FileReader, SerializedFileReader};
11
+ use parquet::record::reader::RowIter as ParquetRowIter;
12
+ use parquet::schema::types::{Type as SchemaType, TypePtr};
13
+ use std::collections::HashMap;
14
+ use std::fs::File;
15
+ use std::mem::ManuallyDrop;
16
+ use std::os::fd::FromRawFd;
17
+ use std::sync::OnceLock;
18
+
19
+ #[inline]
20
+ pub fn parse_parquet_rows<'a>(
21
+ rb_self: Value,
22
+ args: &[Value],
23
+ ) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
24
+ let ruby = unsafe { Ruby::get_unchecked() };
25
+
26
+ let ParquetRowsArgs {
27
+ to_read,
28
+ result_type,
29
+ columns,
30
+ } = parse_parquet_rows_args(&ruby, args)?;
31
+
32
+ if !ruby.block_given() {
33
+ return create_row_enumerator(RowEnumeratorArgs {
34
+ rb_self,
35
+ to_read,
36
+ result_type,
37
+ columns,
38
+ });
39
+ }
40
+
41
+ let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
42
+ let path_string = to_read.to_r_string()?;
43
+ let file_path = unsafe { path_string.as_str()? };
44
+ let file = File::open(file_path).unwrap();
45
+ let reader = SerializedFileReader::new(file).unwrap();
46
+ let schema = reader.metadata().file_metadata().schema().clone();
47
+
48
+ (schema, ParquetRowIter::from_file_into(Box::new(reader)))
49
+ } else if to_read.is_kind_of(ruby.class_io()) {
50
+ let raw_value = to_read.as_raw();
51
+ let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
52
+ .map_err(|_| {
53
+ ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
54
+ })?;
55
+
56
+ if fd < 0 {
57
+ return Err(ReaderError::InvalidFileDescriptor.into());
58
+ }
59
+
60
+ let file = unsafe { File::from_raw_fd(fd) };
61
+ let file = ForgottenFileHandle(ManuallyDrop::new(file));
62
+ let reader = SerializedFileReader::new(file).unwrap();
63
+ let schema = reader.metadata().file_metadata().schema().clone();
64
+
65
+ (schema, ParquetRowIter::from_file_into(Box::new(reader)))
66
+ } else {
67
+ let readable = SeekableRubyValue(Opaque::from(to_read));
68
+ let reader = SerializedFileReader::new(readable).unwrap();
69
+ let schema = reader.metadata().file_metadata().schema().clone();
70
+
71
+ (schema, ParquetRowIter::from_file_into(Box::new(reader)))
72
+ };
73
+
74
+ if let Some(cols) = columns {
75
+ let projection = create_projection_schema(&schema, &cols);
76
+ iter = iter.project(Some(projection.to_owned())).map_err(|e| {
77
+ MagnusError::new(
78
+ ruby.exception_runtime_error(),
79
+ format!("Failed to create projection: {}", e),
80
+ )
81
+ })?;
82
+ }
83
+
84
+ let iter: Box<dyn Iterator<Item = RowRecord<RandomState>>> = match result_type {
85
+ ParserResultType::Hash => {
86
+ let headers = OnceLock::new();
87
+ let headers_clone = headers.clone();
88
+ let iter = iter
89
+ .filter_map(move |row| {
90
+ row.ok().map(|row| {
91
+ let headers = headers_clone.get_or_init(|| {
92
+ let column_count = row.get_column_iter().count();
93
+
94
+ let mut header_string = Vec::with_capacity(column_count);
95
+ for (k, _) in row.get_column_iter() {
96
+ header_string.push(k.to_owned());
97
+ }
98
+
99
+ let headers = StringCache::intern_many(&header_string).unwrap();
100
+
101
+ headers
102
+ });
103
+
104
+ let mut map =
105
+ HashMap::with_capacity_and_hasher(headers.len(), Default::default());
106
+ row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
107
+ map.insert(headers[i], ParquetField(v.clone()));
108
+ });
109
+ map
110
+ })
111
+ })
112
+ .map(RowRecord::Map);
113
+
114
+ Box::new(HeaderCacheCleanupIter {
115
+ inner: iter,
116
+ headers,
117
+ })
118
+ }
119
+ ParserResultType::Array => Box::new(
120
+ iter.filter_map(|row| {
121
+ row.ok().map(|row| {
122
+ let column_count = row.get_column_iter().count();
123
+ let mut vec = Vec::with_capacity(column_count);
124
+ row.get_column_iter()
125
+ .for_each(|(_, v)| vec.push(ParquetField(v.clone())));
126
+ vec
127
+ })
128
+ })
129
+ .map(RowRecord::Vec),
130
+ ),
131
+ };
132
+
133
+ Ok(Yield::Iter(iter))
134
+ }
135
+
136
+ fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
137
+ if let SchemaType::GroupType { fields, .. } = schema {
138
+ let projected_fields: Vec<TypePtr> = fields
139
+ .iter()
140
+ .filter(|field| columns.contains(&field.name().to_string()))
141
+ .cloned()
142
+ .collect();
143
+
144
+ SchemaType::GroupType {
145
+ basic_info: schema.get_basic_info().clone(),
146
+ fields: projected_fields,
147
+ }
148
+ } else {
149
+ // Return original schema if not a group type
150
+ schema.clone()
151
+ }
152
+ }
@@ -1,367 +1,11 @@
1
- // =============================================================================
2
- // Imports and Dependencies
3
- // =============================================================================
4
- use crate::header_cache::{CacheError, HeaderCacheCleanupIter, StringCache};
5
- use crate::{
6
- create_column_enumerator, create_row_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord,
7
- ForgottenFileHandle, ParquetField, ParquetValueVec, RowEnumeratorArgs, RowRecord,
8
- SeekableRubyValue,
9
- };
10
- use ahash::RandomState;
11
- use magnus::rb_sys::AsRawValue;
12
- use magnus::value::{Opaque, ReprValue};
13
- use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
14
- use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
15
- use parquet::arrow::ProjectionMask;
16
- use parquet::errors::ParquetError;
17
- use parquet::file::reader::FileReader;
18
- use parquet::file::reader::SerializedFileReader;
19
- use parquet::record::reader::RowIter as ParquetRowIter;
20
- use parquet::schema::types::{Type as SchemaType, TypePtr};
21
- use std::collections::HashMap;
22
- use std::fs::File;
23
- use std::io::{self};
24
- use std::mem::ManuallyDrop;
25
- use std::os::fd::FromRawFd;
26
- use std::sync::OnceLock;
27
- use thiserror::Error;
28
-
29
- #[inline]
30
- pub fn parse_parquet_rows<'a>(
31
- rb_self: Value,
32
- args: &[Value],
33
- ) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
34
- let original = unsafe { Ruby::get_unchecked() };
35
- let ruby: &'static Ruby = Box::leak(Box::new(original));
36
-
37
- let ParquetRowsArgs {
38
- to_read,
39
- result_type,
40
- columns,
41
- } = parse_parquet_rows_args(&ruby, args)?;
42
-
43
- if !ruby.block_given() {
44
- return create_row_enumerator(RowEnumeratorArgs {
45
- rb_self,
46
- to_read,
47
- result_type,
48
- columns,
49
- });
50
- }
51
-
52
- let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
53
- let path_string = to_read.to_r_string()?;
54
- let file_path = unsafe { path_string.as_str()? };
55
- let file = File::open(file_path).unwrap();
56
- let reader = SerializedFileReader::new(file).unwrap();
57
- let schema = reader.metadata().file_metadata().schema().clone();
58
-
59
- (schema, ParquetRowIter::from_file_into(Box::new(reader)))
60
- } else if to_read.is_kind_of(ruby.class_io()) {
61
- let raw_value = to_read.as_raw();
62
- let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
63
- .map_err(|_| {
64
- ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
65
- })?;
66
-
67
- if fd < 0 {
68
- return Err(ReaderError::InvalidFileDescriptor.into());
69
- }
70
-
71
- let file = unsafe { File::from_raw_fd(fd) };
72
- let file = ForgottenFileHandle(ManuallyDrop::new(file));
73
- let reader = SerializedFileReader::new(file).unwrap();
74
- let schema = reader.metadata().file_metadata().schema().clone();
75
-
76
- (schema, ParquetRowIter::from_file_into(Box::new(reader)))
77
- } else {
78
- let readable = SeekableRubyValue(Opaque::from(to_read));
79
- let reader = SerializedFileReader::new(readable).unwrap();
80
- let schema = reader.metadata().file_metadata().schema().clone();
81
-
82
- (schema, ParquetRowIter::from_file_into(Box::new(reader)))
83
- };
84
-
85
- if let Some(cols) = columns {
86
- let projection = create_projection_schema(&schema, &cols);
87
- iter = iter.project(Some(projection.to_owned())).map_err(|e| {
88
- MagnusError::new(
89
- ruby.exception_runtime_error(),
90
- format!("Failed to create projection: {}", e),
91
- )
92
- })?;
93
- }
94
-
95
- let iter: Box<dyn Iterator<Item = RowRecord<RandomState>>> = match result_type.as_str() {
96
- "hash" => {
97
- let headers = OnceLock::new();
98
- let headers_clone = headers.clone();
99
- let iter = iter
100
- .filter_map(move |row| {
101
- row.ok().map(|row| {
102
- let headers = headers_clone.get_or_init(|| {
103
- let column_count = row.get_column_iter().count();
104
-
105
- let mut header_string = Vec::with_capacity(column_count);
106
- for (k, _) in row.get_column_iter() {
107
- header_string.push(k.to_owned());
108
- }
109
-
110
- let headers = StringCache::intern_many(&header_string).unwrap();
111
-
112
- headers
113
- });
114
-
115
- let mut map =
116
- HashMap::with_capacity_and_hasher(headers.len(), Default::default());
117
- row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
118
- map.insert(headers[i], ParquetField(v.clone()));
119
- });
120
- map
121
- })
122
- })
123
- .map(RowRecord::Map);
124
-
125
- Box::new(HeaderCacheCleanupIter {
126
- inner: iter,
127
- headers,
128
- })
129
- }
130
- "array" => Box::new(
131
- iter.filter_map(|row| {
132
- row.ok().map(|row| {
133
- let column_count = row.get_column_iter().count();
134
- let mut vec = Vec::with_capacity(column_count);
135
- row.get_column_iter()
136
- .for_each(|(_, v)| vec.push(ParquetField(v.clone())));
137
- vec
138
- })
139
- })
140
- .map(RowRecord::Vec),
141
- ),
142
- _ => {
143
- return Err(MagnusError::new(
144
- ruby.exception_runtime_error(),
145
- "Invalid result type",
146
- ))
147
- }
148
- };
149
-
150
- Ok(Yield::Iter(iter))
151
- }
152
-
153
- #[inline]
154
- pub fn parse_parquet_columns<'a>(
155
- rb_self: Value,
156
- args: &[Value],
157
- ) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
158
- let original = unsafe { Ruby::get_unchecked() };
159
- let ruby: &'static Ruby = Box::leak(Box::new(original));
160
-
161
- let ParquetColumnsArgs {
162
- to_read,
163
- result_type,
164
- columns,
165
- batch_size,
166
- } = parse_parquet_columns_args(&ruby, args)?;
167
-
168
- if !ruby.block_given() {
169
- return create_column_enumerator(ColumnEnumeratorArgs {
170
- rb_self,
171
- to_read,
172
- result_type,
173
- columns,
174
- batch_size,
175
- });
176
- }
177
-
178
- let (batch_reader, schema, num_rows) = if to_read.is_kind_of(ruby.class_string()) {
179
- let path_string = to_read.to_r_string()?;
180
- let file_path = unsafe { path_string.as_str()? };
181
- let file = File::open(file_path).map_err(|e| ReaderError::FileOpen(e))?;
182
-
183
- let mut builder =
184
- ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
185
- let schema = builder.schema().clone();
186
- let num_rows = builder.metadata().file_metadata().num_rows();
187
-
188
- // If columns are specified, project only those columns
189
- if let Some(cols) = &columns {
190
- // Get the parquet schema
191
- let parquet_schema = builder.parquet_schema();
192
-
193
- // Create a projection mask from column names
194
- let projection =
195
- ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
196
-
197
- builder = builder.with_projection(projection);
198
- }
199
-
200
- if let Some(batch_size) = batch_size {
201
- builder = builder.with_batch_size(batch_size);
202
- }
1
+ use std::io;
203
2
 
204
- let reader = builder.build().unwrap();
205
-
206
- (reader, schema, num_rows)
207
- } else if to_read.is_kind_of(ruby.class_io()) {
208
- let raw_value = to_read.as_raw();
209
- let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
210
- .map_err(|_| {
211
- ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
212
- })?;
213
-
214
- if fd < 0 {
215
- return Err(ReaderError::InvalidFileDescriptor.into());
216
- }
217
-
218
- let file = unsafe { File::from_raw_fd(fd) };
219
- let file = ForgottenFileHandle(ManuallyDrop::new(file));
220
-
221
- let mut builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
222
- let schema = builder.schema().clone();
223
- let num_rows = builder.metadata().file_metadata().num_rows();
224
-
225
- if let Some(batch_size) = batch_size {
226
- builder = builder.with_batch_size(batch_size);
227
- }
228
-
229
- // If columns are specified, project only those columns
230
- if let Some(cols) = &columns {
231
- // Get the parquet schema
232
- let parquet_schema = builder.parquet_schema();
233
-
234
- // Create a projection mask from column names
235
- let projection =
236
- ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
237
-
238
- builder = builder.with_projection(projection);
239
- }
240
-
241
- let reader = builder.build().unwrap();
242
-
243
- (reader, schema, num_rows)
244
- } else {
245
- let readable = SeekableRubyValue(Opaque::from(to_read));
246
-
247
- let mut builder = ParquetRecordBatchReaderBuilder::try_new(readable).unwrap();
248
- let schema = builder.schema().clone();
249
- let num_rows = builder.metadata().file_metadata().num_rows();
250
-
251
- if let Some(batch_size) = batch_size {
252
- builder = builder.with_batch_size(batch_size);
253
- }
254
-
255
- // If columns are specified, project only those columns
256
- if let Some(cols) = &columns {
257
- // Get the parquet schema
258
- let parquet_schema = builder.parquet_schema();
259
-
260
- // Create a projection mask from column names
261
- let projection =
262
- ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
263
-
264
- builder = builder.with_projection(projection);
265
- }
266
-
267
- let reader = builder.build().unwrap();
268
-
269
- (reader, schema, num_rows)
270
- };
271
-
272
- if num_rows == 0 {
273
- let mut map =
274
- HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
275
- for field in schema.fields() {
276
- map.insert(
277
- StringCache::intern(field.name().to_string()).unwrap(),
278
- vec![],
279
- );
280
- }
281
- let column_record = vec![ColumnRecord::Map(map)];
282
- return Ok(Yield::Iter(Box::new(column_record.into_iter())));
283
- }
284
-
285
- let iter: Box<dyn Iterator<Item = ColumnRecord<RandomState>>> = match result_type.as_str() {
286
- "hash" => {
287
- let headers = OnceLock::new();
288
- let headers_clone = headers.clone();
289
- let iter = batch_reader
290
- .filter_map(move |batch| {
291
- batch.ok().map(|batch| {
292
- let headers = headers_clone.get_or_init(|| {
293
- let schema = batch.schema();
294
- let fields = schema.fields();
295
- let mut header_string = Vec::with_capacity(fields.len());
296
- for field in fields {
297
- header_string.push(field.name().to_owned());
298
- }
299
- StringCache::intern_many(&header_string).unwrap()
300
- });
301
-
302
- let mut map =
303
- HashMap::with_capacity_and_hasher(headers.len(), Default::default());
304
-
305
- batch.columns().iter().enumerate().for_each(|(i, column)| {
306
- let header = headers[i];
307
- let values = ParquetValueVec::try_from(column.clone()).unwrap();
308
- map.insert(header, values.into_inner());
309
- });
310
-
311
- map
312
- })
313
- })
314
- .map(ColumnRecord::Map);
315
-
316
- Box::new(HeaderCacheCleanupIter {
317
- inner: iter,
318
- headers,
319
- })
320
- }
321
- "array" => Box::new(
322
- batch_reader
323
- .filter_map(|batch| {
324
- batch.ok().map(|batch| {
325
- batch
326
- .columns()
327
- .into_iter()
328
- .map(|column| {
329
- let values = ParquetValueVec::try_from(column.clone()).unwrap();
330
- values.into_inner()
331
- })
332
- .collect()
333
- })
334
- })
335
- .map(ColumnRecord::Vec),
336
- ),
337
- _ => {
338
- return Err(MagnusError::new(
339
- ruby.exception_runtime_error(),
340
- "Invalid result type",
341
- ))
342
- }
343
- };
344
-
345
- Ok(Yield::Iter(iter))
346
- }
347
-
348
- fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
349
- if let SchemaType::GroupType { fields, .. } = schema {
350
- let projected_fields: Vec<TypePtr> = fields
351
- .iter()
352
- .filter(|field| columns.contains(&field.name().to_string()))
353
- .cloned()
354
- .collect();
3
+ use magnus::{Error as MagnusError, Ruby};
4
+ use thiserror::Error;
355
5
 
356
- SchemaType::GroupType {
357
- basic_info: schema.get_basic_info().clone(),
358
- fields: projected_fields,
359
- }
360
- } else {
361
- // Return original schema if not a group type
362
- schema.clone()
363
- }
364
- }
6
+ use crate::header_cache::CacheError;
7
+ pub use crate::parquet_column_reader::parse_parquet_columns;
8
+ pub use crate::parquet_row_reader::parse_parquet_rows;
365
9
 
366
10
  #[derive(Error, Debug)]
367
11
  pub enum ReaderError {
@@ -376,7 +20,7 @@ pub enum ReaderError {
376
20
  #[error("Ruby error: {0}")]
377
21
  Ruby(String),
378
22
  #[error("Parquet error: {0}")]
379
- Parquet(#[from] ParquetError),
23
+ Parquet(#[from] parquet::errors::ParquetError),
380
24
  }
381
25
 
382
26
  impl From<MagnusError> for ReaderError {
@@ -14,9 +14,8 @@ pub struct RubyReader<T> {
14
14
  offset: usize,
15
15
  }
16
16
 
17
- pub trait SeekableRead: std::io::Read + Seek {}
18
- impl SeekableRead for RubyReader<Value> {}
19
- impl SeekableRead for RubyReader<RString> {}
17
+ pub trait SeekableRead: Read + Seek {}
18
+ impl<T: Read + Seek> SeekableRead for T {}
20
19
 
21
20
  pub fn build_ruby_reader(
22
21
  ruby: &Ruby,
@@ -15,6 +15,47 @@ use parquet::record::Field;
15
15
 
16
16
  use crate::header_cache::StringCacheKey;
17
17
 
18
+ #[derive(Copy, Clone, Debug, PartialEq, Eq)]
19
+ pub enum ParserResultType {
20
+ Hash,
21
+ Array,
22
+ }
23
+
24
+ impl ParserResultType {
25
+ pub fn iter() -> impl Iterator<Item = Self> {
26
+ [Self::Hash, Self::Array].into_iter()
27
+ }
28
+ }
29
+
30
+ impl TryFrom<&str> for ParserResultType {
31
+ type Error = String;
32
+
33
+ fn try_from(value: &str) -> Result<Self, Self::Error> {
34
+ match value {
35
+ "hash" => Ok(ParserResultType::Hash),
36
+ "array" => Ok(ParserResultType::Array),
37
+ _ => Err(format!("Invalid parser result type: {}", value)),
38
+ }
39
+ }
40
+ }
41
+
42
+ impl TryFrom<String> for ParserResultType {
43
+ type Error = String;
44
+
45
+ fn try_from(value: String) -> Result<Self, Self::Error> {
46
+ Self::try_from(value.as_str())
47
+ }
48
+ }
49
+
50
+ impl std::fmt::Display for ParserResultType {
51
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
52
+ match self {
53
+ ParserResultType::Hash => write!(f, "hash"),
54
+ ParserResultType::Array => write!(f, "array"),
55
+ }
56
+ }
57
+ }
58
+
18
59
  #[derive(Debug)]
19
60
  pub enum RowRecord<S: BuildHasher + Default> {
20
61
  Vec(Vec<ParquetField>),
@@ -4,6 +4,8 @@ use magnus::{
4
4
  Error, RString, Ruby, Symbol, Value,
5
5
  };
6
6
 
7
+ use crate::ParserResultType;
8
+
7
9
  fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, Error> {
8
10
  if value.is_nil() {
9
11
  Ok(None)
@@ -28,7 +30,7 @@ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, E
28
30
  #[derive(Debug)]
29
31
  pub struct ParquetRowsArgs {
30
32
  pub to_read: Value,
31
- pub result_type: String,
33
+ pub result_type: ParserResultType,
32
34
  pub columns: Option<Vec<String>>,
33
35
  }
34
36
 
@@ -43,28 +45,31 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
43
45
  &["result_type", "columns"],
44
46
  )?;
45
47
 
46
- let result_type = match kwargs
48
+ let result_type: ParserResultType = match kwargs
47
49
  .optional
48
50
  .0
49
51
  .map(|value| parse_string_or_symbol(ruby, value))
50
52
  {
51
- Some(Ok(Some(parsed))) => match parsed.as_str() {
52
- "hash" | "array" => parsed,
53
- _ => {
54
- return Err(Error::new(
55
- magnus::exception::runtime_error(),
56
- "result_type must be either 'hash' or 'array'",
57
- ))
58
- }
59
- },
60
- Some(Ok(None)) => String::from("hash"),
53
+ Some(Ok(Some(parsed))) => parsed.try_into().map_err(|e| {
54
+ Error::new(
55
+ magnus::exception::runtime_error(),
56
+ format!(
57
+ "Invalid result type: {e}. Must be one of {}",
58
+ ParserResultType::iter()
59
+ .map(|v| v.to_string())
60
+ .collect::<Vec<_>>()
61
+ .join(", ")
62
+ ),
63
+ )
64
+ })?,
65
+ Some(Ok(None)) => ParserResultType::Hash,
61
66
  Some(Err(_)) => {
62
67
  return Err(Error::new(
63
68
  magnus::exception::type_error(),
64
69
  "result_type must be a String or Symbol",
65
70
  ))
66
71
  }
67
- None => String::from("hash"),
72
+ None => ParserResultType::Hash,
68
73
  };
69
74
 
70
75
  Ok(ParquetRowsArgs {
@@ -77,7 +82,7 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
77
82
  #[derive(Debug)]
78
83
  pub struct ParquetColumnsArgs {
79
84
  pub to_read: Value,
80
- pub result_type: String,
85
+ pub result_type: ParserResultType,
81
86
  pub columns: Option<Vec<String>>,
82
87
  pub batch_size: Option<usize>,
83
88
  }
@@ -96,28 +101,31 @@ pub fn parse_parquet_columns_args(
96
101
  &["result_type", "columns", "batch_size"],
97
102
  )?;
98
103
 
99
- let result_type = match kwargs
104
+ let result_type: ParserResultType = match kwargs
100
105
  .optional
101
106
  .0
102
107
  .map(|value| parse_string_or_symbol(ruby, value))
103
108
  {
104
- Some(Ok(Some(parsed))) => match parsed.as_str() {
105
- "hash" | "array" => parsed,
106
- _ => {
107
- return Err(Error::new(
108
- magnus::exception::runtime_error(),
109
- "result_type must be either 'hash' or 'array'",
110
- ))
111
- }
112
- },
113
- Some(Ok(None)) => String::from("hash"),
109
+ Some(Ok(Some(parsed))) => parsed.try_into().map_err(|e| {
110
+ Error::new(
111
+ magnus::exception::runtime_error(),
112
+ format!(
113
+ "Invalid result type: {e}. Must be one of {}",
114
+ ParserResultType::iter()
115
+ .map(|v| v.to_string())
116
+ .collect::<Vec<_>>()
117
+ .join(", ")
118
+ ),
119
+ )
120
+ })?,
121
+ Some(Ok(None)) => ParserResultType::Hash,
114
122
  Some(Err(_)) => {
115
123
  return Err(Error::new(
116
124
  magnus::exception::type_error(),
117
125
  "result_type must be a String or Symbol",
118
126
  ))
119
127
  }
120
- None => String::from("hash"),
128
+ None => ParserResultType::Hash,
121
129
  };
122
130
 
123
131
  Ok(ParquetColumnsArgs {
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.5"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-05 00:00:00.000000000 Z
11
+ date: 2025-01-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -60,6 +60,8 @@ files:
60
60
  - ext/parquet/src/enumerator.rs
61
61
  - ext/parquet/src/header_cache.rs
62
62
  - ext/parquet/src/lib.rs
63
+ - ext/parquet/src/parquet_column_reader.rs
64
+ - ext/parquet/src/parquet_row_reader.rs
63
65
  - ext/parquet/src/reader.rs
64
66
  - ext/parquet/src/ruby_integration.rs
65
67
  - ext/parquet/src/ruby_reader.rs