parquet 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b88d6751418f21c4ec032d05b6d0a6e9dbd37304983ed80e1a290508c787d118
4
- data.tar.gz: 948702f38cad3c4d4e76efccbd9d7d8ad4c81366c4dcba2c71058cc4d013c237
3
+ metadata.gz: 398a8ab4fe6b9c8e82d63ec832aa73163e75874c39080d87291a60397756df42
4
+ data.tar.gz: cace20e14d0eddc6e3185b2f9294253cb57c1689ec463ff66bc903d3c780af13
5
5
  SHA512:
6
- metadata.gz: 30f90ee2f597aa6e2d5a84b8ab9780af3d71fa41d3a1152f47d7a12b34bc203b8ff06b04c3f929f689c93be9e962186a3e6c305f61724b36ad4e6ad551c11f49
7
- data.tar.gz: 5a83b007e0c4789c6cfde1f8037228b0b00f2f0ef7ea0f932d7eaafefb91669db422450bbfd923f4388e2bfc644cae57f514828a2e4a2868ee6a20b492af428e
6
+ metadata.gz: 72ae6542b367fe433016f06fa109aaa77fe360bbc1df64e5c997db8fcc0a00aa166aa19a37240a706b3f443612770b80bc387dd41b34ee4a94ab26c3b0e74832
7
+ data.tar.gz: f69b10c6d4c8d879cdd3fce7c3b44933a99569358d1adfa3106760bd7c66036a2fef86737cf4dc6369be46234c124b9f2ef66e82fab118e36b5b079e9d23e10b
data/Cargo.lock CHANGED
@@ -826,16 +826,6 @@ dependencies = [
826
826
  "wasm-bindgen",
827
827
  ]
828
828
 
829
- [[package]]
830
- name = "kanal"
831
- version = "0.1.0-pre8"
832
- source = "registry+https://github.com/rust-lang/crates.io-index"
833
- checksum = "b05d55519627edaf7fd0f29981f6dc03fb52df3f5b257130eb8d0bf2801ea1d7"
834
- dependencies = [
835
- "futures-core",
836
- "lock_api",
837
- ]
838
-
839
829
  [[package]]
840
830
  name = "lazy_static"
841
831
  version = "1.5.0"
@@ -975,18 +965,6 @@ dependencies = [
975
965
  "twox-hash",
976
966
  ]
977
967
 
978
- [[package]]
979
- name = "magnus"
980
- version = "0.6.4"
981
- source = "registry+https://github.com/rust-lang/crates.io-index"
982
- checksum = "b1597ef40aa8c36be098249e82c9a20cf7199278ac1c1a1a995eeead6a184479"
983
- dependencies = [
984
- "magnus-macros",
985
- "rb-sys",
986
- "rb-sys-env",
987
- "seq-macro",
988
- ]
989
-
990
968
  [[package]]
991
969
  name = "magnus"
992
970
  version = "0.7.1"
@@ -1203,13 +1181,10 @@ dependencies = [
1203
1181
  "itertools 0.14.0",
1204
1182
  "jemallocator",
1205
1183
  "jiff",
1206
- "kanal",
1207
- "magnus 0.7.1",
1184
+ "magnus",
1208
1185
  "mimalloc",
1209
1186
  "parquet 54.0.0",
1210
1187
  "rb-sys",
1211
- "serde",
1212
- "serde_magnus",
1213
1188
  "thiserror",
1214
1189
  ]
1215
1190
 
@@ -1467,17 +1442,6 @@ dependencies = [
1467
1442
  "serde",
1468
1443
  ]
1469
1444
 
1470
- [[package]]
1471
- name = "serde_magnus"
1472
- version = "0.8.1"
1473
- source = "registry+https://github.com/rust-lang/crates.io-index"
1474
- checksum = "76c20da583b5e1016e9199ef5f3260f7a8d1b253307d232600f6b12737262dbd"
1475
- dependencies = [
1476
- "magnus 0.6.4",
1477
- "serde",
1478
- "tap",
1479
- ]
1480
-
1481
1445
  [[package]]
1482
1446
  name = "shell-words"
1483
1447
  version = "1.1.0"
@@ -1566,12 +1530,6 @@ dependencies = [
1566
1530
  "syn",
1567
1531
  ]
1568
1532
 
1569
- [[package]]
1570
- name = "tap"
1571
- version = "1.0.1"
1572
- source = "registry+https://github.com/rust-lang/crates.io-index"
1573
- checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
1574
-
1575
1533
  [[package]]
1576
1534
  name = "thiserror"
1577
1535
  version = "2.0.9"
data/Gemfile CHANGED
@@ -8,7 +8,7 @@ gemspec
8
8
 
9
9
  group :development do
10
10
  gem "benchmark-ips", "~> 2.12"
11
- # gem "polars-df"
11
+ gem "polars-df"
12
12
  gem "duckdb"
13
13
  end
14
14
 
@@ -8,19 +8,15 @@ crate-type = ["cdylib"]
8
8
 
9
9
  [dependencies]
10
10
  ahash = "0.8"
11
- parquet = { version = "^54.0", features = ["json", "object_store"] }
12
- arrow-schema = "54.0.0"
13
11
  arrow-array = "54.0.0"
12
+ arrow-schema = "54.0.0"
14
13
  bytes = "^1.9"
15
- kanal = "0.1.0-pre8"
14
+ itertools = "^0.14"
15
+ jiff = "0.1.19"
16
16
  magnus = { version = "0.7", features = ["rb-sys"] }
17
+ parquet = { version = "^54.0", features = ["json", "object_store"] }
17
18
  rb-sys = "^0.9"
18
- serde = { version = "1.0", features = ["derive"] }
19
- serde_magnus = "0.8.1"
20
19
  thiserror = "2.0"
21
- itertools = "^0.14"
22
- jiff = "0.1.19"
23
-
24
20
 
25
21
  [target.'cfg(target_os = "linux")'.dependencies]
26
22
  jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
@@ -3,12 +3,12 @@ use magnus::{
3
3
  block::Yield, value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value,
4
4
  };
5
5
 
6
- use crate::{ColumnRecord, RowRecord};
6
+ use crate::{ColumnRecord, ParserResultType, RowRecord};
7
7
 
8
8
  pub struct RowEnumeratorArgs {
9
9
  pub rb_self: Value,
10
10
  pub to_read: Value,
11
- pub result_type: String,
11
+ pub result_type: ParserResultType,
12
12
  pub columns: Option<Vec<String>>,
13
13
  }
14
14
 
@@ -17,7 +17,10 @@ pub fn create_row_enumerator(
17
17
  args: RowEnumeratorArgs,
18
18
  ) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
19
19
  let kwargs = RHash::new();
20
- kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
20
+ kwargs.aset(
21
+ Symbol::new("result_type"),
22
+ Symbol::new(args.result_type.to_string()),
23
+ )?;
21
24
  if let Some(columns) = args.columns {
22
25
  kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
23
26
  }
@@ -30,7 +33,7 @@ pub fn create_row_enumerator(
30
33
  pub struct ColumnEnumeratorArgs {
31
34
  pub rb_self: Value,
32
35
  pub to_read: Value,
33
- pub result_type: String,
36
+ pub result_type: ParserResultType,
34
37
  pub columns: Option<Vec<String>>,
35
38
  pub batch_size: Option<usize>,
36
39
  }
@@ -40,7 +43,10 @@ pub fn create_column_enumerator(
40
43
  args: ColumnEnumeratorArgs,
41
44
  ) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
42
45
  let kwargs = RHash::new();
43
- kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
46
+ kwargs.aset(
47
+ Symbol::new("result_type"),
48
+ Symbol::new(args.result_type.to_string()),
49
+ )?;
44
50
  if let Some(columns) = args.columns {
45
51
  kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
46
52
  }
@@ -7,6 +7,9 @@ mod ruby_reader;
7
7
  mod types;
8
8
  mod utils;
9
9
 
10
+ mod parquet_column_reader;
11
+ mod parquet_row_reader;
12
+
10
13
  use crate::enumerator::*;
11
14
  use crate::reader::*;
12
15
  use crate::ruby_integration::*;
@@ -0,0 +1,238 @@
1
+ use crate::header_cache::{CacheError, HeaderCacheCleanupIter, StringCache};
2
+ use crate::{
3
+ create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ForgottenFileHandle,
4
+ ParquetValueVec, ParserResultType, SeekableRubyValue,
5
+ };
6
+ use ahash::RandomState;
7
+ use magnus::rb_sys::AsRawValue;
8
+ use magnus::value::{Opaque, ReprValue};
9
+ use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
10
+ use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
11
+ use parquet::arrow::ProjectionMask;
12
+ use parquet::errors::ParquetError;
13
+ use std::collections::HashMap;
14
+ use std::fs::File;
15
+ use std::io;
16
+ use std::mem::ManuallyDrop;
17
+ use std::os::fd::FromRawFd;
18
+ use std::sync::OnceLock;
19
+ use thiserror::Error;
20
+
21
+ #[inline]
22
+ pub fn parse_parquet_columns<'a>(
23
+ rb_self: Value,
24
+ args: &[Value],
25
+ ) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
26
+ let ruby = unsafe { Ruby::get_unchecked() };
27
+
28
+ let ParquetColumnsArgs {
29
+ to_read,
30
+ result_type,
31
+ columns,
32
+ batch_size,
33
+ } = parse_parquet_columns_args(&ruby, args)?;
34
+
35
+ if !ruby.block_given() {
36
+ return create_column_enumerator(ColumnEnumeratorArgs {
37
+ rb_self,
38
+ to_read,
39
+ result_type,
40
+ columns,
41
+ batch_size,
42
+ });
43
+ }
44
+
45
+ let (batch_reader, schema, num_rows) = if to_read.is_kind_of(ruby.class_string()) {
46
+ let path_string = to_read.to_r_string()?;
47
+ let file_path = unsafe { path_string.as_str()? };
48
+ let file = File::open(file_path).map_err(|e| ReaderError::FileOpen(e))?;
49
+
50
+ let mut builder =
51
+ ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
52
+ let schema = builder.schema().clone();
53
+ let num_rows = builder.metadata().file_metadata().num_rows();
54
+
55
+ // If columns are specified, project only those columns
56
+ if let Some(cols) = &columns {
57
+ // Get the parquet schema
58
+ let parquet_schema = builder.parquet_schema();
59
+
60
+ // Create a projection mask from column names
61
+ let projection =
62
+ ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
63
+
64
+ builder = builder.with_projection(projection);
65
+ }
66
+
67
+ if let Some(batch_size) = batch_size {
68
+ builder = builder.with_batch_size(batch_size);
69
+ }
70
+
71
+ let reader = builder.build().unwrap();
72
+
73
+ (reader, schema, num_rows)
74
+ } else if to_read.is_kind_of(ruby.class_io()) {
75
+ let raw_value = to_read.as_raw();
76
+ let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
77
+ .map_err(|_| {
78
+ ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
79
+ })?;
80
+
81
+ if fd < 0 {
82
+ return Err(ReaderError::InvalidFileDescriptor.into());
83
+ }
84
+
85
+ let file = unsafe { File::from_raw_fd(fd) };
86
+ let file = ForgottenFileHandle(ManuallyDrop::new(file));
87
+
88
+ let mut builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
89
+ let schema = builder.schema().clone();
90
+ let num_rows = builder.metadata().file_metadata().num_rows();
91
+
92
+ if let Some(batch_size) = batch_size {
93
+ builder = builder.with_batch_size(batch_size);
94
+ }
95
+
96
+ // If columns are specified, project only those columns
97
+ if let Some(cols) = &columns {
98
+ // Get the parquet schema
99
+ let parquet_schema = builder.parquet_schema();
100
+
101
+ // Create a projection mask from column names
102
+ let projection =
103
+ ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
104
+
105
+ builder = builder.with_projection(projection);
106
+ }
107
+
108
+ let reader = builder.build().unwrap();
109
+
110
+ (reader, schema, num_rows)
111
+ } else {
112
+ let readable = SeekableRubyValue(Opaque::from(to_read));
113
+
114
+ let mut builder = ParquetRecordBatchReaderBuilder::try_new(readable).unwrap();
115
+ let schema = builder.schema().clone();
116
+ let num_rows = builder.metadata().file_metadata().num_rows();
117
+
118
+ if let Some(batch_size) = batch_size {
119
+ builder = builder.with_batch_size(batch_size);
120
+ }
121
+
122
+ // If columns are specified, project only those columns
123
+ if let Some(cols) = &columns {
124
+ // Get the parquet schema
125
+ let parquet_schema = builder.parquet_schema();
126
+
127
+ // Create a projection mask from column names
128
+ let projection =
129
+ ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
130
+
131
+ builder = builder.with_projection(projection);
132
+ }
133
+
134
+ let reader = builder.build().unwrap();
135
+
136
+ (reader, schema, num_rows)
137
+ };
138
+
139
+ if num_rows == 0 {
140
+ let mut map =
141
+ HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
142
+ for field in schema.fields() {
143
+ map.insert(
144
+ StringCache::intern(field.name().to_string()).unwrap(),
145
+ vec![],
146
+ );
147
+ }
148
+ let column_record = vec![ColumnRecord::Map(map)];
149
+ return Ok(Yield::Iter(Box::new(column_record.into_iter())));
150
+ }
151
+
152
+ let iter: Box<dyn Iterator<Item = ColumnRecord<RandomState>>> = match result_type {
153
+ ParserResultType::Hash => {
154
+ let headers = OnceLock::new();
155
+ let headers_clone = headers.clone();
156
+ let iter = batch_reader
157
+ .filter_map(move |batch| {
158
+ batch.ok().map(|batch| {
159
+ let headers = headers_clone.get_or_init(|| {
160
+ let schema = batch.schema();
161
+ let fields = schema.fields();
162
+ let mut header_string = Vec::with_capacity(fields.len());
163
+ for field in fields {
164
+ header_string.push(field.name().to_owned());
165
+ }
166
+ StringCache::intern_many(&header_string).unwrap()
167
+ });
168
+
169
+ let mut map =
170
+ HashMap::with_capacity_and_hasher(headers.len(), Default::default());
171
+
172
+ batch.columns().iter().enumerate().for_each(|(i, column)| {
173
+ let header = headers[i];
174
+ let values = ParquetValueVec::try_from(column.clone()).unwrap();
175
+ map.insert(header, values.into_inner());
176
+ });
177
+
178
+ map
179
+ })
180
+ })
181
+ .map(ColumnRecord::Map);
182
+
183
+ Box::new(HeaderCacheCleanupIter {
184
+ inner: iter,
185
+ headers,
186
+ })
187
+ }
188
+ ParserResultType::Array => Box::new(
189
+ batch_reader
190
+ .filter_map(|batch| {
191
+ batch.ok().map(|batch| {
192
+ batch
193
+ .columns()
194
+ .into_iter()
195
+ .map(|column| {
196
+ let values = ParquetValueVec::try_from(column.clone()).unwrap();
197
+ values.into_inner()
198
+ })
199
+ .collect()
200
+ })
201
+ })
202
+ .map(ColumnRecord::Vec),
203
+ ),
204
+ };
205
+
206
+ Ok(Yield::Iter(iter))
207
+ }
208
+
209
+ #[derive(Error, Debug)]
210
+ pub enum ReaderError {
211
+ #[error("Failed to get file descriptor: {0}")]
212
+ FileDescriptor(String),
213
+ #[error("Invalid file descriptor")]
214
+ InvalidFileDescriptor,
215
+ #[error("Failed to open file: {0}")]
216
+ FileOpen(#[from] io::Error),
217
+ #[error("Failed to intern headers: {0}")]
218
+ HeaderIntern(#[from] CacheError),
219
+ #[error("Ruby error: {0}")]
220
+ Ruby(String),
221
+ #[error("Parquet error: {0}")]
222
+ Parquet(#[from] ParquetError),
223
+ }
224
+
225
+ impl From<MagnusError> for ReaderError {
226
+ fn from(err: MagnusError) -> Self {
227
+ Self::Ruby(err.to_string())
228
+ }
229
+ }
230
+
231
+ impl From<ReaderError> for MagnusError {
232
+ fn from(err: ReaderError) -> Self {
233
+ MagnusError::new(
234
+ Ruby::get().unwrap().exception_runtime_error(),
235
+ err.to_string(),
236
+ )
237
+ }
238
+ }
@@ -0,0 +1,152 @@
1
+ use crate::header_cache::{HeaderCacheCleanupIter, StringCache};
2
+ use crate::{
3
+ create_row_enumerator, utils::*, ForgottenFileHandle, ParquetField, ParserResultType,
4
+ ReaderError, RowEnumeratorArgs, RowRecord, SeekableRubyValue,
5
+ };
6
+ use ahash::RandomState;
7
+ use magnus::rb_sys::AsRawValue;
8
+ use magnus::value::{Opaque, ReprValue};
9
+ use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
10
+ use parquet::file::reader::{FileReader, SerializedFileReader};
11
+ use parquet::record::reader::RowIter as ParquetRowIter;
12
+ use parquet::schema::types::{Type as SchemaType, TypePtr};
13
+ use std::collections::HashMap;
14
+ use std::fs::File;
15
+ use std::mem::ManuallyDrop;
16
+ use std::os::fd::FromRawFd;
17
+ use std::sync::OnceLock;
18
+
19
+ #[inline]
20
+ pub fn parse_parquet_rows<'a>(
21
+ rb_self: Value,
22
+ args: &[Value],
23
+ ) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
24
+ let ruby = unsafe { Ruby::get_unchecked() };
25
+
26
+ let ParquetRowsArgs {
27
+ to_read,
28
+ result_type,
29
+ columns,
30
+ } = parse_parquet_rows_args(&ruby, args)?;
31
+
32
+ if !ruby.block_given() {
33
+ return create_row_enumerator(RowEnumeratorArgs {
34
+ rb_self,
35
+ to_read,
36
+ result_type,
37
+ columns,
38
+ });
39
+ }
40
+
41
+ let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
42
+ let path_string = to_read.to_r_string()?;
43
+ let file_path = unsafe { path_string.as_str()? };
44
+ let file = File::open(file_path).unwrap();
45
+ let reader = SerializedFileReader::new(file).unwrap();
46
+ let schema = reader.metadata().file_metadata().schema().clone();
47
+
48
+ (schema, ParquetRowIter::from_file_into(Box::new(reader)))
49
+ } else if to_read.is_kind_of(ruby.class_io()) {
50
+ let raw_value = to_read.as_raw();
51
+ let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
52
+ .map_err(|_| {
53
+ ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
54
+ })?;
55
+
56
+ if fd < 0 {
57
+ return Err(ReaderError::InvalidFileDescriptor.into());
58
+ }
59
+
60
+ let file = unsafe { File::from_raw_fd(fd) };
61
+ let file = ForgottenFileHandle(ManuallyDrop::new(file));
62
+ let reader = SerializedFileReader::new(file).unwrap();
63
+ let schema = reader.metadata().file_metadata().schema().clone();
64
+
65
+ (schema, ParquetRowIter::from_file_into(Box::new(reader)))
66
+ } else {
67
+ let readable = SeekableRubyValue(Opaque::from(to_read));
68
+ let reader = SerializedFileReader::new(readable).unwrap();
69
+ let schema = reader.metadata().file_metadata().schema().clone();
70
+
71
+ (schema, ParquetRowIter::from_file_into(Box::new(reader)))
72
+ };
73
+
74
+ if let Some(cols) = columns {
75
+ let projection = create_projection_schema(&schema, &cols);
76
+ iter = iter.project(Some(projection.to_owned())).map_err(|e| {
77
+ MagnusError::new(
78
+ ruby.exception_runtime_error(),
79
+ format!("Failed to create projection: {}", e),
80
+ )
81
+ })?;
82
+ }
83
+
84
+ let iter: Box<dyn Iterator<Item = RowRecord<RandomState>>> = match result_type {
85
+ ParserResultType::Hash => {
86
+ let headers = OnceLock::new();
87
+ let headers_clone = headers.clone();
88
+ let iter = iter
89
+ .filter_map(move |row| {
90
+ row.ok().map(|row| {
91
+ let headers = headers_clone.get_or_init(|| {
92
+ let column_count = row.get_column_iter().count();
93
+
94
+ let mut header_string = Vec::with_capacity(column_count);
95
+ for (k, _) in row.get_column_iter() {
96
+ header_string.push(k.to_owned());
97
+ }
98
+
99
+ let headers = StringCache::intern_many(&header_string).unwrap();
100
+
101
+ headers
102
+ });
103
+
104
+ let mut map =
105
+ HashMap::with_capacity_and_hasher(headers.len(), Default::default());
106
+ row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
107
+ map.insert(headers[i], ParquetField(v.clone()));
108
+ });
109
+ map
110
+ })
111
+ })
112
+ .map(RowRecord::Map);
113
+
114
+ Box::new(HeaderCacheCleanupIter {
115
+ inner: iter,
116
+ headers,
117
+ })
118
+ }
119
+ ParserResultType::Array => Box::new(
120
+ iter.filter_map(|row| {
121
+ row.ok().map(|row| {
122
+ let column_count = row.get_column_iter().count();
123
+ let mut vec = Vec::with_capacity(column_count);
124
+ row.get_column_iter()
125
+ .for_each(|(_, v)| vec.push(ParquetField(v.clone())));
126
+ vec
127
+ })
128
+ })
129
+ .map(RowRecord::Vec),
130
+ ),
131
+ };
132
+
133
+ Ok(Yield::Iter(iter))
134
+ }
135
+
136
+ fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
137
+ if let SchemaType::GroupType { fields, .. } = schema {
138
+ let projected_fields: Vec<TypePtr> = fields
139
+ .iter()
140
+ .filter(|field| columns.contains(&field.name().to_string()))
141
+ .cloned()
142
+ .collect();
143
+
144
+ SchemaType::GroupType {
145
+ basic_info: schema.get_basic_info().clone(),
146
+ fields: projected_fields,
147
+ }
148
+ } else {
149
+ // Return original schema if not a group type
150
+ schema.clone()
151
+ }
152
+ }
@@ -1,367 +1,11 @@
1
- // =============================================================================
2
- // Imports and Dependencies
3
- // =============================================================================
4
- use crate::header_cache::{CacheError, HeaderCacheCleanupIter, StringCache};
5
- use crate::{
6
- create_column_enumerator, create_row_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord,
7
- ForgottenFileHandle, ParquetField, ParquetValueVec, RowEnumeratorArgs, RowRecord,
8
- SeekableRubyValue,
9
- };
10
- use ahash::RandomState;
11
- use magnus::rb_sys::AsRawValue;
12
- use magnus::value::{Opaque, ReprValue};
13
- use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
14
- use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
15
- use parquet::arrow::ProjectionMask;
16
- use parquet::errors::ParquetError;
17
- use parquet::file::reader::FileReader;
18
- use parquet::file::reader::SerializedFileReader;
19
- use parquet::record::reader::RowIter as ParquetRowIter;
20
- use parquet::schema::types::{Type as SchemaType, TypePtr};
21
- use std::collections::HashMap;
22
- use std::fs::File;
23
- use std::io::{self};
24
- use std::mem::ManuallyDrop;
25
- use std::os::fd::FromRawFd;
26
- use std::sync::OnceLock;
27
- use thiserror::Error;
28
-
29
- #[inline]
30
- pub fn parse_parquet_rows<'a>(
31
- rb_self: Value,
32
- args: &[Value],
33
- ) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
34
- let original = unsafe { Ruby::get_unchecked() };
35
- let ruby: &'static Ruby = Box::leak(Box::new(original));
36
-
37
- let ParquetRowsArgs {
38
- to_read,
39
- result_type,
40
- columns,
41
- } = parse_parquet_rows_args(&ruby, args)?;
42
-
43
- if !ruby.block_given() {
44
- return create_row_enumerator(RowEnumeratorArgs {
45
- rb_self,
46
- to_read,
47
- result_type,
48
- columns,
49
- });
50
- }
51
-
52
- let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
53
- let path_string = to_read.to_r_string()?;
54
- let file_path = unsafe { path_string.as_str()? };
55
- let file = File::open(file_path).unwrap();
56
- let reader = SerializedFileReader::new(file).unwrap();
57
- let schema = reader.metadata().file_metadata().schema().clone();
58
-
59
- (schema, ParquetRowIter::from_file_into(Box::new(reader)))
60
- } else if to_read.is_kind_of(ruby.class_io()) {
61
- let raw_value = to_read.as_raw();
62
- let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
63
- .map_err(|_| {
64
- ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
65
- })?;
66
-
67
- if fd < 0 {
68
- return Err(ReaderError::InvalidFileDescriptor.into());
69
- }
70
-
71
- let file = unsafe { File::from_raw_fd(fd) };
72
- let file = ForgottenFileHandle(ManuallyDrop::new(file));
73
- let reader = SerializedFileReader::new(file).unwrap();
74
- let schema = reader.metadata().file_metadata().schema().clone();
75
-
76
- (schema, ParquetRowIter::from_file_into(Box::new(reader)))
77
- } else {
78
- let readable = SeekableRubyValue(Opaque::from(to_read));
79
- let reader = SerializedFileReader::new(readable).unwrap();
80
- let schema = reader.metadata().file_metadata().schema().clone();
81
-
82
- (schema, ParquetRowIter::from_file_into(Box::new(reader)))
83
- };
84
-
85
- if let Some(cols) = columns {
86
- let projection = create_projection_schema(&schema, &cols);
87
- iter = iter.project(Some(projection.to_owned())).map_err(|e| {
88
- MagnusError::new(
89
- ruby.exception_runtime_error(),
90
- format!("Failed to create projection: {}", e),
91
- )
92
- })?;
93
- }
94
-
95
- let iter: Box<dyn Iterator<Item = RowRecord<RandomState>>> = match result_type.as_str() {
96
- "hash" => {
97
- let headers = OnceLock::new();
98
- let headers_clone = headers.clone();
99
- let iter = iter
100
- .filter_map(move |row| {
101
- row.ok().map(|row| {
102
- let headers = headers_clone.get_or_init(|| {
103
- let column_count = row.get_column_iter().count();
104
-
105
- let mut header_string = Vec::with_capacity(column_count);
106
- for (k, _) in row.get_column_iter() {
107
- header_string.push(k.to_owned());
108
- }
109
-
110
- let headers = StringCache::intern_many(&header_string).unwrap();
111
-
112
- headers
113
- });
114
-
115
- let mut map =
116
- HashMap::with_capacity_and_hasher(headers.len(), Default::default());
117
- row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
118
- map.insert(headers[i], ParquetField(v.clone()));
119
- });
120
- map
121
- })
122
- })
123
- .map(RowRecord::Map);
124
-
125
- Box::new(HeaderCacheCleanupIter {
126
- inner: iter,
127
- headers,
128
- })
129
- }
130
- "array" => Box::new(
131
- iter.filter_map(|row| {
132
- row.ok().map(|row| {
133
- let column_count = row.get_column_iter().count();
134
- let mut vec = Vec::with_capacity(column_count);
135
- row.get_column_iter()
136
- .for_each(|(_, v)| vec.push(ParquetField(v.clone())));
137
- vec
138
- })
139
- })
140
- .map(RowRecord::Vec),
141
- ),
142
- _ => {
143
- return Err(MagnusError::new(
144
- ruby.exception_runtime_error(),
145
- "Invalid result type",
146
- ))
147
- }
148
- };
149
-
150
- Ok(Yield::Iter(iter))
151
- }
152
-
153
- #[inline]
154
- pub fn parse_parquet_columns<'a>(
155
- rb_self: Value,
156
- args: &[Value],
157
- ) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
158
- let original = unsafe { Ruby::get_unchecked() };
159
- let ruby: &'static Ruby = Box::leak(Box::new(original));
160
-
161
- let ParquetColumnsArgs {
162
- to_read,
163
- result_type,
164
- columns,
165
- batch_size,
166
- } = parse_parquet_columns_args(&ruby, args)?;
167
-
168
- if !ruby.block_given() {
169
- return create_column_enumerator(ColumnEnumeratorArgs {
170
- rb_self,
171
- to_read,
172
- result_type,
173
- columns,
174
- batch_size,
175
- });
176
- }
177
-
178
- let (batch_reader, schema, num_rows) = if to_read.is_kind_of(ruby.class_string()) {
179
- let path_string = to_read.to_r_string()?;
180
- let file_path = unsafe { path_string.as_str()? };
181
- let file = File::open(file_path).map_err(|e| ReaderError::FileOpen(e))?;
182
-
183
- let mut builder =
184
- ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
185
- let schema = builder.schema().clone();
186
- let num_rows = builder.metadata().file_metadata().num_rows();
187
-
188
- // If columns are specified, project only those columns
189
- if let Some(cols) = &columns {
190
- // Get the parquet schema
191
- let parquet_schema = builder.parquet_schema();
192
-
193
- // Create a projection mask from column names
194
- let projection =
195
- ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
196
-
197
- builder = builder.with_projection(projection);
198
- }
199
-
200
- if let Some(batch_size) = batch_size {
201
- builder = builder.with_batch_size(batch_size);
202
- }
1
+ use std::io;
203
2
 
204
- let reader = builder.build().unwrap();
205
-
206
- (reader, schema, num_rows)
207
- } else if to_read.is_kind_of(ruby.class_io()) {
208
- let raw_value = to_read.as_raw();
209
- let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
210
- .map_err(|_| {
211
- ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
212
- })?;
213
-
214
- if fd < 0 {
215
- return Err(ReaderError::InvalidFileDescriptor.into());
216
- }
217
-
218
- let file = unsafe { File::from_raw_fd(fd) };
219
- let file = ForgottenFileHandle(ManuallyDrop::new(file));
220
-
221
- let mut builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
222
- let schema = builder.schema().clone();
223
- let num_rows = builder.metadata().file_metadata().num_rows();
224
-
225
- if let Some(batch_size) = batch_size {
226
- builder = builder.with_batch_size(batch_size);
227
- }
228
-
229
- // If columns are specified, project only those columns
230
- if let Some(cols) = &columns {
231
- // Get the parquet schema
232
- let parquet_schema = builder.parquet_schema();
233
-
234
- // Create a projection mask from column names
235
- let projection =
236
- ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
237
-
238
- builder = builder.with_projection(projection);
239
- }
240
-
241
- let reader = builder.build().unwrap();
242
-
243
- (reader, schema, num_rows)
244
- } else {
245
- let readable = SeekableRubyValue(Opaque::from(to_read));
246
-
247
- let mut builder = ParquetRecordBatchReaderBuilder::try_new(readable).unwrap();
248
- let schema = builder.schema().clone();
249
- let num_rows = builder.metadata().file_metadata().num_rows();
250
-
251
- if let Some(batch_size) = batch_size {
252
- builder = builder.with_batch_size(batch_size);
253
- }
254
-
255
- // If columns are specified, project only those columns
256
- if let Some(cols) = &columns {
257
- // Get the parquet schema
258
- let parquet_schema = builder.parquet_schema();
259
-
260
- // Create a projection mask from column names
261
- let projection =
262
- ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
263
-
264
- builder = builder.with_projection(projection);
265
- }
266
-
267
- let reader = builder.build().unwrap();
268
-
269
- (reader, schema, num_rows)
270
- };
271
-
272
- if num_rows == 0 {
273
- let mut map =
274
- HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
275
- for field in schema.fields() {
276
- map.insert(
277
- StringCache::intern(field.name().to_string()).unwrap(),
278
- vec![],
279
- );
280
- }
281
- let column_record = vec![ColumnRecord::Map(map)];
282
- return Ok(Yield::Iter(Box::new(column_record.into_iter())));
283
- }
284
-
285
- let iter: Box<dyn Iterator<Item = ColumnRecord<RandomState>>> = match result_type.as_str() {
286
- "hash" => {
287
- let headers = OnceLock::new();
288
- let headers_clone = headers.clone();
289
- let iter = batch_reader
290
- .filter_map(move |batch| {
291
- batch.ok().map(|batch| {
292
- let headers = headers_clone.get_or_init(|| {
293
- let schema = batch.schema();
294
- let fields = schema.fields();
295
- let mut header_string = Vec::with_capacity(fields.len());
296
- for field in fields {
297
- header_string.push(field.name().to_owned());
298
- }
299
- StringCache::intern_many(&header_string).unwrap()
300
- });
301
-
302
- let mut map =
303
- HashMap::with_capacity_and_hasher(headers.len(), Default::default());
304
-
305
- batch.columns().iter().enumerate().for_each(|(i, column)| {
306
- let header = headers[i];
307
- let values = ParquetValueVec::try_from(column.clone()).unwrap();
308
- map.insert(header, values.into_inner());
309
- });
310
-
311
- map
312
- })
313
- })
314
- .map(ColumnRecord::Map);
315
-
316
- Box::new(HeaderCacheCleanupIter {
317
- inner: iter,
318
- headers,
319
- })
320
- }
321
- "array" => Box::new(
322
- batch_reader
323
- .filter_map(|batch| {
324
- batch.ok().map(|batch| {
325
- batch
326
- .columns()
327
- .into_iter()
328
- .map(|column| {
329
- let values = ParquetValueVec::try_from(column.clone()).unwrap();
330
- values.into_inner()
331
- })
332
- .collect()
333
- })
334
- })
335
- .map(ColumnRecord::Vec),
336
- ),
337
- _ => {
338
- return Err(MagnusError::new(
339
- ruby.exception_runtime_error(),
340
- "Invalid result type",
341
- ))
342
- }
343
- };
344
-
345
- Ok(Yield::Iter(iter))
346
- }
347
-
348
- fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
349
- if let SchemaType::GroupType { fields, .. } = schema {
350
- let projected_fields: Vec<TypePtr> = fields
351
- .iter()
352
- .filter(|field| columns.contains(&field.name().to_string()))
353
- .cloned()
354
- .collect();
3
+ use magnus::{Error as MagnusError, Ruby};
4
+ use thiserror::Error;
355
5
 
356
- SchemaType::GroupType {
357
- basic_info: schema.get_basic_info().clone(),
358
- fields: projected_fields,
359
- }
360
- } else {
361
- // Return original schema if not a group type
362
- schema.clone()
363
- }
364
- }
6
+ use crate::header_cache::CacheError;
7
+ pub use crate::parquet_column_reader::parse_parquet_columns;
8
+ pub use crate::parquet_row_reader::parse_parquet_rows;
365
9
 
366
10
  #[derive(Error, Debug)]
367
11
  pub enum ReaderError {
@@ -376,7 +20,7 @@ pub enum ReaderError {
376
20
  #[error("Ruby error: {0}")]
377
21
  Ruby(String),
378
22
  #[error("Parquet error: {0}")]
379
- Parquet(#[from] ParquetError),
23
+ Parquet(#[from] parquet::errors::ParquetError),
380
24
  }
381
25
 
382
26
  impl From<MagnusError> for ReaderError {
@@ -14,9 +14,8 @@ pub struct RubyReader<T> {
14
14
  offset: usize,
15
15
  }
16
16
 
17
- pub trait SeekableRead: std::io::Read + Seek {}
18
- impl SeekableRead for RubyReader<Value> {}
19
- impl SeekableRead for RubyReader<RString> {}
17
+ pub trait SeekableRead: Read + Seek {}
18
+ impl<T: Read + Seek> SeekableRead for T {}
20
19
 
21
20
  pub fn build_ruby_reader(
22
21
  ruby: &Ruby,
@@ -15,6 +15,47 @@ use parquet::record::Field;
15
15
 
16
16
  use crate::header_cache::StringCacheKey;
17
17
 
18
+ #[derive(Copy, Clone, Debug, PartialEq, Eq)]
19
+ pub enum ParserResultType {
20
+ Hash,
21
+ Array,
22
+ }
23
+
24
+ impl ParserResultType {
25
+ pub fn iter() -> impl Iterator<Item = Self> {
26
+ [Self::Hash, Self::Array].into_iter()
27
+ }
28
+ }
29
+
30
+ impl TryFrom<&str> for ParserResultType {
31
+ type Error = String;
32
+
33
+ fn try_from(value: &str) -> Result<Self, Self::Error> {
34
+ match value {
35
+ "hash" => Ok(ParserResultType::Hash),
36
+ "array" => Ok(ParserResultType::Array),
37
+ _ => Err(format!("Invalid parser result type: {}", value)),
38
+ }
39
+ }
40
+ }
41
+
42
+ impl TryFrom<String> for ParserResultType {
43
+ type Error = String;
44
+
45
+ fn try_from(value: String) -> Result<Self, Self::Error> {
46
+ Self::try_from(value.as_str())
47
+ }
48
+ }
49
+
50
+ impl std::fmt::Display for ParserResultType {
51
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
52
+ match self {
53
+ ParserResultType::Hash => write!(f, "hash"),
54
+ ParserResultType::Array => write!(f, "array"),
55
+ }
56
+ }
57
+ }
58
+
18
59
  #[derive(Debug)]
19
60
  pub enum RowRecord<S: BuildHasher + Default> {
20
61
  Vec(Vec<ParquetField>),
@@ -4,6 +4,8 @@ use magnus::{
4
4
  Error, RString, Ruby, Symbol, Value,
5
5
  };
6
6
 
7
+ use crate::ParserResultType;
8
+
7
9
  fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, Error> {
8
10
  if value.is_nil() {
9
11
  Ok(None)
@@ -28,7 +30,7 @@ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, E
28
30
  #[derive(Debug)]
29
31
  pub struct ParquetRowsArgs {
30
32
  pub to_read: Value,
31
- pub result_type: String,
33
+ pub result_type: ParserResultType,
32
34
  pub columns: Option<Vec<String>>,
33
35
  }
34
36
 
@@ -43,28 +45,31 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
43
45
  &["result_type", "columns"],
44
46
  )?;
45
47
 
46
- let result_type = match kwargs
48
+ let result_type: ParserResultType = match kwargs
47
49
  .optional
48
50
  .0
49
51
  .map(|value| parse_string_or_symbol(ruby, value))
50
52
  {
51
- Some(Ok(Some(parsed))) => match parsed.as_str() {
52
- "hash" | "array" => parsed,
53
- _ => {
54
- return Err(Error::new(
55
- magnus::exception::runtime_error(),
56
- "result_type must be either 'hash' or 'array'",
57
- ))
58
- }
59
- },
60
- Some(Ok(None)) => String::from("hash"),
53
+ Some(Ok(Some(parsed))) => parsed.try_into().map_err(|e| {
54
+ Error::new(
55
+ magnus::exception::runtime_error(),
56
+ format!(
57
+ "Invalid result type: {e}. Must be one of {}",
58
+ ParserResultType::iter()
59
+ .map(|v| v.to_string())
60
+ .collect::<Vec<_>>()
61
+ .join(", ")
62
+ ),
63
+ )
64
+ })?,
65
+ Some(Ok(None)) => ParserResultType::Hash,
61
66
  Some(Err(_)) => {
62
67
  return Err(Error::new(
63
68
  magnus::exception::type_error(),
64
69
  "result_type must be a String or Symbol",
65
70
  ))
66
71
  }
67
- None => String::from("hash"),
72
+ None => ParserResultType::Hash,
68
73
  };
69
74
 
70
75
  Ok(ParquetRowsArgs {
@@ -77,7 +82,7 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
77
82
  #[derive(Debug)]
78
83
  pub struct ParquetColumnsArgs {
79
84
  pub to_read: Value,
80
- pub result_type: String,
85
+ pub result_type: ParserResultType,
81
86
  pub columns: Option<Vec<String>>,
82
87
  pub batch_size: Option<usize>,
83
88
  }
@@ -96,28 +101,31 @@ pub fn parse_parquet_columns_args(
96
101
  &["result_type", "columns", "batch_size"],
97
102
  )?;
98
103
 
99
- let result_type = match kwargs
104
+ let result_type: ParserResultType = match kwargs
100
105
  .optional
101
106
  .0
102
107
  .map(|value| parse_string_or_symbol(ruby, value))
103
108
  {
104
- Some(Ok(Some(parsed))) => match parsed.as_str() {
105
- "hash" | "array" => parsed,
106
- _ => {
107
- return Err(Error::new(
108
- magnus::exception::runtime_error(),
109
- "result_type must be either 'hash' or 'array'",
110
- ))
111
- }
112
- },
113
- Some(Ok(None)) => String::from("hash"),
109
+ Some(Ok(Some(parsed))) => parsed.try_into().map_err(|e| {
110
+ Error::new(
111
+ magnus::exception::runtime_error(),
112
+ format!(
113
+ "Invalid result type: {e}. Must be one of {}",
114
+ ParserResultType::iter()
115
+ .map(|v| v.to_string())
116
+ .collect::<Vec<_>>()
117
+ .join(", ")
118
+ ),
119
+ )
120
+ })?,
121
+ Some(Ok(None)) => ParserResultType::Hash,
114
122
  Some(Err(_)) => {
115
123
  return Err(Error::new(
116
124
  magnus::exception::type_error(),
117
125
  "result_type must be a String or Symbol",
118
126
  ))
119
127
  }
120
- None => String::from("hash"),
128
+ None => ParserResultType::Hash,
121
129
  };
122
130
 
123
131
  Ok(ParquetColumnsArgs {
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.5"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-05 00:00:00.000000000 Z
11
+ date: 2025-01-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -60,6 +60,8 @@ files:
60
60
  - ext/parquet/src/enumerator.rs
61
61
  - ext/parquet/src/header_cache.rs
62
62
  - ext/parquet/src/lib.rs
63
+ - ext/parquet/src/parquet_column_reader.rs
64
+ - ext/parquet/src/parquet_row_reader.rs
63
65
  - ext/parquet/src/reader.rs
64
66
  - ext/parquet/src/ruby_integration.rs
65
67
  - ext/parquet/src/ruby_reader.rs