parquet 0.5.4 → 0.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 936feb49be7a1bbbb36236551480ae0522d6b52443e76b4ebb7502abdb9d2903
4
- data.tar.gz: bcc56665ec0cd132e22c262373e7b1294e085be364c93efbd214e434ada7dcb6
3
+ metadata.gz: dc1d1eda7d71aa6336fbf6cc94789517439df3fab1852ec7d2e9d265e0c016c4
4
+ data.tar.gz: 6fff5321a31d3fe19a59a4f47add56222dbeb274bef7a068163b48757d65252d
5
5
  SHA512:
6
- metadata.gz: 7856d7f36820a8384faf564f166d39e0daca1c9d15457b6f6aae8ff56f4176a8b1302bfbc2cc5edcfedfcb0805cbe71029f5712e716a29dc4942a1e6453a3e5e
7
- data.tar.gz: '08d1f4cfe357b22bad4c4fab4ddd4fa93069b13c65559d668fb704e2f7d8884fc8f081270e4dc43a5db60aab7147be36bfe7d26945f93c9ad6e9badbd0ad957e'
6
+ metadata.gz: ddd50f82df2b42cf844e379a7f07c0214e9aef925e7c43ec566b6b9f27be311676b6f887c163aa5d41d4523cd1d506266b15623205453bc8e08467c88e7c2b63
7
+ data.tar.gz: afb235ad09338d8c4cd59588dded3d312890c5d5d879b77040fcbf960be69653981fe5176cc591969a80ba54214d4c6a63cff96c36ceda7b9e00c75ba8e9e913
data/Cargo.lock CHANGED
@@ -63,9 +63,8 @@ dependencies = [
63
63
 
64
64
  [[package]]
65
65
  name = "arrow-array"
66
- version = "54.2.0"
67
- source = "registry+https://github.com/rust-lang/crates.io-index"
68
- checksum = "57a4a6d2896083cfbdf84a71a863b22460d0708f8206a8373c52e326cc72ea1a"
66
+ version = "55.1.0"
67
+ source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
69
68
  dependencies = [
70
69
  "ahash",
71
70
  "arrow-buffer",
@@ -79,9 +78,8 @@ dependencies = [
79
78
 
80
79
  [[package]]
81
80
  name = "arrow-buffer"
82
- version = "54.2.0"
83
- source = "registry+https://github.com/rust-lang/crates.io-index"
84
- checksum = "cef870583ce5e4f3b123c181706f2002fb134960f9a911900f64ba4830c7a43a"
81
+ version = "55.1.0"
82
+ source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
85
83
  dependencies = [
86
84
  "bytes",
87
85
  "half",
@@ -90,9 +88,8 @@ dependencies = [
90
88
 
91
89
  [[package]]
92
90
  name = "arrow-cast"
93
- version = "54.2.0"
94
- source = "registry+https://github.com/rust-lang/crates.io-index"
95
- checksum = "1ac7eba5a987f8b4a7d9629206ba48e19a1991762795bbe5d08497b7736017ee"
91
+ version = "55.1.0"
92
+ source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
96
93
  dependencies = [
97
94
  "arrow-array",
98
95
  "arrow-buffer",
@@ -110,9 +107,8 @@ dependencies = [
110
107
 
111
108
  [[package]]
112
109
  name = "arrow-data"
113
- version = "54.2.0"
114
- source = "registry+https://github.com/rust-lang/crates.io-index"
115
- checksum = "b095e8a4f3c309544935d53e04c3bfe4eea4e71c3de6fe0416d1f08bb4441a83"
110
+ version = "55.1.0"
111
+ source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
116
112
  dependencies = [
117
113
  "arrow-buffer",
118
114
  "arrow-schema",
@@ -122,9 +118,8 @@ dependencies = [
122
118
 
123
119
  [[package]]
124
120
  name = "arrow-ipc"
125
- version = "54.2.0"
126
- source = "registry+https://github.com/rust-lang/crates.io-index"
127
- checksum = "65c63da4afedde2b25ef69825cd4663ca76f78f79ffe2d057695742099130ff6"
121
+ version = "55.1.0"
122
+ source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
128
123
  dependencies = [
129
124
  "arrow-array",
130
125
  "arrow-buffer",
@@ -135,15 +130,13 @@ dependencies = [
135
130
 
136
131
  [[package]]
137
132
  name = "arrow-schema"
138
- version = "54.2.0"
139
- source = "registry+https://github.com/rust-lang/crates.io-index"
140
- checksum = "0f40f6be8f78af1ab610db7d9b236e21d587b7168e368a36275d2e5670096735"
133
+ version = "55.1.0"
134
+ source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
141
135
 
142
136
  [[package]]
143
137
  name = "arrow-select"
144
- version = "54.2.0"
145
- source = "registry+https://github.com/rust-lang/crates.io-index"
146
- checksum = "ac265273864a820c4a179fc67182ccc41ea9151b97024e1be956f0f2369c2539"
138
+ version = "55.1.0"
139
+ source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
147
140
  dependencies = [
148
141
  "ahash",
149
142
  "arrow-array",
@@ -180,7 +173,7 @@ version = "0.69.5"
180
173
  source = "registry+https://github.com/rust-lang/crates.io-index"
181
174
  checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
182
175
  dependencies = [
183
- "bitflags 2.8.0",
176
+ "bitflags",
184
177
  "cexpr",
185
178
  "clang-sys",
186
179
  "itertools 0.12.1",
@@ -194,12 +187,6 @@ dependencies = [
194
187
  "syn",
195
188
  ]
196
189
 
197
- [[package]]
198
- name = "bitflags"
199
- version = "1.3.2"
200
- source = "registry+https://github.com/rust-lang/crates.io-index"
201
- checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
202
-
203
190
  [[package]]
204
191
  name = "bitflags"
205
192
  version = "2.8.0"
@@ -208,9 +195,9 @@ checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36"
208
195
 
209
196
  [[package]]
210
197
  name = "brotli"
211
- version = "7.0.0"
198
+ version = "8.0.1"
212
199
  source = "registry+https://github.com/rust-lang/crates.io-index"
213
- checksum = "cc97b8f16f944bba54f0433f07e30be199b6dc2bd25937444bbad560bcea29bd"
200
+ checksum = "9991eea70ea4f293524138648e41ee89b0b2b12ddef3b255effa43c8056e0e0d"
214
201
  dependencies = [
215
202
  "alloc-no-stdlib",
216
203
  "alloc-stdlib",
@@ -219,9 +206,9 @@ dependencies = [
219
206
 
220
207
  [[package]]
221
208
  name = "brotli-decompressor"
222
- version = "4.0.2"
209
+ version = "5.0.0"
223
210
  source = "registry+https://github.com/rust-lang/crates.io-index"
224
- checksum = "74fa05ad7d803d413eb8380983b092cbbaf9a85f151b871360e7b00cd7060b37"
211
+ checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03"
225
212
  dependencies = [
226
213
  "alloc-no-stdlib",
227
214
  "alloc-stdlib",
@@ -359,11 +346,11 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
359
346
 
360
347
  [[package]]
361
348
  name = "flatbuffers"
362
- version = "24.12.23"
349
+ version = "25.2.10"
363
350
  source = "registry+https://github.com/rust-lang/crates.io-index"
364
- checksum = "4f1baf0dbf96932ec9a3038d57900329c015b0bfb7b63d904f3bc27e2b02a096"
351
+ checksum = "1045398c1bfd89168b5fd3f1fc11f6e70b34f6f66300c87d44d3de849463abf1"
365
352
  dependencies = [
366
- "bitflags 1.3.2",
353
+ "bitflags",
367
354
  "rustc_version",
368
355
  ]
369
356
 
@@ -374,6 +361,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
374
361
  checksum = "11faaf5a5236997af9848be0bef4db95824b1d534ebc64d0f0c6cf3e67bd38dc"
375
362
  dependencies = [
376
363
  "crc32fast",
364
+ "libz-rs-sys",
377
365
  "miniz_oxide",
378
366
  ]
379
367
 
@@ -652,6 +640,15 @@ dependencies = [
652
640
  "libc",
653
641
  ]
654
642
 
643
+ [[package]]
644
+ name = "libz-rs-sys"
645
+ version = "0.4.2"
646
+ source = "registry+https://github.com/rust-lang/crates.io-index"
647
+ checksum = "902bc563b5d65ad9bba616b490842ef0651066a1a1dc3ce1087113ffcb873c8d"
648
+ dependencies = [
649
+ "zlib-rs",
650
+ ]
651
+
655
652
  [[package]]
656
653
  name = "linux-raw-sys"
657
654
  version = "0.4.15"
@@ -670,7 +667,7 @@ version = "0.11.3"
670
667
  source = "registry+https://github.com/rust-lang/crates.io-index"
671
668
  checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5"
672
669
  dependencies = [
673
- "twox-hash",
670
+ "twox-hash 1.6.3",
674
671
  ]
675
672
 
676
673
  [[package]]
@@ -840,7 +837,7 @@ dependencies = [
840
837
  "magnus",
841
838
  "mimalloc",
842
839
  "num",
843
- "parquet 54.2.0",
840
+ "parquet 55.1.0",
844
841
  "rand",
845
842
  "rb-sys",
846
843
  "rb-sys-env 0.2.2",
@@ -851,9 +848,8 @@ dependencies = [
851
848
 
852
849
  [[package]]
853
850
  name = "parquet"
854
- version = "54.2.0"
855
- source = "registry+https://github.com/rust-lang/crates.io-index"
856
- checksum = "761c44d824fe83106e0600d2510c07bf4159a4985bf0569b513ea4288dc1b4fb"
851
+ version = "55.1.0"
852
+ source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
857
853
  dependencies = [
858
854
  "ahash",
859
855
  "arrow-array",
@@ -879,9 +875,8 @@ dependencies = [
879
875
  "simdutf8",
880
876
  "snap",
881
877
  "thrift",
882
- "twox-hash",
878
+ "twox-hash 2.1.0",
883
879
  "zstd",
884
- "zstd-sys",
885
880
  ]
886
881
 
887
882
  [[package]]
@@ -1055,7 +1050,7 @@ version = "0.38.44"
1055
1050
  source = "registry+https://github.com/rust-lang/crates.io-index"
1056
1051
  checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154"
1057
1052
  dependencies = [
1058
- "bitflags 2.8.0",
1053
+ "bitflags",
1059
1054
  "errno",
1060
1055
  "libc",
1061
1056
  "linux-raw-sys",
@@ -1223,6 +1218,12 @@ dependencies = [
1223
1218
  "static_assertions",
1224
1219
  ]
1225
1220
 
1221
+ [[package]]
1222
+ name = "twox-hash"
1223
+ version = "2.1.0"
1224
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1225
+ checksum = "e7b17f197b3050ba473acf9181f7b1d3b66d1cf7356c6cc57886662276e65908"
1226
+
1226
1227
  [[package]]
1227
1228
  name = "unicode-ident"
1228
1229
  version = "1.0.17"
@@ -1402,7 +1403,7 @@ version = "0.33.0"
1402
1403
  source = "registry+https://github.com/rust-lang/crates.io-index"
1403
1404
  checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c"
1404
1405
  dependencies = [
1405
- "bitflags 2.8.0",
1406
+ "bitflags",
1406
1407
  ]
1407
1408
 
1408
1409
  [[package]]
@@ -1446,6 +1447,12 @@ dependencies = [
1446
1447
  "syn",
1447
1448
  ]
1448
1449
 
1450
+ [[package]]
1451
+ name = "zlib-rs"
1452
+ version = "0.4.2"
1453
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1454
+ checksum = "8b20717f0917c908dc63de2e44e97f1e6b126ca58d0e391cee86d504eb8fbd05"
1455
+
1449
1456
  [[package]]
1450
1457
  name = "zstd"
1451
1458
  version = "0.13.3"
@@ -11,14 +11,14 @@ rb-sys-env = "^0.2"
11
11
 
12
12
  [dependencies]
13
13
  ahash = "0.8"
14
- arrow-array = "54.0.0"
15
- arrow-schema = "54.0.0"
14
+ arrow-array = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-reading-int32-timestamp-records" }
15
+ arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-reading-int32-timestamp-records" }
16
16
  bytes = "^1.9"
17
17
  either = "1.9"
18
18
  itertools = "^0.14"
19
19
  jiff = "0.2"
20
20
  magnus = { version = "0.7", features = ["rb-sys"] }
21
- parquet = { version = "^54.0", features = ["json"] }
21
+ parquet = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-reading-int32-timestamp-records", features = ["json"] }
22
22
  rand = "0.9"
23
23
  rb-sys = "^0.9"
24
24
  simdutf8 = "0.1.5"
@@ -1,6 +1,7 @@
1
1
  mod common;
2
2
  mod parquet_column_reader;
3
3
  mod parquet_row_reader;
4
+ mod unified;
4
5
  use std::{fs::File, rc::Rc};
5
6
 
6
7
  use magnus::{value::ReprValue, Error as MagnusError, Ruby, Value};
@@ -207,4 +208,4 @@ pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusEr
207
208
  let metadata = reader.finish().map_err(ParquetGemError::Parquet)?;
208
209
 
209
210
  Ok(RubyParquetMetaData(metadata).try_into_value_with(&ruby)?)
210
- }
211
+ }
@@ -1,21 +1,9 @@
1
- use crate::header_cache::StringCache;
2
- use crate::logger::RubyLogger;
3
- use crate::types::{ArrayWrapper, ParquetGemError, TryIntoValue};
4
- use crate::{
5
- create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ParquetValueVec,
6
- ParserResultType,
7
- };
8
- use ahash::RandomState;
9
- use either::Either;
10
- use magnus::IntoValue;
1
+ use crate::reader::unified::{parse_parquet_unified, ParserType, UnifiedParserArgs};
2
+ use crate::utils::*;
3
+ use crate::ParquetGemError;
4
+
11
5
  use magnus::{Error as MagnusError, Ruby, Value};
12
- use std::collections::HashMap;
13
6
  use std::rc::Rc;
14
- use std::sync::OnceLock;
15
-
16
- use super::common::{
17
- create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
18
- };
19
7
 
20
8
  #[inline]
21
9
  pub fn parse_parquet_columns(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
@@ -41,116 +29,16 @@ fn parse_parquet_columns_impl(
41
29
  logger,
42
30
  } = parse_parquet_columns_args(&ruby, args)?;
43
31
 
44
- // Initialize the logger if provided
45
- let ruby_logger = RubyLogger::new(&ruby, logger)?;
46
- if let Some(ref bs) = batch_size {
47
- ruby_logger.debug(|| format!("Using batch size: {}", bs))?;
48
- }
49
-
50
- // Clone values for the closure to avoid move issues
51
- let columns_clone = columns.clone();
52
-
53
- // Handle block or create enumerator
54
- if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
55
- create_column_enumerator(ColumnEnumeratorArgs {
56
- rb_self,
32
+ // Use the unified parsing implementation
33
+ parse_parquet_unified(
34
+ ruby,
35
+ rb_self,
36
+ UnifiedParserArgs {
57
37
  to_read,
58
38
  result_type,
59
- columns: columns_clone,
60
- batch_size,
61
- strict,
62
- logger: logger.as_ref().map(|_| to_read),
63
- })
64
- .map(|yield_enum| yield_enum.into_value_with(&ruby))
65
- })? {
66
- return Ok(enum_value);
67
- }
68
-
69
- let source = open_parquet_source(ruby.clone(), to_read)?;
70
-
71
- // Use the common function to create the batch reader
72
-
73
- let (batch_reader, schema, num_rows) = match source {
74
- Either::Left(file) => create_batch_reader(file, &columns, batch_size)?,
75
- Either::Right(readable) => create_batch_reader(readable, &columns, batch_size)?,
76
- };
77
-
78
- match result_type {
79
- ParserResultType::Hash => {
80
- // For hash return type, we need to return a hash with column names pointing at empty arrays
81
- if handle_empty_file(&ruby, &schema, num_rows)? {
82
- return Ok(ruby.qnil().into_value_with(&ruby));
83
- }
84
-
85
- let headers = OnceLock::new();
86
- let headers_clone = headers.clone();
87
- let iter = batch_reader.map(move |batch| {
88
- batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
89
- let local_headers = headers_clone
90
- .get_or_init(|| {
91
- let schema = batch.schema();
92
- let fields = schema.fields();
93
- let mut header_string = Vec::with_capacity(fields.len());
94
- for field in fields {
95
- header_string.push(field.name().to_owned());
96
- }
97
- StringCache::intern_many(&header_string)
98
- })
99
- .as_ref()
100
- .map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
101
-
102
- let mut map = HashMap::with_capacity_and_hasher(
103
- local_headers.len(),
104
- RandomState::default(),
105
- );
106
-
107
- batch
108
- .columns()
109
- .iter()
110
- .enumerate()
111
- .try_for_each(|(i, column)| {
112
- let header = local_headers[i];
113
- let values = ParquetValueVec::try_from(ArrayWrapper {
114
- array: column,
115
- strict,
116
- })?;
117
- map.insert(header, values.into_inner());
118
- Ok::<_, ParquetGemError>(())
119
- })?;
120
-
121
- Ok(ColumnRecord::Map::<RandomState>(map))
122
- })
123
- });
124
-
125
- for result in iter {
126
- let record = result?;
127
- let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
128
- }
129
- }
130
- ParserResultType::Array => {
131
- let iter = batch_reader.map(|batch| {
132
- batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
133
- let vec = batch
134
- .columns()
135
- .iter()
136
- .map(|column| {
137
- let values = ParquetValueVec::try_from(ArrayWrapper {
138
- array: column,
139
- strict,
140
- })?;
141
- Ok::<_, ParquetGemError>(values.into_inner())
142
- })
143
- .collect::<Result<Vec<_>, _>>()?;
144
- Ok(ColumnRecord::Vec::<RandomState>(vec))
145
- })
146
- });
147
-
148
- for result in iter {
149
- let record = result?;
150
- let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
151
- }
152
- }
153
- }
154
-
155
- Ok(ruby.qnil().into_value_with(&ruby))
156
- }
39
+ columns,
40
+ parser_type: ParserType::Column { batch_size, strict },
41
+ logger,
42
+ },
43
+ )
44
+ }
@@ -1,22 +1,9 @@
1
- use crate::header_cache::StringCache;
2
- use crate::logger::RubyLogger;
3
- use crate::types::TryIntoValue;
4
- use crate::{
5
- create_row_enumerator, utils::*, ParquetField, ParquetGemError, ParserResultType,
6
- RowEnumeratorArgs, RowRecord,
7
- };
8
- use ahash::RandomState;
9
- use either::Either;
10
- use magnus::IntoValue;
1
+ use crate::reader::unified::{parse_parquet_unified, ParserType, UnifiedParserArgs};
2
+ use crate::utils::*;
3
+ use crate::ParquetGemError;
4
+
11
5
  use magnus::{Error as MagnusError, Ruby, Value};
12
- use parquet::file::reader::{FileReader, SerializedFileReader};
13
- use parquet::record::reader::RowIter as ParquetRowIter;
14
- use parquet::schema::types::{Type as SchemaType, TypePtr};
15
- use std::collections::HashMap;
16
6
  use std::rc::Rc;
17
- use std::sync::OnceLock;
18
-
19
- use super::common::{handle_block_or_enum, open_parquet_source};
20
7
 
21
8
  #[inline]
22
9
  pub fn parse_parquet_rows(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
@@ -41,123 +28,16 @@ fn parse_parquet_rows_impl(
41
28
  logger,
42
29
  } = parse_parquet_rows_args(&ruby, args)?;
43
30
 
44
- // Initialize the logger if provided
45
- let ruby_logger = RubyLogger::new(&ruby, logger)?;
46
-
47
- // Clone values for the closure to avoid move issues
48
- let columns_clone = columns.clone();
49
-
50
- // Handle block or create enumerator
51
- if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
52
- create_row_enumerator(RowEnumeratorArgs {
53
- rb_self,
31
+ // Use the unified parsing implementation
32
+ parse_parquet_unified(
33
+ ruby,
34
+ rb_self,
35
+ UnifiedParserArgs {
54
36
  to_read,
55
37
  result_type,
56
- columns: columns_clone,
57
- strict,
38
+ columns,
39
+ parser_type: ParserType::Row { strict },
58
40
  logger,
59
- })
60
- .map(|yield_enum| yield_enum.into_value_with(&ruby))
61
- })? {
62
- return Ok(enum_value);
63
- }
64
-
65
- let source = open_parquet_source(ruby.clone(), to_read)?;
66
- let reader: Box<dyn FileReader> = match source {
67
- Either::Left(file) => {
68
- Box::new(SerializedFileReader::new(file).map_err(ParquetGemError::from)?)
69
- }
70
- Either::Right(readable) => {
71
- Box::new(SerializedFileReader::new(readable).map_err(ParquetGemError::from)?)
72
- }
73
- };
74
-
75
- let schema = reader.metadata().file_metadata().schema().clone();
76
- ruby_logger.debug(|| format!("Schema loaded: {:?}", schema))?;
77
-
78
- let mut iter = ParquetRowIter::from_file_into(reader);
79
- if let Some(cols) = columns {
80
- ruby_logger.debug(|| format!("Projecting columns: {:?}", cols))?;
81
- let projection = create_projection_schema(&schema, &cols);
82
- iter = iter.project(Some(projection.to_owned())).map_err(|e| {
83
- MagnusError::new(
84
- ruby.exception_runtime_error(),
85
- format!("Failed to create projection: {}", e),
86
- )
87
- })?;
88
- }
89
-
90
- match result_type {
91
- ParserResultType::Hash => {
92
- let headers = OnceLock::new();
93
- let headers_clone = headers.clone();
94
- let iter = iter.map(move |row| {
95
- row.map(|row| {
96
- let headers = headers_clone.get_or_init(|| {
97
- let column_count = row.get_column_iter().count();
98
-
99
- let mut header_string = Vec::with_capacity(column_count);
100
- for (k, _) in row.get_column_iter() {
101
- header_string.push(k.to_owned());
102
- }
103
-
104
- StringCache::intern_many(&header_string).expect("Failed to intern headers")
105
- });
106
-
107
- let mut map =
108
- HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
109
- for (i, (_, v)) in row.get_column_iter().enumerate() {
110
- map.insert(headers[i], ParquetField(v.clone(), strict));
111
- }
112
- map
113
- })
114
- .map(RowRecord::Map::<RandomState>)
115
- .map_err(ParquetGemError::from)
116
- });
117
-
118
- for result in iter {
119
- let record = result?;
120
- let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
121
- }
122
- }
123
- ParserResultType::Array => {
124
- let iter = iter.map(|row| {
125
- row.map(|row| {
126
- let column_count = row.get_column_iter().count();
127
- let mut vec = Vec::with_capacity(column_count);
128
- for (_, v) in row.get_column_iter() {
129
- vec.push(ParquetField(v.clone(), strict));
130
- }
131
- vec
132
- })
133
- .map(RowRecord::Vec::<RandomState>)
134
- .map_err(ParquetGemError::from)
135
- });
136
-
137
- for result in iter {
138
- let record = result?;
139
- let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
140
- }
141
- }
142
- }
143
-
144
- Ok(ruby.qnil().into_value_with(&ruby))
145
- }
146
-
147
- fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
148
- if let SchemaType::GroupType { fields, .. } = schema {
149
- let projected_fields: Vec<TypePtr> = fields
150
- .iter()
151
- .filter(|field| columns.contains(&field.name().to_string()))
152
- .cloned()
153
- .collect();
154
-
155
- SchemaType::GroupType {
156
- basic_info: schema.get_basic_info().clone(),
157
- fields: projected_fields,
158
- }
159
- } else {
160
- // Return original schema if not a group type
161
- schema.clone()
162
- }
163
- }
41
+ },
42
+ )
43
+ }