parquet 0.5.5 → 0.5.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e2295ee94fe35758ae8e5137070e2206ec1e104aad6b9a0806aa508ad4799247
4
- data.tar.gz: 340f86257082bdba22d6ced530ecd1d201c7b4e6d9116eebac41541ba2aaa257
3
+ metadata.gz: e8a79e74af0419282904a0041c09509520f64ce1e504e133237f4b87697dce14
4
+ data.tar.gz: 63391ffff73907caccc142f37550e85c12826f302f00ac726f826af391f8d8cd
5
5
  SHA512:
6
- metadata.gz: f333ae2914cdd00468c390e8b3d876aec4e522a546d43ab29db5d777792105a38d2a40c49db0f0afe1e800bf32e54bb4c479441f8f9876937ba59917b444d15a
7
- data.tar.gz: da2832c3514729cc0e99e16f70a10bbfc4e9093dc734de55715305121649ebc371dff93a7bb462b97fde27c79ad65cec12c5fa90a47f70bc64153a7fd2ce1a5c
6
+ metadata.gz: cddb7c6711e7e49ea785f6c0ab5ae3c40181756ad0e3fc23f298c291b725b178fdfbe5a8430fd9be10591b09e1b963255cb50637743054fe2173c9798e1e8bcc
7
+ data.tar.gz: 927a112ff1994800b3ed989f5000ed2a43438cebff886a545d0dd22018731b042b9052ead5b14983faf50fffc593cdd7512dd764bfed4de8ffa7781e6f2fda1a
data/Cargo.lock CHANGED
@@ -63,9 +63,8 @@ dependencies = [
63
63
 
64
64
  [[package]]
65
65
  name = "arrow-array"
66
- version = "54.2.0"
67
- source = "registry+https://github.com/rust-lang/crates.io-index"
68
- checksum = "57a4a6d2896083cfbdf84a71a863b22460d0708f8206a8373c52e326cc72ea1a"
66
+ version = "55.1.0"
67
+ source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
69
68
  dependencies = [
70
69
  "ahash",
71
70
  "arrow-buffer",
@@ -79,9 +78,8 @@ dependencies = [
79
78
 
80
79
  [[package]]
81
80
  name = "arrow-buffer"
82
- version = "54.2.0"
83
- source = "registry+https://github.com/rust-lang/crates.io-index"
84
- checksum = "cef870583ce5e4f3b123c181706f2002fb134960f9a911900f64ba4830c7a43a"
81
+ version = "55.1.0"
82
+ source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
85
83
  dependencies = [
86
84
  "bytes",
87
85
  "half",
@@ -90,9 +88,8 @@ dependencies = [
90
88
 
91
89
  [[package]]
92
90
  name = "arrow-cast"
93
- version = "54.2.0"
94
- source = "registry+https://github.com/rust-lang/crates.io-index"
95
- checksum = "1ac7eba5a987f8b4a7d9629206ba48e19a1991762795bbe5d08497b7736017ee"
91
+ version = "55.1.0"
92
+ source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
96
93
  dependencies = [
97
94
  "arrow-array",
98
95
  "arrow-buffer",
@@ -110,9 +107,8 @@ dependencies = [
110
107
 
111
108
  [[package]]
112
109
  name = "arrow-data"
113
- version = "54.2.0"
114
- source = "registry+https://github.com/rust-lang/crates.io-index"
115
- checksum = "b095e8a4f3c309544935d53e04c3bfe4eea4e71c3de6fe0416d1f08bb4441a83"
110
+ version = "55.1.0"
111
+ source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
116
112
  dependencies = [
117
113
  "arrow-buffer",
118
114
  "arrow-schema",
@@ -122,9 +118,8 @@ dependencies = [
122
118
 
123
119
  [[package]]
124
120
  name = "arrow-ipc"
125
- version = "54.2.0"
126
- source = "registry+https://github.com/rust-lang/crates.io-index"
127
- checksum = "65c63da4afedde2b25ef69825cd4663ca76f78f79ffe2d057695742099130ff6"
121
+ version = "55.1.0"
122
+ source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
128
123
  dependencies = [
129
124
  "arrow-array",
130
125
  "arrow-buffer",
@@ -135,15 +130,13 @@ dependencies = [
135
130
 
136
131
  [[package]]
137
132
  name = "arrow-schema"
138
- version = "54.2.0"
139
- source = "registry+https://github.com/rust-lang/crates.io-index"
140
- checksum = "0f40f6be8f78af1ab610db7d9b236e21d587b7168e368a36275d2e5670096735"
133
+ version = "55.1.0"
134
+ source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
141
135
 
142
136
  [[package]]
143
137
  name = "arrow-select"
144
- version = "54.2.0"
145
- source = "registry+https://github.com/rust-lang/crates.io-index"
146
- checksum = "ac265273864a820c4a179fc67182ccc41ea9151b97024e1be956f0f2369c2539"
138
+ version = "55.1.0"
139
+ source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
147
140
  dependencies = [
148
141
  "ahash",
149
142
  "arrow-array",
@@ -180,7 +173,7 @@ version = "0.69.5"
180
173
  source = "registry+https://github.com/rust-lang/crates.io-index"
181
174
  checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
182
175
  dependencies = [
183
- "bitflags 2.8.0",
176
+ "bitflags",
184
177
  "cexpr",
185
178
  "clang-sys",
186
179
  "itertools 0.12.1",
@@ -194,12 +187,6 @@ dependencies = [
194
187
  "syn",
195
188
  ]
196
189
 
197
- [[package]]
198
- name = "bitflags"
199
- version = "1.3.2"
200
- source = "registry+https://github.com/rust-lang/crates.io-index"
201
- checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
202
-
203
190
  [[package]]
204
191
  name = "bitflags"
205
192
  version = "2.8.0"
@@ -208,9 +195,9 @@ checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36"
208
195
 
209
196
  [[package]]
210
197
  name = "brotli"
211
- version = "7.0.0"
198
+ version = "8.0.1"
212
199
  source = "registry+https://github.com/rust-lang/crates.io-index"
213
- checksum = "cc97b8f16f944bba54f0433f07e30be199b6dc2bd25937444bbad560bcea29bd"
200
+ checksum = "9991eea70ea4f293524138648e41ee89b0b2b12ddef3b255effa43c8056e0e0d"
214
201
  dependencies = [
215
202
  "alloc-no-stdlib",
216
203
  "alloc-stdlib",
@@ -219,9 +206,9 @@ dependencies = [
219
206
 
220
207
  [[package]]
221
208
  name = "brotli-decompressor"
222
- version = "4.0.2"
209
+ version = "5.0.0"
223
210
  source = "registry+https://github.com/rust-lang/crates.io-index"
224
- checksum = "74fa05ad7d803d413eb8380983b092cbbaf9a85f151b871360e7b00cd7060b37"
211
+ checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03"
225
212
  dependencies = [
226
213
  "alloc-no-stdlib",
227
214
  "alloc-stdlib",
@@ -359,11 +346,11 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
359
346
 
360
347
  [[package]]
361
348
  name = "flatbuffers"
362
- version = "24.12.23"
349
+ version = "25.2.10"
363
350
  source = "registry+https://github.com/rust-lang/crates.io-index"
364
- checksum = "4f1baf0dbf96932ec9a3038d57900329c015b0bfb7b63d904f3bc27e2b02a096"
351
+ checksum = "1045398c1bfd89168b5fd3f1fc11f6e70b34f6f66300c87d44d3de849463abf1"
365
352
  dependencies = [
366
- "bitflags 1.3.2",
353
+ "bitflags",
367
354
  "rustc_version",
368
355
  ]
369
356
 
@@ -374,6 +361,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
374
361
  checksum = "11faaf5a5236997af9848be0bef4db95824b1d534ebc64d0f0c6cf3e67bd38dc"
375
362
  dependencies = [
376
363
  "crc32fast",
364
+ "libz-rs-sys",
377
365
  "miniz_oxide",
378
366
  ]
379
367
 
@@ -652,6 +640,15 @@ dependencies = [
652
640
  "libc",
653
641
  ]
654
642
 
643
+ [[package]]
644
+ name = "libz-rs-sys"
645
+ version = "0.4.2"
646
+ source = "registry+https://github.com/rust-lang/crates.io-index"
647
+ checksum = "902bc563b5d65ad9bba616b490842ef0651066a1a1dc3ce1087113ffcb873c8d"
648
+ dependencies = [
649
+ "zlib-rs",
650
+ ]
651
+
655
652
  [[package]]
656
653
  name = "linux-raw-sys"
657
654
  version = "0.4.15"
@@ -670,7 +667,7 @@ version = "0.11.3"
670
667
  source = "registry+https://github.com/rust-lang/crates.io-index"
671
668
  checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5"
672
669
  dependencies = [
673
- "twox-hash",
670
+ "twox-hash 1.6.3",
674
671
  ]
675
672
 
676
673
  [[package]]
@@ -840,20 +837,20 @@ dependencies = [
840
837
  "magnus",
841
838
  "mimalloc",
842
839
  "num",
843
- "parquet 54.2.0",
840
+ "parquet 55.1.0",
844
841
  "rand",
845
842
  "rb-sys",
846
843
  "rb-sys-env 0.2.2",
847
844
  "simdutf8",
848
845
  "tempfile",
849
846
  "thiserror",
847
+ "uuid",
850
848
  ]
851
849
 
852
850
  [[package]]
853
851
  name = "parquet"
854
- version = "54.2.0"
855
- source = "registry+https://github.com/rust-lang/crates.io-index"
856
- checksum = "761c44d824fe83106e0600d2510c07bf4159a4985bf0569b513ea4288dc1b4fb"
852
+ version = "55.1.0"
853
+ source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
857
854
  dependencies = [
858
855
  "ahash",
859
856
  "arrow-array",
@@ -879,9 +876,8 @@ dependencies = [
879
876
  "simdutf8",
880
877
  "snap",
881
878
  "thrift",
882
- "twox-hash",
879
+ "twox-hash 2.1.0",
883
880
  "zstd",
884
- "zstd-sys",
885
881
  ]
886
882
 
887
883
  [[package]]
@@ -1055,7 +1051,7 @@ version = "0.38.44"
1055
1051
  source = "registry+https://github.com/rust-lang/crates.io-index"
1056
1052
  checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154"
1057
1053
  dependencies = [
1058
- "bitflags 2.8.0",
1054
+ "bitflags",
1059
1055
  "errno",
1060
1056
  "libc",
1061
1057
  "linux-raw-sys",
@@ -1223,12 +1219,24 @@ dependencies = [
1223
1219
  "static_assertions",
1224
1220
  ]
1225
1221
 
1222
+ [[package]]
1223
+ name = "twox-hash"
1224
+ version = "2.1.0"
1225
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1226
+ checksum = "e7b17f197b3050ba473acf9181f7b1d3b66d1cf7356c6cc57886662276e65908"
1227
+
1226
1228
  [[package]]
1227
1229
  name = "unicode-ident"
1228
1230
  version = "1.0.17"
1229
1231
  source = "registry+https://github.com/rust-lang/crates.io-index"
1230
1232
  checksum = "00e2473a93778eb0bad35909dff6a10d28e63f792f16ed15e404fca9d5eeedbe"
1231
1233
 
1234
+ [[package]]
1235
+ name = "uuid"
1236
+ version = "1.16.0"
1237
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1238
+ checksum = "458f7a779bf54acc9f347480ac654f68407d3aab21269a6e3c9f922acd9e2da9"
1239
+
1232
1240
  [[package]]
1233
1241
  name = "version_check"
1234
1242
  version = "0.9.5"
@@ -1402,7 +1410,7 @@ version = "0.33.0"
1402
1410
  source = "registry+https://github.com/rust-lang/crates.io-index"
1403
1411
  checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c"
1404
1412
  dependencies = [
1405
- "bitflags 2.8.0",
1413
+ "bitflags",
1406
1414
  ]
1407
1415
 
1408
1416
  [[package]]
@@ -1446,6 +1454,12 @@ dependencies = [
1446
1454
  "syn",
1447
1455
  ]
1448
1456
 
1457
+ [[package]]
1458
+ name = "zlib-rs"
1459
+ version = "0.4.2"
1460
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1461
+ checksum = "8b20717f0917c908dc63de2e44e97f1e6b126ca58d0e391cee86d504eb8fbd05"
1462
+
1449
1463
  [[package]]
1450
1464
  name = "zstd"
1451
1465
  version = "0.13.3"
@@ -11,20 +11,21 @@ rb-sys-env = "^0.2"
11
11
 
12
12
  [dependencies]
13
13
  ahash = "0.8"
14
- arrow-array = "54.0.0"
15
- arrow-schema = "54.0.0"
14
+ arrow-array = { git = "https://github.com/apache/arrow-rs", branch = "main" }
15
+ arrow-schema = { git = "https://github.com/apache/arrow-rs", branch = "main" }
16
16
  bytes = "^1.9"
17
17
  either = "1.9"
18
18
  itertools = "^0.14"
19
19
  jiff = "0.2"
20
20
  magnus = { version = "0.7", features = ["rb-sys"] }
21
- parquet = { version = "^54.0", features = ["json"] }
21
+ parquet = { git = "https://github.com/apache/arrow-rs", branch = "main", features = ["json"] }
22
22
  rand = "0.9"
23
23
  rb-sys = "^0.9"
24
24
  simdutf8 = "0.1.5"
25
25
  tempfile = "^3.15"
26
26
  thiserror = "2.0"
27
27
  num = "0.4.3"
28
+ uuid = "1.16.0"
28
29
 
29
30
  [target.'cfg(target_os = "linux")'.dependencies]
30
31
  jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
@@ -2,8 +2,8 @@ use crate::header_cache::StringCache;
2
2
  use crate::logger::RubyLogger;
3
3
  use crate::types::TryIntoValue;
4
4
  use crate::{
5
- create_column_enumerator, create_row_enumerator, ParquetField, ParquetGemError,
6
- ParserResultType, ColumnEnumeratorArgs, RowEnumeratorArgs, RowRecord, ColumnRecord, ParquetValueVec,
5
+ create_column_enumerator, create_row_enumerator, ColumnEnumeratorArgs, ColumnRecord,
6
+ ParquetField, ParquetGemError, ParquetValueVec, ParserResultType, RowEnumeratorArgs, RowRecord,
7
7
  };
8
8
  use ahash::RandomState;
9
9
  use either::Either;
@@ -13,10 +13,10 @@ use std::collections::HashMap;
13
13
  use std::rc::Rc;
14
14
  use std::sync::OnceLock;
15
15
 
16
- use crate::types::ArrayWrapper;
17
16
  use super::common::{
18
17
  create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
19
18
  };
19
+ use crate::types::ArrayWrapper;
20
20
 
21
21
  /// A unified parser configuration that can be used for both row and column parsing
22
22
  pub enum ParserType {
@@ -53,11 +53,11 @@ pub fn parse_parquet_unified(
53
53
  } = args;
54
54
 
55
55
  // Initialize the logger if provided
56
- let ruby_logger = RubyLogger::new(&ruby, logger.clone())?;
57
-
56
+ let ruby_logger = RubyLogger::new(&ruby, logger)?;
57
+
58
58
  // Clone values for the closure to avoid move issues
59
59
  let columns_clone = columns.clone();
60
-
60
+
61
61
  // Determine if we're handling rows or columns for enumerator creation
62
62
  match &parser_type {
63
63
  ParserType::Row { strict } => {
@@ -75,13 +75,13 @@ pub fn parse_parquet_unified(
75
75
  })? {
76
76
  return Ok(enum_value);
77
77
  }
78
- },
78
+ }
79
79
  ParserType::Column { batch_size, strict } => {
80
80
  // For column-based parsing, log the batch size if present
81
81
  if let Some(ref bs) = batch_size {
82
82
  ruby_logger.debug(|| format!("Using batch size: {}", bs))?;
83
83
  }
84
-
84
+
85
85
  // Handle block or create column enumerator
86
86
  if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
87
87
  create_column_enumerator(ColumnEnumeratorArgs {
@@ -102,19 +102,34 @@ pub fn parse_parquet_unified(
102
102
 
103
103
  // Open the Parquet source
104
104
  let source = open_parquet_source(ruby.clone(), to_read)?;
105
-
105
+
106
106
  // Based on the parser type, handle the data differently
107
107
  match parser_type {
108
108
  ParserType::Row { strict } => {
109
109
  // Handle row-based parsing
110
- process_row_data(ruby.clone(), source, &columns, result_type, strict, &ruby_logger)?;
111
- },
110
+ process_row_data(
111
+ ruby.clone(),
112
+ source,
113
+ &columns,
114
+ result_type,
115
+ strict,
116
+ &ruby_logger,
117
+ )?;
118
+ }
112
119
  ParserType::Column { batch_size, strict } => {
113
120
  // Handle column-based parsing
114
- process_column_data(ruby.clone(), source, &columns, result_type, batch_size, strict, &ruby_logger)?;
121
+ process_column_data(
122
+ ruby.clone(),
123
+ source,
124
+ &columns,
125
+ result_type,
126
+ batch_size,
127
+ strict,
128
+ &ruby_logger,
129
+ )?;
115
130
  }
116
131
  }
117
-
132
+
118
133
  Ok(ruby.qnil().into_value_with(&ruby))
119
134
  }
120
135
 
@@ -129,7 +144,7 @@ fn process_row_data(
129
144
  ) -> Result<(), ParquetGemError> {
130
145
  use parquet::file::reader::{FileReader, SerializedFileReader};
131
146
  use parquet::record::reader::RowIter as ParquetRowIter;
132
-
147
+
133
148
  // Create the row-based reader
134
149
  let reader: Box<dyn FileReader> = match source {
135
150
  Either::Left(file) => {
@@ -174,8 +189,19 @@ fn process_row_data(
174
189
 
175
190
  let mut map =
176
191
  HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
177
- for (i, (_, v)) in row.get_column_iter().enumerate() {
178
- map.insert(headers[i], ParquetField(v.clone(), strict));
192
+ for (i, ((_, v), t)) in
193
+ row.get_column_iter().zip(schema.get_fields()).enumerate()
194
+ {
195
+ let type_info = t.get_basic_info();
196
+ map.insert(
197
+ headers[i],
198
+ ParquetField {
199
+ field: v.clone(),
200
+ converted_type: type_info.converted_type(),
201
+ logical_type: type_info.logical_type().clone(),
202
+ strict,
203
+ },
204
+ );
179
205
  }
180
206
  map
181
207
  })
@@ -193,8 +219,14 @@ fn process_row_data(
193
219
  row.map(|row| {
194
220
  let column_count = row.get_column_iter().count();
195
221
  let mut vec = Vec::with_capacity(column_count);
196
- for (_, v) in row.get_column_iter() {
197
- vec.push(ParquetField(v.clone(), strict));
222
+ for ((_, v), t) in row.get_column_iter().zip(schema.get_fields()) {
223
+ let type_info = t.get_basic_info();
224
+ vec.push(ParquetField {
225
+ field: v.clone(),
226
+ converted_type: type_info.converted_type(),
227
+ logical_type: type_info.logical_type().clone(),
228
+ strict,
229
+ });
198
230
  }
199
231
  vec
200
232
  })
@@ -309,7 +341,10 @@ fn process_column_data(
309
341
  }
310
342
 
311
343
  /// Helper function to create a projection schema
312
- fn create_projection_schema(schema: &parquet::schema::types::Type, columns: &[String]) -> parquet::schema::types::Type {
344
+ fn create_projection_schema(
345
+ schema: &parquet::schema::types::Type,
346
+ columns: &[String],
347
+ ) -> parquet::schema::types::Type {
313
348
  if let parquet::schema::types::Type::GroupType { fields, .. } = schema {
314
349
  let projected_fields: Vec<std::sync::Arc<parquet::schema::types::Type>> = fields
315
350
  .iter()
@@ -325,4 +360,4 @@ fn create_projection_schema(schema: &parquet::schema::types::Type, columns: &[St
325
360
  // Return original schema if not a group type
326
361
  schema.clone()
327
362
  }
328
- }
363
+ }
@@ -61,6 +61,8 @@ pub enum ParquetGemError {
61
61
  Jiff(#[from] jiff::Error),
62
62
  #[error("Failed to cast slice to array: {0}")]
63
63
  InvalidDecimal(#[from] TryFromSliceError),
64
+ #[error("Failed to parse UUID: {0}")]
65
+ UuidError(#[from] uuid::Error),
64
66
  }
65
67
 
66
68
  #[derive(Debug)]
@@ -1,7 +1,10 @@
1
1
  use std::sync::OnceLock;
2
2
 
3
3
  use itertools::Itertools;
4
- use parquet::data_type::AsBytes;
4
+ use parquet::{
5
+ basic::{ConvertedType, LogicalType},
6
+ data_type::AsBytes,
7
+ };
5
8
 
6
9
  use super::*;
7
10
 
@@ -44,7 +47,13 @@ pub enum ColumnRecord<S: BuildHasher + Default> {
44
47
  }
45
48
 
46
49
  #[derive(Debug)]
47
- pub struct ParquetField(pub Field, pub bool);
50
+ pub struct ParquetField {
51
+ pub field: Field,
52
+ #[allow(dead_code)]
53
+ pub converted_type: ConvertedType,
54
+ pub logical_type: Option<LogicalType>,
55
+ pub strict: bool,
56
+ }
48
57
 
49
58
  impl<S: BuildHasher + Default> TryIntoValue for RowRecord<S> {
50
59
  fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ParquetGemError> {
@@ -158,7 +167,7 @@ pub trait TryIntoValue {
158
167
 
159
168
  impl TryIntoValue for ParquetField {
160
169
  fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ParquetGemError> {
161
- match self.0 {
170
+ match self.field {
162
171
  Field::Null => Ok(handle.qnil().as_value()),
163
172
  Field::Bool(b) => Ok(b.into_value_with(handle)),
164
173
  Field::Short(s) => Ok(s.into_value_with(handle)),
@@ -172,7 +181,7 @@ impl TryIntoValue for ParquetField {
172
181
  Field::Float(f) => Ok(f.into_value_with(handle)),
173
182
  Field::Double(d) => Ok(d.into_value_with(handle)),
174
183
  Field::Str(s) => {
175
- if self.1 {
184
+ if self.strict {
176
185
  Ok(simdutf8::basic::from_utf8(s.as_bytes())
177
186
  .map_err(ParquetGemError::Utf8Error)
178
187
  .map(|s| s.into_value_with(handle))?)
@@ -182,7 +191,15 @@ impl TryIntoValue for ParquetField {
182
191
  }
183
192
  }
184
193
  Field::Byte(b) => Ok(b.into_value_with(handle)),
185
- Field::Bytes(b) => Ok(handle.str_from_slice(b.data()).as_value()),
194
+ Field::Bytes(b) => {
195
+ if matches!(self.logical_type, Some(parquet::basic::LogicalType::Uuid)) {
196
+ let bytes = b.as_bytes();
197
+ let uuid = uuid::Uuid::from_slice(bytes)?;
198
+ Ok(uuid.to_string().into_value_with(handle))
199
+ } else {
200
+ Ok(handle.str_from_slice(b.data()).as_value())
201
+ }
202
+ }
186
203
  Field::Date(d) => {
187
204
  let ts = jiff::Timestamp::from_second((d as i64) * 86400)?;
188
205
  let formatted = ts.strftime("%Y-%m-%d").to_string();
@@ -206,7 +223,15 @@ impl TryIntoValue for ParquetField {
206
223
  let elements = list.elements();
207
224
  let ary = handle.ary_new_capa(elements.len());
208
225
  elements.iter().try_for_each(|e| {
209
- ary.push(ParquetField(e.clone(), self.1).try_into_value_with(handle)?)?;
226
+ ary.push(
227
+ ParquetField {
228
+ field: e.clone(),
229
+ logical_type: e.to_logical_type(),
230
+ converted_type: e.to_converted_type(),
231
+ strict: self.strict,
232
+ }
233
+ .try_into_value_with(handle)?,
234
+ )?;
210
235
  Ok::<_, ParquetGemError>(())
211
236
  })?;
212
237
  Ok(ary.into_value_with(handle))
@@ -220,8 +245,20 @@ impl TryIntoValue for ParquetField {
220
245
 
221
246
  map.entries().iter().try_for_each(|(k, v)| {
222
247
  hash.aset(
223
- ParquetField(k.clone(), self.1).try_into_value_with(handle)?,
224
- ParquetField(v.clone(), self.1).try_into_value_with(handle)?,
248
+ ParquetField {
249
+ field: k.clone(),
250
+ converted_type: k.to_converted_type(),
251
+ logical_type: k.to_logical_type(),
252
+ strict: self.strict,
253
+ }
254
+ .try_into_value_with(handle)?,
255
+ ParquetField {
256
+ field: v.clone(),
257
+ converted_type: v.to_converted_type(),
258
+ logical_type: v.to_logical_type(),
259
+ strict: self.strict,
260
+ }
261
+ .try_into_value_with(handle)?,
225
262
  )?;
226
263
  Ok::<_, ParquetGemError>(())
227
264
  })?;
@@ -238,10 +275,32 @@ impl TryIntoValue for ParquetField {
238
275
  format_decimal_with_i32_scale(unscaled, scale)
239
276
  }
240
277
  Decimal::Bytes { value, scale, .. } => {
241
- // value is a byte array containing the bytes for an i128 value in big endian order
242
- let casted = value.as_bytes()[..16].try_into()?;
243
- let unscaled = i128::from_be_bytes(casted);
244
- format_decimal_with_i32_scale(unscaled, scale)
278
+ match value.len() {
279
+ 4 => {
280
+ // value is a byte array containing the bytes for an i32 value in big endian order
281
+ let casted = value.as_bytes()[..4].try_into()?;
282
+ let unscaled = i32::from_be_bytes(casted);
283
+ format_decimal_with_i32_scale(unscaled, scale)
284
+ }
285
+ 8 => {
286
+ // value is a byte array containing the bytes for an i64 value in big endian order
287
+ let casted = value.as_bytes()[..8].try_into()?;
288
+ let unscaled = i64::from_be_bytes(casted);
289
+ format_decimal_with_i32_scale(unscaled, scale)
290
+ }
291
+ 16 => {
292
+ // value is a byte array containing the bytes for an i128 value in big endian order
293
+ let casted = value.as_bytes()[..16].try_into()?;
294
+ let unscaled = i128::from_be_bytes(casted);
295
+ format_decimal_with_i32_scale(unscaled, scale)
296
+ }
297
+ _ => {
298
+ unimplemented!(
299
+ "Unsupported decimal byte array size: {}",
300
+ value.len()
301
+ );
302
+ }
303
+ }
245
304
  }
246
305
  };
247
306
 
@@ -256,7 +315,13 @@ impl TryIntoValue for ParquetField {
256
315
  row.get_column_iter().try_for_each(|(k, v)| {
257
316
  hash.aset(
258
317
  k.clone().into_value_with(handle),
259
- ParquetField(v.clone(), self.1).try_into_value_with(handle)?,
318
+ ParquetField {
319
+ field: v.clone(),
320
+ converted_type: v.to_converted_type(),
321
+ logical_type: v.to_logical_type(),
322
+ strict: self.strict,
323
+ }
324
+ .try_into_value_with(handle)?,
260
325
  )?;
261
326
  Ok::<_, ParquetGemError>(())
262
327
  })?;
@@ -265,3 +330,111 @@ impl TryIntoValue for ParquetField {
265
330
  }
266
331
  }
267
332
  }
333
+
334
+ trait ToTypeInfo {
335
+ fn to_converted_type(&self) -> ConvertedType;
336
+ fn to_logical_type(&self) -> Option<LogicalType>;
337
+ }
338
+
339
+ impl ToTypeInfo for &parquet::record::Field {
340
+ fn to_converted_type(&self) -> ConvertedType {
341
+ match self {
342
+ Field::Null => ConvertedType::NONE,
343
+ Field::Bool(_) => ConvertedType::INT_8,
344
+ Field::Byte(_) => ConvertedType::INT_8,
345
+ Field::Short(_) => ConvertedType::INT_16,
346
+ Field::Int(_) => ConvertedType::INT_32,
347
+ Field::Long(_) => ConvertedType::INT_64,
348
+ Field::UByte(_) => ConvertedType::UINT_8,
349
+ Field::UShort(_) => ConvertedType::UINT_16,
350
+ Field::UInt(_) => ConvertedType::UINT_32,
351
+ Field::ULong(_) => ConvertedType::UINT_64,
352
+ Field::Float16(_) => ConvertedType::NONE,
353
+ Field::Float(_) => ConvertedType::NONE,
354
+ Field::Double(_) => ConvertedType::NONE,
355
+ Field::Decimal(_) => ConvertedType::DECIMAL,
356
+ Field::Str(_) => ConvertedType::UTF8,
357
+ Field::Bytes(_) => ConvertedType::LIST,
358
+ Field::Date(_) => ConvertedType::DATE,
359
+ Field::TimestampMillis(_) => ConvertedType::TIMESTAMP_MILLIS,
360
+ Field::TimestampMicros(_) => ConvertedType::TIMESTAMP_MICROS,
361
+ Field::Group(_) => ConvertedType::NONE,
362
+ Field::ListInternal(_) => ConvertedType::LIST,
363
+ Field::MapInternal(_) => ConvertedType::MAP,
364
+ }
365
+ }
366
+ fn to_logical_type(&self) -> Option<LogicalType> {
367
+ Some(match self {
368
+ Field::Null => LogicalType::Unknown,
369
+ Field::Bool(_) => LogicalType::Integer {
370
+ bit_width: 1,
371
+ is_signed: false,
372
+ },
373
+ Field::Byte(_) => LogicalType::Integer {
374
+ bit_width: 8,
375
+ is_signed: false,
376
+ },
377
+ Field::Short(_) => LogicalType::Integer {
378
+ bit_width: 16,
379
+ is_signed: true,
380
+ },
381
+ Field::Int(_) => LogicalType::Integer {
382
+ bit_width: 32,
383
+ is_signed: true,
384
+ },
385
+ Field::Long(_) => LogicalType::Integer {
386
+ bit_width: 64,
387
+ is_signed: true,
388
+ },
389
+ Field::UByte(_) => LogicalType::Integer {
390
+ bit_width: 8,
391
+ is_signed: false,
392
+ },
393
+ Field::UShort(_) => LogicalType::Integer {
394
+ bit_width: 16,
395
+ is_signed: false,
396
+ },
397
+ Field::UInt(_) => LogicalType::Integer {
398
+ bit_width: 32,
399
+ is_signed: false,
400
+ },
401
+ Field::ULong(_) => LogicalType::Integer {
402
+ bit_width: 64,
403
+ is_signed: false,
404
+ },
405
+ Field::Float16(_) => LogicalType::Float16,
406
+ Field::Float(_) => LogicalType::Decimal {
407
+ scale: 7,
408
+ precision: 7,
409
+ },
410
+ Field::Double(_) => LogicalType::Decimal {
411
+ scale: 15,
412
+ precision: 15,
413
+ },
414
+ Field::Decimal(decimal) => LogicalType::Decimal {
415
+ scale: decimal.scale(),
416
+ precision: decimal.precision(),
417
+ },
418
+ Field::Str(_) => LogicalType::String,
419
+ Field::Bytes(b) => {
420
+ if b.data().len() == 16 && uuid::Uuid::from_slice(b.as_bytes()).is_ok() {
421
+ LogicalType::Uuid
422
+ } else {
423
+ LogicalType::Unknown
424
+ }
425
+ }
426
+ Field::Date(_) => LogicalType::Date,
427
+ Field::TimestampMillis(_) => LogicalType::Timestamp {
428
+ is_adjusted_to_u_t_c: true,
429
+ unit: parquet::basic::TimeUnit::MILLIS(parquet::format::MilliSeconds {}),
430
+ },
431
+ Field::TimestampMicros(_) => LogicalType::Timestamp {
432
+ is_adjusted_to_u_t_c: true,
433
+ unit: parquet::basic::TimeUnit::MICROS(parquet::format::MicroSeconds {}),
434
+ },
435
+ Field::Group(_) => LogicalType::Unknown,
436
+ Field::ListInternal(_) => LogicalType::List,
437
+ Field::MapInternal(_) => LogicalType::Map,
438
+ })
439
+ }
440
+ }
@@ -121,7 +121,8 @@ pub fn parse_legacy_schema(
121
121
  ruby.exception_type_error(),
122
122
  "Schema must be an array of field definitions or nil",
123
123
  )
124
- })?.is_empty())
124
+ })?
125
+ .is_empty())
125
126
  {
126
127
  // If schema is nil or an empty array, we'll handle this in the caller
127
128
  return Ok(Vec::new());
@@ -206,101 +207,39 @@ pub fn parse_legacy_schema(
206
207
 
207
208
  // Handle decimal type with precision and scale
208
209
  let mut type_result = PST::try_convert(type_str)?;
209
-
210
+
210
211
  // If it's a decimal type and we have precision and scale, override the type
211
212
  if let PST::Primitive(PrimitiveType::Decimal128(_, _)) = type_result {
212
- let precision_value = precision.unwrap_or_else(|| {
213
- let val: u8 = 18;
214
- val.into_value_with(ruby)
215
- });
216
- let scale_value = scale.unwrap_or_else(|| {
217
- let val: i8 = 2;
218
- val.into_value_with(ruby)
219
- });
220
-
221
- let precision_u8 = u8::try_convert(precision_value).map_err(|_| {
222
- MagnusError::new(
223
- ruby.exception_type_error(),
224
- "Invalid precision value for decimal type, expected a positive integer".to_string(),
225
- )
226
- })?;
227
-
228
- // Validate precision is in a valid range
229
- if precision_u8 < 1 {
230
- return Err(MagnusError::new(
231
- ruby.exception_arg_error(),
232
- format!(
233
- "Precision for decimal type must be at least 1, got {}",
234
- precision_u8
235
- ),
236
- ));
237
- }
238
-
239
- if precision_u8 > 38 {
240
- return Err(MagnusError::new(
241
- ruby.exception_arg_error(),
242
- format!(
243
- "Precision for decimal type cannot exceed 38, got {}",
244
- precision_u8
245
- ),
246
- ));
247
- }
248
-
249
- let scale_i8 = i8::try_convert(scale_value).map_err(|_| {
250
- MagnusError::new(
251
- ruby.exception_type_error(),
252
- "Invalid scale value for decimal type, expected an integer".to_string(),
253
- )
254
- })?;
255
-
256
- // Validate scale is in a valid range relative to precision
257
- if scale_i8 < 0 {
258
- return Err(MagnusError::new(
259
- ruby.exception_arg_error(),
260
- format!(
261
- "Scale for decimal type cannot be negative, got {}",
262
- scale_i8
263
- ),
264
- ));
265
- }
266
-
267
- if scale_i8 as u8 > precision_u8 {
268
- return Err(MagnusError::new(
269
- ruby.exception_arg_error(),
270
- format!(
271
- "Scale ({}) cannot be larger than precision ({}) for decimal type",
272
- scale_i8, precision_u8
273
- ),
274
- ));
275
- }
276
-
277
- type_result = PST::Primitive(PrimitiveType::Decimal128(precision_u8, scale_i8));
213
+ // Do nothing
278
214
  } else if let Some(type_name) = parse_string_or_symbol(ruby, type_str)? {
279
215
  if type_name == "decimal" {
280
216
  let precision_value = precision.unwrap_or_else(|| {
281
- let val: u8 = 18;
217
+ let val: u8 = 38;
282
218
  val.into_value_with(ruby)
283
219
  });
220
+
284
221
  let scale_value = scale.unwrap_or_else(|| {
285
- let val: i8 = 2;
222
+ let val: i8 = 0;
286
223
  val.into_value_with(ruby)
287
224
  });
288
-
225
+
289
226
  let precision_u8 = u8::try_convert(precision_value).map_err(|_| {
290
227
  MagnusError::new(
291
228
  ruby.exception_type_error(),
292
229
  "Invalid precision value for decimal type, expected a positive integer".to_string(),
293
230
  )
294
231
  })?;
295
-
232
+
296
233
  let scale_i8 = i8::try_convert(scale_value).map_err(|_| {
297
234
  MagnusError::new(
298
235
  ruby.exception_type_error(),
299
- "Invalid scale value for decimal type, expected an integer".to_string(),
236
+ "Invalid scale value for decimal type, expected an integer"
237
+ .to_string(),
300
238
  )
301
239
  })?;
302
-
303
- type_result = PST::Primitive(PrimitiveType::Decimal128(precision_u8, scale_i8));
240
+
241
+ type_result =
242
+ PST::Primitive(PrimitiveType::Decimal128(precision_u8, scale_i8));
304
243
  }
305
244
  }
306
245
 
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.5.5"
2
+ VERSION = "0.5.7"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.5
4
+ version: 0.5.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-04-01 00:00:00.000000000 Z
11
+ date: 2025-05-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys