parquet 0.5.2 → 0.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 896f2833b6db8e4466af8fc9d43eb5c695e25a207a6f8050d22052458edded36
4
- data.tar.gz: 38de2831bf7013e0194b2e61a91b26a1283fef65a04309dfbe125c570d64e9ed
3
+ metadata.gz: 936feb49be7a1bbbb36236551480ae0522d6b52443e76b4ebb7502abdb9d2903
4
+ data.tar.gz: bcc56665ec0cd132e22c262373e7b1294e085be364c93efbd214e434ada7dcb6
5
5
  SHA512:
6
- metadata.gz: 52d83bc198f789856eac4bff7ff985a82c3f03f75e5de79efc5b388ce5afc63cb507b4cacc90625ee321619ade1ddc16f66f6c437ca0f60d144bc593bbec8cc5
7
- data.tar.gz: c6dd98694fd2a1d29ceebec6b58d63220f3992fe4dc63dae1c28ea27f8a353764f87a16a13c95e3e2111bca811c57fde45da9f008a93d8868112b4be608d46ee
6
+ metadata.gz: 7856d7f36820a8384faf564f166d39e0daca1c9d15457b6f6aae8ff56f4176a8b1302bfbc2cc5edcfedfcb0805cbe71029f5712e716a29dc4942a1e6453a3e5e
7
+ data.tar.gz: '08d1f4cfe357b22bad4c4fab4ddd4fa93069b13c65559d668fb704e2f7d8884fc8f081270e4dc43a5db60aab7147be36bfe7d26945f93c9ad6e9badbd0ad957e'
data/Cargo.lock CHANGED
@@ -681,7 +681,7 @@ checksum = "3d87ae53030f3a22e83879e666cb94e58a7bdf31706878a0ba48752994146dab"
681
681
  dependencies = [
682
682
  "magnus-macros",
683
683
  "rb-sys",
684
- "rb-sys-env",
684
+ "rb-sys-env 0.1.2",
685
685
  "seq-macro",
686
686
  ]
687
687
 
@@ -839,9 +839,11 @@ dependencies = [
839
839
  "jiff",
840
840
  "magnus",
841
841
  "mimalloc",
842
+ "num",
842
843
  "parquet 54.2.0",
843
844
  "rand",
844
845
  "rb-sys",
846
+ "rb-sys-env 0.2.2",
845
847
  "simdutf8",
846
848
  "tempfile",
847
849
  "thiserror",
@@ -997,6 +999,12 @@ version = "0.1.2"
997
999
  source = "registry+https://github.com/rust-lang/crates.io-index"
998
1000
  checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
999
1001
 
1002
+ [[package]]
1003
+ name = "rb-sys-env"
1004
+ version = "0.2.2"
1005
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1006
+ checksum = "08f8d2924cf136a1315e2b4c7460a39f62ef11ee5d522df9b2750fab55b868b6"
1007
+
1000
1008
  [[package]]
1001
1009
  name = "regex"
1002
1010
  version = "1.11.1"
data/README.md CHANGED
@@ -8,6 +8,78 @@ This project is a Ruby library wrapping the [parquet-rs](https://github.com/apac
8
8
 
9
9
  This library provides high-level bindings to parquet-rs with two primary APIs for reading Parquet files: row-wise and column-wise iteration. The column-wise API generally offers better performance, especially when working with subset of columns.
10
10
 
11
+ ### Metadata
12
+
13
+ The `metadata` method provides detailed information about a Parquet file's structure and contents:
14
+
15
+ ```ruby
16
+ require "parquet"
17
+
18
+ # Get metadata from a file path
19
+ metadata = Parquet.metadata("data.parquet")
20
+
21
+ # Or from an IO object
22
+ File.open("data.parquet", "rb") do |file|
23
+ metadata = Parquet.metadata(file)
24
+ end
25
+
26
+ # Example metadata output:
27
+ # {
28
+ # "num_rows" => 3,
29
+ # "created_by" => "parquet-rs version 54.2.0",
30
+ # "key_value_metadata" => [
31
+ # {
32
+ # "key" => "ARROW:schema",
33
+ # "value" => "base64_encoded_schema"
34
+ # }
35
+ # ],
36
+ # "schema" => {
37
+ # "name" => "arrow_schema",
38
+ # "fields" => [
39
+ # {
40
+ # "name" => "id",
41
+ # "type" => "primitive",
42
+ # "physical_type" => "INT64",
43
+ # "repetition" => "OPTIONAL",
44
+ # "converted_type" => "NONE"
45
+ # },
46
+ # # ... other fields
47
+ # ]
48
+ # },
49
+ # "row_groups" => [
50
+ # {
51
+ # "num_columns" => 5,
52
+ # "num_rows" => 3,
53
+ # "total_byte_size" => 379,
54
+ # "columns" => [
55
+ # {
56
+ # "column_path" => "id",
57
+ # "num_values" => 3,
58
+ # "compression" => "UNCOMPRESSED",
59
+ # "total_compressed_size" => 91,
60
+ # "encodings" => ["PLAIN", "RLE", "RLE_DICTIONARY"],
61
+ # "statistics" => {
62
+ # "min_is_exact" => true,
63
+ # "max_is_exact" => true
64
+ # }
65
+ # },
66
+ # # ... other columns
67
+ # ]
68
+ # }
69
+ # ]
70
+ # }
71
+ ```
72
+
73
+ The metadata includes:
74
+ - Total number of rows
75
+ - File creation information
76
+ - Key-value metadata (including Arrow schema)
77
+ - Detailed schema information for each column
78
+ - Row group information including:
79
+ - Number of columns and rows
80
+ - Total byte size
81
+ - Column-level details (compression, encodings, statistics)
82
+
11
83
  ### Row-wise Iteration
12
84
 
13
85
  The `each_row` method provides sequential access to individual rows:
@@ -236,17 +308,169 @@ schema = Parquet::Schema.define do
236
308
  field :description, :string
237
309
  end
238
310
 
239
- # Nested lists
311
+ # Nested lists (list of lists of strings)
240
312
  field :nested_lists, :list, item: :list do
241
- field :item, :string # For nested lists, inner item must be named 'item'
313
+ field :item, :string # REQUIRED: Inner item field MUST be named 'item' for nested lists
242
314
  end
243
315
 
244
316
  # Map of lists
245
317
  field :map_of_lists, :map, key: :string, value: :list do
246
- field :item, :int32 # For list items in maps, item must be named 'item'
318
+ field :item, :int32 # REQUIRED: List items in maps MUST be named 'item'
247
319
  end
248
320
  end
249
321
 
322
+ ### Nested Lists
323
+
324
+ When working with nested lists (a list of lists), there are specific requirements:
325
+
326
+ 1. Using the Schema DSL:
327
+ ```ruby
328
+ # A list of lists of strings
329
+ field :nested_lists, :list, item: :list do
330
+ field :item, :string # For nested lists, inner item MUST be named 'item'
331
+ end
332
+ ```
333
+
334
+ 2. Using hash-based schema format:
335
+ ```ruby
336
+ # A list of lists of integers
337
+ { "nested_numbers" => "list<list<int32>>" }
338
+ ```
339
+
340
+ The data for nested lists is structured as an array of arrays:
341
+ ```ruby
342
+ # Data for the nested_lists field
343
+ [["a", "b"], ["c", "d", "e"], []] # Last one is an empty inner list
344
+ ```
345
+
346
+ ### Decimal Data Type
347
+
348
+ Parquet supports decimal numbers with configurable precision and scale, which is essential for financial applications where exact decimal representation is critical. The library seamlessly converts between Ruby's `BigDecimal` and Parquet's decimal type.
349
+
350
+ #### Decimal Precision and Scale
351
+
352
+ When working with decimal fields, you need to understand two key parameters:
353
+
354
+ - **Precision**: The total number of significant digits (both before and after the decimal point)
355
+ - **Scale**: The number of digits after the decimal point
356
+
357
+ The rules for defining decimals are:
358
+
359
+ ```ruby
360
+ # No precision/scale specified - uses maximum precision (38) with scale 0
361
+ field :amount1, :decimal # Equivalent to INTEGER with 38 digits
362
+
363
+ # Only precision specified - scale defaults to 0
364
+ field :amount2, :decimal, precision: 10 # 10 digits, no decimal places
365
+
366
+ # Only scale specified - uses maximum precision (38)
367
+ field :amount3, :decimal, scale: 2 # 38 digits with 2 decimal places
368
+
369
+ # Both precision and scale specified
370
+ field :amount4, :decimal, precision: 10, scale: 2 # 10 digits with 2 decimal places
371
+ ```
372
+
373
+ #### Financial Data Example
374
+
375
+ Here's a practical example for a financial application:
376
+
377
+ ```ruby
378
+ require "parquet"
379
+ require "bigdecimal"
380
+
381
+ # Schema for financial transactions
382
+ schema = Parquet::Schema.define do
383
+ field :transaction_id, :string, nullable: false
384
+ field :timestamp, :timestamp_millis, nullable: false
385
+ field :amount, :decimal, precision: 12, scale: 2 # Supports up to 10^10 with 2 decimal places
386
+ field :balance, :decimal, precision: 16, scale: 2 # Larger precision for running balances
387
+ field :currency, :string
388
+ field :exchange_rate, :decimal, precision: 10, scale: 6 # 6 decimal places for forex rates
389
+ field :fee, :decimal, precision: 8, scale: 2, nullable: true # Optional fee
390
+ field :category, :string
391
+ end
392
+
393
+ # Sample financial data
394
+ transactions = [
395
+ [
396
+ "T-12345",
397
+ Time.now,
398
+ BigDecimal("1256.99"), # amount (directly using BigDecimal)
399
+ BigDecimal("10250.25"), # balance
400
+ "USD",
401
+ BigDecimal("1.0"), # exchange_rate
402
+ BigDecimal("2.50"), # fee
403
+ "Groceries"
404
+ ],
405
+ [
406
+ "T-12346",
407
+ Time.now - 86400, # yesterday
408
+ BigDecimal("-89.50"), # negative amount for withdrawal
409
+ BigDecimal("10160.75"), # updated balance
410
+ "USD",
411
+ BigDecimal("1.0"), # exchange_rate
412
+ nil, # no fee
413
+ "Transportation"
414
+ ],
415
+ [
416
+ "T-12347",
417
+ Time.now - 172800, # two days ago
418
+ BigDecimal("250.00"), # amount
419
+ BigDecimal("10410.75"), # balance
420
+ "EUR", # different currency
421
+ BigDecimal("1.05463"), # exchange_rate
422
+ BigDecimal("1.75"), # fee
423
+ "Entertainment"
424
+ ]
425
+ ]
426
+
427
+ # Write financial data to Parquet file
428
+ Parquet.write_rows(transactions.each, schema: schema, write_to: "financial_data.parquet")
429
+
430
+ # Read back transactions
431
+ Parquet.each_row("financial_data.parquet") do |transaction|
432
+ # Access decimal fields as BigDecimal objects
433
+ puts "Transaction: #{transaction['transaction_id']}"
434
+ puts " Amount: #{transaction['currency']} #{transaction['amount']}"
435
+ puts " Balance: $#{transaction['balance']}"
436
+ puts " Fee: #{transaction['fee'] || 'No fee'}"
437
+
438
+ # You can perform precise decimal calculations
439
+ if transaction['currency'] != 'USD'
440
+ usd_amount = transaction['amount'] * transaction['exchange_rate']
441
+ puts " USD Equivalent: $#{usd_amount.round(2)}"
442
+ end
443
+ end
444
+ ```
445
+
446
+ #### Decimal Type Storage Considerations
447
+
448
+ Parquet optimizes storage based on the precision:
449
+ - For precision ≤ 9: Uses 4-byte INT32
450
+ - For precision ≤ 18: Uses 8-byte INT64
451
+ - For precision ≤ 38: Uses 16-byte BYTE_ARRAY
452
+
453
+ Choose appropriate precision and scale for your data to optimize storage while ensuring adequate range:
454
+
455
+ ```ruby
456
+ # Banking examples
457
+ field :account_balance, :decimal, precision: 16, scale: 2 # Up to 14 digits before decimal point
458
+ field :interest_rate, :decimal, precision: 8, scale: 6 # Rate with 6 decimal places (e.g., 0.015625)
459
+
460
+ # E-commerce examples
461
+ field :product_price, :decimal, precision: 10, scale: 2 # Product price
462
+ field :shipping_weight, :decimal, precision: 6, scale: 3 # Weight in kg with 3 decimal places
463
+
464
+ # Analytics examples
465
+ field :conversion_rate, :decimal, precision: 5, scale: 4 # Rate like 0.0123
466
+ field :daily_revenue, :decimal, precision: 14, scale: 2 # Daily revenue with 2 decimal places
467
+ ```
468
+
469
+ ### Sample Data with Nested Structures
470
+
471
+ Here's an example showing how to use the schema defined earlier with sample data:
472
+
473
+ ```ruby
250
474
  # Sample data with nested structures
251
475
  data = [
252
476
  [
@@ -271,7 +495,7 @@ data = [
271
495
  "feature1" => { "count" => 5, "description" => "Main feature" },
272
496
  "feature2" => { "count" => 3, "description" => "Secondary feature" }
273
497
  },
274
- [["a", "b"], ["c", "d", "e"]], # nested_lists
498
+ [["a", "b"], ["c", "d", "e"]], # nested_lists (a list of lists of strings)
275
499
  { # map_of_lists
276
500
  "group1" => [1, 2, 3],
277
501
  "group2" => [4, 5, 6]
@@ -6,6 +6,9 @@ edition = "2021"
6
6
  [lib]
7
7
  crate-type = ["cdylib"]
8
8
 
9
+ [build-dependencies]
10
+ rb-sys-env = "^0.2"
11
+
9
12
  [dependencies]
10
13
  ahash = "0.8"
11
14
  arrow-array = "54.0.0"
@@ -21,6 +24,7 @@ rb-sys = "^0.9"
21
24
  simdutf8 = "0.1.5"
22
25
  tempfile = "^3.15"
23
26
  thiserror = "2.0"
27
+ num = "0.4.3"
24
28
 
25
29
  [target.'cfg(target_os = "linux")'.dependencies]
26
30
  jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
@@ -0,0 +1,5 @@
1
+ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
2
+ let _rb_env = rb_sys_env::activate()?;
3
+
4
+ Ok(())
5
+ }
@@ -20,6 +20,7 @@ use writer::write_rows;
20
20
  #[magnus::init]
21
21
  fn init(ruby: &Ruby) -> Result<(), Error> {
22
22
  let module = ruby.define_module("Parquet")?;
23
+ module.define_module_function("metadata", magnus::method!(reader::parse_metadata, -1))?;
23
24
  module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
24
25
  module.define_module_function("each_column", magnus::method!(parse_parquet_columns, -1))?;
25
26
  module.define_module_function("write_rows", magnus::function!(write_rows, -1))?;
@@ -5,6 +5,7 @@ use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchR
5
5
  use parquet::arrow::ProjectionMask;
6
6
  use std::collections::HashMap;
7
7
  use std::fs::File;
8
+ use std::rc::Rc;
8
9
  use std::sync::Arc;
9
10
 
10
11
  use magnus::value::ReprValue;
@@ -21,7 +22,7 @@ use crate::ColumnRecord;
21
22
  /// returning either a File or a ThreadSafeRubyReader that can be used with
22
23
  /// parquet readers.
23
24
  pub fn open_parquet_source(
24
- ruby: Arc<Ruby>,
25
+ ruby: Rc<Ruby>,
25
26
  to_read: Value,
26
27
  ) -> Result<Either<File, ThreadSafeRubyReader>, ParquetGemError> {
27
28
  if to_read.is_kind_of(ruby.class_string()) {
@@ -58,8 +59,8 @@ pub fn create_batch_reader<T: parquet::file::reader::ChunkReader + 'static>(
58
59
  columns: &Option<Vec<String>>,
59
60
  batch_size: Option<usize>,
60
61
  ) -> Result<(ParquetRecordBatchReader, std::sync::Arc<Schema>, i64), ParquetGemError> {
61
- let mut builder = ParquetRecordBatchReaderBuilder::try_new(reader)
62
- .map_err(|e| ParquetGemError::Parquet(e))?;
62
+ let mut builder =
63
+ ParquetRecordBatchReaderBuilder::try_new(reader).map_err(ParquetGemError::Parquet)?;
63
64
 
64
65
  let schema = builder.schema().clone();
65
66
  let num_rows = builder.metadata().file_metadata().num_rows();
@@ -78,7 +79,7 @@ pub fn create_batch_reader<T: parquet::file::reader::ChunkReader + 'static>(
78
79
  builder = builder.with_batch_size(batch_size);
79
80
  }
80
81
 
81
- let reader = builder.build().map_err(|e| ParquetGemError::Parquet(e))?;
82
+ let reader = builder.build().map_err(ParquetGemError::Parquet)?;
82
83
  Ok((reader, schema, num_rows))
83
84
  }
84
85
 
@@ -98,12 +99,12 @@ pub fn handle_empty_file(
98
99
  .map(|field| field.name().to_string())
99
100
  .collect();
100
101
  let interned_headers =
101
- StringCache::intern_many(&headers).map_err(|e| ParquetGemError::HeaderIntern(e))?;
102
+ StringCache::intern_many(&headers).map_err(ParquetGemError::HeaderIntern)?;
102
103
  for field in interned_headers.iter() {
103
104
  map.insert(*field, vec![]);
104
105
  }
105
106
  let record = ColumnRecord::Map(map);
106
- let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
107
+ let _: Value = ruby.yield_value(record.try_into_value_with(ruby)?)?;
107
108
  return Ok(true);
108
109
  }
109
110
  Ok(false)
@@ -1,6 +1,210 @@
1
1
  mod common;
2
2
  mod parquet_column_reader;
3
3
  mod parquet_row_reader;
4
+ use std::{fs::File, rc::Rc};
4
5
 
6
+ use magnus::{value::ReprValue, Error as MagnusError, Ruby, Value};
7
+ use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader};
5
8
  pub use parquet_column_reader::parse_parquet_columns;
6
9
  pub use parquet_row_reader::parse_parquet_rows;
10
+
11
+ use crate::{
12
+ ruby_reader::{RubyReader, ThreadSafeRubyReader},
13
+ types::{ParquetGemError, TryIntoValue},
14
+ };
15
+
16
+ struct RubyParquetMetaData(ParquetMetaData);
17
+
18
+ impl TryIntoValue for RubyParquetMetaData {
19
+ fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ParquetGemError> {
20
+ let metadata = self.0;
21
+ let file_metadata = metadata.file_metadata();
22
+ let row_groups = metadata.row_groups();
23
+
24
+ // Construct a hash with the metadata
25
+ let hash = handle.hash_new();
26
+ hash.aset("num_rows", file_metadata.num_rows())?;
27
+ hash.aset("created_by", file_metadata.created_by())?;
28
+ // Convert key_value_metadata to a Ruby array if it exists
29
+ if let Some(key_value_metadata) = file_metadata.key_value_metadata() {
30
+ let kv_array = handle.ary_new();
31
+ for kv in key_value_metadata {
32
+ let kv_hash = handle.hash_new();
33
+ kv_hash.aset("key", kv.key.clone())?;
34
+ kv_hash.aset("value", kv.value.clone())?;
35
+ kv_array.push(kv_hash)?;
36
+ }
37
+ hash.aset("key_value_metadata", kv_array)?;
38
+ } else {
39
+ hash.aset("key_value_metadata", None::<Value>)?;
40
+ }
41
+
42
+ // Convert schema to a Ruby hash since &Type doesn't implement IntoValue
43
+ let schema_hash = handle.hash_new();
44
+ let schema = file_metadata.schema();
45
+ schema_hash.aset("name", schema.name())?;
46
+ // Add schema fields information
47
+ let fields_array = handle.ary_new();
48
+ for field in schema.get_fields() {
49
+ let field_hash = handle.hash_new();
50
+ field_hash.aset("name", field.name())?;
51
+
52
+ // Handle different field types
53
+ match field.as_ref() {
54
+ parquet::schema::types::Type::PrimitiveType {
55
+ physical_type,
56
+ type_length,
57
+ scale,
58
+ precision,
59
+ ..
60
+ } => {
61
+ field_hash.aset("type", "primitive")?;
62
+ field_hash.aset("physical_type", format!("{:?}", physical_type))?;
63
+ field_hash.aset("type_length", *type_length)?;
64
+ field_hash.aset("scale", *scale)?;
65
+ field_hash.aset("precision", *precision)?;
66
+ }
67
+ parquet::schema::types::Type::GroupType { .. } => {
68
+ field_hash.aset("type", "group")?;
69
+ }
70
+ }
71
+
72
+ // Add basic info
73
+ let basic_info = field.get_basic_info();
74
+ field_hash.aset("repetition", format!("{:?}", basic_info.repetition()))?;
75
+ field_hash.aset(
76
+ "converted_type",
77
+ format!("{:?}", basic_info.converted_type()),
78
+ )?;
79
+ if let Some(logical_type) = basic_info.logical_type() {
80
+ field_hash.aset("logical_type", format!("{:?}", logical_type))?;
81
+ }
82
+
83
+ fields_array.push(field_hash)?;
84
+ }
85
+ schema_hash.aset("fields", fields_array)?;
86
+
87
+ hash.aset("schema", schema_hash)?;
88
+
89
+ // Convert row_groups to a Ruby array since &[RowGroupMetaData] doesn't implement IntoValue
90
+ let row_groups_array = handle.ary_new();
91
+ for row_group in row_groups.iter() {
92
+ let rg_hash = handle.hash_new();
93
+ rg_hash.aset("num_columns", row_group.num_columns())?;
94
+ rg_hash.aset("num_rows", row_group.num_rows())?;
95
+ rg_hash.aset("total_byte_size", row_group.total_byte_size())?;
96
+ rg_hash.aset("file_offset", row_group.file_offset())?;
97
+ rg_hash.aset("ordinal", row_group.ordinal())?;
98
+ rg_hash.aset("compressed_size", row_group.compressed_size())?;
99
+
100
+ // Add column chunks metadata
101
+ let columns_array = handle.ary_new();
102
+ for col_idx in 0..row_group.num_columns() {
103
+ let column = row_group.column(col_idx);
104
+ let col_hash = handle.hash_new();
105
+
106
+ col_hash.aset("column_path", column.column_path().string())?;
107
+ col_hash.aset("file_path", column.file_path())?;
108
+ col_hash.aset("file_offset", column.file_offset())?;
109
+ col_hash.aset("num_values", column.num_values())?;
110
+ col_hash.aset("compression", format!("{:?}", column.compression()))?;
111
+ col_hash.aset("total_compressed_size", column.compressed_size())?;
112
+ col_hash.aset("total_uncompressed_size", column.uncompressed_size())?;
113
+ col_hash.aset("data_page_offset", column.data_page_offset())?;
114
+
115
+ if let Some(offset) = column.dictionary_page_offset() {
116
+ col_hash.aset("dictionary_page_offset", offset)?;
117
+ }
118
+
119
+ if let Some(offset) = column.bloom_filter_offset() {
120
+ col_hash.aset("bloom_filter_offset", offset)?;
121
+ }
122
+
123
+ if let Some(length) = column.bloom_filter_length() {
124
+ col_hash.aset("bloom_filter_length", length)?;
125
+ }
126
+
127
+ if let Some(offset) = column.offset_index_offset() {
128
+ col_hash.aset("offset_index_offset", offset)?;
129
+ }
130
+
131
+ if let Some(length) = column.offset_index_length() {
132
+ col_hash.aset("offset_index_length", length)?;
133
+ }
134
+
135
+ if let Some(offset) = column.column_index_offset() {
136
+ col_hash.aset("column_index_offset", offset)?;
137
+ }
138
+
139
+ if let Some(length) = column.column_index_length() {
140
+ col_hash.aset("column_index_length", length)?;
141
+ }
142
+
143
+ // Add encodings
144
+ let encodings_array = handle.ary_new();
145
+ for encoding in column.encodings() {
146
+ encodings_array.push(format!("{:?}", encoding))?;
147
+ }
148
+ col_hash.aset("encodings", encodings_array)?;
149
+
150
+ // Add statistics if available
151
+ if let Some(stats) = column.statistics() {
152
+ let stats_hash = handle.hash_new();
153
+ stats_hash.aset("min_is_exact", stats.min_is_exact())?;
154
+ stats_hash.aset("max_is_exact", stats.max_is_exact())?;
155
+
156
+ col_hash.aset("statistics", stats_hash)?;
157
+ }
158
+
159
+ // Add page encoding stats if available
160
+ if let Some(page_encoding_stats) = column.page_encoding_stats() {
161
+ let page_stats_array = handle.ary_new();
162
+ for stat in page_encoding_stats {
163
+ let stat_hash = handle.hash_new();
164
+ stat_hash.aset("page_type", format!("{:?}", stat.page_type))?;
165
+ stat_hash.aset("encoding", format!("{:?}", stat.encoding))?;
166
+ stat_hash.aset("count", stat.count)?;
167
+ page_stats_array.push(stat_hash)?;
168
+ }
169
+ col_hash.aset("page_encoding_stats", page_stats_array)?;
170
+ }
171
+
172
+ columns_array.push(col_hash)?;
173
+ }
174
+ rg_hash.aset("columns", columns_array)?;
175
+
176
+ row_groups_array.push(rg_hash)?;
177
+ }
178
+ hash.aset("row_groups", row_groups_array)?;
179
+
180
+ Ok(handle.into_value(hash))
181
+ }
182
+ }
183
+
184
+ pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
185
+ let ruby = unsafe { Ruby::get_unchecked() };
186
+
187
+ if args.len() != 1 {
188
+ return Err(MagnusError::new(
189
+ magnus::exception::arg_error(),
190
+ format!("metadata expects exactly 1 argument (file path or IO-like object), got {}", args.len()),
191
+ ));
192
+ }
193
+
194
+ let ruby = Rc::new(ruby);
195
+ let arg = args[0];
196
+
197
+ let mut reader = ParquetMetaDataReader::new();
198
+ if arg.is_kind_of(ruby.class_string()) {
199
+ let path = arg.to_r_string()?.to_string()?;
200
+ let file = File::open(path).map_err(ParquetGemError::FileOpen)?;
201
+ reader.try_parse(&file).map_err(ParquetGemError::Parquet)?;
202
+ } else {
203
+ let file = ThreadSafeRubyReader::new(RubyReader::new(ruby.clone(), arg)?);
204
+ reader.try_parse(&file).map_err(ParquetGemError::Parquet)?;
205
+ }
206
+
207
+ let metadata = reader.finish().map_err(ParquetGemError::Parquet)?;
208
+
209
+ Ok(RubyParquetMetaData(metadata).try_into_value_with(&ruby)?)
210
+ }
@@ -10,26 +10,25 @@ use either::Either;
10
10
  use magnus::IntoValue;
11
11
  use magnus::{Error as MagnusError, Ruby, Value};
12
12
  use std::collections::HashMap;
13
- use std::sync::{Arc, OnceLock};
13
+ use std::rc::Rc;
14
+ use std::sync::OnceLock;
14
15
 
15
16
  use super::common::{
16
17
  create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
17
18
  };
18
19
 
19
20
  #[inline]
20
- pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
21
+ pub fn parse_parquet_columns(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
21
22
  let ruby = unsafe { Ruby::get_unchecked() };
22
- Ok(
23
- parse_parquet_columns_impl(Arc::new(ruby), rb_self, args).map_err(|e| {
24
- let z: MagnusError = e.into();
25
- z
26
- })?,
27
- )
23
+ parse_parquet_columns_impl(Rc::new(ruby), rb_self, args).map_err(|e| {
24
+ let z: MagnusError = e.into();
25
+ z
26
+ })
28
27
  }
29
28
 
30
29
  #[inline]
31
- fn parse_parquet_columns_impl<'a>(
32
- ruby: Arc<Ruby>,
30
+ fn parse_parquet_columns_impl(
31
+ ruby: Rc<Ruby>,
33
32
  rb_self: Value,
34
33
  args: &[Value],
35
34
  ) -> Result<Value, ParquetGemError> {
@@ -76,13 +75,13 @@ fn parse_parquet_columns_impl<'a>(
76
75
  Either::Right(readable) => create_batch_reader(readable, &columns, batch_size)?,
77
76
  };
78
77
 
79
- // Handle empty file case
80
- if handle_empty_file(&ruby, &schema, num_rows)? {
81
- return Ok(ruby.qnil().into_value_with(&ruby));
82
- }
83
-
84
78
  match result_type {
85
79
  ParserResultType::Hash => {
80
+ // For hash return type, we need to return a hash with column names pointing at empty arrays
81
+ if handle_empty_file(&ruby, &schema, num_rows)? {
82
+ return Ok(ruby.qnil().into_value_with(&ruby));
83
+ }
84
+
86
85
  let headers = OnceLock::new();
87
86
  let headers_clone = headers.clone();
88
87
  let iter = batch_reader.map(move |batch| {
@@ -112,8 +111,8 @@ fn parse_parquet_columns_impl<'a>(
112
111
  .try_for_each(|(i, column)| {
113
112
  let header = local_headers[i];
114
113
  let values = ParquetValueVec::try_from(ArrayWrapper {
115
- array: &*column,
116
- strict: strict,
114
+ array: column,
115
+ strict,
117
116
  })?;
118
117
  map.insert(header, values.into_inner());
119
118
  Ok::<_, ParquetGemError>(())
@@ -133,11 +132,11 @@ fn parse_parquet_columns_impl<'a>(
133
132
  batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
134
133
  let vec = batch
135
134
  .columns()
136
- .into_iter()
135
+ .iter()
137
136
  .map(|column| {
138
137
  let values = ParquetValueVec::try_from(ArrayWrapper {
139
- array: &*column,
140
- strict: strict,
138
+ array: column,
139
+ strict,
141
140
  })?;
142
141
  Ok::<_, ParquetGemError>(values.into_inner())
143
142
  })