parquet 0.5.3 → 0.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e1ae8e2c64920df8527a16d7348fc37c5ae2cf5c783b648bed93e31cab25bd72
4
- data.tar.gz: 2d7b45349d33679f96559683e31d7c9dd5718fb78611aad057bba92d7324c2d3
3
+ metadata.gz: e2295ee94fe35758ae8e5137070e2206ec1e104aad6b9a0806aa508ad4799247
4
+ data.tar.gz: 340f86257082bdba22d6ced530ecd1d201c7b4e6d9116eebac41541ba2aaa257
5
5
  SHA512:
6
- metadata.gz: 1f56d8e538bdb095e43472940a8c3a57b6b54d74ab87d9c1519878d759962e6d844f9c992927dc22d22ebefee4bd64a858b2ed89ccc3c694d183bcb9fd154497
7
- data.tar.gz: 5f5c8914d81ef297bebb021ba40e70725208e61c2bd1565f7d134341ac3c31489b501766266f7390ffde82a44e5821321b55f827467ac95c760cd08588788e9d
6
+ metadata.gz: f333ae2914cdd00468c390e8b3d876aec4e522a546d43ab29db5d777792105a38d2a40c49db0f0afe1e800bf32e54bb4c479441f8f9876937ba59917b444d15a
7
+ data.tar.gz: da2832c3514729cc0e99e16f70a10bbfc4e9093dc734de55715305121649ebc371dff93a7bb462b97fde27c79ad65cec12c5fa90a47f70bc64153a7fd2ce1a5c
data/README.md CHANGED
@@ -8,6 +8,78 @@ This project is a Ruby library wrapping the [parquet-rs](https://github.com/apac
8
8
 
9
9
  This library provides high-level bindings to parquet-rs with two primary APIs for reading Parquet files: row-wise and column-wise iteration. The column-wise API generally offers better performance, especially when working with subset of columns.
10
10
 
11
+ ### Metadata
12
+
13
+ The `metadata` method provides detailed information about a Parquet file's structure and contents:
14
+
15
+ ```ruby
16
+ require "parquet"
17
+
18
+ # Get metadata from a file path
19
+ metadata = Parquet.metadata("data.parquet")
20
+
21
+ # Or from an IO object
22
+ File.open("data.parquet", "rb") do |file|
23
+ metadata = Parquet.metadata(file)
24
+ end
25
+
26
+ # Example metadata output:
27
+ # {
28
+ # "num_rows" => 3,
29
+ # "created_by" => "parquet-rs version 54.2.0",
30
+ # "key_value_metadata" => [
31
+ # {
32
+ # "key" => "ARROW:schema",
33
+ # "value" => "base64_encoded_schema"
34
+ # }
35
+ # ],
36
+ # "schema" => {
37
+ # "name" => "arrow_schema",
38
+ # "fields" => [
39
+ # {
40
+ # "name" => "id",
41
+ # "type" => "primitive",
42
+ # "physical_type" => "INT64",
43
+ # "repetition" => "OPTIONAL",
44
+ # "converted_type" => "NONE"
45
+ # },
46
+ # # ... other fields
47
+ # ]
48
+ # },
49
+ # "row_groups" => [
50
+ # {
51
+ # "num_columns" => 5,
52
+ # "num_rows" => 3,
53
+ # "total_byte_size" => 379,
54
+ # "columns" => [
55
+ # {
56
+ # "column_path" => "id",
57
+ # "num_values" => 3,
58
+ # "compression" => "UNCOMPRESSED",
59
+ # "total_compressed_size" => 91,
60
+ # "encodings" => ["PLAIN", "RLE", "RLE_DICTIONARY"],
61
+ # "statistics" => {
62
+ # "min_is_exact" => true,
63
+ # "max_is_exact" => true
64
+ # }
65
+ # },
66
+ # # ... other columns
67
+ # ]
68
+ # }
69
+ # ]
70
+ # }
71
+ ```
72
+
73
+ The metadata includes:
74
+ - Total number of rows
75
+ - File creation information
76
+ - Key-value metadata (including Arrow schema)
77
+ - Detailed schema information for each column
78
+ - Row group information including:
79
+ - Number of columns and rows
80
+ - Total byte size
81
+ - Column-level details (compression, encodings, statistics)
82
+
11
83
  ### Row-wise Iteration
12
84
 
13
85
  The `each_row` method provides sequential access to individual rows:
@@ -236,17 +308,169 @@ schema = Parquet::Schema.define do
236
308
  field :description, :string
237
309
  end
238
310
 
239
- # Nested lists
311
+ # Nested lists (list of lists of strings)
240
312
  field :nested_lists, :list, item: :list do
241
- field :item, :string # For nested lists, inner item must be named 'item'
313
+ field :item, :string # REQUIRED: Inner item field MUST be named 'item' for nested lists
242
314
  end
243
315
 
244
316
  # Map of lists
245
317
  field :map_of_lists, :map, key: :string, value: :list do
246
- field :item, :int32 # For list items in maps, item must be named 'item'
318
+ field :item, :int32 # REQUIRED: List items in maps MUST be named 'item'
247
319
  end
248
320
  end
249
321
 
322
+ ### Nested Lists
323
+
324
+ When working with nested lists (a list of lists), there are specific requirements:
325
+
326
+ 1. Using the Schema DSL:
327
+ ```ruby
328
+ # A list of lists of strings
329
+ field :nested_lists, :list, item: :list do
330
+ field :item, :string # For nested lists, inner item MUST be named 'item'
331
+ end
332
+ ```
333
+
334
+ 2. Using hash-based schema format:
335
+ ```ruby
336
+ # A list of lists of integers
337
+ { "nested_numbers" => "list<list<int32>>" }
338
+ ```
339
+
340
+ The data for nested lists is structured as an array of arrays:
341
+ ```ruby
342
+ # Data for the nested_lists field
343
+ [["a", "b"], ["c", "d", "e"], []] # Last one is an empty inner list
344
+ ```
345
+
346
+ ### Decimal Data Type
347
+
348
+ Parquet supports decimal numbers with configurable precision and scale, which is essential for financial applications where exact decimal representation is critical. The library seamlessly converts between Ruby's `BigDecimal` and Parquet's decimal type.
349
+
350
+ #### Decimal Precision and Scale
351
+
352
+ When working with decimal fields, you need to understand two key parameters:
353
+
354
+ - **Precision**: The total number of significant digits (both before and after the decimal point)
355
+ - **Scale**: The number of digits after the decimal point
356
+
357
+ The rules for defining decimals are:
358
+
359
+ ```ruby
360
+ # No precision/scale specified - uses maximum precision (38) with scale 0
361
+ field :amount1, :decimal # Equivalent to INTEGER with 38 digits
362
+
363
+ # Only precision specified - scale defaults to 0
364
+ field :amount2, :decimal, precision: 10 # 10 digits, no decimal places
365
+
366
+ # Only scale specified - uses maximum precision (38)
367
+ field :amount3, :decimal, scale: 2 # 38 digits with 2 decimal places
368
+
369
+ # Both precision and scale specified
370
+ field :amount4, :decimal, precision: 10, scale: 2 # 10 digits with 2 decimal places
371
+ ```
372
+
373
+ #### Financial Data Example
374
+
375
+ Here's a practical example for a financial application:
376
+
377
+ ```ruby
378
+ require "parquet"
379
+ require "bigdecimal"
380
+
381
+ # Schema for financial transactions
382
+ schema = Parquet::Schema.define do
383
+ field :transaction_id, :string, nullable: false
384
+ field :timestamp, :timestamp_millis, nullable: false
385
+ field :amount, :decimal, precision: 12, scale: 2 # Supports up to 10^10 with 2 decimal places
386
+ field :balance, :decimal, precision: 16, scale: 2 # Larger precision for running balances
387
+ field :currency, :string
388
+ field :exchange_rate, :decimal, precision: 10, scale: 6 # 6 decimal places for forex rates
389
+ field :fee, :decimal, precision: 8, scale: 2, nullable: true # Optional fee
390
+ field :category, :string
391
+ end
392
+
393
+ # Sample financial data
394
+ transactions = [
395
+ [
396
+ "T-12345",
397
+ Time.now,
398
+ BigDecimal("1256.99"), # amount (directly using BigDecimal)
399
+ BigDecimal("10250.25"), # balance
400
+ "USD",
401
+ BigDecimal("1.0"), # exchange_rate
402
+ BigDecimal("2.50"), # fee
403
+ "Groceries"
404
+ ],
405
+ [
406
+ "T-12346",
407
+ Time.now - 86400, # yesterday
408
+ BigDecimal("-89.50"), # negative amount for withdrawal
409
+ BigDecimal("10160.75"), # updated balance
410
+ "USD",
411
+ BigDecimal("1.0"), # exchange_rate
412
+ nil, # no fee
413
+ "Transportation"
414
+ ],
415
+ [
416
+ "T-12347",
417
+ Time.now - 172800, # two days ago
418
+ BigDecimal("250.00"), # amount
419
+ BigDecimal("10410.75"), # balance
420
+ "EUR", # different currency
421
+ BigDecimal("1.05463"), # exchange_rate
422
+ BigDecimal("1.75"), # fee
423
+ "Entertainment"
424
+ ]
425
+ ]
426
+
427
+ # Write financial data to Parquet file
428
+ Parquet.write_rows(transactions.each, schema: schema, write_to: "financial_data.parquet")
429
+
430
+ # Read back transactions
431
+ Parquet.each_row("financial_data.parquet") do |transaction|
432
+ # Access decimal fields as BigDecimal objects
433
+ puts "Transaction: #{transaction['transaction_id']}"
434
+ puts " Amount: #{transaction['currency']} #{transaction['amount']}"
435
+ puts " Balance: $#{transaction['balance']}"
436
+ puts " Fee: #{transaction['fee'] || 'No fee'}"
437
+
438
+ # You can perform precise decimal calculations
439
+ if transaction['currency'] != 'USD'
440
+ usd_amount = transaction['amount'] * transaction['exchange_rate']
441
+ puts " USD Equivalent: $#{usd_amount.round(2)}"
442
+ end
443
+ end
444
+ ```
445
+
446
+ #### Decimal Type Storage Considerations
447
+
448
+ Parquet optimizes storage based on the precision:
449
+ - For precision ≤ 9: Uses 4-byte INT32
450
+ - For precision ≤ 18: Uses 8-byte INT64
451
+ - For precision ≤ 38: Uses 16-byte BYTE_ARRAY
452
+
453
+ Choose appropriate precision and scale for your data to optimize storage while ensuring adequate range:
454
+
455
+ ```ruby
456
+ # Banking examples
457
+ field :account_balance, :decimal, precision: 16, scale: 2 # Up to 14 digits before decimal point
458
+ field :interest_rate, :decimal, precision: 8, scale: 6 # Rate with 6 decimal places (e.g., 0.015625)
459
+
460
+ # E-commerce examples
461
+ field :product_price, :decimal, precision: 10, scale: 2 # Product price
462
+ field :shipping_weight, :decimal, precision: 6, scale: 3 # Weight in kg with 3 decimal places
463
+
464
+ # Analytics examples
465
+ field :conversion_rate, :decimal, precision: 5, scale: 4 # Rate like 0.0123
466
+ field :daily_revenue, :decimal, precision: 14, scale: 2 # Daily revenue with 2 decimal places
467
+ ```
468
+
469
+ ### Sample Data with Nested Structures
470
+
471
+ Here's an example showing how to use the schema defined earlier with sample data:
472
+
473
+ ```ruby
250
474
  # Sample data with nested structures
251
475
  data = [
252
476
  [
@@ -271,7 +495,7 @@ data = [
271
495
  "feature1" => { "count" => 5, "description" => "Main feature" },
272
496
  "feature2" => { "count" => 3, "description" => "Secondary feature" }
273
497
  },
274
- [["a", "b"], ["c", "d", "e"]], # nested_lists
498
+ [["a", "b"], ["c", "d", "e"]], # nested_lists (a list of lists of strings)
275
499
  { # map_of_lists
276
500
  "group1" => [1, 2, 3],
277
501
  "group2" => [4, 5, 6]
@@ -1,6 +1,7 @@
1
1
  mod common;
2
2
  mod parquet_column_reader;
3
3
  mod parquet_row_reader;
4
+ mod unified;
4
5
  use std::{fs::File, rc::Rc};
5
6
 
6
7
  use magnus::{value::ReprValue, Error as MagnusError, Ruby, Value};
@@ -207,4 +208,4 @@ pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusEr
207
208
  let metadata = reader.finish().map_err(ParquetGemError::Parquet)?;
208
209
 
209
210
  Ok(RubyParquetMetaData(metadata).try_into_value_with(&ruby)?)
210
- }
211
+ }
@@ -1,21 +1,9 @@
1
- use crate::header_cache::StringCache;
2
- use crate::logger::RubyLogger;
3
- use crate::types::{ArrayWrapper, ParquetGemError, TryIntoValue};
4
- use crate::{
5
- create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ParquetValueVec,
6
- ParserResultType,
7
- };
8
- use ahash::RandomState;
9
- use either::Either;
10
- use magnus::IntoValue;
1
+ use crate::reader::unified::{parse_parquet_unified, ParserType, UnifiedParserArgs};
2
+ use crate::utils::*;
3
+ use crate::ParquetGemError;
4
+
11
5
  use magnus::{Error as MagnusError, Ruby, Value};
12
- use std::collections::HashMap;
13
6
  use std::rc::Rc;
14
- use std::sync::OnceLock;
15
-
16
- use super::common::{
17
- create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
18
- };
19
7
 
20
8
  #[inline]
21
9
  pub fn parse_parquet_columns(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
@@ -41,116 +29,16 @@ fn parse_parquet_columns_impl(
41
29
  logger,
42
30
  } = parse_parquet_columns_args(&ruby, args)?;
43
31
 
44
- // Initialize the logger if provided
45
- let ruby_logger = RubyLogger::new(&ruby, logger)?;
46
- if let Some(ref bs) = batch_size {
47
- ruby_logger.debug(|| format!("Using batch size: {}", bs))?;
48
- }
49
-
50
- // Clone values for the closure to avoid move issues
51
- let columns_clone = columns.clone();
52
-
53
- // Handle block or create enumerator
54
- if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
55
- create_column_enumerator(ColumnEnumeratorArgs {
56
- rb_self,
32
+ // Use the unified parsing implementation
33
+ parse_parquet_unified(
34
+ ruby,
35
+ rb_self,
36
+ UnifiedParserArgs {
57
37
  to_read,
58
38
  result_type,
59
- columns: columns_clone,
60
- batch_size,
61
- strict,
62
- logger: logger.as_ref().map(|_| to_read),
63
- })
64
- .map(|yield_enum| yield_enum.into_value_with(&ruby))
65
- })? {
66
- return Ok(enum_value);
67
- }
68
-
69
- let source = open_parquet_source(ruby.clone(), to_read)?;
70
-
71
- // Use the common function to create the batch reader
72
-
73
- let (batch_reader, schema, num_rows) = match source {
74
- Either::Left(file) => create_batch_reader(file, &columns, batch_size)?,
75
- Either::Right(readable) => create_batch_reader(readable, &columns, batch_size)?,
76
- };
77
-
78
- match result_type {
79
- ParserResultType::Hash => {
80
- // For hash return type, we need to return a hash with column names pointing at empty arrays
81
- if handle_empty_file(&ruby, &schema, num_rows)? {
82
- return Ok(ruby.qnil().into_value_with(&ruby));
83
- }
84
-
85
- let headers = OnceLock::new();
86
- let headers_clone = headers.clone();
87
- let iter = batch_reader.map(move |batch| {
88
- batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
89
- let local_headers = headers_clone
90
- .get_or_init(|| {
91
- let schema = batch.schema();
92
- let fields = schema.fields();
93
- let mut header_string = Vec::with_capacity(fields.len());
94
- for field in fields {
95
- header_string.push(field.name().to_owned());
96
- }
97
- StringCache::intern_many(&header_string)
98
- })
99
- .as_ref()
100
- .map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
101
-
102
- let mut map = HashMap::with_capacity_and_hasher(
103
- local_headers.len(),
104
- RandomState::default(),
105
- );
106
-
107
- batch
108
- .columns()
109
- .iter()
110
- .enumerate()
111
- .try_for_each(|(i, column)| {
112
- let header = local_headers[i];
113
- let values = ParquetValueVec::try_from(ArrayWrapper {
114
- array: column,
115
- strict,
116
- })?;
117
- map.insert(header, values.into_inner());
118
- Ok::<_, ParquetGemError>(())
119
- })?;
120
-
121
- Ok(ColumnRecord::Map::<RandomState>(map))
122
- })
123
- });
124
-
125
- for result in iter {
126
- let record = result?;
127
- let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
128
- }
129
- }
130
- ParserResultType::Array => {
131
- let iter = batch_reader.map(|batch| {
132
- batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
133
- let vec = batch
134
- .columns()
135
- .iter()
136
- .map(|column| {
137
- let values = ParquetValueVec::try_from(ArrayWrapper {
138
- array: column,
139
- strict,
140
- })?;
141
- Ok::<_, ParquetGemError>(values.into_inner())
142
- })
143
- .collect::<Result<Vec<_>, _>>()?;
144
- Ok(ColumnRecord::Vec::<RandomState>(vec))
145
- })
146
- });
147
-
148
- for result in iter {
149
- let record = result?;
150
- let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
151
- }
152
- }
153
- }
154
-
155
- Ok(ruby.qnil().into_value_with(&ruby))
156
- }
39
+ columns,
40
+ parser_type: ParserType::Column { batch_size, strict },
41
+ logger,
42
+ },
43
+ )
44
+ }
@@ -1,22 +1,9 @@
1
- use crate::header_cache::StringCache;
2
- use crate::logger::RubyLogger;
3
- use crate::types::TryIntoValue;
4
- use crate::{
5
- create_row_enumerator, utils::*, ParquetField, ParquetGemError, ParserResultType,
6
- RowEnumeratorArgs, RowRecord,
7
- };
8
- use ahash::RandomState;
9
- use either::Either;
10
- use magnus::IntoValue;
1
+ use crate::reader::unified::{parse_parquet_unified, ParserType, UnifiedParserArgs};
2
+ use crate::utils::*;
3
+ use crate::ParquetGemError;
4
+
11
5
  use magnus::{Error as MagnusError, Ruby, Value};
12
- use parquet::file::reader::{FileReader, SerializedFileReader};
13
- use parquet::record::reader::RowIter as ParquetRowIter;
14
- use parquet::schema::types::{Type as SchemaType, TypePtr};
15
- use std::collections::HashMap;
16
6
  use std::rc::Rc;
17
- use std::sync::OnceLock;
18
-
19
- use super::common::{handle_block_or_enum, open_parquet_source};
20
7
 
21
8
  #[inline]
22
9
  pub fn parse_parquet_rows(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
@@ -41,123 +28,16 @@ fn parse_parquet_rows_impl(
41
28
  logger,
42
29
  } = parse_parquet_rows_args(&ruby, args)?;
43
30
 
44
- // Initialize the logger if provided
45
- let ruby_logger = RubyLogger::new(&ruby, logger)?;
46
-
47
- // Clone values for the closure to avoid move issues
48
- let columns_clone = columns.clone();
49
-
50
- // Handle block or create enumerator
51
- if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
52
- create_row_enumerator(RowEnumeratorArgs {
53
- rb_self,
31
+ // Use the unified parsing implementation
32
+ parse_parquet_unified(
33
+ ruby,
34
+ rb_self,
35
+ UnifiedParserArgs {
54
36
  to_read,
55
37
  result_type,
56
- columns: columns_clone,
57
- strict,
38
+ columns,
39
+ parser_type: ParserType::Row { strict },
58
40
  logger,
59
- })
60
- .map(|yield_enum| yield_enum.into_value_with(&ruby))
61
- })? {
62
- return Ok(enum_value);
63
- }
64
-
65
- let source = open_parquet_source(ruby.clone(), to_read)?;
66
- let reader: Box<dyn FileReader> = match source {
67
- Either::Left(file) => {
68
- Box::new(SerializedFileReader::new(file).map_err(ParquetGemError::from)?)
69
- }
70
- Either::Right(readable) => {
71
- Box::new(SerializedFileReader::new(readable).map_err(ParquetGemError::from)?)
72
- }
73
- };
74
-
75
- let schema = reader.metadata().file_metadata().schema().clone();
76
- ruby_logger.debug(|| format!("Schema loaded: {:?}", schema))?;
77
-
78
- let mut iter = ParquetRowIter::from_file_into(reader);
79
- if let Some(cols) = columns {
80
- ruby_logger.debug(|| format!("Projecting columns: {:?}", cols))?;
81
- let projection = create_projection_schema(&schema, &cols);
82
- iter = iter.project(Some(projection.to_owned())).map_err(|e| {
83
- MagnusError::new(
84
- ruby.exception_runtime_error(),
85
- format!("Failed to create projection: {}", e),
86
- )
87
- })?;
88
- }
89
-
90
- match result_type {
91
- ParserResultType::Hash => {
92
- let headers = OnceLock::new();
93
- let headers_clone = headers.clone();
94
- let iter = iter.map(move |row| {
95
- row.map(|row| {
96
- let headers = headers_clone.get_or_init(|| {
97
- let column_count = row.get_column_iter().count();
98
-
99
- let mut header_string = Vec::with_capacity(column_count);
100
- for (k, _) in row.get_column_iter() {
101
- header_string.push(k.to_owned());
102
- }
103
-
104
- StringCache::intern_many(&header_string).expect("Failed to intern headers")
105
- });
106
-
107
- let mut map =
108
- HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
109
- for (i, (_, v)) in row.get_column_iter().enumerate() {
110
- map.insert(headers[i], ParquetField(v.clone(), strict));
111
- }
112
- map
113
- })
114
- .map(RowRecord::Map::<RandomState>)
115
- .map_err(ParquetGemError::from)
116
- });
117
-
118
- for result in iter {
119
- let record = result?;
120
- let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
121
- }
122
- }
123
- ParserResultType::Array => {
124
- let iter = iter.map(|row| {
125
- row.map(|row| {
126
- let column_count = row.get_column_iter().count();
127
- let mut vec = Vec::with_capacity(column_count);
128
- for (_, v) in row.get_column_iter() {
129
- vec.push(ParquetField(v.clone(), strict));
130
- }
131
- vec
132
- })
133
- .map(RowRecord::Vec::<RandomState>)
134
- .map_err(ParquetGemError::from)
135
- });
136
-
137
- for result in iter {
138
- let record = result?;
139
- let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
140
- }
141
- }
142
- }
143
-
144
- Ok(ruby.qnil().into_value_with(&ruby))
145
- }
146
-
147
- fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
148
- if let SchemaType::GroupType { fields, .. } = schema {
149
- let projected_fields: Vec<TypePtr> = fields
150
- .iter()
151
- .filter(|field| columns.contains(&field.name().to_string()))
152
- .cloned()
153
- .collect();
154
-
155
- SchemaType::GroupType {
156
- basic_info: schema.get_basic_info().clone(),
157
- fields: projected_fields,
158
- }
159
- } else {
160
- // Return original schema if not a group type
161
- schema.clone()
162
- }
163
- }
41
+ },
42
+ )
43
+ }