parquet 0.5.3 → 0.5.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +228 -4
- data/ext/parquet/src/reader/mod.rs +2 -1
- data/ext/parquet/src/reader/parquet_column_reader.rs +15 -127
- data/ext/parquet/src/reader/parquet_row_reader.rs +14 -134
- data/ext/parquet/src/reader/unified/mod.rs +328 -0
- data/ext/parquet/src/types/parquet_value.rs +90 -16
- data/ext/parquet/src/types/record_types.rs +28 -4
- data/ext/parquet/src/types/type_conversion.rs +13 -11
- data/ext/parquet/src/types/writer_types.rs +38 -19
- data/lib/parquet/schema.rb +21 -9
- data/lib/parquet/version.rb +1 -1
- metadata +2 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e2295ee94fe35758ae8e5137070e2206ec1e104aad6b9a0806aa508ad4799247
|
4
|
+
data.tar.gz: 340f86257082bdba22d6ced530ecd1d201c7b4e6d9116eebac41541ba2aaa257
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f333ae2914cdd00468c390e8b3d876aec4e522a546d43ab29db5d777792105a38d2a40c49db0f0afe1e800bf32e54bb4c479441f8f9876937ba59917b444d15a
|
7
|
+
data.tar.gz: da2832c3514729cc0e99e16f70a10bbfc4e9093dc734de55715305121649ebc371dff93a7bb462b97fde27c79ad65cec12c5fa90a47f70bc64153a7fd2ce1a5c
|
data/README.md
CHANGED
@@ -8,6 +8,78 @@ This project is a Ruby library wrapping the [parquet-rs](https://github.com/apac
|
|
8
8
|
|
9
9
|
This library provides high-level bindings to parquet-rs with two primary APIs for reading Parquet files: row-wise and column-wise iteration. The column-wise API generally offers better performance, especially when working with subset of columns.
|
10
10
|
|
11
|
+
### Metadata
|
12
|
+
|
13
|
+
The `metadata` method provides detailed information about a Parquet file's structure and contents:
|
14
|
+
|
15
|
+
```ruby
|
16
|
+
require "parquet"
|
17
|
+
|
18
|
+
# Get metadata from a file path
|
19
|
+
metadata = Parquet.metadata("data.parquet")
|
20
|
+
|
21
|
+
# Or from an IO object
|
22
|
+
File.open("data.parquet", "rb") do |file|
|
23
|
+
metadata = Parquet.metadata(file)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Example metadata output:
|
27
|
+
# {
|
28
|
+
# "num_rows" => 3,
|
29
|
+
# "created_by" => "parquet-rs version 54.2.0",
|
30
|
+
# "key_value_metadata" => [
|
31
|
+
# {
|
32
|
+
# "key" => "ARROW:schema",
|
33
|
+
# "value" => "base64_encoded_schema"
|
34
|
+
# }
|
35
|
+
# ],
|
36
|
+
# "schema" => {
|
37
|
+
# "name" => "arrow_schema",
|
38
|
+
# "fields" => [
|
39
|
+
# {
|
40
|
+
# "name" => "id",
|
41
|
+
# "type" => "primitive",
|
42
|
+
# "physical_type" => "INT64",
|
43
|
+
# "repetition" => "OPTIONAL",
|
44
|
+
# "converted_type" => "NONE"
|
45
|
+
# },
|
46
|
+
# # ... other fields
|
47
|
+
# ]
|
48
|
+
# },
|
49
|
+
# "row_groups" => [
|
50
|
+
# {
|
51
|
+
# "num_columns" => 5,
|
52
|
+
# "num_rows" => 3,
|
53
|
+
# "total_byte_size" => 379,
|
54
|
+
# "columns" => [
|
55
|
+
# {
|
56
|
+
# "column_path" => "id",
|
57
|
+
# "num_values" => 3,
|
58
|
+
# "compression" => "UNCOMPRESSED",
|
59
|
+
# "total_compressed_size" => 91,
|
60
|
+
# "encodings" => ["PLAIN", "RLE", "RLE_DICTIONARY"],
|
61
|
+
# "statistics" => {
|
62
|
+
# "min_is_exact" => true,
|
63
|
+
# "max_is_exact" => true
|
64
|
+
# }
|
65
|
+
# },
|
66
|
+
# # ... other columns
|
67
|
+
# ]
|
68
|
+
# }
|
69
|
+
# ]
|
70
|
+
# }
|
71
|
+
```
|
72
|
+
|
73
|
+
The metadata includes:
|
74
|
+
- Total number of rows
|
75
|
+
- File creation information
|
76
|
+
- Key-value metadata (including Arrow schema)
|
77
|
+
- Detailed schema information for each column
|
78
|
+
- Row group information including:
|
79
|
+
- Number of columns and rows
|
80
|
+
- Total byte size
|
81
|
+
- Column-level details (compression, encodings, statistics)
|
82
|
+
|
11
83
|
### Row-wise Iteration
|
12
84
|
|
13
85
|
The `each_row` method provides sequential access to individual rows:
|
@@ -236,17 +308,169 @@ schema = Parquet::Schema.define do
|
|
236
308
|
field :description, :string
|
237
309
|
end
|
238
310
|
|
239
|
-
# Nested lists
|
311
|
+
# Nested lists (list of lists of strings)
|
240
312
|
field :nested_lists, :list, item: :list do
|
241
|
-
field :item, :string #
|
313
|
+
field :item, :string # REQUIRED: Inner item field MUST be named 'item' for nested lists
|
242
314
|
end
|
243
315
|
|
244
316
|
# Map of lists
|
245
317
|
field :map_of_lists, :map, key: :string, value: :list do
|
246
|
-
field :item, :int32 #
|
318
|
+
field :item, :int32 # REQUIRED: List items in maps MUST be named 'item'
|
247
319
|
end
|
248
320
|
end
|
249
321
|
|
322
|
+
### Nested Lists
|
323
|
+
|
324
|
+
When working with nested lists (a list of lists), there are specific requirements:
|
325
|
+
|
326
|
+
1. Using the Schema DSL:
|
327
|
+
```ruby
|
328
|
+
# A list of lists of strings
|
329
|
+
field :nested_lists, :list, item: :list do
|
330
|
+
field :item, :string # For nested lists, inner item MUST be named 'item'
|
331
|
+
end
|
332
|
+
```
|
333
|
+
|
334
|
+
2. Using hash-based schema format:
|
335
|
+
```ruby
|
336
|
+
# A list of lists of integers
|
337
|
+
{ "nested_numbers" => "list<list<int32>>" }
|
338
|
+
```
|
339
|
+
|
340
|
+
The data for nested lists is structured as an array of arrays:
|
341
|
+
```ruby
|
342
|
+
# Data for the nested_lists field
|
343
|
+
[["a", "b"], ["c", "d", "e"], []] # Last one is an empty inner list
|
344
|
+
```
|
345
|
+
|
346
|
+
### Decimal Data Type
|
347
|
+
|
348
|
+
Parquet supports decimal numbers with configurable precision and scale, which is essential for financial applications where exact decimal representation is critical. The library seamlessly converts between Ruby's `BigDecimal` and Parquet's decimal type.
|
349
|
+
|
350
|
+
#### Decimal Precision and Scale
|
351
|
+
|
352
|
+
When working with decimal fields, you need to understand two key parameters:
|
353
|
+
|
354
|
+
- **Precision**: The total number of significant digits (both before and after the decimal point)
|
355
|
+
- **Scale**: The number of digits after the decimal point
|
356
|
+
|
357
|
+
The rules for defining decimals are:
|
358
|
+
|
359
|
+
```ruby
|
360
|
+
# No precision/scale specified - uses maximum precision (38) with scale 0
|
361
|
+
field :amount1, :decimal # Equivalent to INTEGER with 38 digits
|
362
|
+
|
363
|
+
# Only precision specified - scale defaults to 0
|
364
|
+
field :amount2, :decimal, precision: 10 # 10 digits, no decimal places
|
365
|
+
|
366
|
+
# Only scale specified - uses maximum precision (38)
|
367
|
+
field :amount3, :decimal, scale: 2 # 38 digits with 2 decimal places
|
368
|
+
|
369
|
+
# Both precision and scale specified
|
370
|
+
field :amount4, :decimal, precision: 10, scale: 2 # 10 digits with 2 decimal places
|
371
|
+
```
|
372
|
+
|
373
|
+
#### Financial Data Example
|
374
|
+
|
375
|
+
Here's a practical example for a financial application:
|
376
|
+
|
377
|
+
```ruby
|
378
|
+
require "parquet"
|
379
|
+
require "bigdecimal"
|
380
|
+
|
381
|
+
# Schema for financial transactions
|
382
|
+
schema = Parquet::Schema.define do
|
383
|
+
field :transaction_id, :string, nullable: false
|
384
|
+
field :timestamp, :timestamp_millis, nullable: false
|
385
|
+
field :amount, :decimal, precision: 12, scale: 2 # Supports up to 10^10 with 2 decimal places
|
386
|
+
field :balance, :decimal, precision: 16, scale: 2 # Larger precision for running balances
|
387
|
+
field :currency, :string
|
388
|
+
field :exchange_rate, :decimal, precision: 10, scale: 6 # 6 decimal places for forex rates
|
389
|
+
field :fee, :decimal, precision: 8, scale: 2, nullable: true # Optional fee
|
390
|
+
field :category, :string
|
391
|
+
end
|
392
|
+
|
393
|
+
# Sample financial data
|
394
|
+
transactions = [
|
395
|
+
[
|
396
|
+
"T-12345",
|
397
|
+
Time.now,
|
398
|
+
BigDecimal("1256.99"), # amount (directly using BigDecimal)
|
399
|
+
BigDecimal("10250.25"), # balance
|
400
|
+
"USD",
|
401
|
+
BigDecimal("1.0"), # exchange_rate
|
402
|
+
BigDecimal("2.50"), # fee
|
403
|
+
"Groceries"
|
404
|
+
],
|
405
|
+
[
|
406
|
+
"T-12346",
|
407
|
+
Time.now - 86400, # yesterday
|
408
|
+
BigDecimal("-89.50"), # negative amount for withdrawal
|
409
|
+
BigDecimal("10160.75"), # updated balance
|
410
|
+
"USD",
|
411
|
+
BigDecimal("1.0"), # exchange_rate
|
412
|
+
nil, # no fee
|
413
|
+
"Transportation"
|
414
|
+
],
|
415
|
+
[
|
416
|
+
"T-12347",
|
417
|
+
Time.now - 172800, # two days ago
|
418
|
+
BigDecimal("250.00"), # amount
|
419
|
+
BigDecimal("10410.75"), # balance
|
420
|
+
"EUR", # different currency
|
421
|
+
BigDecimal("1.05463"), # exchange_rate
|
422
|
+
BigDecimal("1.75"), # fee
|
423
|
+
"Entertainment"
|
424
|
+
]
|
425
|
+
]
|
426
|
+
|
427
|
+
# Write financial data to Parquet file
|
428
|
+
Parquet.write_rows(transactions.each, schema: schema, write_to: "financial_data.parquet")
|
429
|
+
|
430
|
+
# Read back transactions
|
431
|
+
Parquet.each_row("financial_data.parquet") do |transaction|
|
432
|
+
# Access decimal fields as BigDecimal objects
|
433
|
+
puts "Transaction: #{transaction['transaction_id']}"
|
434
|
+
puts " Amount: #{transaction['currency']} #{transaction['amount']}"
|
435
|
+
puts " Balance: $#{transaction['balance']}"
|
436
|
+
puts " Fee: #{transaction['fee'] || 'No fee'}"
|
437
|
+
|
438
|
+
# You can perform precise decimal calculations
|
439
|
+
if transaction['currency'] != 'USD'
|
440
|
+
usd_amount = transaction['amount'] * transaction['exchange_rate']
|
441
|
+
puts " USD Equivalent: $#{usd_amount.round(2)}"
|
442
|
+
end
|
443
|
+
end
|
444
|
+
```
|
445
|
+
|
446
|
+
#### Decimal Type Storage Considerations
|
447
|
+
|
448
|
+
Parquet optimizes storage based on the precision:
|
449
|
+
- For precision ≤ 9: Uses 4-byte INT32
|
450
|
+
- For precision ≤ 18: Uses 8-byte INT64
|
451
|
+
- For precision ≤ 38: Uses 16-byte BYTE_ARRAY
|
452
|
+
|
453
|
+
Choose appropriate precision and scale for your data to optimize storage while ensuring adequate range:
|
454
|
+
|
455
|
+
```ruby
|
456
|
+
# Banking examples
|
457
|
+
field :account_balance, :decimal, precision: 16, scale: 2 # Up to 14 digits before decimal point
|
458
|
+
field :interest_rate, :decimal, precision: 8, scale: 6 # Rate with 6 decimal places (e.g., 0.015625)
|
459
|
+
|
460
|
+
# E-commerce examples
|
461
|
+
field :product_price, :decimal, precision: 10, scale: 2 # Product price
|
462
|
+
field :shipping_weight, :decimal, precision: 6, scale: 3 # Weight in kg with 3 decimal places
|
463
|
+
|
464
|
+
# Analytics examples
|
465
|
+
field :conversion_rate, :decimal, precision: 5, scale: 4 # Rate like 0.0123
|
466
|
+
field :daily_revenue, :decimal, precision: 14, scale: 2 # Daily revenue with 2 decimal places
|
467
|
+
```
|
468
|
+
|
469
|
+
### Sample Data with Nested Structures
|
470
|
+
|
471
|
+
Here's an example showing how to use the schema defined earlier with sample data:
|
472
|
+
|
473
|
+
```ruby
|
250
474
|
# Sample data with nested structures
|
251
475
|
data = [
|
252
476
|
[
|
@@ -271,7 +495,7 @@ data = [
|
|
271
495
|
"feature1" => { "count" => 5, "description" => "Main feature" },
|
272
496
|
"feature2" => { "count" => 3, "description" => "Secondary feature" }
|
273
497
|
},
|
274
|
-
[["a", "b"], ["c", "d", "e"]], # nested_lists
|
498
|
+
[["a", "b"], ["c", "d", "e"]], # nested_lists (a list of lists of strings)
|
275
499
|
{ # map_of_lists
|
276
500
|
"group1" => [1, 2, 3],
|
277
501
|
"group2" => [4, 5, 6]
|
@@ -1,6 +1,7 @@
|
|
1
1
|
mod common;
|
2
2
|
mod parquet_column_reader;
|
3
3
|
mod parquet_row_reader;
|
4
|
+
mod unified;
|
4
5
|
use std::{fs::File, rc::Rc};
|
5
6
|
|
6
7
|
use magnus::{value::ReprValue, Error as MagnusError, Ruby, Value};
|
@@ -207,4 +208,4 @@ pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusEr
|
|
207
208
|
let metadata = reader.finish().map_err(ParquetGemError::Parquet)?;
|
208
209
|
|
209
210
|
Ok(RubyParquetMetaData(metadata).try_into_value_with(&ruby)?)
|
210
|
-
}
|
211
|
+
}
|
@@ -1,21 +1,9 @@
|
|
1
|
-
use crate::
|
2
|
-
use crate::
|
3
|
-
use crate::
|
4
|
-
|
5
|
-
create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ParquetValueVec,
|
6
|
-
ParserResultType,
|
7
|
-
};
|
8
|
-
use ahash::RandomState;
|
9
|
-
use either::Either;
|
10
|
-
use magnus::IntoValue;
|
1
|
+
use crate::reader::unified::{parse_parquet_unified, ParserType, UnifiedParserArgs};
|
2
|
+
use crate::utils::*;
|
3
|
+
use crate::ParquetGemError;
|
4
|
+
|
11
5
|
use magnus::{Error as MagnusError, Ruby, Value};
|
12
|
-
use std::collections::HashMap;
|
13
6
|
use std::rc::Rc;
|
14
|
-
use std::sync::OnceLock;
|
15
|
-
|
16
|
-
use super::common::{
|
17
|
-
create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
|
18
|
-
};
|
19
7
|
|
20
8
|
#[inline]
|
21
9
|
pub fn parse_parquet_columns(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
@@ -41,116 +29,16 @@ fn parse_parquet_columns_impl(
|
|
41
29
|
logger,
|
42
30
|
} = parse_parquet_columns_args(&ruby, args)?;
|
43
31
|
|
44
|
-
//
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
// Clone values for the closure to avoid move issues
|
51
|
-
let columns_clone = columns.clone();
|
52
|
-
|
53
|
-
// Handle block or create enumerator
|
54
|
-
if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
|
55
|
-
create_column_enumerator(ColumnEnumeratorArgs {
|
56
|
-
rb_self,
|
32
|
+
// Use the unified parsing implementation
|
33
|
+
parse_parquet_unified(
|
34
|
+
ruby,
|
35
|
+
rb_self,
|
36
|
+
UnifiedParserArgs {
|
57
37
|
to_read,
|
58
38
|
result_type,
|
59
|
-
columns
|
60
|
-
batch_size,
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
})? {
|
66
|
-
return Ok(enum_value);
|
67
|
-
}
|
68
|
-
|
69
|
-
let source = open_parquet_source(ruby.clone(), to_read)?;
|
70
|
-
|
71
|
-
// Use the common function to create the batch reader
|
72
|
-
|
73
|
-
let (batch_reader, schema, num_rows) = match source {
|
74
|
-
Either::Left(file) => create_batch_reader(file, &columns, batch_size)?,
|
75
|
-
Either::Right(readable) => create_batch_reader(readable, &columns, batch_size)?,
|
76
|
-
};
|
77
|
-
|
78
|
-
match result_type {
|
79
|
-
ParserResultType::Hash => {
|
80
|
-
// For hash return type, we need to return a hash with column names pointing at empty arrays
|
81
|
-
if handle_empty_file(&ruby, &schema, num_rows)? {
|
82
|
-
return Ok(ruby.qnil().into_value_with(&ruby));
|
83
|
-
}
|
84
|
-
|
85
|
-
let headers = OnceLock::new();
|
86
|
-
let headers_clone = headers.clone();
|
87
|
-
let iter = batch_reader.map(move |batch| {
|
88
|
-
batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
|
89
|
-
let local_headers = headers_clone
|
90
|
-
.get_or_init(|| {
|
91
|
-
let schema = batch.schema();
|
92
|
-
let fields = schema.fields();
|
93
|
-
let mut header_string = Vec::with_capacity(fields.len());
|
94
|
-
for field in fields {
|
95
|
-
header_string.push(field.name().to_owned());
|
96
|
-
}
|
97
|
-
StringCache::intern_many(&header_string)
|
98
|
-
})
|
99
|
-
.as_ref()
|
100
|
-
.map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
|
101
|
-
|
102
|
-
let mut map = HashMap::with_capacity_and_hasher(
|
103
|
-
local_headers.len(),
|
104
|
-
RandomState::default(),
|
105
|
-
);
|
106
|
-
|
107
|
-
batch
|
108
|
-
.columns()
|
109
|
-
.iter()
|
110
|
-
.enumerate()
|
111
|
-
.try_for_each(|(i, column)| {
|
112
|
-
let header = local_headers[i];
|
113
|
-
let values = ParquetValueVec::try_from(ArrayWrapper {
|
114
|
-
array: column,
|
115
|
-
strict,
|
116
|
-
})?;
|
117
|
-
map.insert(header, values.into_inner());
|
118
|
-
Ok::<_, ParquetGemError>(())
|
119
|
-
})?;
|
120
|
-
|
121
|
-
Ok(ColumnRecord::Map::<RandomState>(map))
|
122
|
-
})
|
123
|
-
});
|
124
|
-
|
125
|
-
for result in iter {
|
126
|
-
let record = result?;
|
127
|
-
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
128
|
-
}
|
129
|
-
}
|
130
|
-
ParserResultType::Array => {
|
131
|
-
let iter = batch_reader.map(|batch| {
|
132
|
-
batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
|
133
|
-
let vec = batch
|
134
|
-
.columns()
|
135
|
-
.iter()
|
136
|
-
.map(|column| {
|
137
|
-
let values = ParquetValueVec::try_from(ArrayWrapper {
|
138
|
-
array: column,
|
139
|
-
strict,
|
140
|
-
})?;
|
141
|
-
Ok::<_, ParquetGemError>(values.into_inner())
|
142
|
-
})
|
143
|
-
.collect::<Result<Vec<_>, _>>()?;
|
144
|
-
Ok(ColumnRecord::Vec::<RandomState>(vec))
|
145
|
-
})
|
146
|
-
});
|
147
|
-
|
148
|
-
for result in iter {
|
149
|
-
let record = result?;
|
150
|
-
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
151
|
-
}
|
152
|
-
}
|
153
|
-
}
|
154
|
-
|
155
|
-
Ok(ruby.qnil().into_value_with(&ruby))
|
156
|
-
}
|
39
|
+
columns,
|
40
|
+
parser_type: ParserType::Column { batch_size, strict },
|
41
|
+
logger,
|
42
|
+
},
|
43
|
+
)
|
44
|
+
}
|
@@ -1,22 +1,9 @@
|
|
1
|
-
use crate::
|
2
|
-
use crate::
|
3
|
-
use crate::
|
4
|
-
|
5
|
-
create_row_enumerator, utils::*, ParquetField, ParquetGemError, ParserResultType,
|
6
|
-
RowEnumeratorArgs, RowRecord,
|
7
|
-
};
|
8
|
-
use ahash::RandomState;
|
9
|
-
use either::Either;
|
10
|
-
use magnus::IntoValue;
|
1
|
+
use crate::reader::unified::{parse_parquet_unified, ParserType, UnifiedParserArgs};
|
2
|
+
use crate::utils::*;
|
3
|
+
use crate::ParquetGemError;
|
4
|
+
|
11
5
|
use magnus::{Error as MagnusError, Ruby, Value};
|
12
|
-
use parquet::file::reader::{FileReader, SerializedFileReader};
|
13
|
-
use parquet::record::reader::RowIter as ParquetRowIter;
|
14
|
-
use parquet::schema::types::{Type as SchemaType, TypePtr};
|
15
|
-
use std::collections::HashMap;
|
16
6
|
use std::rc::Rc;
|
17
|
-
use std::sync::OnceLock;
|
18
|
-
|
19
|
-
use super::common::{handle_block_or_enum, open_parquet_source};
|
20
7
|
|
21
8
|
#[inline]
|
22
9
|
pub fn parse_parquet_rows(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
@@ -41,123 +28,16 @@ fn parse_parquet_rows_impl(
|
|
41
28
|
logger,
|
42
29
|
} = parse_parquet_rows_args(&ruby, args)?;
|
43
30
|
|
44
|
-
//
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
// Handle block or create enumerator
|
51
|
-
if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
|
52
|
-
create_row_enumerator(RowEnumeratorArgs {
|
53
|
-
rb_self,
|
31
|
+
// Use the unified parsing implementation
|
32
|
+
parse_parquet_unified(
|
33
|
+
ruby,
|
34
|
+
rb_self,
|
35
|
+
UnifiedParserArgs {
|
54
36
|
to_read,
|
55
37
|
result_type,
|
56
|
-
columns
|
57
|
-
strict,
|
38
|
+
columns,
|
39
|
+
parser_type: ParserType::Row { strict },
|
58
40
|
logger,
|
59
|
-
}
|
60
|
-
|
61
|
-
|
62
|
-
return Ok(enum_value);
|
63
|
-
}
|
64
|
-
|
65
|
-
let source = open_parquet_source(ruby.clone(), to_read)?;
|
66
|
-
let reader: Box<dyn FileReader> = match source {
|
67
|
-
Either::Left(file) => {
|
68
|
-
Box::new(SerializedFileReader::new(file).map_err(ParquetGemError::from)?)
|
69
|
-
}
|
70
|
-
Either::Right(readable) => {
|
71
|
-
Box::new(SerializedFileReader::new(readable).map_err(ParquetGemError::from)?)
|
72
|
-
}
|
73
|
-
};
|
74
|
-
|
75
|
-
let schema = reader.metadata().file_metadata().schema().clone();
|
76
|
-
ruby_logger.debug(|| format!("Schema loaded: {:?}", schema))?;
|
77
|
-
|
78
|
-
let mut iter = ParquetRowIter::from_file_into(reader);
|
79
|
-
if let Some(cols) = columns {
|
80
|
-
ruby_logger.debug(|| format!("Projecting columns: {:?}", cols))?;
|
81
|
-
let projection = create_projection_schema(&schema, &cols);
|
82
|
-
iter = iter.project(Some(projection.to_owned())).map_err(|e| {
|
83
|
-
MagnusError::new(
|
84
|
-
ruby.exception_runtime_error(),
|
85
|
-
format!("Failed to create projection: {}", e),
|
86
|
-
)
|
87
|
-
})?;
|
88
|
-
}
|
89
|
-
|
90
|
-
match result_type {
|
91
|
-
ParserResultType::Hash => {
|
92
|
-
let headers = OnceLock::new();
|
93
|
-
let headers_clone = headers.clone();
|
94
|
-
let iter = iter.map(move |row| {
|
95
|
-
row.map(|row| {
|
96
|
-
let headers = headers_clone.get_or_init(|| {
|
97
|
-
let column_count = row.get_column_iter().count();
|
98
|
-
|
99
|
-
let mut header_string = Vec::with_capacity(column_count);
|
100
|
-
for (k, _) in row.get_column_iter() {
|
101
|
-
header_string.push(k.to_owned());
|
102
|
-
}
|
103
|
-
|
104
|
-
StringCache::intern_many(&header_string).expect("Failed to intern headers")
|
105
|
-
});
|
106
|
-
|
107
|
-
let mut map =
|
108
|
-
HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
|
109
|
-
for (i, (_, v)) in row.get_column_iter().enumerate() {
|
110
|
-
map.insert(headers[i], ParquetField(v.clone(), strict));
|
111
|
-
}
|
112
|
-
map
|
113
|
-
})
|
114
|
-
.map(RowRecord::Map::<RandomState>)
|
115
|
-
.map_err(ParquetGemError::from)
|
116
|
-
});
|
117
|
-
|
118
|
-
for result in iter {
|
119
|
-
let record = result?;
|
120
|
-
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
121
|
-
}
|
122
|
-
}
|
123
|
-
ParserResultType::Array => {
|
124
|
-
let iter = iter.map(|row| {
|
125
|
-
row.map(|row| {
|
126
|
-
let column_count = row.get_column_iter().count();
|
127
|
-
let mut vec = Vec::with_capacity(column_count);
|
128
|
-
for (_, v) in row.get_column_iter() {
|
129
|
-
vec.push(ParquetField(v.clone(), strict));
|
130
|
-
}
|
131
|
-
vec
|
132
|
-
})
|
133
|
-
.map(RowRecord::Vec::<RandomState>)
|
134
|
-
.map_err(ParquetGemError::from)
|
135
|
-
});
|
136
|
-
|
137
|
-
for result in iter {
|
138
|
-
let record = result?;
|
139
|
-
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
140
|
-
}
|
141
|
-
}
|
142
|
-
}
|
143
|
-
|
144
|
-
Ok(ruby.qnil().into_value_with(&ruby))
|
145
|
-
}
|
146
|
-
|
147
|
-
fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
|
148
|
-
if let SchemaType::GroupType { fields, .. } = schema {
|
149
|
-
let projected_fields: Vec<TypePtr> = fields
|
150
|
-
.iter()
|
151
|
-
.filter(|field| columns.contains(&field.name().to_string()))
|
152
|
-
.cloned()
|
153
|
-
.collect();
|
154
|
-
|
155
|
-
SchemaType::GroupType {
|
156
|
-
basic_info: schema.get_basic_info().clone(),
|
157
|
-
fields: projected_fields,
|
158
|
-
}
|
159
|
-
} else {
|
160
|
-
// Return original schema if not a group type
|
161
|
-
schema.clone()
|
162
|
-
}
|
163
|
-
}
|
41
|
+
},
|
42
|
+
)
|
43
|
+
}
|