parquet 0.0.5 → 0.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Cargo.lock +50 -0
- data/README.md +92 -2
- data/ext/parquet/Cargo.toml +1 -0
- data/ext/parquet/src/lib.rs +5 -3
- data/ext/parquet/src/{reader.rs → reader/mod.rs} +5 -2
- data/ext/parquet/src/types/core_types.rs +73 -0
- data/ext/parquet/src/types/mod.rs +30 -0
- data/ext/parquet/src/types/parquet_value.rs +458 -0
- data/ext/parquet/src/types/record_types.rs +204 -0
- data/ext/parquet/src/types/timestamp.rs +85 -0
- data/ext/parquet/src/types/type_conversion.rs +753 -0
- data/ext/parquet/src/types/writer_types.rs +270 -0
- data/ext/parquet/src/writer/mod.rs +403 -0
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +33 -2
- metadata +13 -6
- data/ext/parquet/src/types.rs +0 -763
- /data/ext/parquet/src/{parquet_column_reader.rs → reader/parquet_column_reader.rs} +0 -0
- /data/ext/parquet/src/{parquet_row_reader.rs → reader/parquet_row_reader.rs} +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 90e876ca198a0e1871f692a382f09ceaeec670d162da26f2c102ea4eca4244bf
|
4
|
+
data.tar.gz: 96743e260cbd2fb55f6cdeaf256fbb1e915c57651fdc3f20fdd58b6a34596544
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1609a37c5a9bd9f1d57bb31dd02b2fdb5b608a7c044686e6ef2513c95e53e830bd7bf7048a36904465a32a5915425c7b6bf581c5b35a4fb19f950cbca20913b2
|
7
|
+
data.tar.gz: 96ec18377fc5944556760329c126f440de61d3b378bfa976a66437db03f0a51220c880afd14098a5b1968daa968d2e836c50f83bef21507789ba4df314c48148
|
data/Cargo.lock
CHANGED
@@ -387,6 +387,22 @@ version = "1.13.0"
|
|
387
387
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
388
388
|
checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
|
389
389
|
|
390
|
+
[[package]]
|
391
|
+
name = "errno"
|
392
|
+
version = "0.3.10"
|
393
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
394
|
+
checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d"
|
395
|
+
dependencies = [
|
396
|
+
"libc",
|
397
|
+
"windows-sys",
|
398
|
+
]
|
399
|
+
|
400
|
+
[[package]]
|
401
|
+
name = "fastrand"
|
402
|
+
version = "2.3.0"
|
403
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
404
|
+
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
|
405
|
+
|
390
406
|
[[package]]
|
391
407
|
name = "flatbuffers"
|
392
408
|
version = "24.12.23"
|
@@ -934,6 +950,12 @@ dependencies = [
|
|
934
950
|
"libc",
|
935
951
|
]
|
936
952
|
|
953
|
+
[[package]]
|
954
|
+
name = "linux-raw-sys"
|
955
|
+
version = "0.4.15"
|
956
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
957
|
+
checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
|
958
|
+
|
937
959
|
[[package]]
|
938
960
|
name = "litemap"
|
939
961
|
version = "0.7.4"
|
@@ -1185,6 +1207,7 @@ dependencies = [
|
|
1185
1207
|
"mimalloc",
|
1186
1208
|
"parquet 54.0.0",
|
1187
1209
|
"rb-sys",
|
1210
|
+
"tempfile",
|
1188
1211
|
"thiserror",
|
1189
1212
|
]
|
1190
1213
|
|
@@ -1377,6 +1400,19 @@ dependencies = [
|
|
1377
1400
|
"semver",
|
1378
1401
|
]
|
1379
1402
|
|
1403
|
+
[[package]]
|
1404
|
+
name = "rustix"
|
1405
|
+
version = "0.38.43"
|
1406
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1407
|
+
checksum = "a78891ee6bf2340288408954ac787aa063d8e8817e9f53abb37c695c6d834ef6"
|
1408
|
+
dependencies = [
|
1409
|
+
"bitflags 2.6.0",
|
1410
|
+
"errno",
|
1411
|
+
"libc",
|
1412
|
+
"linux-raw-sys",
|
1413
|
+
"windows-sys",
|
1414
|
+
]
|
1415
|
+
|
1380
1416
|
[[package]]
|
1381
1417
|
name = "ryu"
|
1382
1418
|
version = "1.0.18"
|
@@ -1530,6 +1566,20 @@ dependencies = [
|
|
1530
1566
|
"syn",
|
1531
1567
|
]
|
1532
1568
|
|
1569
|
+
[[package]]
|
1570
|
+
name = "tempfile"
|
1571
|
+
version = "3.15.0"
|
1572
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1573
|
+
checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704"
|
1574
|
+
dependencies = [
|
1575
|
+
"cfg-if",
|
1576
|
+
"fastrand",
|
1577
|
+
"getrandom",
|
1578
|
+
"once_cell",
|
1579
|
+
"rustix",
|
1580
|
+
"windows-sys",
|
1581
|
+
]
|
1582
|
+
|
1533
1583
|
[[package]]
|
1534
1584
|
name = "thiserror"
|
1535
1585
|
version = "2.0.9"
|
data/README.md
CHANGED
@@ -4,8 +4,6 @@
|
|
4
4
|
|
5
5
|
This project is a Ruby library wrapping the [parquet-rs](https://github.com/apache/parquet-rs) rust crate.
|
6
6
|
|
7
|
-
At the moment, it only supports iterating rows as either a hash or an array.
|
8
|
-
|
9
7
|
## Usage
|
10
8
|
|
11
9
|
This library provides high-level bindings to parquet-rs with two primary APIs for reading Parquet files: row-wise and column-wise iteration. The column-wise API generally offers better performance, especially when working with subset of columns.
|
@@ -83,3 +81,95 @@ Additional arguments for `each_column`:
|
|
83
81
|
- `batch_size`: Number of rows per batch (defaults to implementation-defined value)
|
84
82
|
|
85
83
|
When no block is given, both methods return an Enumerator.
|
84
|
+
|
85
|
+
### Writing Row-wise Data
|
86
|
+
|
87
|
+
The `write_rows` method allows you to write data row by row:
|
88
|
+
|
89
|
+
```ruby
|
90
|
+
require "parquet"
|
91
|
+
|
92
|
+
# Define the schema for your data
|
93
|
+
schema = [
|
94
|
+
{ "id" => "int64" },
|
95
|
+
{ "name" => "string" },
|
96
|
+
{ "score" => "double" }
|
97
|
+
]
|
98
|
+
|
99
|
+
# Create an enumerator that yields arrays of row values
|
100
|
+
rows = [
|
101
|
+
[1, "Alice", 95.5],
|
102
|
+
[2, "Bob", 82.3],
|
103
|
+
[3, "Charlie", 88.7]
|
104
|
+
].each
|
105
|
+
|
106
|
+
# Write to a file
|
107
|
+
Parquet.write_rows(rows, schema: schema, write_to: "data.parquet")
|
108
|
+
|
109
|
+
# Write to an IO object
|
110
|
+
File.open("data.parquet", "wb") do |file|
|
111
|
+
Parquet.write_rows(rows, schema: schema, write_to: file)
|
112
|
+
end
|
113
|
+
|
114
|
+
# Optionally specify batch size (default is 1000)
|
115
|
+
Parquet.write_rows(rows,
|
116
|
+
schema: schema,
|
117
|
+
write_to: "data.parquet",
|
118
|
+
batch_size: 500
|
119
|
+
)
|
120
|
+
```
|
121
|
+
|
122
|
+
### Writing Column-wise Data
|
123
|
+
|
124
|
+
The `write_columns` method provides a more efficient way to write data in column-oriented batches:
|
125
|
+
|
126
|
+
```ruby
|
127
|
+
require "parquet"
|
128
|
+
|
129
|
+
# Define the schema
|
130
|
+
schema = [
|
131
|
+
{ "id" => "int64" },
|
132
|
+
{ "name" => "string" },
|
133
|
+
{ "score" => "double" }
|
134
|
+
]
|
135
|
+
|
136
|
+
# Create batches of column data
|
137
|
+
batches = [
|
138
|
+
# First batch
|
139
|
+
[
|
140
|
+
[1, 2], # id column
|
141
|
+
["Alice", "Bob"], # name column
|
142
|
+
[95.5, 82.3] # score column
|
143
|
+
],
|
144
|
+
# Second batch
|
145
|
+
[
|
146
|
+
[3], # id column
|
147
|
+
["Charlie"], # name column
|
148
|
+
[88.7] # score column
|
149
|
+
]
|
150
|
+
]
|
151
|
+
|
152
|
+
# Create an enumerator from the batches
|
153
|
+
columns = batches.each
|
154
|
+
|
155
|
+
# Write to a parquet file
|
156
|
+
Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
|
157
|
+
|
158
|
+
# Write to an IO object
|
159
|
+
File.open("data.parquet", "wb") do |file|
|
160
|
+
Parquet.write_columns(columns, schema: schema, write_to: file)
|
161
|
+
end
|
162
|
+
```
|
163
|
+
|
164
|
+
The following data types are supported in the schema:
|
165
|
+
|
166
|
+
- `int8`, `int16`, `int32`, `int64`
|
167
|
+
- `uint8`, `uint16`, `uint32`, `uint64`
|
168
|
+
- `float`, `double`
|
169
|
+
- `string`
|
170
|
+
- `binary`
|
171
|
+
- `boolean`
|
172
|
+
- `date32`
|
173
|
+
- `timestamp_millis`, `timestamp_micros`
|
174
|
+
|
175
|
+
Note: List and Map types are currently not supported.
|
data/ext/parquet/Cargo.toml
CHANGED
@@ -17,6 +17,7 @@ magnus = { version = "0.7", features = ["rb-sys"] }
|
|
17
17
|
parquet = { version = "^54.0", features = ["json", "object_store"] }
|
18
18
|
rb-sys = "^0.9"
|
19
19
|
thiserror = "2.0"
|
20
|
+
tempfile = "^3.15"
|
20
21
|
|
21
22
|
[target.'cfg(target_os = "linux")'.dependencies]
|
22
23
|
jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
|
data/ext/parquet/src/lib.rs
CHANGED
@@ -6,9 +6,7 @@ mod ruby_integration;
|
|
6
6
|
mod ruby_reader;
|
7
7
|
mod types;
|
8
8
|
mod utils;
|
9
|
-
|
10
|
-
mod parquet_column_reader;
|
11
|
-
mod parquet_row_reader;
|
9
|
+
mod writer;
|
12
10
|
|
13
11
|
use crate::enumerator::*;
|
14
12
|
use crate::reader::*;
|
@@ -16,6 +14,8 @@ use crate::ruby_integration::*;
|
|
16
14
|
use crate::types::*;
|
17
15
|
|
18
16
|
use magnus::{Error, Ruby};
|
17
|
+
use writer::write_columns;
|
18
|
+
use writer::write_rows;
|
19
19
|
|
20
20
|
/// Initializes the Ruby extension and defines methods.
|
21
21
|
#[magnus::init]
|
@@ -23,5 +23,7 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
23
23
|
let module = ruby.define_module("Parquet")?;
|
24
24
|
module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
|
25
25
|
module.define_module_function("each_column", magnus::method!(parse_parquet_columns, -1))?;
|
26
|
+
module.define_module_function("write_rows", magnus::function!(write_rows, -1))?;
|
27
|
+
module.define_module_function("write_columns", magnus::function!(write_columns, -1))?;
|
26
28
|
Ok(())
|
27
29
|
}
|
@@ -1,11 +1,14 @@
|
|
1
|
+
mod parquet_column_reader;
|
2
|
+
mod parquet_row_reader;
|
3
|
+
|
1
4
|
use std::io;
|
2
5
|
|
3
6
|
use magnus::{Error as MagnusError, Ruby};
|
4
7
|
use thiserror::Error;
|
5
8
|
|
6
9
|
use crate::header_cache::CacheError;
|
7
|
-
pub use
|
8
|
-
pub use
|
10
|
+
pub use parquet_column_reader::parse_parquet_columns;
|
11
|
+
pub use parquet_row_reader::parse_parquet_rows;
|
9
12
|
|
10
13
|
#[derive(Error, Debug)]
|
11
14
|
pub enum ReaderError {
|
@@ -0,0 +1,73 @@
|
|
1
|
+
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
2
|
+
pub enum ParserResultType {
|
3
|
+
Hash,
|
4
|
+
Array,
|
5
|
+
}
|
6
|
+
|
7
|
+
impl ParserResultType {
|
8
|
+
pub fn iter() -> impl Iterator<Item = Self> {
|
9
|
+
[Self::Hash, Self::Array].into_iter()
|
10
|
+
}
|
11
|
+
}
|
12
|
+
|
13
|
+
impl TryFrom<&str> for ParserResultType {
|
14
|
+
type Error = String;
|
15
|
+
|
16
|
+
fn try_from(value: &str) -> Result<Self, Self::Error> {
|
17
|
+
match value {
|
18
|
+
"hash" => Ok(ParserResultType::Hash),
|
19
|
+
"array" => Ok(ParserResultType::Array),
|
20
|
+
_ => Err(format!("Invalid parser result type: {}", value)),
|
21
|
+
}
|
22
|
+
}
|
23
|
+
}
|
24
|
+
|
25
|
+
impl TryFrom<String> for ParserResultType {
|
26
|
+
type Error = String;
|
27
|
+
|
28
|
+
fn try_from(value: String) -> Result<Self, Self::Error> {
|
29
|
+
Self::try_from(value.as_str())
|
30
|
+
}
|
31
|
+
}
|
32
|
+
|
33
|
+
impl std::fmt::Display for ParserResultType {
|
34
|
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
35
|
+
match self {
|
36
|
+
ParserResultType::Hash => write!(f, "hash"),
|
37
|
+
ParserResultType::Array => write!(f, "array"),
|
38
|
+
}
|
39
|
+
}
|
40
|
+
}
|
41
|
+
|
42
|
+
#[derive(Debug, Clone)]
|
43
|
+
pub struct ListField {
|
44
|
+
pub item_type: ParquetSchemaType,
|
45
|
+
}
|
46
|
+
|
47
|
+
#[derive(Debug, Clone)]
|
48
|
+
pub struct MapField {
|
49
|
+
pub key_type: ParquetSchemaType,
|
50
|
+
pub value_type: ParquetSchemaType,
|
51
|
+
}
|
52
|
+
|
53
|
+
#[derive(Debug, Clone)]
|
54
|
+
pub enum ParquetSchemaType {
|
55
|
+
Int8,
|
56
|
+
Int16,
|
57
|
+
Int32,
|
58
|
+
Int64,
|
59
|
+
UInt8,
|
60
|
+
UInt16,
|
61
|
+
UInt32,
|
62
|
+
UInt64,
|
63
|
+
Float,
|
64
|
+
Double,
|
65
|
+
String,
|
66
|
+
Binary,
|
67
|
+
Boolean,
|
68
|
+
Date32,
|
69
|
+
TimestampMillis,
|
70
|
+
TimestampMicros,
|
71
|
+
List(Box<ListField>),
|
72
|
+
Map(Box<MapField>),
|
73
|
+
}
|
@@ -0,0 +1,30 @@
|
|
1
|
+
// Re-export all public items from submodules
|
2
|
+
mod core_types;
|
3
|
+
mod parquet_value;
|
4
|
+
mod record_types;
|
5
|
+
mod timestamp;
|
6
|
+
mod type_conversion;
|
7
|
+
mod writer_types;
|
8
|
+
|
9
|
+
pub use core_types::*;
|
10
|
+
pub use parquet_value::*;
|
11
|
+
pub use record_types::*;
|
12
|
+
pub use timestamp::*;
|
13
|
+
pub use type_conversion::*;
|
14
|
+
pub use writer_types::*;
|
15
|
+
|
16
|
+
// Common imports used across the module
|
17
|
+
use arrow_array::cast::downcast_array;
|
18
|
+
use arrow_array::{
|
19
|
+
Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Float16Array, Float32Array,
|
20
|
+
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, NullArray, StringArray,
|
21
|
+
StructArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
|
22
|
+
TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
|
23
|
+
};
|
24
|
+
use arrow_schema::{DataType, TimeUnit};
|
25
|
+
use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby, TryConvert, Value};
|
26
|
+
use parquet::data_type::Decimal;
|
27
|
+
use parquet::record::Field;
|
28
|
+
use std::{collections::HashMap, hash::BuildHasher, sync::Arc};
|
29
|
+
|
30
|
+
use crate::header_cache::StringCacheKey;
|