parquet 0.0.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +50 -0
- data/README.md +92 -2
- data/ext/parquet/Cargo.toml +1 -0
- data/ext/parquet/src/lib.rs +5 -3
- data/ext/parquet/src/{reader.rs → reader/mod.rs} +5 -2
- data/ext/parquet/src/types/core_types.rs +73 -0
- data/ext/parquet/src/types/mod.rs +30 -0
- data/ext/parquet/src/types/parquet_value.rs +462 -0
- data/ext/parquet/src/types/record_types.rs +204 -0
- data/ext/parquet/src/types/timestamp.rs +85 -0
- data/ext/parquet/src/types/type_conversion.rs +753 -0
- data/ext/parquet/src/types/writer_types.rs +275 -0
- data/ext/parquet/src/utils.rs +16 -5
- data/ext/parquet/src/writer/mod.rs +403 -0
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +33 -2
- metadata +13 -6
- data/ext/parquet/src/types.rs +0 -763
- /data/ext/parquet/src/{parquet_column_reader.rs → reader/parquet_column_reader.rs} +0 -0
- /data/ext/parquet/src/{parquet_row_reader.rs → reader/parquet_row_reader.rs} +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 794d11142b73d13b665ecdb4ffd46df6ab7d97e5f99336e2bc91b79dbb55a514
|
4
|
+
data.tar.gz: eb2843d724e7aad70445a8b992a527e3bee0a79055fdeab7f2ebd2cdfb6247d6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8b97550fb18f2ab4db0b5fbb170d12448237665d9372242d4027760f1c697be0d1e7a8bb47d43886f704e0923ddf57544961fe5af29c596b49aac188f714b9e6
|
7
|
+
data.tar.gz: 1ea56a23e39a084d40690d4e7bd108ec2a4cb20b61714bd564e68600d3f3edda3ffd5c3e646d49d4bb85632ad14f2c7d5735e645610e7a863d9e25d6f1d2b90d
|
data/Cargo.lock
CHANGED
@@ -387,6 +387,22 @@ version = "1.13.0"
|
|
387
387
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
388
388
|
checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
|
389
389
|
|
390
|
+
[[package]]
|
391
|
+
name = "errno"
|
392
|
+
version = "0.3.10"
|
393
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
394
|
+
checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d"
|
395
|
+
dependencies = [
|
396
|
+
"libc",
|
397
|
+
"windows-sys",
|
398
|
+
]
|
399
|
+
|
400
|
+
[[package]]
|
401
|
+
name = "fastrand"
|
402
|
+
version = "2.3.0"
|
403
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
404
|
+
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
|
405
|
+
|
390
406
|
[[package]]
|
391
407
|
name = "flatbuffers"
|
392
408
|
version = "24.12.23"
|
@@ -934,6 +950,12 @@ dependencies = [
|
|
934
950
|
"libc",
|
935
951
|
]
|
936
952
|
|
953
|
+
[[package]]
|
954
|
+
name = "linux-raw-sys"
|
955
|
+
version = "0.4.15"
|
956
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
957
|
+
checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
|
958
|
+
|
937
959
|
[[package]]
|
938
960
|
name = "litemap"
|
939
961
|
version = "0.7.4"
|
@@ -1185,6 +1207,7 @@ dependencies = [
|
|
1185
1207
|
"mimalloc",
|
1186
1208
|
"parquet 54.0.0",
|
1187
1209
|
"rb-sys",
|
1210
|
+
"tempfile",
|
1188
1211
|
"thiserror",
|
1189
1212
|
]
|
1190
1213
|
|
@@ -1377,6 +1400,19 @@ dependencies = [
|
|
1377
1400
|
"semver",
|
1378
1401
|
]
|
1379
1402
|
|
1403
|
+
[[package]]
|
1404
|
+
name = "rustix"
|
1405
|
+
version = "0.38.43"
|
1406
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1407
|
+
checksum = "a78891ee6bf2340288408954ac787aa063d8e8817e9f53abb37c695c6d834ef6"
|
1408
|
+
dependencies = [
|
1409
|
+
"bitflags 2.6.0",
|
1410
|
+
"errno",
|
1411
|
+
"libc",
|
1412
|
+
"linux-raw-sys",
|
1413
|
+
"windows-sys",
|
1414
|
+
]
|
1415
|
+
|
1380
1416
|
[[package]]
|
1381
1417
|
name = "ryu"
|
1382
1418
|
version = "1.0.18"
|
@@ -1530,6 +1566,20 @@ dependencies = [
|
|
1530
1566
|
"syn",
|
1531
1567
|
]
|
1532
1568
|
|
1569
|
+
[[package]]
|
1570
|
+
name = "tempfile"
|
1571
|
+
version = "3.15.0"
|
1572
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1573
|
+
checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704"
|
1574
|
+
dependencies = [
|
1575
|
+
"cfg-if",
|
1576
|
+
"fastrand",
|
1577
|
+
"getrandom",
|
1578
|
+
"once_cell",
|
1579
|
+
"rustix",
|
1580
|
+
"windows-sys",
|
1581
|
+
]
|
1582
|
+
|
1533
1583
|
[[package]]
|
1534
1584
|
name = "thiserror"
|
1535
1585
|
version = "2.0.9"
|
data/README.md
CHANGED
@@ -4,8 +4,6 @@
|
|
4
4
|
|
5
5
|
This project is a Ruby library wrapping the [parquet-rs](https://github.com/apache/parquet-rs) rust crate.
|
6
6
|
|
7
|
-
At the moment, it only supports iterating rows as either a hash or an array.
|
8
|
-
|
9
7
|
## Usage
|
10
8
|
|
11
9
|
This library provides high-level bindings to parquet-rs with two primary APIs for reading Parquet files: row-wise and column-wise iteration. The column-wise API generally offers better performance, especially when working with subset of columns.
|
@@ -83,3 +81,95 @@ Additional arguments for `each_column`:
|
|
83
81
|
- `batch_size`: Number of rows per batch (defaults to implementation-defined value)
|
84
82
|
|
85
83
|
When no block is given, both methods return an Enumerator.
|
84
|
+
|
85
|
+
### Writing Row-wise Data
|
86
|
+
|
87
|
+
The `write_rows` method allows you to write data row by row:
|
88
|
+
|
89
|
+
```ruby
|
90
|
+
require "parquet"
|
91
|
+
|
92
|
+
# Define the schema for your data
|
93
|
+
schema = [
|
94
|
+
{ "id" => "int64" },
|
95
|
+
{ "name" => "string" },
|
96
|
+
{ "score" => "double" }
|
97
|
+
]
|
98
|
+
|
99
|
+
# Create an enumerator that yields arrays of row values
|
100
|
+
rows = [
|
101
|
+
[1, "Alice", 95.5],
|
102
|
+
[2, "Bob", 82.3],
|
103
|
+
[3, "Charlie", 88.7]
|
104
|
+
].each
|
105
|
+
|
106
|
+
# Write to a file
|
107
|
+
Parquet.write_rows(rows, schema: schema, write_to: "data.parquet")
|
108
|
+
|
109
|
+
# Write to an IO object
|
110
|
+
File.open("data.parquet", "wb") do |file|
|
111
|
+
Parquet.write_rows(rows, schema: schema, write_to: file)
|
112
|
+
end
|
113
|
+
|
114
|
+
# Optionally specify batch size (default is 1000)
|
115
|
+
Parquet.write_rows(rows,
|
116
|
+
schema: schema,
|
117
|
+
write_to: "data.parquet",
|
118
|
+
batch_size: 500
|
119
|
+
)
|
120
|
+
```
|
121
|
+
|
122
|
+
### Writing Column-wise Data
|
123
|
+
|
124
|
+
The `write_columns` method provides a more efficient way to write data in column-oriented batches:
|
125
|
+
|
126
|
+
```ruby
|
127
|
+
require "parquet"
|
128
|
+
|
129
|
+
# Define the schema
|
130
|
+
schema = [
|
131
|
+
{ "id" => "int64" },
|
132
|
+
{ "name" => "string" },
|
133
|
+
{ "score" => "double" }
|
134
|
+
]
|
135
|
+
|
136
|
+
# Create batches of column data
|
137
|
+
batches = [
|
138
|
+
# First batch
|
139
|
+
[
|
140
|
+
[1, 2], # id column
|
141
|
+
["Alice", "Bob"], # name column
|
142
|
+
[95.5, 82.3] # score column
|
143
|
+
],
|
144
|
+
# Second batch
|
145
|
+
[
|
146
|
+
[3], # id column
|
147
|
+
["Charlie"], # name column
|
148
|
+
[88.7] # score column
|
149
|
+
]
|
150
|
+
]
|
151
|
+
|
152
|
+
# Create an enumerator from the batches
|
153
|
+
columns = batches.each
|
154
|
+
|
155
|
+
# Write to a parquet file
|
156
|
+
Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
|
157
|
+
|
158
|
+
# Write to an IO object
|
159
|
+
File.open("data.parquet", "wb") do |file|
|
160
|
+
Parquet.write_columns(columns, schema: schema, write_to: file)
|
161
|
+
end
|
162
|
+
```
|
163
|
+
|
164
|
+
The following data types are supported in the schema:
|
165
|
+
|
166
|
+
- `int8`, `int16`, `int32`, `int64`
|
167
|
+
- `uint8`, `uint16`, `uint32`, `uint64`
|
168
|
+
- `float`, `double`
|
169
|
+
- `string`
|
170
|
+
- `binary`
|
171
|
+
- `boolean`
|
172
|
+
- `date32`
|
173
|
+
- `timestamp_millis`, `timestamp_micros`
|
174
|
+
|
175
|
+
Note: List and Map types are currently not supported.
|
data/ext/parquet/Cargo.toml
CHANGED
@@ -17,6 +17,7 @@ magnus = { version = "0.7", features = ["rb-sys"] }
|
|
17
17
|
parquet = { version = "^54.0", features = ["json", "object_store"] }
|
18
18
|
rb-sys = "^0.9"
|
19
19
|
thiserror = "2.0"
|
20
|
+
tempfile = "^3.15"
|
20
21
|
|
21
22
|
[target.'cfg(target_os = "linux")'.dependencies]
|
22
23
|
jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
|
data/ext/parquet/src/lib.rs
CHANGED
@@ -6,9 +6,7 @@ mod ruby_integration;
|
|
6
6
|
mod ruby_reader;
|
7
7
|
mod types;
|
8
8
|
mod utils;
|
9
|
-
|
10
|
-
mod parquet_column_reader;
|
11
|
-
mod parquet_row_reader;
|
9
|
+
mod writer;
|
12
10
|
|
13
11
|
use crate::enumerator::*;
|
14
12
|
use crate::reader::*;
|
@@ -16,6 +14,8 @@ use crate::ruby_integration::*;
|
|
16
14
|
use crate::types::*;
|
17
15
|
|
18
16
|
use magnus::{Error, Ruby};
|
17
|
+
use writer::write_columns;
|
18
|
+
use writer::write_rows;
|
19
19
|
|
20
20
|
/// Initializes the Ruby extension and defines methods.
|
21
21
|
#[magnus::init]
|
@@ -23,5 +23,7 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
23
23
|
let module = ruby.define_module("Parquet")?;
|
24
24
|
module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
|
25
25
|
module.define_module_function("each_column", magnus::method!(parse_parquet_columns, -1))?;
|
26
|
+
module.define_module_function("write_rows", magnus::function!(write_rows, -1))?;
|
27
|
+
module.define_module_function("write_columns", magnus::function!(write_columns, -1))?;
|
26
28
|
Ok(())
|
27
29
|
}
|
@@ -1,11 +1,14 @@
|
|
1
|
+
mod parquet_column_reader;
|
2
|
+
mod parquet_row_reader;
|
3
|
+
|
1
4
|
use std::io;
|
2
5
|
|
3
6
|
use magnus::{Error as MagnusError, Ruby};
|
4
7
|
use thiserror::Error;
|
5
8
|
|
6
9
|
use crate::header_cache::CacheError;
|
7
|
-
pub use
|
8
|
-
pub use
|
10
|
+
pub use parquet_column_reader::parse_parquet_columns;
|
11
|
+
pub use parquet_row_reader::parse_parquet_rows;
|
9
12
|
|
10
13
|
#[derive(Error, Debug)]
|
11
14
|
pub enum ReaderError {
|
@@ -0,0 +1,73 @@
|
|
1
|
+
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
2
|
+
pub enum ParserResultType {
|
3
|
+
Hash,
|
4
|
+
Array,
|
5
|
+
}
|
6
|
+
|
7
|
+
impl ParserResultType {
|
8
|
+
pub fn iter() -> impl Iterator<Item = Self> {
|
9
|
+
[Self::Hash, Self::Array].into_iter()
|
10
|
+
}
|
11
|
+
}
|
12
|
+
|
13
|
+
impl TryFrom<&str> for ParserResultType {
|
14
|
+
type Error = String;
|
15
|
+
|
16
|
+
fn try_from(value: &str) -> Result<Self, Self::Error> {
|
17
|
+
match value {
|
18
|
+
"hash" => Ok(ParserResultType::Hash),
|
19
|
+
"array" => Ok(ParserResultType::Array),
|
20
|
+
_ => Err(format!("Invalid parser result type: {}", value)),
|
21
|
+
}
|
22
|
+
}
|
23
|
+
}
|
24
|
+
|
25
|
+
impl TryFrom<String> for ParserResultType {
|
26
|
+
type Error = String;
|
27
|
+
|
28
|
+
fn try_from(value: String) -> Result<Self, Self::Error> {
|
29
|
+
Self::try_from(value.as_str())
|
30
|
+
}
|
31
|
+
}
|
32
|
+
|
33
|
+
impl std::fmt::Display for ParserResultType {
|
34
|
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
35
|
+
match self {
|
36
|
+
ParserResultType::Hash => write!(f, "hash"),
|
37
|
+
ParserResultType::Array => write!(f, "array"),
|
38
|
+
}
|
39
|
+
}
|
40
|
+
}
|
41
|
+
|
42
|
+
#[derive(Debug, Clone)]
|
43
|
+
pub struct ListField {
|
44
|
+
pub item_type: ParquetSchemaType,
|
45
|
+
}
|
46
|
+
|
47
|
+
#[derive(Debug, Clone)]
|
48
|
+
pub struct MapField {
|
49
|
+
pub key_type: ParquetSchemaType,
|
50
|
+
pub value_type: ParquetSchemaType,
|
51
|
+
}
|
52
|
+
|
53
|
+
#[derive(Debug, Clone)]
|
54
|
+
pub enum ParquetSchemaType {
|
55
|
+
Int8,
|
56
|
+
Int16,
|
57
|
+
Int32,
|
58
|
+
Int64,
|
59
|
+
UInt8,
|
60
|
+
UInt16,
|
61
|
+
UInt32,
|
62
|
+
UInt64,
|
63
|
+
Float,
|
64
|
+
Double,
|
65
|
+
String,
|
66
|
+
Binary,
|
67
|
+
Boolean,
|
68
|
+
Date32,
|
69
|
+
TimestampMillis,
|
70
|
+
TimestampMicros,
|
71
|
+
List(Box<ListField>),
|
72
|
+
Map(Box<MapField>),
|
73
|
+
}
|
@@ -0,0 +1,30 @@
|
|
1
|
+
// Re-export all public items from submodules
|
2
|
+
mod core_types;
|
3
|
+
mod parquet_value;
|
4
|
+
mod record_types;
|
5
|
+
mod timestamp;
|
6
|
+
mod type_conversion;
|
7
|
+
mod writer_types;
|
8
|
+
|
9
|
+
pub use core_types::*;
|
10
|
+
pub use parquet_value::*;
|
11
|
+
pub use record_types::*;
|
12
|
+
pub use timestamp::*;
|
13
|
+
pub use type_conversion::*;
|
14
|
+
pub use writer_types::*;
|
15
|
+
|
16
|
+
// Common imports used across the module
|
17
|
+
use arrow_array::cast::downcast_array;
|
18
|
+
use arrow_array::{
|
19
|
+
Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Float16Array, Float32Array,
|
20
|
+
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, NullArray, StringArray,
|
21
|
+
StructArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
|
22
|
+
TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
|
23
|
+
};
|
24
|
+
use arrow_schema::{DataType, TimeUnit};
|
25
|
+
use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby, TryConvert, Value};
|
26
|
+
use parquet::data_type::Decimal;
|
27
|
+
use parquet::record::Field;
|
28
|
+
use std::{collections::HashMap, hash::BuildHasher, sync::Arc};
|
29
|
+
|
30
|
+
use crate::header_cache::StringCacheKey;
|