parquet 0.0.2 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Cargo.lock +57 -0
- data/Gemfile +1 -1
- data/README.md +66 -10
- data/ext/parquet/Cargo.toml +5 -0
- data/ext/parquet/src/enumerator.rs +32 -6
- data/ext/parquet/src/header_cache.rs +85 -28
- data/ext/parquet/src/lib.rs +2 -1
- data/ext/parquet/src/reader.rs +218 -13
- data/ext/parquet/src/types.rs +647 -15
- data/ext/parquet/src/utils.rs +57 -3
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +22 -3
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b88d6751418f21c4ec032d05b6d0a6e9dbd37304983ed80e1a290508c787d118
|
4
|
+
data.tar.gz: 948702f38cad3c4d4e76efccbd9d7d8ad4c81366c4dcba2c71058cc4d013c237
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 30f90ee2f597aa6e2d5a84b8ab9780af3d71fa41d3a1152f47d7a12b34bc203b8ff06b04c3f929f689c93be9e962186a3e6c305f61724b36ad4e6ad551c11f49
|
7
|
+
data.tar.gz: 5a83b007e0c4789c6cfde1f8037228b0b00f2f0ef7ea0f932d7eaafefb91669db422450bbfd923f4388e2bfc644cae57f514828a2e4a2868ee6a20b492af428e
|
data/Cargo.lock
CHANGED
@@ -743,6 +743,15 @@ dependencies = [
|
|
743
743
|
"either",
|
744
744
|
]
|
745
745
|
|
746
|
+
[[package]]
|
747
|
+
name = "itertools"
|
748
|
+
version = "0.14.0"
|
749
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
750
|
+
checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
|
751
|
+
dependencies = [
|
752
|
+
"either",
|
753
|
+
]
|
754
|
+
|
746
755
|
[[package]]
|
747
756
|
name = "itoa"
|
748
757
|
version = "1.0.14"
|
@@ -769,6 +778,35 @@ dependencies = [
|
|
769
778
|
"libc",
|
770
779
|
]
|
771
780
|
|
781
|
+
[[package]]
|
782
|
+
name = "jiff"
|
783
|
+
version = "0.1.19"
|
784
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
785
|
+
checksum = "943611a469f78ab9afdac9022e473a80fca16a9deca6c5be3eb566d872231e76"
|
786
|
+
dependencies = [
|
787
|
+
"jiff-tzdb-platform",
|
788
|
+
"log",
|
789
|
+
"portable-atomic",
|
790
|
+
"portable-atomic-util",
|
791
|
+
"serde",
|
792
|
+
"windows-sys",
|
793
|
+
]
|
794
|
+
|
795
|
+
[[package]]
|
796
|
+
name = "jiff-tzdb"
|
797
|
+
version = "0.1.1"
|
798
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
799
|
+
checksum = "91335e575850c5c4c673b9bd467b0e025f164ca59d0564f69d0c2ee0ffad4653"
|
800
|
+
|
801
|
+
[[package]]
|
802
|
+
name = "jiff-tzdb-platform"
|
803
|
+
version = "0.1.1"
|
804
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
805
|
+
checksum = "9835f0060a626fe59f160437bc725491a6af23133ea906500027d1bd2f8f4329"
|
806
|
+
dependencies = [
|
807
|
+
"jiff-tzdb",
|
808
|
+
]
|
809
|
+
|
772
810
|
[[package]]
|
773
811
|
name = "jobserver"
|
774
812
|
version = "0.1.32"
|
@@ -1159,8 +1197,12 @@ name = "parquet"
|
|
1159
1197
|
version = "0.1.0"
|
1160
1198
|
dependencies = [
|
1161
1199
|
"ahash",
|
1200
|
+
"arrow-array",
|
1201
|
+
"arrow-schema",
|
1162
1202
|
"bytes",
|
1203
|
+
"itertools 0.14.0",
|
1163
1204
|
"jemallocator",
|
1205
|
+
"jiff",
|
1164
1206
|
"kanal",
|
1165
1207
|
"magnus 0.7.1",
|
1166
1208
|
"mimalloc",
|
@@ -1238,6 +1280,21 @@ version = "0.3.31"
|
|
1238
1280
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1239
1281
|
checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2"
|
1240
1282
|
|
1283
|
+
[[package]]
|
1284
|
+
name = "portable-atomic"
|
1285
|
+
version = "1.10.0"
|
1286
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1287
|
+
checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
|
1288
|
+
|
1289
|
+
[[package]]
|
1290
|
+
name = "portable-atomic-util"
|
1291
|
+
version = "0.2.4"
|
1292
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1293
|
+
checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507"
|
1294
|
+
dependencies = [
|
1295
|
+
"portable-atomic",
|
1296
|
+
]
|
1297
|
+
|
1241
1298
|
[[package]]
|
1242
1299
|
name = "proc-macro2"
|
1243
1300
|
version = "1.0.92"
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -8,22 +8,78 @@ At the moment, it only supports iterating rows as either a hash or an array.
|
|
8
8
|
|
9
9
|
## Usage
|
10
10
|
|
11
|
+
This library provides high-level bindings to parquet-rs with two primary APIs for reading Parquet files: row-wise and column-wise iteration. The column-wise API generally offers better performance, especially when working with subset of columns.
|
12
|
+
|
13
|
+
### Row-wise Iteration
|
14
|
+
|
15
|
+
The `each_row` method provides sequential access to individual rows:
|
16
|
+
|
11
17
|
```ruby
|
12
18
|
require "parquet"
|
13
19
|
|
14
|
-
#
|
15
|
-
Parquet.each_row("
|
20
|
+
# Basic usage with default hash output
|
21
|
+
Parquet.each_row("data.parquet") do |row|
|
22
|
+
puts row.inspect # {"id"=>1, "name"=>"name_1"}
|
23
|
+
end
|
16
24
|
|
17
|
-
#
|
18
|
-
Parquet.each_row("
|
25
|
+
# Array output for more efficient memory usage
|
26
|
+
Parquet.each_row("data.parquet", result_type: :array) do |row|
|
27
|
+
puts row.inspect # [1, "name_1"]
|
28
|
+
end
|
19
29
|
|
20
|
-
#
|
21
|
-
|
22
|
-
|
30
|
+
# Select specific columns to reduce I/O
|
31
|
+
Parquet.each_row("data.parquet", columns: ["id", "name"]) do |row|
|
32
|
+
puts row.inspect
|
23
33
|
end
|
24
34
|
|
25
|
-
#
|
26
|
-
|
27
|
-
Parquet.each_row(
|
35
|
+
# Reading from IO objects
|
36
|
+
File.open("data.parquet", "rb") do |file|
|
37
|
+
Parquet.each_row(file) do |row|
|
38
|
+
puts row.inspect
|
39
|
+
end
|
40
|
+
end
|
41
|
+
```
|
42
|
+
|
43
|
+
### Column-wise Iteration
|
44
|
+
|
45
|
+
The `each_column` method reads data in column-oriented batches, which is typically more efficient for analytical queries:
|
28
46
|
|
47
|
+
```ruby
|
48
|
+
require "parquet"
|
49
|
+
|
50
|
+
# Process columns in batches of 1024 rows
|
51
|
+
Parquet.each_column("data.parquet", batch_size: 1024) do |batch|
|
52
|
+
# With result_type: :hash (default)
|
53
|
+
puts batch.inspect
|
54
|
+
# {
|
55
|
+
# "id" => [1, 2, ..., 1024],
|
56
|
+
# "name" => ["name_1", "name_2", ..., "name_1024"]
|
57
|
+
# }
|
58
|
+
end
|
59
|
+
|
60
|
+
# Array output with specific columns
|
61
|
+
Parquet.each_column("data.parquet",
|
62
|
+
columns: ["id", "name"],
|
63
|
+
result_type: :array,
|
64
|
+
batch_size: 1024) do |batch|
|
65
|
+
puts batch.inspect
|
66
|
+
# [
|
67
|
+
# [1, 2, ..., 1024], # id column
|
68
|
+
# ["name_1", "name_2", ...] # name column
|
69
|
+
# ]
|
70
|
+
end
|
29
71
|
```
|
72
|
+
|
73
|
+
### Arguments
|
74
|
+
|
75
|
+
Both methods accept these common arguments:
|
76
|
+
|
77
|
+
- `input`: Path string or IO-like object containing Parquet data
|
78
|
+
- `result_type`: Output format (`:hash` or `:array`, defaults to `:hash`)
|
79
|
+
- `columns`: Optional array of column names to read (improves performance)
|
80
|
+
|
81
|
+
Additional arguments for `each_column`:
|
82
|
+
|
83
|
+
- `batch_size`: Number of rows per batch (defaults to implementation-defined value)
|
84
|
+
|
85
|
+
When no block is given, both methods return an Enumerator.
|
data/ext/parquet/Cargo.toml
CHANGED
@@ -9,6 +9,8 @@ crate-type = ["cdylib"]
|
|
9
9
|
[dependencies]
|
10
10
|
ahash = "0.8"
|
11
11
|
parquet = { version = "^54.0", features = ["json", "object_store"] }
|
12
|
+
arrow-schema = "54.0.0"
|
13
|
+
arrow-array = "54.0.0"
|
12
14
|
bytes = "^1.9"
|
13
15
|
kanal = "0.1.0-pre8"
|
14
16
|
magnus = { version = "0.7", features = ["rb-sys"] }
|
@@ -16,6 +18,9 @@ rb-sys = "^0.9"
|
|
16
18
|
serde = { version = "1.0", features = ["derive"] }
|
17
19
|
serde_magnus = "0.8.1"
|
18
20
|
thiserror = "2.0"
|
21
|
+
itertools = "^0.14"
|
22
|
+
jiff = "0.1.19"
|
23
|
+
|
19
24
|
|
20
25
|
[target.'cfg(target_os = "linux")'.dependencies]
|
21
26
|
jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
|
@@ -3,9 +3,9 @@ use magnus::{
|
|
3
3
|
block::Yield, value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value,
|
4
4
|
};
|
5
5
|
|
6
|
-
use crate::
|
6
|
+
use crate::{ColumnRecord, RowRecord};
|
7
7
|
|
8
|
-
pub struct
|
8
|
+
pub struct RowEnumeratorArgs {
|
9
9
|
pub rb_self: Value,
|
10
10
|
pub to_read: Value,
|
11
11
|
pub result_type: String,
|
@@ -13,9 +13,9 @@ pub struct EnumeratorArgs {
|
|
13
13
|
}
|
14
14
|
|
15
15
|
#[inline]
|
16
|
-
pub fn
|
17
|
-
args:
|
18
|
-
) -> Result<Yield<Box<dyn Iterator<Item =
|
16
|
+
pub fn create_row_enumerator(
|
17
|
+
args: RowEnumeratorArgs,
|
18
|
+
) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
|
19
19
|
let kwargs = RHash::new();
|
20
20
|
kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
|
21
21
|
if let Some(columns) = args.columns {
|
@@ -23,6 +23,32 @@ pub fn create_enumerator(
|
|
23
23
|
}
|
24
24
|
let enumerator = args
|
25
25
|
.rb_self
|
26
|
-
.enumeratorize("
|
26
|
+
.enumeratorize("each_row", (args.to_read, KwArgs(kwargs)));
|
27
|
+
Ok(Yield::Enumerator(enumerator))
|
28
|
+
}
|
29
|
+
|
30
|
+
pub struct ColumnEnumeratorArgs {
|
31
|
+
pub rb_self: Value,
|
32
|
+
pub to_read: Value,
|
33
|
+
pub result_type: String,
|
34
|
+
pub columns: Option<Vec<String>>,
|
35
|
+
pub batch_size: Option<usize>,
|
36
|
+
}
|
37
|
+
|
38
|
+
#[inline]
|
39
|
+
pub fn create_column_enumerator(
|
40
|
+
args: ColumnEnumeratorArgs,
|
41
|
+
) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
|
42
|
+
let kwargs = RHash::new();
|
43
|
+
kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
|
44
|
+
if let Some(columns) = args.columns {
|
45
|
+
kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
|
46
|
+
}
|
47
|
+
if let Some(batch_size) = args.batch_size {
|
48
|
+
kwargs.aset(Symbol::new("batch_size"), batch_size)?;
|
49
|
+
}
|
50
|
+
let enumerator = args
|
51
|
+
.rb_self
|
52
|
+
.enumeratorize("each_column", (args.to_read, KwArgs(kwargs)));
|
27
53
|
Ok(Yield::Enumerator(enumerator))
|
28
54
|
}
|
@@ -6,8 +6,14 @@
|
|
6
6
|
/// so this optimization could be removed if any issues arise.
|
7
7
|
use std::{
|
8
8
|
collections::HashMap,
|
9
|
-
sync::{
|
9
|
+
sync::{
|
10
|
+
atomic::{AtomicU32, Ordering},
|
11
|
+
LazyLock, Mutex, OnceLock,
|
12
|
+
},
|
10
13
|
};
|
14
|
+
|
15
|
+
use magnus::{r_string::FString, value::Opaque, IntoValue, RString, Ruby, Value};
|
16
|
+
|
11
17
|
use thiserror::Error;
|
12
18
|
|
13
19
|
#[derive(Debug, Error)]
|
@@ -16,64 +22,116 @@ pub enum CacheError {
|
|
16
22
|
LockError(String),
|
17
23
|
}
|
18
24
|
|
19
|
-
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, AtomicU32>>> =
|
25
|
+
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey, AtomicU32)>>> =
|
20
26
|
LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
|
21
27
|
|
22
28
|
pub struct StringCache;
|
23
29
|
|
30
|
+
#[derive(Copy, Clone)]
|
31
|
+
pub struct StringCacheKey(Opaque<FString>, &'static str);
|
32
|
+
|
33
|
+
impl StringCacheKey {
|
34
|
+
pub fn new(string: &str) -> Self {
|
35
|
+
let rstr = RString::new(string);
|
36
|
+
let fstr = rstr.to_interned_str();
|
37
|
+
Self(Opaque::from(fstr), fstr.as_str().unwrap())
|
38
|
+
}
|
39
|
+
}
|
40
|
+
|
41
|
+
impl AsRef<str> for StringCacheKey {
|
42
|
+
fn as_ref(&self) -> &'static str {
|
43
|
+
self.1
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
impl IntoValue for StringCacheKey {
|
48
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
49
|
+
handle.into_value(self.0)
|
50
|
+
}
|
51
|
+
}
|
52
|
+
|
53
|
+
impl std::fmt::Debug for StringCacheKey {
|
54
|
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
55
|
+
self.1.fmt(f)
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
impl PartialEq for StringCacheKey {
|
60
|
+
fn eq(&self, other: &Self) -> bool {
|
61
|
+
self.1 == other.1
|
62
|
+
}
|
63
|
+
}
|
64
|
+
|
65
|
+
impl std::cmp::Eq for StringCacheKey {}
|
66
|
+
|
67
|
+
impl std::hash::Hash for StringCacheKey {
|
68
|
+
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
69
|
+
self.1.hash(state);
|
70
|
+
}
|
71
|
+
}
|
72
|
+
|
24
73
|
impl StringCache {
|
25
74
|
#[allow(dead_code)]
|
26
|
-
pub fn intern(string: String) -> Result
|
75
|
+
pub fn intern(string: String) -> Result<StringCacheKey, CacheError> {
|
27
76
|
let mut cache = STRING_CACHE
|
28
77
|
.lock()
|
29
78
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
30
79
|
|
31
|
-
if let Some((
|
32
|
-
|
33
|
-
Ok(
|
80
|
+
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
|
81
|
+
counter.fetch_add(1, Ordering::Relaxed);
|
82
|
+
Ok(*interned_string)
|
34
83
|
} else {
|
84
|
+
let interned = StringCacheKey::new(string.as_str());
|
35
85
|
let leaked = Box::leak(string.into_boxed_str());
|
36
|
-
cache.insert(leaked, AtomicU32::new(1));
|
37
|
-
Ok(
|
86
|
+
cache.insert(leaked, (interned, AtomicU32::new(1)));
|
87
|
+
Ok(interned)
|
38
88
|
}
|
39
89
|
}
|
40
90
|
|
41
|
-
pub fn intern_many(strings: &[String]) -> Result<Vec
|
91
|
+
pub fn intern_many(strings: &[String]) -> Result<Vec<StringCacheKey>, CacheError> {
|
42
92
|
let mut cache = STRING_CACHE
|
43
93
|
.lock()
|
44
94
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
45
95
|
|
46
|
-
let mut result = Vec::with_capacity(strings.len());
|
96
|
+
let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
|
47
97
|
for string in strings {
|
48
|
-
if let Some((
|
49
|
-
|
50
|
-
result.push(
|
98
|
+
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
|
99
|
+
counter.fetch_add(1, Ordering::Relaxed);
|
100
|
+
result.push(*interned_string);
|
51
101
|
} else {
|
102
|
+
let interned = StringCacheKey::new(&string);
|
52
103
|
let leaked = Box::leak(string.clone().into_boxed_str());
|
53
|
-
cache.insert(leaked, AtomicU32::new(1));
|
54
|
-
result.push(
|
104
|
+
cache.insert(leaked, (interned, AtomicU32::new(1)));
|
105
|
+
result.push(interned);
|
55
106
|
}
|
56
107
|
}
|
57
108
|
Ok(result)
|
58
109
|
}
|
59
110
|
|
60
|
-
pub fn clear(headers: &[
|
111
|
+
pub fn clear(headers: &[StringCacheKey]) -> Result<(), CacheError> {
|
61
112
|
let mut cache = STRING_CACHE
|
62
113
|
.lock()
|
63
114
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
64
115
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
let
|
69
|
-
if
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
116
|
+
let to_remove: Vec<_> = headers
|
117
|
+
.iter()
|
118
|
+
.filter_map(|header| {
|
119
|
+
let key = header.as_ref();
|
120
|
+
if let Some((_, (_, counter))) = cache.get_key_value(key) {
|
121
|
+
let prev_count = counter.fetch_sub(1, Ordering::Relaxed);
|
122
|
+
if prev_count == 1 {
|
123
|
+
Some(key)
|
124
|
+
} else {
|
125
|
+
None
|
74
126
|
}
|
127
|
+
} else {
|
128
|
+
None
|
75
129
|
}
|
76
|
-
}
|
130
|
+
})
|
131
|
+
.collect();
|
132
|
+
|
133
|
+
for key in to_remove {
|
134
|
+
cache.remove(key);
|
77
135
|
}
|
78
136
|
|
79
137
|
Ok(())
|
@@ -82,13 +140,12 @@ impl StringCache {
|
|
82
140
|
|
83
141
|
pub struct HeaderCacheCleanupIter<I> {
|
84
142
|
pub inner: I,
|
85
|
-
pub headers: OnceLock<Vec
|
143
|
+
pub headers: OnceLock<Vec<StringCacheKey>>,
|
86
144
|
}
|
87
145
|
|
88
146
|
impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
|
89
147
|
type Item = I::Item;
|
90
148
|
|
91
|
-
#[inline(always)]
|
92
149
|
fn next(&mut self) -> Option<Self::Item> {
|
93
150
|
self.inner.next()
|
94
151
|
}
|
data/ext/parquet/src/lib.rs
CHANGED
@@ -18,6 +18,7 @@ use magnus::{Error, Ruby};
|
|
18
18
|
#[magnus::init]
|
19
19
|
fn init(ruby: &Ruby) -> Result<(), Error> {
|
20
20
|
let module = ruby.define_module("Parquet")?;
|
21
|
-
module.define_module_function("each_row", magnus::method!(
|
21
|
+
module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
|
22
|
+
module.define_module_function("each_column", magnus::method!(parse_parquet_columns, -1))?;
|
22
23
|
Ok(())
|
23
24
|
}
|