parquet 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +57 -0
- data/Gemfile +1 -1
- data/README.md +66 -10
- data/ext/parquet/Cargo.toml +5 -0
- data/ext/parquet/src/enumerator.rs +32 -6
- data/ext/parquet/src/header_cache.rs +85 -28
- data/ext/parquet/src/lib.rs +2 -1
- data/ext/parquet/src/reader.rs +218 -13
- data/ext/parquet/src/types.rs +647 -15
- data/ext/parquet/src/utils.rs +57 -3
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +22 -3
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b88d6751418f21c4ec032d05b6d0a6e9dbd37304983ed80e1a290508c787d118
|
4
|
+
data.tar.gz: 948702f38cad3c4d4e76efccbd9d7d8ad4c81366c4dcba2c71058cc4d013c237
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 30f90ee2f597aa6e2d5a84b8ab9780af3d71fa41d3a1152f47d7a12b34bc203b8ff06b04c3f929f689c93be9e962186a3e6c305f61724b36ad4e6ad551c11f49
|
7
|
+
data.tar.gz: 5a83b007e0c4789c6cfde1f8037228b0b00f2f0ef7ea0f932d7eaafefb91669db422450bbfd923f4388e2bfc644cae57f514828a2e4a2868ee6a20b492af428e
|
data/Cargo.lock
CHANGED
@@ -743,6 +743,15 @@ dependencies = [
|
|
743
743
|
"either",
|
744
744
|
]
|
745
745
|
|
746
|
+
[[package]]
|
747
|
+
name = "itertools"
|
748
|
+
version = "0.14.0"
|
749
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
750
|
+
checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
|
751
|
+
dependencies = [
|
752
|
+
"either",
|
753
|
+
]
|
754
|
+
|
746
755
|
[[package]]
|
747
756
|
name = "itoa"
|
748
757
|
version = "1.0.14"
|
@@ -769,6 +778,35 @@ dependencies = [
|
|
769
778
|
"libc",
|
770
779
|
]
|
771
780
|
|
781
|
+
[[package]]
|
782
|
+
name = "jiff"
|
783
|
+
version = "0.1.19"
|
784
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
785
|
+
checksum = "943611a469f78ab9afdac9022e473a80fca16a9deca6c5be3eb566d872231e76"
|
786
|
+
dependencies = [
|
787
|
+
"jiff-tzdb-platform",
|
788
|
+
"log",
|
789
|
+
"portable-atomic",
|
790
|
+
"portable-atomic-util",
|
791
|
+
"serde",
|
792
|
+
"windows-sys",
|
793
|
+
]
|
794
|
+
|
795
|
+
[[package]]
|
796
|
+
name = "jiff-tzdb"
|
797
|
+
version = "0.1.1"
|
798
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
799
|
+
checksum = "91335e575850c5c4c673b9bd467b0e025f164ca59d0564f69d0c2ee0ffad4653"
|
800
|
+
|
801
|
+
[[package]]
|
802
|
+
name = "jiff-tzdb-platform"
|
803
|
+
version = "0.1.1"
|
804
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
805
|
+
checksum = "9835f0060a626fe59f160437bc725491a6af23133ea906500027d1bd2f8f4329"
|
806
|
+
dependencies = [
|
807
|
+
"jiff-tzdb",
|
808
|
+
]
|
809
|
+
|
772
810
|
[[package]]
|
773
811
|
name = "jobserver"
|
774
812
|
version = "0.1.32"
|
@@ -1159,8 +1197,12 @@ name = "parquet"
|
|
1159
1197
|
version = "0.1.0"
|
1160
1198
|
dependencies = [
|
1161
1199
|
"ahash",
|
1200
|
+
"arrow-array",
|
1201
|
+
"arrow-schema",
|
1162
1202
|
"bytes",
|
1203
|
+
"itertools 0.14.0",
|
1163
1204
|
"jemallocator",
|
1205
|
+
"jiff",
|
1164
1206
|
"kanal",
|
1165
1207
|
"magnus 0.7.1",
|
1166
1208
|
"mimalloc",
|
@@ -1238,6 +1280,21 @@ version = "0.3.31"
|
|
1238
1280
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1239
1281
|
checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2"
|
1240
1282
|
|
1283
|
+
[[package]]
|
1284
|
+
name = "portable-atomic"
|
1285
|
+
version = "1.10.0"
|
1286
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1287
|
+
checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
|
1288
|
+
|
1289
|
+
[[package]]
|
1290
|
+
name = "portable-atomic-util"
|
1291
|
+
version = "0.2.4"
|
1292
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1293
|
+
checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507"
|
1294
|
+
dependencies = [
|
1295
|
+
"portable-atomic",
|
1296
|
+
]
|
1297
|
+
|
1241
1298
|
[[package]]
|
1242
1299
|
name = "proc-macro2"
|
1243
1300
|
version = "1.0.92"
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -8,22 +8,78 @@ At the moment, it only supports iterating rows as either a hash or an array.
|
|
8
8
|
|
9
9
|
## Usage
|
10
10
|
|
11
|
+
This library provides high-level bindings to parquet-rs with two primary APIs for reading Parquet files: row-wise and column-wise iteration. The column-wise API generally offers better performance, especially when working with subset of columns.
|
12
|
+
|
13
|
+
### Row-wise Iteration
|
14
|
+
|
15
|
+
The `each_row` method provides sequential access to individual rows:
|
16
|
+
|
11
17
|
```ruby
|
12
18
|
require "parquet"
|
13
19
|
|
14
|
-
#
|
15
|
-
Parquet.each_row("
|
20
|
+
# Basic usage with default hash output
|
21
|
+
Parquet.each_row("data.parquet") do |row|
|
22
|
+
puts row.inspect # {"id"=>1, "name"=>"name_1"}
|
23
|
+
end
|
16
24
|
|
17
|
-
#
|
18
|
-
Parquet.each_row("
|
25
|
+
# Array output for more efficient memory usage
|
26
|
+
Parquet.each_row("data.parquet", result_type: :array) do |row|
|
27
|
+
puts row.inspect # [1, "name_1"]
|
28
|
+
end
|
19
29
|
|
20
|
-
#
|
21
|
-
|
22
|
-
|
30
|
+
# Select specific columns to reduce I/O
|
31
|
+
Parquet.each_row("data.parquet", columns: ["id", "name"]) do |row|
|
32
|
+
puts row.inspect
|
23
33
|
end
|
24
34
|
|
25
|
-
#
|
26
|
-
|
27
|
-
Parquet.each_row(
|
35
|
+
# Reading from IO objects
|
36
|
+
File.open("data.parquet", "rb") do |file|
|
37
|
+
Parquet.each_row(file) do |row|
|
38
|
+
puts row.inspect
|
39
|
+
end
|
40
|
+
end
|
41
|
+
```
|
42
|
+
|
43
|
+
### Column-wise Iteration
|
44
|
+
|
45
|
+
The `each_column` method reads data in column-oriented batches, which is typically more efficient for analytical queries:
|
28
46
|
|
47
|
+
```ruby
|
48
|
+
require "parquet"
|
49
|
+
|
50
|
+
# Process columns in batches of 1024 rows
|
51
|
+
Parquet.each_column("data.parquet", batch_size: 1024) do |batch|
|
52
|
+
# With result_type: :hash (default)
|
53
|
+
puts batch.inspect
|
54
|
+
# {
|
55
|
+
# "id" => [1, 2, ..., 1024],
|
56
|
+
# "name" => ["name_1", "name_2", ..., "name_1024"]
|
57
|
+
# }
|
58
|
+
end
|
59
|
+
|
60
|
+
# Array output with specific columns
|
61
|
+
Parquet.each_column("data.parquet",
|
62
|
+
columns: ["id", "name"],
|
63
|
+
result_type: :array,
|
64
|
+
batch_size: 1024) do |batch|
|
65
|
+
puts batch.inspect
|
66
|
+
# [
|
67
|
+
# [1, 2, ..., 1024], # id column
|
68
|
+
# ["name_1", "name_2", ...] # name column
|
69
|
+
# ]
|
70
|
+
end
|
29
71
|
```
|
72
|
+
|
73
|
+
### Arguments
|
74
|
+
|
75
|
+
Both methods accept these common arguments:
|
76
|
+
|
77
|
+
- `input`: Path string or IO-like object containing Parquet data
|
78
|
+
- `result_type`: Output format (`:hash` or `:array`, defaults to `:hash`)
|
79
|
+
- `columns`: Optional array of column names to read (improves performance)
|
80
|
+
|
81
|
+
Additional arguments for `each_column`:
|
82
|
+
|
83
|
+
- `batch_size`: Number of rows per batch (defaults to implementation-defined value)
|
84
|
+
|
85
|
+
When no block is given, both methods return an Enumerator.
|
data/ext/parquet/Cargo.toml
CHANGED
@@ -9,6 +9,8 @@ crate-type = ["cdylib"]
|
|
9
9
|
[dependencies]
|
10
10
|
ahash = "0.8"
|
11
11
|
parquet = { version = "^54.0", features = ["json", "object_store"] }
|
12
|
+
arrow-schema = "54.0.0"
|
13
|
+
arrow-array = "54.0.0"
|
12
14
|
bytes = "^1.9"
|
13
15
|
kanal = "0.1.0-pre8"
|
14
16
|
magnus = { version = "0.7", features = ["rb-sys"] }
|
@@ -16,6 +18,9 @@ rb-sys = "^0.9"
|
|
16
18
|
serde = { version = "1.0", features = ["derive"] }
|
17
19
|
serde_magnus = "0.8.1"
|
18
20
|
thiserror = "2.0"
|
21
|
+
itertools = "^0.14"
|
22
|
+
jiff = "0.1.19"
|
23
|
+
|
19
24
|
|
20
25
|
[target.'cfg(target_os = "linux")'.dependencies]
|
21
26
|
jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
|
@@ -3,9 +3,9 @@ use magnus::{
|
|
3
3
|
block::Yield, value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value,
|
4
4
|
};
|
5
5
|
|
6
|
-
use crate::
|
6
|
+
use crate::{ColumnRecord, RowRecord};
|
7
7
|
|
8
|
-
pub struct
|
8
|
+
pub struct RowEnumeratorArgs {
|
9
9
|
pub rb_self: Value,
|
10
10
|
pub to_read: Value,
|
11
11
|
pub result_type: String,
|
@@ -13,9 +13,9 @@ pub struct EnumeratorArgs {
|
|
13
13
|
}
|
14
14
|
|
15
15
|
#[inline]
|
16
|
-
pub fn
|
17
|
-
args:
|
18
|
-
) -> Result<Yield<Box<dyn Iterator<Item =
|
16
|
+
pub fn create_row_enumerator(
|
17
|
+
args: RowEnumeratorArgs,
|
18
|
+
) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
|
19
19
|
let kwargs = RHash::new();
|
20
20
|
kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
|
21
21
|
if let Some(columns) = args.columns {
|
@@ -23,6 +23,32 @@ pub fn create_enumerator(
|
|
23
23
|
}
|
24
24
|
let enumerator = args
|
25
25
|
.rb_self
|
26
|
-
.enumeratorize("
|
26
|
+
.enumeratorize("each_row", (args.to_read, KwArgs(kwargs)));
|
27
|
+
Ok(Yield::Enumerator(enumerator))
|
28
|
+
}
|
29
|
+
|
30
|
+
pub struct ColumnEnumeratorArgs {
|
31
|
+
pub rb_self: Value,
|
32
|
+
pub to_read: Value,
|
33
|
+
pub result_type: String,
|
34
|
+
pub columns: Option<Vec<String>>,
|
35
|
+
pub batch_size: Option<usize>,
|
36
|
+
}
|
37
|
+
|
38
|
+
#[inline]
|
39
|
+
pub fn create_column_enumerator(
|
40
|
+
args: ColumnEnumeratorArgs,
|
41
|
+
) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
|
42
|
+
let kwargs = RHash::new();
|
43
|
+
kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
|
44
|
+
if let Some(columns) = args.columns {
|
45
|
+
kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
|
46
|
+
}
|
47
|
+
if let Some(batch_size) = args.batch_size {
|
48
|
+
kwargs.aset(Symbol::new("batch_size"), batch_size)?;
|
49
|
+
}
|
50
|
+
let enumerator = args
|
51
|
+
.rb_self
|
52
|
+
.enumeratorize("each_column", (args.to_read, KwArgs(kwargs)));
|
27
53
|
Ok(Yield::Enumerator(enumerator))
|
28
54
|
}
|
@@ -6,8 +6,14 @@
|
|
6
6
|
/// so this optimization could be removed if any issues arise.
|
7
7
|
use std::{
|
8
8
|
collections::HashMap,
|
9
|
-
sync::{
|
9
|
+
sync::{
|
10
|
+
atomic::{AtomicU32, Ordering},
|
11
|
+
LazyLock, Mutex, OnceLock,
|
12
|
+
},
|
10
13
|
};
|
14
|
+
|
15
|
+
use magnus::{r_string::FString, value::Opaque, IntoValue, RString, Ruby, Value};
|
16
|
+
|
11
17
|
use thiserror::Error;
|
12
18
|
|
13
19
|
#[derive(Debug, Error)]
|
@@ -16,64 +22,116 @@ pub enum CacheError {
|
|
16
22
|
LockError(String),
|
17
23
|
}
|
18
24
|
|
19
|
-
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, AtomicU32>>> =
|
25
|
+
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey, AtomicU32)>>> =
|
20
26
|
LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
|
21
27
|
|
22
28
|
pub struct StringCache;
|
23
29
|
|
30
|
+
#[derive(Copy, Clone)]
|
31
|
+
pub struct StringCacheKey(Opaque<FString>, &'static str);
|
32
|
+
|
33
|
+
impl StringCacheKey {
|
34
|
+
pub fn new(string: &str) -> Self {
|
35
|
+
let rstr = RString::new(string);
|
36
|
+
let fstr = rstr.to_interned_str();
|
37
|
+
Self(Opaque::from(fstr), fstr.as_str().unwrap())
|
38
|
+
}
|
39
|
+
}
|
40
|
+
|
41
|
+
impl AsRef<str> for StringCacheKey {
|
42
|
+
fn as_ref(&self) -> &'static str {
|
43
|
+
self.1
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
impl IntoValue for StringCacheKey {
|
48
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
49
|
+
handle.into_value(self.0)
|
50
|
+
}
|
51
|
+
}
|
52
|
+
|
53
|
+
impl std::fmt::Debug for StringCacheKey {
|
54
|
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
55
|
+
self.1.fmt(f)
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
impl PartialEq for StringCacheKey {
|
60
|
+
fn eq(&self, other: &Self) -> bool {
|
61
|
+
self.1 == other.1
|
62
|
+
}
|
63
|
+
}
|
64
|
+
|
65
|
+
impl std::cmp::Eq for StringCacheKey {}
|
66
|
+
|
67
|
+
impl std::hash::Hash for StringCacheKey {
|
68
|
+
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
69
|
+
self.1.hash(state);
|
70
|
+
}
|
71
|
+
}
|
72
|
+
|
24
73
|
impl StringCache {
|
25
74
|
#[allow(dead_code)]
|
26
|
-
pub fn intern(string: String) -> Result
|
75
|
+
pub fn intern(string: String) -> Result<StringCacheKey, CacheError> {
|
27
76
|
let mut cache = STRING_CACHE
|
28
77
|
.lock()
|
29
78
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
30
79
|
|
31
|
-
if let Some((
|
32
|
-
|
33
|
-
Ok(
|
80
|
+
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
|
81
|
+
counter.fetch_add(1, Ordering::Relaxed);
|
82
|
+
Ok(*interned_string)
|
34
83
|
} else {
|
84
|
+
let interned = StringCacheKey::new(string.as_str());
|
35
85
|
let leaked = Box::leak(string.into_boxed_str());
|
36
|
-
cache.insert(leaked, AtomicU32::new(1));
|
37
|
-
Ok(
|
86
|
+
cache.insert(leaked, (interned, AtomicU32::new(1)));
|
87
|
+
Ok(interned)
|
38
88
|
}
|
39
89
|
}
|
40
90
|
|
41
|
-
pub fn intern_many(strings: &[String]) -> Result<Vec
|
91
|
+
pub fn intern_many(strings: &[String]) -> Result<Vec<StringCacheKey>, CacheError> {
|
42
92
|
let mut cache = STRING_CACHE
|
43
93
|
.lock()
|
44
94
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
45
95
|
|
46
|
-
let mut result = Vec::with_capacity(strings.len());
|
96
|
+
let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
|
47
97
|
for string in strings {
|
48
|
-
if let Some((
|
49
|
-
|
50
|
-
result.push(
|
98
|
+
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
|
99
|
+
counter.fetch_add(1, Ordering::Relaxed);
|
100
|
+
result.push(*interned_string);
|
51
101
|
} else {
|
102
|
+
let interned = StringCacheKey::new(&string);
|
52
103
|
let leaked = Box::leak(string.clone().into_boxed_str());
|
53
|
-
cache.insert(leaked, AtomicU32::new(1));
|
54
|
-
result.push(
|
104
|
+
cache.insert(leaked, (interned, AtomicU32::new(1)));
|
105
|
+
result.push(interned);
|
55
106
|
}
|
56
107
|
}
|
57
108
|
Ok(result)
|
58
109
|
}
|
59
110
|
|
60
|
-
pub fn clear(headers: &[
|
111
|
+
pub fn clear(headers: &[StringCacheKey]) -> Result<(), CacheError> {
|
61
112
|
let mut cache = STRING_CACHE
|
62
113
|
.lock()
|
63
114
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
64
115
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
let
|
69
|
-
if
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
116
|
+
let to_remove: Vec<_> = headers
|
117
|
+
.iter()
|
118
|
+
.filter_map(|header| {
|
119
|
+
let key = header.as_ref();
|
120
|
+
if let Some((_, (_, counter))) = cache.get_key_value(key) {
|
121
|
+
let prev_count = counter.fetch_sub(1, Ordering::Relaxed);
|
122
|
+
if prev_count == 1 {
|
123
|
+
Some(key)
|
124
|
+
} else {
|
125
|
+
None
|
74
126
|
}
|
127
|
+
} else {
|
128
|
+
None
|
75
129
|
}
|
76
|
-
}
|
130
|
+
})
|
131
|
+
.collect();
|
132
|
+
|
133
|
+
for key in to_remove {
|
134
|
+
cache.remove(key);
|
77
135
|
}
|
78
136
|
|
79
137
|
Ok(())
|
@@ -82,13 +140,12 @@ impl StringCache {
|
|
82
140
|
|
83
141
|
pub struct HeaderCacheCleanupIter<I> {
|
84
142
|
pub inner: I,
|
85
|
-
pub headers: OnceLock<Vec
|
143
|
+
pub headers: OnceLock<Vec<StringCacheKey>>,
|
86
144
|
}
|
87
145
|
|
88
146
|
impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
|
89
147
|
type Item = I::Item;
|
90
148
|
|
91
|
-
#[inline(always)]
|
92
149
|
fn next(&mut self) -> Option<Self::Item> {
|
93
150
|
self.inner.next()
|
94
151
|
}
|
data/ext/parquet/src/lib.rs
CHANGED
@@ -18,6 +18,7 @@ use magnus::{Error, Ruby};
|
|
18
18
|
#[magnus::init]
|
19
19
|
fn init(ruby: &Ruby) -> Result<(), Error> {
|
20
20
|
let module = ruby.define_module("Parquet")?;
|
21
|
-
module.define_module_function("each_row", magnus::method!(
|
21
|
+
module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
|
22
|
+
module.define_module_function("each_column", magnus::method!(parse_parquet_columns, -1))?;
|
22
23
|
Ok(())
|
23
24
|
}
|