parquet 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +99 -7
- data/Gemfile +7 -2
- data/README.md +66 -10
- data/ext/parquet/Cargo.toml +12 -1
- data/ext/parquet/src/allocator.rs +13 -0
- data/ext/parquet/src/enumerator.rs +54 -0
- data/ext/parquet/src/header_cache.rs +105 -26
- data/ext/parquet/src/lib.rs +9 -1
- data/ext/parquet/src/reader.rs +289 -231
- data/ext/parquet/src/ruby_integration.rs +77 -0
- data/ext/parquet/src/ruby_reader.rs +43 -102
- data/ext/parquet/src/types.rs +722 -0
- data/ext/parquet/src/utils.rs +64 -5
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +26 -5
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d46e0a95ff244189cadf71b3860b03aaf7638629d8c2eeda2800eaae57c0dbd2
|
4
|
+
data.tar.gz: 8bd2a8a29c7b199fcd67f7ababf241ebdcc871a0ff247fbd5320b789a6e6222e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ae074d37108a5e12369638a23fcf0962ee416ac901dfcb64daf67b43e76e1566f50f56ccc890581a4cee8b14a0b801ad65dd874dd64c4d0882715e94d35e71b6
|
7
|
+
data.tar.gz: 9e08475b60bf1a5ee5e296aecca6df33a9f13fbe747914eadc45b171b6745feda099a3f8b4a1bca57ce3b3fdb03d4452545170d44d4dc90a4168c7b9194d8c19
|
data/Cargo.lock
CHANGED
@@ -743,12 +743,70 @@ dependencies = [
|
|
743
743
|
"either",
|
744
744
|
]
|
745
745
|
|
746
|
+
[[package]]
|
747
|
+
name = "itertools"
|
748
|
+
version = "0.14.0"
|
749
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
750
|
+
checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
|
751
|
+
dependencies = [
|
752
|
+
"either",
|
753
|
+
]
|
754
|
+
|
746
755
|
[[package]]
|
747
756
|
name = "itoa"
|
748
757
|
version = "1.0.14"
|
749
758
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
750
759
|
checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
|
751
760
|
|
761
|
+
[[package]]
|
762
|
+
name = "jemalloc-sys"
|
763
|
+
version = "0.5.4+5.3.0-patched"
|
764
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
765
|
+
checksum = "ac6c1946e1cea1788cbfde01c993b52a10e2da07f4bac608228d1bed20bfebf2"
|
766
|
+
dependencies = [
|
767
|
+
"cc",
|
768
|
+
"libc",
|
769
|
+
]
|
770
|
+
|
771
|
+
[[package]]
|
772
|
+
name = "jemallocator"
|
773
|
+
version = "0.5.4"
|
774
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
775
|
+
checksum = "a0de374a9f8e63150e6f5e8a60cc14c668226d7a347d8aee1a45766e3c4dd3bc"
|
776
|
+
dependencies = [
|
777
|
+
"jemalloc-sys",
|
778
|
+
"libc",
|
779
|
+
]
|
780
|
+
|
781
|
+
[[package]]
|
782
|
+
name = "jiff"
|
783
|
+
version = "0.1.19"
|
784
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
785
|
+
checksum = "943611a469f78ab9afdac9022e473a80fca16a9deca6c5be3eb566d872231e76"
|
786
|
+
dependencies = [
|
787
|
+
"jiff-tzdb-platform",
|
788
|
+
"log",
|
789
|
+
"portable-atomic",
|
790
|
+
"portable-atomic-util",
|
791
|
+
"serde",
|
792
|
+
"windows-sys",
|
793
|
+
]
|
794
|
+
|
795
|
+
[[package]]
|
796
|
+
name = "jiff-tzdb"
|
797
|
+
version = "0.1.1"
|
798
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
799
|
+
checksum = "91335e575850c5c4c673b9bd467b0e025f164ca59d0564f69d0c2ee0ffad4653"
|
800
|
+
|
801
|
+
[[package]]
|
802
|
+
name = "jiff-tzdb-platform"
|
803
|
+
version = "0.1.1"
|
804
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
805
|
+
checksum = "9835f0060a626fe59f160437bc725491a6af23133ea906500027d1bd2f8f4329"
|
806
|
+
dependencies = [
|
807
|
+
"jiff-tzdb",
|
808
|
+
]
|
809
|
+
|
752
810
|
[[package]]
|
753
811
|
name = "jobserver"
|
754
812
|
version = "0.1.32"
|
@@ -876,6 +934,16 @@ version = "0.2.11"
|
|
876
934
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
877
935
|
checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa"
|
878
936
|
|
937
|
+
[[package]]
|
938
|
+
name = "libmimalloc-sys"
|
939
|
+
version = "0.1.39"
|
940
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
941
|
+
checksum = "23aa6811d3bd4deb8a84dde645f943476d13b248d818edcf8ce0b2f37f036b44"
|
942
|
+
dependencies = [
|
943
|
+
"cc",
|
944
|
+
"libc",
|
945
|
+
]
|
946
|
+
|
879
947
|
[[package]]
|
880
948
|
name = "litemap"
|
881
949
|
version = "0.7.4"
|
@@ -948,6 +1016,15 @@ version = "2.7.4"
|
|
948
1016
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
949
1017
|
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
950
1018
|
|
1019
|
+
[[package]]
|
1020
|
+
name = "mimalloc"
|
1021
|
+
version = "0.1.43"
|
1022
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1023
|
+
checksum = "68914350ae34959d83f732418d51e2427a794055d0b9529f48259ac07af65633"
|
1024
|
+
dependencies = [
|
1025
|
+
"libmimalloc-sys",
|
1026
|
+
]
|
1027
|
+
|
951
1028
|
[[package]]
|
952
1029
|
name = "minimal-lexical"
|
953
1030
|
version = "0.2.1"
|
@@ -1119,15 +1196,21 @@ dependencies = [
|
|
1119
1196
|
name = "parquet"
|
1120
1197
|
version = "0.1.0"
|
1121
1198
|
dependencies = [
|
1199
|
+
"ahash",
|
1200
|
+
"arrow-array",
|
1201
|
+
"arrow-schema",
|
1122
1202
|
"bytes",
|
1203
|
+
"itertools 0.14.0",
|
1204
|
+
"jemallocator",
|
1205
|
+
"jiff",
|
1123
1206
|
"kanal",
|
1124
1207
|
"magnus 0.7.1",
|
1208
|
+
"mimalloc",
|
1125
1209
|
"parquet 54.0.0",
|
1126
1210
|
"rb-sys",
|
1127
1211
|
"serde",
|
1128
1212
|
"serde_magnus",
|
1129
1213
|
"thiserror",
|
1130
|
-
"xxhash-rust",
|
1131
1214
|
]
|
1132
1215
|
|
1133
1216
|
[[package]]
|
@@ -1197,6 +1280,21 @@ version = "0.3.31"
|
|
1197
1280
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1198
1281
|
checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2"
|
1199
1282
|
|
1283
|
+
[[package]]
|
1284
|
+
name = "portable-atomic"
|
1285
|
+
version = "1.10.0"
|
1286
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1287
|
+
checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
|
1288
|
+
|
1289
|
+
[[package]]
|
1290
|
+
name = "portable-atomic-util"
|
1291
|
+
version = "0.2.4"
|
1292
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1293
|
+
checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507"
|
1294
|
+
dependencies = [
|
1295
|
+
"portable-atomic",
|
1296
|
+
]
|
1297
|
+
|
1200
1298
|
[[package]]
|
1201
1299
|
name = "proc-macro2"
|
1202
1300
|
version = "1.0.92"
|
@@ -1796,12 +1894,6 @@ version = "0.5.5"
|
|
1796
1894
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1797
1895
|
checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
|
1798
1896
|
|
1799
|
-
[[package]]
|
1800
|
-
name = "xxhash-rust"
|
1801
|
-
version = "0.8.14"
|
1802
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1803
|
-
checksum = "d7d48f1b18be023c95e7b75f481cac649d74be7c507ff4a407c55cfb957f7934"
|
1804
|
-
|
1805
1897
|
[[package]]
|
1806
1898
|
name = "yoke"
|
1807
1899
|
version = "0.7.5"
|
data/Gemfile
CHANGED
@@ -6,7 +6,12 @@ gem "rake"
|
|
6
6
|
# Use local version of parquet
|
7
7
|
gemspec
|
8
8
|
|
9
|
-
group :development
|
10
|
-
gem "minitest", "~> 5.0"
|
9
|
+
group :development do
|
11
10
|
gem "benchmark-ips", "~> 2.12"
|
11
|
+
# gem "polars-df"
|
12
|
+
gem "duckdb"
|
13
|
+
end
|
14
|
+
|
15
|
+
group :test do
|
16
|
+
gem "minitest", "~> 5.0"
|
12
17
|
end
|
data/README.md
CHANGED
@@ -8,22 +8,78 @@ At the moment, it only supports iterating rows as either a hash or an array.
|
|
8
8
|
|
9
9
|
## Usage
|
10
10
|
|
11
|
+
This library provides high-level bindings to parquet-rs with two primary APIs for reading Parquet files: row-wise and column-wise iteration. The column-wise API generally offers better performance, especially when working with subset of columns.
|
12
|
+
|
13
|
+
### Row-wise Iteration
|
14
|
+
|
15
|
+
The `each_row` method provides sequential access to individual rows:
|
16
|
+
|
11
17
|
```ruby
|
12
18
|
require "parquet"
|
13
19
|
|
14
|
-
#
|
15
|
-
Parquet.each_row("
|
20
|
+
# Basic usage with default hash output
|
21
|
+
Parquet.each_row("data.parquet") do |row|
|
22
|
+
puts row.inspect # {"id"=>1, "name"=>"name_1"}
|
23
|
+
end
|
16
24
|
|
17
|
-
#
|
18
|
-
Parquet.each_row("
|
25
|
+
# Array output for more efficient memory usage
|
26
|
+
Parquet.each_row("data.parquet", result_type: :array) do |row|
|
27
|
+
puts row.inspect # [1, "name_1"]
|
28
|
+
end
|
19
29
|
|
20
|
-
#
|
21
|
-
|
22
|
-
|
30
|
+
# Select specific columns to reduce I/O
|
31
|
+
Parquet.each_row("data.parquet", columns: ["id", "name"]) do |row|
|
32
|
+
puts row.inspect
|
23
33
|
end
|
24
34
|
|
25
|
-
#
|
26
|
-
|
27
|
-
Parquet.each_row(
|
35
|
+
# Reading from IO objects
|
36
|
+
File.open("data.parquet", "rb") do |file|
|
37
|
+
Parquet.each_row(file) do |row|
|
38
|
+
puts row.inspect
|
39
|
+
end
|
40
|
+
end
|
41
|
+
```
|
42
|
+
|
43
|
+
### Column-wise Iteration
|
44
|
+
|
45
|
+
The `each_column` method reads data in column-oriented batches, which is typically more efficient for analytical queries:
|
28
46
|
|
47
|
+
```ruby
|
48
|
+
require "parquet"
|
49
|
+
|
50
|
+
# Process columns in batches of 1024 rows
|
51
|
+
Parquet.each_column("data.parquet", batch_size: 1024) do |batch|
|
52
|
+
# With result_type: :hash (default)
|
53
|
+
puts batch.inspect
|
54
|
+
# {
|
55
|
+
# "id" => [1, 2, ..., 1024],
|
56
|
+
# "name" => ["name_1", "name_2", ..., "name_1024"]
|
57
|
+
# }
|
58
|
+
end
|
59
|
+
|
60
|
+
# Array output with specific columns
|
61
|
+
Parquet.each_column("data.parquet",
|
62
|
+
columns: ["id", "name"],
|
63
|
+
result_type: :array,
|
64
|
+
batch_size: 1024) do |batch|
|
65
|
+
puts batch.inspect
|
66
|
+
# [
|
67
|
+
# [1, 2, ..., 1024], # id column
|
68
|
+
# ["name_1", "name_2", ...] # name column
|
69
|
+
# ]
|
70
|
+
end
|
29
71
|
```
|
72
|
+
|
73
|
+
### Arguments
|
74
|
+
|
75
|
+
Both methods accept these common arguments:
|
76
|
+
|
77
|
+
- `input`: Path string or IO-like object containing Parquet data
|
78
|
+
- `result_type`: Output format (`:hash` or `:array`, defaults to `:hash`)
|
79
|
+
- `columns`: Optional array of column names to read (improves performance)
|
80
|
+
|
81
|
+
Additional arguments for `each_column`:
|
82
|
+
|
83
|
+
- `batch_size`: Number of rows per batch (defaults to implementation-defined value)
|
84
|
+
|
85
|
+
When no block is given, both methods return an Enumerator.
|
data/ext/parquet/Cargo.toml
CHANGED
@@ -7,7 +7,10 @@ edition = "2021"
|
|
7
7
|
crate-type = ["cdylib"]
|
8
8
|
|
9
9
|
[dependencies]
|
10
|
+
ahash = "0.8"
|
10
11
|
parquet = { version = "^54.0", features = ["json", "object_store"] }
|
12
|
+
arrow-schema = "54.0.0"
|
13
|
+
arrow-array = "54.0.0"
|
11
14
|
bytes = "^1.9"
|
12
15
|
kanal = "0.1.0-pre8"
|
13
16
|
magnus = { version = "0.7", features = ["rb-sys"] }
|
@@ -15,4 +18,12 @@ rb-sys = "^0.9"
|
|
15
18
|
serde = { version = "1.0", features = ["derive"] }
|
16
19
|
serde_magnus = "0.8.1"
|
17
20
|
thiserror = "2.0"
|
18
|
-
|
21
|
+
itertools = "^0.14"
|
22
|
+
jiff = "0.1.19"
|
23
|
+
|
24
|
+
|
25
|
+
[target.'cfg(target_os = "linux")'.dependencies]
|
26
|
+
jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
|
27
|
+
|
28
|
+
[target.'cfg(not(any(target_os = "linux", target_os = "windows")))'.dependencies]
|
29
|
+
mimalloc = { version = "0.1", default-features = false }
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#[cfg(target_os = "linux")]
|
2
|
+
use jemallocator::Jemalloc;
|
3
|
+
|
4
|
+
#[cfg(not(any(target_os = "linux", target_os = "windows")))]
|
5
|
+
use mimalloc::MiMalloc;
|
6
|
+
|
7
|
+
#[global_allocator]
|
8
|
+
#[cfg(target_os = "linux")]
|
9
|
+
static ALLOC: Jemalloc = Jemalloc;
|
10
|
+
|
11
|
+
#[global_allocator]
|
12
|
+
#[cfg(not(any(target_os = "linux", target_os = "windows")))]
|
13
|
+
static ALLOC: MiMalloc = MiMalloc;
|
@@ -0,0 +1,54 @@
|
|
1
|
+
use ahash::RandomState;
|
2
|
+
use magnus::{
|
3
|
+
block::Yield, value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value,
|
4
|
+
};
|
5
|
+
|
6
|
+
use crate::{ColumnRecord, RowRecord};
|
7
|
+
|
8
|
+
pub struct RowEnumeratorArgs {
|
9
|
+
pub rb_self: Value,
|
10
|
+
pub to_read: Value,
|
11
|
+
pub result_type: String,
|
12
|
+
pub columns: Option<Vec<String>>,
|
13
|
+
}
|
14
|
+
|
15
|
+
#[inline]
|
16
|
+
pub fn create_row_enumerator(
|
17
|
+
args: RowEnumeratorArgs,
|
18
|
+
) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
|
19
|
+
let kwargs = RHash::new();
|
20
|
+
kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
|
21
|
+
if let Some(columns) = args.columns {
|
22
|
+
kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
|
23
|
+
}
|
24
|
+
let enumerator = args
|
25
|
+
.rb_self
|
26
|
+
.enumeratorize("each_row", (args.to_read, KwArgs(kwargs)));
|
27
|
+
Ok(Yield::Enumerator(enumerator))
|
28
|
+
}
|
29
|
+
|
30
|
+
pub struct ColumnEnumeratorArgs {
|
31
|
+
pub rb_self: Value,
|
32
|
+
pub to_read: Value,
|
33
|
+
pub result_type: String,
|
34
|
+
pub columns: Option<Vec<String>>,
|
35
|
+
pub batch_size: Option<usize>,
|
36
|
+
}
|
37
|
+
|
38
|
+
#[inline]
|
39
|
+
pub fn create_column_enumerator(
|
40
|
+
args: ColumnEnumeratorArgs,
|
41
|
+
) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
|
42
|
+
let kwargs = RHash::new();
|
43
|
+
kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
|
44
|
+
if let Some(columns) = args.columns {
|
45
|
+
kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
|
46
|
+
}
|
47
|
+
if let Some(batch_size) = args.batch_size {
|
48
|
+
kwargs.aset(Symbol::new("batch_size"), batch_size)?;
|
49
|
+
}
|
50
|
+
let enumerator = args
|
51
|
+
.rb_self
|
52
|
+
.enumeratorize("each_column", (args.to_read, KwArgs(kwargs)));
|
53
|
+
Ok(Yield::Enumerator(enumerator))
|
54
|
+
}
|
@@ -6,8 +6,14 @@
|
|
6
6
|
/// so this optimization could be removed if any issues arise.
|
7
7
|
use std::{
|
8
8
|
collections::HashMap,
|
9
|
-
sync::{
|
9
|
+
sync::{
|
10
|
+
atomic::{AtomicU32, Ordering},
|
11
|
+
LazyLock, Mutex, OnceLock,
|
12
|
+
},
|
10
13
|
};
|
14
|
+
|
15
|
+
use magnus::{r_string::FString, value::Opaque, IntoValue, RString, Ruby, Value};
|
16
|
+
|
11
17
|
use thiserror::Error;
|
12
18
|
|
13
19
|
#[derive(Debug, Error)]
|
@@ -16,66 +22,139 @@ pub enum CacheError {
|
|
16
22
|
LockError(String),
|
17
23
|
}
|
18
24
|
|
19
|
-
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, AtomicU32>>> =
|
25
|
+
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey, AtomicU32)>>> =
|
20
26
|
LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
|
21
27
|
|
22
28
|
pub struct StringCache;
|
23
29
|
|
30
|
+
#[derive(Copy, Clone)]
|
31
|
+
pub struct StringCacheKey(Opaque<FString>, &'static str);
|
32
|
+
|
33
|
+
impl StringCacheKey {
|
34
|
+
pub fn new(string: &str) -> Self {
|
35
|
+
let rstr = RString::new(string);
|
36
|
+
let fstr = rstr.to_interned_str();
|
37
|
+
Self(Opaque::from(fstr), fstr.as_str().unwrap())
|
38
|
+
}
|
39
|
+
}
|
40
|
+
|
41
|
+
impl AsRef<str> for StringCacheKey {
|
42
|
+
fn as_ref(&self) -> &'static str {
|
43
|
+
self.1
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
impl IntoValue for StringCacheKey {
|
48
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
49
|
+
handle.into_value(self.0)
|
50
|
+
}
|
51
|
+
}
|
52
|
+
|
53
|
+
impl std::fmt::Debug for StringCacheKey {
|
54
|
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
55
|
+
self.1.fmt(f)
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
impl PartialEq for StringCacheKey {
|
60
|
+
fn eq(&self, other: &Self) -> bool {
|
61
|
+
self.1 == other.1
|
62
|
+
}
|
63
|
+
}
|
64
|
+
|
65
|
+
impl std::cmp::Eq for StringCacheKey {}
|
66
|
+
|
67
|
+
impl std::hash::Hash for StringCacheKey {
|
68
|
+
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
69
|
+
self.1.hash(state);
|
70
|
+
}
|
71
|
+
}
|
72
|
+
|
24
73
|
impl StringCache {
|
25
74
|
#[allow(dead_code)]
|
26
|
-
pub fn intern(string: String) -> Result
|
75
|
+
pub fn intern(string: String) -> Result<StringCacheKey, CacheError> {
|
27
76
|
let mut cache = STRING_CACHE
|
28
77
|
.lock()
|
29
78
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
30
79
|
|
31
|
-
if let Some((
|
32
|
-
|
33
|
-
Ok(
|
80
|
+
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
|
81
|
+
counter.fetch_add(1, Ordering::Relaxed);
|
82
|
+
Ok(*interned_string)
|
34
83
|
} else {
|
84
|
+
let interned = StringCacheKey::new(string.as_str());
|
35
85
|
let leaked = Box::leak(string.into_boxed_str());
|
36
|
-
cache.insert(leaked, AtomicU32::new(1));
|
37
|
-
Ok(
|
86
|
+
cache.insert(leaked, (interned, AtomicU32::new(1)));
|
87
|
+
Ok(interned)
|
38
88
|
}
|
39
89
|
}
|
40
90
|
|
41
|
-
pub fn intern_many(strings: &[String]) -> Result<Vec
|
91
|
+
pub fn intern_many(strings: &[String]) -> Result<Vec<StringCacheKey>, CacheError> {
|
42
92
|
let mut cache = STRING_CACHE
|
43
93
|
.lock()
|
44
94
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
45
95
|
|
46
|
-
let mut result = Vec::with_capacity(strings.len());
|
96
|
+
let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
|
47
97
|
for string in strings {
|
48
|
-
if let Some((
|
49
|
-
|
50
|
-
result.push(
|
98
|
+
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
|
99
|
+
counter.fetch_add(1, Ordering::Relaxed);
|
100
|
+
result.push(*interned_string);
|
51
101
|
} else {
|
102
|
+
let interned = StringCacheKey::new(&string);
|
52
103
|
let leaked = Box::leak(string.clone().into_boxed_str());
|
53
|
-
cache.insert(leaked, AtomicU32::new(1));
|
54
|
-
result.push(
|
104
|
+
cache.insert(leaked, (interned, AtomicU32::new(1)));
|
105
|
+
result.push(interned);
|
55
106
|
}
|
56
107
|
}
|
57
108
|
Ok(result)
|
58
109
|
}
|
59
110
|
|
60
|
-
pub fn clear(headers: &[
|
111
|
+
pub fn clear(headers: &[StringCacheKey]) -> Result<(), CacheError> {
|
61
112
|
let mut cache = STRING_CACHE
|
62
113
|
.lock()
|
63
114
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
64
115
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
let
|
69
|
-
if
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
116
|
+
let to_remove: Vec<_> = headers
|
117
|
+
.iter()
|
118
|
+
.filter_map(|header| {
|
119
|
+
let key = header.as_ref();
|
120
|
+
if let Some((_, (_, counter))) = cache.get_key_value(key) {
|
121
|
+
let prev_count = counter.fetch_sub(1, Ordering::Relaxed);
|
122
|
+
if prev_count == 1 {
|
123
|
+
Some(key)
|
124
|
+
} else {
|
125
|
+
None
|
74
126
|
}
|
127
|
+
} else {
|
128
|
+
None
|
75
129
|
}
|
76
|
-
}
|
130
|
+
})
|
131
|
+
.collect();
|
132
|
+
|
133
|
+
for key in to_remove {
|
134
|
+
cache.remove(key);
|
77
135
|
}
|
78
136
|
|
79
137
|
Ok(())
|
80
138
|
}
|
81
139
|
}
|
140
|
+
|
141
|
+
pub struct HeaderCacheCleanupIter<I> {
|
142
|
+
pub inner: I,
|
143
|
+
pub headers: OnceLock<Vec<StringCacheKey>>,
|
144
|
+
}
|
145
|
+
|
146
|
+
impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
|
147
|
+
type Item = I::Item;
|
148
|
+
|
149
|
+
fn next(&mut self) -> Option<Self::Item> {
|
150
|
+
self.inner.next()
|
151
|
+
}
|
152
|
+
}
|
153
|
+
|
154
|
+
impl<I> Drop for HeaderCacheCleanupIter<I> {
|
155
|
+
fn drop(&mut self) {
|
156
|
+
if let Some(headers) = self.headers.get() {
|
157
|
+
StringCache::clear(&headers).unwrap();
|
158
|
+
}
|
159
|
+
}
|
160
|
+
}
|
data/ext/parquet/src/lib.rs
CHANGED
@@ -1,9 +1,16 @@
|
|
1
|
+
mod allocator;
|
2
|
+
mod enumerator;
|
1
3
|
pub mod header_cache;
|
2
4
|
mod reader;
|
5
|
+
mod ruby_integration;
|
3
6
|
mod ruby_reader;
|
7
|
+
mod types;
|
4
8
|
mod utils;
|
5
9
|
|
10
|
+
use crate::enumerator::*;
|
6
11
|
use crate::reader::*;
|
12
|
+
use crate::ruby_integration::*;
|
13
|
+
use crate::types::*;
|
7
14
|
|
8
15
|
use magnus::{Error, Ruby};
|
9
16
|
|
@@ -11,6 +18,7 @@ use magnus::{Error, Ruby};
|
|
11
18
|
#[magnus::init]
|
12
19
|
fn init(ruby: &Ruby) -> Result<(), Error> {
|
13
20
|
let module = ruby.define_module("Parquet")?;
|
14
|
-
module.define_module_function("each_row", magnus::method!(
|
21
|
+
module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
|
22
|
+
module.define_module_function("each_column", magnus::method!(parse_parquet_columns, -1))?;
|
15
23
|
Ok(())
|
16
24
|
}
|