parquet 0.0.1 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Cargo.lock +99 -7
- data/Gemfile +7 -2
- data/README.md +66 -10
- data/ext/parquet/Cargo.toml +12 -1
- data/ext/parquet/src/allocator.rs +13 -0
- data/ext/parquet/src/enumerator.rs +54 -0
- data/ext/parquet/src/header_cache.rs +105 -26
- data/ext/parquet/src/lib.rs +9 -1
- data/ext/parquet/src/reader.rs +289 -231
- data/ext/parquet/src/ruby_integration.rs +77 -0
- data/ext/parquet/src/ruby_reader.rs +43 -102
- data/ext/parquet/src/types.rs +722 -0
- data/ext/parquet/src/utils.rs +64 -5
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +26 -5
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d46e0a95ff244189cadf71b3860b03aaf7638629d8c2eeda2800eaae57c0dbd2
|
4
|
+
data.tar.gz: 8bd2a8a29c7b199fcd67f7ababf241ebdcc871a0ff247fbd5320b789a6e6222e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ae074d37108a5e12369638a23fcf0962ee416ac901dfcb64daf67b43e76e1566f50f56ccc890581a4cee8b14a0b801ad65dd874dd64c4d0882715e94d35e71b6
|
7
|
+
data.tar.gz: 9e08475b60bf1a5ee5e296aecca6df33a9f13fbe747914eadc45b171b6745feda099a3f8b4a1bca57ce3b3fdb03d4452545170d44d4dc90a4168c7b9194d8c19
|
data/Cargo.lock
CHANGED
@@ -743,12 +743,70 @@ dependencies = [
|
|
743
743
|
"either",
|
744
744
|
]
|
745
745
|
|
746
|
+
[[package]]
|
747
|
+
name = "itertools"
|
748
|
+
version = "0.14.0"
|
749
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
750
|
+
checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
|
751
|
+
dependencies = [
|
752
|
+
"either",
|
753
|
+
]
|
754
|
+
|
746
755
|
[[package]]
|
747
756
|
name = "itoa"
|
748
757
|
version = "1.0.14"
|
749
758
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
750
759
|
checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
|
751
760
|
|
761
|
+
[[package]]
|
762
|
+
name = "jemalloc-sys"
|
763
|
+
version = "0.5.4+5.3.0-patched"
|
764
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
765
|
+
checksum = "ac6c1946e1cea1788cbfde01c993b52a10e2da07f4bac608228d1bed20bfebf2"
|
766
|
+
dependencies = [
|
767
|
+
"cc",
|
768
|
+
"libc",
|
769
|
+
]
|
770
|
+
|
771
|
+
[[package]]
|
772
|
+
name = "jemallocator"
|
773
|
+
version = "0.5.4"
|
774
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
775
|
+
checksum = "a0de374a9f8e63150e6f5e8a60cc14c668226d7a347d8aee1a45766e3c4dd3bc"
|
776
|
+
dependencies = [
|
777
|
+
"jemalloc-sys",
|
778
|
+
"libc",
|
779
|
+
]
|
780
|
+
|
781
|
+
[[package]]
|
782
|
+
name = "jiff"
|
783
|
+
version = "0.1.19"
|
784
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
785
|
+
checksum = "943611a469f78ab9afdac9022e473a80fca16a9deca6c5be3eb566d872231e76"
|
786
|
+
dependencies = [
|
787
|
+
"jiff-tzdb-platform",
|
788
|
+
"log",
|
789
|
+
"portable-atomic",
|
790
|
+
"portable-atomic-util",
|
791
|
+
"serde",
|
792
|
+
"windows-sys",
|
793
|
+
]
|
794
|
+
|
795
|
+
[[package]]
|
796
|
+
name = "jiff-tzdb"
|
797
|
+
version = "0.1.1"
|
798
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
799
|
+
checksum = "91335e575850c5c4c673b9bd467b0e025f164ca59d0564f69d0c2ee0ffad4653"
|
800
|
+
|
801
|
+
[[package]]
|
802
|
+
name = "jiff-tzdb-platform"
|
803
|
+
version = "0.1.1"
|
804
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
805
|
+
checksum = "9835f0060a626fe59f160437bc725491a6af23133ea906500027d1bd2f8f4329"
|
806
|
+
dependencies = [
|
807
|
+
"jiff-tzdb",
|
808
|
+
]
|
809
|
+
|
752
810
|
[[package]]
|
753
811
|
name = "jobserver"
|
754
812
|
version = "0.1.32"
|
@@ -876,6 +934,16 @@ version = "0.2.11"
|
|
876
934
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
877
935
|
checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa"
|
878
936
|
|
937
|
+
[[package]]
|
938
|
+
name = "libmimalloc-sys"
|
939
|
+
version = "0.1.39"
|
940
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
941
|
+
checksum = "23aa6811d3bd4deb8a84dde645f943476d13b248d818edcf8ce0b2f37f036b44"
|
942
|
+
dependencies = [
|
943
|
+
"cc",
|
944
|
+
"libc",
|
945
|
+
]
|
946
|
+
|
879
947
|
[[package]]
|
880
948
|
name = "litemap"
|
881
949
|
version = "0.7.4"
|
@@ -948,6 +1016,15 @@ version = "2.7.4"
|
|
948
1016
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
949
1017
|
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
950
1018
|
|
1019
|
+
[[package]]
|
1020
|
+
name = "mimalloc"
|
1021
|
+
version = "0.1.43"
|
1022
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1023
|
+
checksum = "68914350ae34959d83f732418d51e2427a794055d0b9529f48259ac07af65633"
|
1024
|
+
dependencies = [
|
1025
|
+
"libmimalloc-sys",
|
1026
|
+
]
|
1027
|
+
|
951
1028
|
[[package]]
|
952
1029
|
name = "minimal-lexical"
|
953
1030
|
version = "0.2.1"
|
@@ -1119,15 +1196,21 @@ dependencies = [
|
|
1119
1196
|
name = "parquet"
|
1120
1197
|
version = "0.1.0"
|
1121
1198
|
dependencies = [
|
1199
|
+
"ahash",
|
1200
|
+
"arrow-array",
|
1201
|
+
"arrow-schema",
|
1122
1202
|
"bytes",
|
1203
|
+
"itertools 0.14.0",
|
1204
|
+
"jemallocator",
|
1205
|
+
"jiff",
|
1123
1206
|
"kanal",
|
1124
1207
|
"magnus 0.7.1",
|
1208
|
+
"mimalloc",
|
1125
1209
|
"parquet 54.0.0",
|
1126
1210
|
"rb-sys",
|
1127
1211
|
"serde",
|
1128
1212
|
"serde_magnus",
|
1129
1213
|
"thiserror",
|
1130
|
-
"xxhash-rust",
|
1131
1214
|
]
|
1132
1215
|
|
1133
1216
|
[[package]]
|
@@ -1197,6 +1280,21 @@ version = "0.3.31"
|
|
1197
1280
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1198
1281
|
checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2"
|
1199
1282
|
|
1283
|
+
[[package]]
|
1284
|
+
name = "portable-atomic"
|
1285
|
+
version = "1.10.0"
|
1286
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1287
|
+
checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
|
1288
|
+
|
1289
|
+
[[package]]
|
1290
|
+
name = "portable-atomic-util"
|
1291
|
+
version = "0.2.4"
|
1292
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1293
|
+
checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507"
|
1294
|
+
dependencies = [
|
1295
|
+
"portable-atomic",
|
1296
|
+
]
|
1297
|
+
|
1200
1298
|
[[package]]
|
1201
1299
|
name = "proc-macro2"
|
1202
1300
|
version = "1.0.92"
|
@@ -1796,12 +1894,6 @@ version = "0.5.5"
|
|
1796
1894
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1797
1895
|
checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
|
1798
1896
|
|
1799
|
-
[[package]]
|
1800
|
-
name = "xxhash-rust"
|
1801
|
-
version = "0.8.14"
|
1802
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1803
|
-
checksum = "d7d48f1b18be023c95e7b75f481cac649d74be7c507ff4a407c55cfb957f7934"
|
1804
|
-
|
1805
1897
|
[[package]]
|
1806
1898
|
name = "yoke"
|
1807
1899
|
version = "0.7.5"
|
data/Gemfile
CHANGED
@@ -6,7 +6,12 @@ gem "rake"
|
|
6
6
|
# Use local version of parquet
|
7
7
|
gemspec
|
8
8
|
|
9
|
-
group :development
|
10
|
-
gem "minitest", "~> 5.0"
|
9
|
+
group :development do
|
11
10
|
gem "benchmark-ips", "~> 2.12"
|
11
|
+
# gem "polars-df"
|
12
|
+
gem "duckdb"
|
13
|
+
end
|
14
|
+
|
15
|
+
group :test do
|
16
|
+
gem "minitest", "~> 5.0"
|
12
17
|
end
|
data/README.md
CHANGED
@@ -8,22 +8,78 @@ At the moment, it only supports iterating rows as either a hash or an array.
|
|
8
8
|
|
9
9
|
## Usage
|
10
10
|
|
11
|
+
This library provides high-level bindings to parquet-rs with two primary APIs for reading Parquet files: row-wise and column-wise iteration. The column-wise API generally offers better performance, especially when working with subset of columns.
|
12
|
+
|
13
|
+
### Row-wise Iteration
|
14
|
+
|
15
|
+
The `each_row` method provides sequential access to individual rows:
|
16
|
+
|
11
17
|
```ruby
|
12
18
|
require "parquet"
|
13
19
|
|
14
|
-
#
|
15
|
-
Parquet.each_row("
|
20
|
+
# Basic usage with default hash output
|
21
|
+
Parquet.each_row("data.parquet") do |row|
|
22
|
+
puts row.inspect # {"id"=>1, "name"=>"name_1"}
|
23
|
+
end
|
16
24
|
|
17
|
-
#
|
18
|
-
Parquet.each_row("
|
25
|
+
# Array output for more efficient memory usage
|
26
|
+
Parquet.each_row("data.parquet", result_type: :array) do |row|
|
27
|
+
puts row.inspect # [1, "name_1"]
|
28
|
+
end
|
19
29
|
|
20
|
-
#
|
21
|
-
|
22
|
-
|
30
|
+
# Select specific columns to reduce I/O
|
31
|
+
Parquet.each_row("data.parquet", columns: ["id", "name"]) do |row|
|
32
|
+
puts row.inspect
|
23
33
|
end
|
24
34
|
|
25
|
-
#
|
26
|
-
|
27
|
-
Parquet.each_row(
|
35
|
+
# Reading from IO objects
|
36
|
+
File.open("data.parquet", "rb") do |file|
|
37
|
+
Parquet.each_row(file) do |row|
|
38
|
+
puts row.inspect
|
39
|
+
end
|
40
|
+
end
|
41
|
+
```
|
42
|
+
|
43
|
+
### Column-wise Iteration
|
44
|
+
|
45
|
+
The `each_column` method reads data in column-oriented batches, which is typically more efficient for analytical queries:
|
28
46
|
|
47
|
+
```ruby
|
48
|
+
require "parquet"
|
49
|
+
|
50
|
+
# Process columns in batches of 1024 rows
|
51
|
+
Parquet.each_column("data.parquet", batch_size: 1024) do |batch|
|
52
|
+
# With result_type: :hash (default)
|
53
|
+
puts batch.inspect
|
54
|
+
# {
|
55
|
+
# "id" => [1, 2, ..., 1024],
|
56
|
+
# "name" => ["name_1", "name_2", ..., "name_1024"]
|
57
|
+
# }
|
58
|
+
end
|
59
|
+
|
60
|
+
# Array output with specific columns
|
61
|
+
Parquet.each_column("data.parquet",
|
62
|
+
columns: ["id", "name"],
|
63
|
+
result_type: :array,
|
64
|
+
batch_size: 1024) do |batch|
|
65
|
+
puts batch.inspect
|
66
|
+
# [
|
67
|
+
# [1, 2, ..., 1024], # id column
|
68
|
+
# ["name_1", "name_2", ...] # name column
|
69
|
+
# ]
|
70
|
+
end
|
29
71
|
```
|
72
|
+
|
73
|
+
### Arguments
|
74
|
+
|
75
|
+
Both methods accept these common arguments:
|
76
|
+
|
77
|
+
- `input`: Path string or IO-like object containing Parquet data
|
78
|
+
- `result_type`: Output format (`:hash` or `:array`, defaults to `:hash`)
|
79
|
+
- `columns`: Optional array of column names to read (improves performance)
|
80
|
+
|
81
|
+
Additional arguments for `each_column`:
|
82
|
+
|
83
|
+
- `batch_size`: Number of rows per batch (defaults to implementation-defined value)
|
84
|
+
|
85
|
+
When no block is given, both methods return an Enumerator.
|
data/ext/parquet/Cargo.toml
CHANGED
@@ -7,7 +7,10 @@ edition = "2021"
|
|
7
7
|
crate-type = ["cdylib"]
|
8
8
|
|
9
9
|
[dependencies]
|
10
|
+
ahash = "0.8"
|
10
11
|
parquet = { version = "^54.0", features = ["json", "object_store"] }
|
12
|
+
arrow-schema = "54.0.0"
|
13
|
+
arrow-array = "54.0.0"
|
11
14
|
bytes = "^1.9"
|
12
15
|
kanal = "0.1.0-pre8"
|
13
16
|
magnus = { version = "0.7", features = ["rb-sys"] }
|
@@ -15,4 +18,12 @@ rb-sys = "^0.9"
|
|
15
18
|
serde = { version = "1.0", features = ["derive"] }
|
16
19
|
serde_magnus = "0.8.1"
|
17
20
|
thiserror = "2.0"
|
18
|
-
|
21
|
+
itertools = "^0.14"
|
22
|
+
jiff = "0.1.19"
|
23
|
+
|
24
|
+
|
25
|
+
[target.'cfg(target_os = "linux")'.dependencies]
|
26
|
+
jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
|
27
|
+
|
28
|
+
[target.'cfg(not(any(target_os = "linux", target_os = "windows")))'.dependencies]
|
29
|
+
mimalloc = { version = "0.1", default-features = false }
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#[cfg(target_os = "linux")]
|
2
|
+
use jemallocator::Jemalloc;
|
3
|
+
|
4
|
+
#[cfg(not(any(target_os = "linux", target_os = "windows")))]
|
5
|
+
use mimalloc::MiMalloc;
|
6
|
+
|
7
|
+
#[global_allocator]
|
8
|
+
#[cfg(target_os = "linux")]
|
9
|
+
static ALLOC: Jemalloc = Jemalloc;
|
10
|
+
|
11
|
+
#[global_allocator]
|
12
|
+
#[cfg(not(any(target_os = "linux", target_os = "windows")))]
|
13
|
+
static ALLOC: MiMalloc = MiMalloc;
|
@@ -0,0 +1,54 @@
|
|
1
|
+
use ahash::RandomState;
|
2
|
+
use magnus::{
|
3
|
+
block::Yield, value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value,
|
4
|
+
};
|
5
|
+
|
6
|
+
use crate::{ColumnRecord, RowRecord};
|
7
|
+
|
8
|
+
pub struct RowEnumeratorArgs {
|
9
|
+
pub rb_self: Value,
|
10
|
+
pub to_read: Value,
|
11
|
+
pub result_type: String,
|
12
|
+
pub columns: Option<Vec<String>>,
|
13
|
+
}
|
14
|
+
|
15
|
+
#[inline]
|
16
|
+
pub fn create_row_enumerator(
|
17
|
+
args: RowEnumeratorArgs,
|
18
|
+
) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
|
19
|
+
let kwargs = RHash::new();
|
20
|
+
kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
|
21
|
+
if let Some(columns) = args.columns {
|
22
|
+
kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
|
23
|
+
}
|
24
|
+
let enumerator = args
|
25
|
+
.rb_self
|
26
|
+
.enumeratorize("each_row", (args.to_read, KwArgs(kwargs)));
|
27
|
+
Ok(Yield::Enumerator(enumerator))
|
28
|
+
}
|
29
|
+
|
30
|
+
pub struct ColumnEnumeratorArgs {
|
31
|
+
pub rb_self: Value,
|
32
|
+
pub to_read: Value,
|
33
|
+
pub result_type: String,
|
34
|
+
pub columns: Option<Vec<String>>,
|
35
|
+
pub batch_size: Option<usize>,
|
36
|
+
}
|
37
|
+
|
38
|
+
#[inline]
|
39
|
+
pub fn create_column_enumerator(
|
40
|
+
args: ColumnEnumeratorArgs,
|
41
|
+
) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
|
42
|
+
let kwargs = RHash::new();
|
43
|
+
kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
|
44
|
+
if let Some(columns) = args.columns {
|
45
|
+
kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
|
46
|
+
}
|
47
|
+
if let Some(batch_size) = args.batch_size {
|
48
|
+
kwargs.aset(Symbol::new("batch_size"), batch_size)?;
|
49
|
+
}
|
50
|
+
let enumerator = args
|
51
|
+
.rb_self
|
52
|
+
.enumeratorize("each_column", (args.to_read, KwArgs(kwargs)));
|
53
|
+
Ok(Yield::Enumerator(enumerator))
|
54
|
+
}
|
@@ -6,8 +6,14 @@
|
|
6
6
|
/// so this optimization could be removed if any issues arise.
|
7
7
|
use std::{
|
8
8
|
collections::HashMap,
|
9
|
-
sync::{
|
9
|
+
sync::{
|
10
|
+
atomic::{AtomicU32, Ordering},
|
11
|
+
LazyLock, Mutex, OnceLock,
|
12
|
+
},
|
10
13
|
};
|
14
|
+
|
15
|
+
use magnus::{r_string::FString, value::Opaque, IntoValue, RString, Ruby, Value};
|
16
|
+
|
11
17
|
use thiserror::Error;
|
12
18
|
|
13
19
|
#[derive(Debug, Error)]
|
@@ -16,66 +22,139 @@ pub enum CacheError {
|
|
16
22
|
LockError(String),
|
17
23
|
}
|
18
24
|
|
19
|
-
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, AtomicU32>>> =
|
25
|
+
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey, AtomicU32)>>> =
|
20
26
|
LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
|
21
27
|
|
22
28
|
pub struct StringCache;
|
23
29
|
|
30
|
+
#[derive(Copy, Clone)]
|
31
|
+
pub struct StringCacheKey(Opaque<FString>, &'static str);
|
32
|
+
|
33
|
+
impl StringCacheKey {
|
34
|
+
pub fn new(string: &str) -> Self {
|
35
|
+
let rstr = RString::new(string);
|
36
|
+
let fstr = rstr.to_interned_str();
|
37
|
+
Self(Opaque::from(fstr), fstr.as_str().unwrap())
|
38
|
+
}
|
39
|
+
}
|
40
|
+
|
41
|
+
impl AsRef<str> for StringCacheKey {
|
42
|
+
fn as_ref(&self) -> &'static str {
|
43
|
+
self.1
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
impl IntoValue for StringCacheKey {
|
48
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
49
|
+
handle.into_value(self.0)
|
50
|
+
}
|
51
|
+
}
|
52
|
+
|
53
|
+
impl std::fmt::Debug for StringCacheKey {
|
54
|
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
55
|
+
self.1.fmt(f)
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
impl PartialEq for StringCacheKey {
|
60
|
+
fn eq(&self, other: &Self) -> bool {
|
61
|
+
self.1 == other.1
|
62
|
+
}
|
63
|
+
}
|
64
|
+
|
65
|
+
impl std::cmp::Eq for StringCacheKey {}
|
66
|
+
|
67
|
+
impl std::hash::Hash for StringCacheKey {
|
68
|
+
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
69
|
+
self.1.hash(state);
|
70
|
+
}
|
71
|
+
}
|
72
|
+
|
24
73
|
impl StringCache {
|
25
74
|
#[allow(dead_code)]
|
26
|
-
pub fn intern(string: String) -> Result
|
75
|
+
pub fn intern(string: String) -> Result<StringCacheKey, CacheError> {
|
27
76
|
let mut cache = STRING_CACHE
|
28
77
|
.lock()
|
29
78
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
30
79
|
|
31
|
-
if let Some((
|
32
|
-
|
33
|
-
Ok(
|
80
|
+
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
|
81
|
+
counter.fetch_add(1, Ordering::Relaxed);
|
82
|
+
Ok(*interned_string)
|
34
83
|
} else {
|
84
|
+
let interned = StringCacheKey::new(string.as_str());
|
35
85
|
let leaked = Box::leak(string.into_boxed_str());
|
36
|
-
cache.insert(leaked, AtomicU32::new(1));
|
37
|
-
Ok(
|
86
|
+
cache.insert(leaked, (interned, AtomicU32::new(1)));
|
87
|
+
Ok(interned)
|
38
88
|
}
|
39
89
|
}
|
40
90
|
|
41
|
-
pub fn intern_many(strings: &[String]) -> Result<Vec
|
91
|
+
pub fn intern_many(strings: &[String]) -> Result<Vec<StringCacheKey>, CacheError> {
|
42
92
|
let mut cache = STRING_CACHE
|
43
93
|
.lock()
|
44
94
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
45
95
|
|
46
|
-
let mut result = Vec::with_capacity(strings.len());
|
96
|
+
let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
|
47
97
|
for string in strings {
|
48
|
-
if let Some((
|
49
|
-
|
50
|
-
result.push(
|
98
|
+
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
|
99
|
+
counter.fetch_add(1, Ordering::Relaxed);
|
100
|
+
result.push(*interned_string);
|
51
101
|
} else {
|
102
|
+
let interned = StringCacheKey::new(&string);
|
52
103
|
let leaked = Box::leak(string.clone().into_boxed_str());
|
53
|
-
cache.insert(leaked, AtomicU32::new(1));
|
54
|
-
result.push(
|
104
|
+
cache.insert(leaked, (interned, AtomicU32::new(1)));
|
105
|
+
result.push(interned);
|
55
106
|
}
|
56
107
|
}
|
57
108
|
Ok(result)
|
58
109
|
}
|
59
110
|
|
60
|
-
pub fn clear(headers: &[
|
111
|
+
pub fn clear(headers: &[StringCacheKey]) -> Result<(), CacheError> {
|
61
112
|
let mut cache = STRING_CACHE
|
62
113
|
.lock()
|
63
114
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
64
115
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
let
|
69
|
-
if
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
116
|
+
let to_remove: Vec<_> = headers
|
117
|
+
.iter()
|
118
|
+
.filter_map(|header| {
|
119
|
+
let key = header.as_ref();
|
120
|
+
if let Some((_, (_, counter))) = cache.get_key_value(key) {
|
121
|
+
let prev_count = counter.fetch_sub(1, Ordering::Relaxed);
|
122
|
+
if prev_count == 1 {
|
123
|
+
Some(key)
|
124
|
+
} else {
|
125
|
+
None
|
74
126
|
}
|
127
|
+
} else {
|
128
|
+
None
|
75
129
|
}
|
76
|
-
}
|
130
|
+
})
|
131
|
+
.collect();
|
132
|
+
|
133
|
+
for key in to_remove {
|
134
|
+
cache.remove(key);
|
77
135
|
}
|
78
136
|
|
79
137
|
Ok(())
|
80
138
|
}
|
81
139
|
}
|
140
|
+
|
141
|
+
pub struct HeaderCacheCleanupIter<I> {
|
142
|
+
pub inner: I,
|
143
|
+
pub headers: OnceLock<Vec<StringCacheKey>>,
|
144
|
+
}
|
145
|
+
|
146
|
+
impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
|
147
|
+
type Item = I::Item;
|
148
|
+
|
149
|
+
fn next(&mut self) -> Option<Self::Item> {
|
150
|
+
self.inner.next()
|
151
|
+
}
|
152
|
+
}
|
153
|
+
|
154
|
+
impl<I> Drop for HeaderCacheCleanupIter<I> {
|
155
|
+
fn drop(&mut self) {
|
156
|
+
if let Some(headers) = self.headers.get() {
|
157
|
+
StringCache::clear(&headers).unwrap();
|
158
|
+
}
|
159
|
+
}
|
160
|
+
}
|
data/ext/parquet/src/lib.rs
CHANGED
@@ -1,9 +1,16 @@
|
|
1
|
+
mod allocator;
|
2
|
+
mod enumerator;
|
1
3
|
pub mod header_cache;
|
2
4
|
mod reader;
|
5
|
+
mod ruby_integration;
|
3
6
|
mod ruby_reader;
|
7
|
+
mod types;
|
4
8
|
mod utils;
|
5
9
|
|
10
|
+
use crate::enumerator::*;
|
6
11
|
use crate::reader::*;
|
12
|
+
use crate::ruby_integration::*;
|
13
|
+
use crate::types::*;
|
7
14
|
|
8
15
|
use magnus::{Error, Ruby};
|
9
16
|
|
@@ -11,6 +18,7 @@ use magnus::{Error, Ruby};
|
|
11
18
|
#[magnus::init]
|
12
19
|
fn init(ruby: &Ruby) -> Result<(), Error> {
|
13
20
|
let module = ruby.define_module("Parquet")?;
|
14
|
-
module.define_module_function("each_row", magnus::method!(
|
21
|
+
module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
|
22
|
+
module.define_module_function("each_column", magnus::method!(parse_parquet_columns, -1))?;
|
15
23
|
Ok(())
|
16
24
|
}
|