parquet 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 70d9932bf622cd2647423e2519013d3a9f9256217effe9610e9aeaaebbcf1778
4
- data.tar.gz: fae3767ce0d950c91b17f77b740159d863293e1288063ed15d9b9c1f82e87fe1
3
+ metadata.gz: d46e0a95ff244189cadf71b3860b03aaf7638629d8c2eeda2800eaae57c0dbd2
4
+ data.tar.gz: 8bd2a8a29c7b199fcd67f7ababf241ebdcc871a0ff247fbd5320b789a6e6222e
5
5
  SHA512:
6
- metadata.gz: a03e75bcd377ce5a61cd5f17685995c420601ac5917bd3d4a99dc082686423729ee5f0913bb032fe826dd1a8bac9b52c152cfb2037a376751258c17f3b0e63b1
7
- data.tar.gz: ddfbb0ee14a6b7dcce47caf41962afe9610ab175d2b829c2744d62bed67cc746e64d214f64318220f2301a9ce8dcdecf9f9f9e90786df3d18f244716724abef8
6
+ metadata.gz: ae074d37108a5e12369638a23fcf0962ee416ac901dfcb64daf67b43e76e1566f50f56ccc890581a4cee8b14a0b801ad65dd874dd64c4d0882715e94d35e71b6
7
+ data.tar.gz: 9e08475b60bf1a5ee5e296aecca6df33a9f13fbe747914eadc45b171b6745feda099a3f8b4a1bca57ce3b3fdb03d4452545170d44d4dc90a4168c7b9194d8c19
data/Cargo.lock CHANGED
@@ -743,6 +743,15 @@ dependencies = [
743
743
  "either",
744
744
  ]
745
745
 
746
+ [[package]]
747
+ name = "itertools"
748
+ version = "0.14.0"
749
+ source = "registry+https://github.com/rust-lang/crates.io-index"
750
+ checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
751
+ dependencies = [
752
+ "either",
753
+ ]
754
+
746
755
  [[package]]
747
756
  name = "itoa"
748
757
  version = "1.0.14"
@@ -769,6 +778,35 @@ dependencies = [
769
778
  "libc",
770
779
  ]
771
780
 
781
+ [[package]]
782
+ name = "jiff"
783
+ version = "0.1.19"
784
+ source = "registry+https://github.com/rust-lang/crates.io-index"
785
+ checksum = "943611a469f78ab9afdac9022e473a80fca16a9deca6c5be3eb566d872231e76"
786
+ dependencies = [
787
+ "jiff-tzdb-platform",
788
+ "log",
789
+ "portable-atomic",
790
+ "portable-atomic-util",
791
+ "serde",
792
+ "windows-sys",
793
+ ]
794
+
795
+ [[package]]
796
+ name = "jiff-tzdb"
797
+ version = "0.1.1"
798
+ source = "registry+https://github.com/rust-lang/crates.io-index"
799
+ checksum = "91335e575850c5c4c673b9bd467b0e025f164ca59d0564f69d0c2ee0ffad4653"
800
+
801
+ [[package]]
802
+ name = "jiff-tzdb-platform"
803
+ version = "0.1.1"
804
+ source = "registry+https://github.com/rust-lang/crates.io-index"
805
+ checksum = "9835f0060a626fe59f160437bc725491a6af23133ea906500027d1bd2f8f4329"
806
+ dependencies = [
807
+ "jiff-tzdb",
808
+ ]
809
+
772
810
  [[package]]
773
811
  name = "jobserver"
774
812
  version = "0.1.32"
@@ -1159,8 +1197,12 @@ name = "parquet"
1159
1197
  version = "0.1.0"
1160
1198
  dependencies = [
1161
1199
  "ahash",
1200
+ "arrow-array",
1201
+ "arrow-schema",
1162
1202
  "bytes",
1203
+ "itertools 0.14.0",
1163
1204
  "jemallocator",
1205
+ "jiff",
1164
1206
  "kanal",
1165
1207
  "magnus 0.7.1",
1166
1208
  "mimalloc",
@@ -1238,6 +1280,21 @@ version = "0.3.31"
1238
1280
  source = "registry+https://github.com/rust-lang/crates.io-index"
1239
1281
  checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2"
1240
1282
 
1283
+ [[package]]
1284
+ name = "portable-atomic"
1285
+ version = "1.10.0"
1286
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1287
+ checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
1288
+
1289
+ [[package]]
1290
+ name = "portable-atomic-util"
1291
+ version = "0.2.4"
1292
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1293
+ checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507"
1294
+ dependencies = [
1295
+ "portable-atomic",
1296
+ ]
1297
+
1241
1298
  [[package]]
1242
1299
  name = "proc-macro2"
1243
1300
  version = "1.0.92"
data/Gemfile CHANGED
@@ -9,7 +9,7 @@ gemspec
9
9
  group :development do
10
10
  gem "benchmark-ips", "~> 2.12"
11
11
  # gem "polars-df"
12
- # gem "duckdb"
12
+ gem "duckdb"
13
13
  end
14
14
 
15
15
  group :test do
data/README.md CHANGED
@@ -8,22 +8,78 @@ At the moment, it only supports iterating rows as either a hash or an array.
8
8
 
9
9
  ## Usage
10
10
 
11
+ This library provides high-level bindings to parquet-rs with two primary APIs for reading Parquet files: row-wise and column-wise iteration. The column-wise API generally offers better performance, especially when working with subset of columns.
12
+
13
+ ### Row-wise Iteration
14
+
15
+ The `each_row` method provides sequential access to individual rows:
16
+
11
17
  ```ruby
12
18
  require "parquet"
13
19
 
14
- # Read each row as a hash
15
- Parquet.each_row("test/data.parquet") { |row| puts row.inspect }
20
+ # Basic usage with default hash output
21
+ Parquet.each_row("data.parquet") do |row|
22
+ puts row.inspect # {"id"=>1, "name"=>"name_1"}
23
+ end
16
24
 
17
- # Read each row as an array
18
- Parquet.each_row("test/data.parquet", result_type: :array) { |row| puts row.inspect }
25
+ # Array output for more efficient memory usage
26
+ Parquet.each_row("data.parquet", result_type: :array) do |row|
27
+ puts row.inspect # [1, "name_1"]
28
+ end
19
29
 
20
- # Read from an IO object (like File or StringIO)
21
- File.open("test/data.parquet", "rb") do |file|
22
- Parquet.each_row(file) { |row| puts row.inspect }
30
+ # Select specific columns to reduce I/O
31
+ Parquet.each_row("data.parquet", columns: ["id", "name"]) do |row|
32
+ puts row.inspect
23
33
  end
24
34
 
25
- # Or with StringIO
26
- io = StringIO.new(File.binread("test/data.parquet"))
27
- Parquet.each_row(io) { |row| puts row.inspect }
35
+ # Reading from IO objects
36
+ File.open("data.parquet", "rb") do |file|
37
+ Parquet.each_row(file) do |row|
38
+ puts row.inspect
39
+ end
40
+ end
41
+ ```
42
+
43
+ ### Column-wise Iteration
44
+
45
+ The `each_column` method reads data in column-oriented batches, which is typically more efficient for analytical queries:
28
46
 
47
+ ```ruby
48
+ require "parquet"
49
+
50
+ # Process columns in batches of 1024 rows
51
+ Parquet.each_column("data.parquet", batch_size: 1024) do |batch|
52
+ # With result_type: :hash (default)
53
+ puts batch.inspect
54
+ # {
55
+ # "id" => [1, 2, ..., 1024],
56
+ # "name" => ["name_1", "name_2", ..., "name_1024"]
57
+ # }
58
+ end
59
+
60
+ # Array output with specific columns
61
+ Parquet.each_column("data.parquet",
62
+ columns: ["id", "name"],
63
+ result_type: :array,
64
+ batch_size: 1024) do |batch|
65
+ puts batch.inspect
66
+ # [
67
+ # [1, 2, ..., 1024], # id column
68
+ # ["name_1", "name_2", ...] # name column
69
+ # ]
70
+ end
29
71
  ```
72
+
73
+ ### Arguments
74
+
75
+ Both methods accept these common arguments:
76
+
77
+ - `input`: Path string or IO-like object containing Parquet data
78
+ - `result_type`: Output format (`:hash` or `:array`, defaults to `:hash`)
79
+ - `columns`: Optional array of column names to read (improves performance)
80
+
81
+ Additional arguments for `each_column`:
82
+
83
+ - `batch_size`: Number of rows per batch (defaults to implementation-defined value)
84
+
85
+ When no block is given, both methods return an Enumerator.
@@ -9,6 +9,8 @@ crate-type = ["cdylib"]
9
9
  [dependencies]
10
10
  ahash = "0.8"
11
11
  parquet = { version = "^54.0", features = ["json", "object_store"] }
12
+ arrow-schema = "54.0.0"
13
+ arrow-array = "54.0.0"
12
14
  bytes = "^1.9"
13
15
  kanal = "0.1.0-pre8"
14
16
  magnus = { version = "0.7", features = ["rb-sys"] }
@@ -16,6 +18,9 @@ rb-sys = "^0.9"
16
18
  serde = { version = "1.0", features = ["derive"] }
17
19
  serde_magnus = "0.8.1"
18
20
  thiserror = "2.0"
21
+ itertools = "^0.14"
22
+ jiff = "0.1.19"
23
+
19
24
 
20
25
  [target.'cfg(target_os = "linux")'.dependencies]
21
26
  jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
@@ -3,9 +3,9 @@ use magnus::{
3
3
  block::Yield, value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value,
4
4
  };
5
5
 
6
- use crate::Record;
6
+ use crate::{ColumnRecord, RowRecord};
7
7
 
8
- pub struct EnumeratorArgs {
8
+ pub struct RowEnumeratorArgs {
9
9
  pub rb_self: Value,
10
10
  pub to_read: Value,
11
11
  pub result_type: String,
@@ -13,9 +13,9 @@ pub struct EnumeratorArgs {
13
13
  }
14
14
 
15
15
  #[inline]
16
- pub fn create_enumerator(
17
- args: EnumeratorArgs,
18
- ) -> Result<Yield<Box<dyn Iterator<Item = Record<RandomState>>>>, MagnusError> {
16
+ pub fn create_row_enumerator(
17
+ args: RowEnumeratorArgs,
18
+ ) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
19
19
  let kwargs = RHash::new();
20
20
  kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
21
21
  if let Some(columns) = args.columns {
@@ -23,6 +23,32 @@ pub fn create_enumerator(
23
23
  }
24
24
  let enumerator = args
25
25
  .rb_self
26
- .enumeratorize("for_each", (args.to_read, KwArgs(kwargs)));
26
+ .enumeratorize("each_row", (args.to_read, KwArgs(kwargs)));
27
+ Ok(Yield::Enumerator(enumerator))
28
+ }
29
+
30
+ pub struct ColumnEnumeratorArgs {
31
+ pub rb_self: Value,
32
+ pub to_read: Value,
33
+ pub result_type: String,
34
+ pub columns: Option<Vec<String>>,
35
+ pub batch_size: Option<usize>,
36
+ }
37
+
38
+ #[inline]
39
+ pub fn create_column_enumerator(
40
+ args: ColumnEnumeratorArgs,
41
+ ) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
42
+ let kwargs = RHash::new();
43
+ kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
44
+ if let Some(columns) = args.columns {
45
+ kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
46
+ }
47
+ if let Some(batch_size) = args.batch_size {
48
+ kwargs.aset(Symbol::new("batch_size"), batch_size)?;
49
+ }
50
+ let enumerator = args
51
+ .rb_self
52
+ .enumeratorize("each_column", (args.to_read, KwArgs(kwargs)));
27
53
  Ok(Yield::Enumerator(enumerator))
28
54
  }
@@ -6,8 +6,14 @@
6
6
  /// so this optimization could be removed if any issues arise.
7
7
  use std::{
8
8
  collections::HashMap,
9
- sync::{atomic::AtomicU32, LazyLock, Mutex, OnceLock},
9
+ sync::{
10
+ atomic::{AtomicU32, Ordering},
11
+ LazyLock, Mutex, OnceLock,
12
+ },
10
13
  };
14
+
15
+ use magnus::{r_string::FString, value::Opaque, IntoValue, RString, Ruby, Value};
16
+
11
17
  use thiserror::Error;
12
18
 
13
19
  #[derive(Debug, Error)]
@@ -16,64 +22,116 @@ pub enum CacheError {
16
22
  LockError(String),
17
23
  }
18
24
 
19
- static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, AtomicU32>>> =
25
+ static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey, AtomicU32)>>> =
20
26
  LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
21
27
 
22
28
  pub struct StringCache;
23
29
 
30
+ #[derive(Copy, Clone)]
31
+ pub struct StringCacheKey(Opaque<FString>, &'static str);
32
+
33
+ impl StringCacheKey {
34
+ pub fn new(string: &str) -> Self {
35
+ let rstr = RString::new(string);
36
+ let fstr = rstr.to_interned_str();
37
+ Self(Opaque::from(fstr), fstr.as_str().unwrap())
38
+ }
39
+ }
40
+
41
+ impl AsRef<str> for StringCacheKey {
42
+ fn as_ref(&self) -> &'static str {
43
+ self.1
44
+ }
45
+ }
46
+
47
+ impl IntoValue for StringCacheKey {
48
+ fn into_value_with(self, handle: &Ruby) -> Value {
49
+ handle.into_value(self.0)
50
+ }
51
+ }
52
+
53
+ impl std::fmt::Debug for StringCacheKey {
54
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
55
+ self.1.fmt(f)
56
+ }
57
+ }
58
+
59
+ impl PartialEq for StringCacheKey {
60
+ fn eq(&self, other: &Self) -> bool {
61
+ self.1 == other.1
62
+ }
63
+ }
64
+
65
+ impl std::cmp::Eq for StringCacheKey {}
66
+
67
+ impl std::hash::Hash for StringCacheKey {
68
+ fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
69
+ self.1.hash(state);
70
+ }
71
+ }
72
+
24
73
  impl StringCache {
25
74
  #[allow(dead_code)]
26
- pub fn intern(string: String) -> Result<&'static str, CacheError> {
75
+ pub fn intern(string: String) -> Result<StringCacheKey, CacheError> {
27
76
  let mut cache = STRING_CACHE
28
77
  .lock()
29
78
  .map_err(|e| CacheError::LockError(e.to_string()))?;
30
79
 
31
- if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
32
- count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
33
- Ok(existing)
80
+ if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
81
+ counter.fetch_add(1, Ordering::Relaxed);
82
+ Ok(*interned_string)
34
83
  } else {
84
+ let interned = StringCacheKey::new(string.as_str());
35
85
  let leaked = Box::leak(string.into_boxed_str());
36
- cache.insert(leaked, AtomicU32::new(1));
37
- Ok(leaked)
86
+ cache.insert(leaked, (interned, AtomicU32::new(1)));
87
+ Ok(interned)
38
88
  }
39
89
  }
40
90
 
41
- pub fn intern_many(strings: &[String]) -> Result<Vec<&'static str>, CacheError> {
91
+ pub fn intern_many(strings: &[String]) -> Result<Vec<StringCacheKey>, CacheError> {
42
92
  let mut cache = STRING_CACHE
43
93
  .lock()
44
94
  .map_err(|e| CacheError::LockError(e.to_string()))?;
45
95
 
46
- let mut result = Vec::with_capacity(strings.len());
96
+ let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
47
97
  for string in strings {
48
- if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
49
- count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
50
- result.push(existing);
98
+ if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
99
+ counter.fetch_add(1, Ordering::Relaxed);
100
+ result.push(*interned_string);
51
101
  } else {
102
+ let interned = StringCacheKey::new(&string);
52
103
  let leaked = Box::leak(string.clone().into_boxed_str());
53
- cache.insert(leaked, AtomicU32::new(1));
54
- result.push(leaked);
104
+ cache.insert(leaked, (interned, AtomicU32::new(1)));
105
+ result.push(interned);
55
106
  }
56
107
  }
57
108
  Ok(result)
58
109
  }
59
110
 
60
- pub fn clear(headers: &[&'static str]) -> Result<(), CacheError> {
111
+ pub fn clear(headers: &[StringCacheKey]) -> Result<(), CacheError> {
61
112
  let mut cache = STRING_CACHE
62
113
  .lock()
63
114
  .map_err(|e| CacheError::LockError(e.to_string()))?;
64
115
 
65
- for header in headers {
66
- if let Some(count) = cache.get(header) {
67
- // Returns the previous value of the counter
68
- let was = count.fetch_sub(1, std::sync::atomic::Ordering::Relaxed);
69
- if was == 1 {
70
- cache.remove(header);
71
- let ptr = *header as *const str as *mut str;
72
- unsafe {
73
- let _ = Box::from_raw(ptr);
116
+ let to_remove: Vec<_> = headers
117
+ .iter()
118
+ .filter_map(|header| {
119
+ let key = header.as_ref();
120
+ if let Some((_, (_, counter))) = cache.get_key_value(key) {
121
+ let prev_count = counter.fetch_sub(1, Ordering::Relaxed);
122
+ if prev_count == 1 {
123
+ Some(key)
124
+ } else {
125
+ None
74
126
  }
127
+ } else {
128
+ None
75
129
  }
76
- }
130
+ })
131
+ .collect();
132
+
133
+ for key in to_remove {
134
+ cache.remove(key);
77
135
  }
78
136
 
79
137
  Ok(())
@@ -82,13 +140,12 @@ impl StringCache {
82
140
 
83
141
  pub struct HeaderCacheCleanupIter<I> {
84
142
  pub inner: I,
85
- pub headers: OnceLock<Vec<&'static str>>,
143
+ pub headers: OnceLock<Vec<StringCacheKey>>,
86
144
  }
87
145
 
88
146
  impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
89
147
  type Item = I::Item;
90
148
 
91
- #[inline(always)]
92
149
  fn next(&mut self) -> Option<Self::Item> {
93
150
  self.inner.next()
94
151
  }
@@ -18,6 +18,7 @@ use magnus::{Error, Ruby};
18
18
  #[magnus::init]
19
19
  fn init(ruby: &Ruby) -> Result<(), Error> {
20
20
  let module = ruby.define_module("Parquet")?;
21
- module.define_module_function("each_row", magnus::method!(parse_parquet, -1))?;
21
+ module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
22
+ module.define_module_function("each_column", magnus::method!(parse_parquet_columns, -1))?;
22
23
  Ok(())
23
24
  }