parquet 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 66a87903cde02fff1e0a91627311bc1eba1fccc1d61915b7504c9fc61e10ed39
4
- data.tar.gz: 3d93f8aa12b664fd0dcff05b98dc3f12ba6ce745507ff18847936dafe8958be4
3
+ metadata.gz: 70d9932bf622cd2647423e2519013d3a9f9256217effe9610e9aeaaebbcf1778
4
+ data.tar.gz: fae3767ce0d950c91b17f77b740159d863293e1288063ed15d9b9c1f82e87fe1
5
5
  SHA512:
6
- metadata.gz: f042ba14e11c51849be30d22df7a1c63911bcf2ca6778b463b53878b562676f151f9a91a376baf57c3819a89861af2b76e58a80b66ea8c38870b8cf18664105a
7
- data.tar.gz: f708de1716e9640bba8fc3537c76b2cfefbcf9db709ea560db619eb6a50ef2a3965d06b43cb127541ae33067f2facab8bf75df6d8fe6c3aea6cadb765d099a2f
6
+ metadata.gz: a03e75bcd377ce5a61cd5f17685995c420601ac5917bd3d4a99dc082686423729ee5f0913bb032fe826dd1a8bac9b52c152cfb2037a376751258c17f3b0e63b1
7
+ data.tar.gz: ddfbb0ee14a6b7dcce47caf41962afe9610ab175d2b829c2744d62bed67cc746e64d214f64318220f2301a9ce8dcdecf9f9f9e90786df3d18f244716724abef8
data/Cargo.lock CHANGED
@@ -749,6 +749,26 @@ version = "1.0.14"
749
749
  source = "registry+https://github.com/rust-lang/crates.io-index"
750
750
  checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
751
751
 
752
+ [[package]]
753
+ name = "jemalloc-sys"
754
+ version = "0.5.4+5.3.0-patched"
755
+ source = "registry+https://github.com/rust-lang/crates.io-index"
756
+ checksum = "ac6c1946e1cea1788cbfde01c993b52a10e2da07f4bac608228d1bed20bfebf2"
757
+ dependencies = [
758
+ "cc",
759
+ "libc",
760
+ ]
761
+
762
+ [[package]]
763
+ name = "jemallocator"
764
+ version = "0.5.4"
765
+ source = "registry+https://github.com/rust-lang/crates.io-index"
766
+ checksum = "a0de374a9f8e63150e6f5e8a60cc14c668226d7a347d8aee1a45766e3c4dd3bc"
767
+ dependencies = [
768
+ "jemalloc-sys",
769
+ "libc",
770
+ ]
771
+
752
772
  [[package]]
753
773
  name = "jobserver"
754
774
  version = "0.1.32"
@@ -876,6 +896,16 @@ version = "0.2.11"
876
896
  source = "registry+https://github.com/rust-lang/crates.io-index"
877
897
  checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa"
878
898
 
899
+ [[package]]
900
+ name = "libmimalloc-sys"
901
+ version = "0.1.39"
902
+ source = "registry+https://github.com/rust-lang/crates.io-index"
903
+ checksum = "23aa6811d3bd4deb8a84dde645f943476d13b248d818edcf8ce0b2f37f036b44"
904
+ dependencies = [
905
+ "cc",
906
+ "libc",
907
+ ]
908
+
879
909
  [[package]]
880
910
  name = "litemap"
881
911
  version = "0.7.4"
@@ -948,6 +978,15 @@ version = "2.7.4"
948
978
  source = "registry+https://github.com/rust-lang/crates.io-index"
949
979
  checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
950
980
 
981
+ [[package]]
982
+ name = "mimalloc"
983
+ version = "0.1.43"
984
+ source = "registry+https://github.com/rust-lang/crates.io-index"
985
+ checksum = "68914350ae34959d83f732418d51e2427a794055d0b9529f48259ac07af65633"
986
+ dependencies = [
987
+ "libmimalloc-sys",
988
+ ]
989
+
951
990
  [[package]]
952
991
  name = "minimal-lexical"
953
992
  version = "0.2.1"
@@ -1119,15 +1158,17 @@ dependencies = [
1119
1158
  name = "parquet"
1120
1159
  version = "0.1.0"
1121
1160
  dependencies = [
1161
+ "ahash",
1122
1162
  "bytes",
1163
+ "jemallocator",
1123
1164
  "kanal",
1124
1165
  "magnus 0.7.1",
1166
+ "mimalloc",
1125
1167
  "parquet 54.0.0",
1126
1168
  "rb-sys",
1127
1169
  "serde",
1128
1170
  "serde_magnus",
1129
1171
  "thiserror",
1130
- "xxhash-rust",
1131
1172
  ]
1132
1173
 
1133
1174
  [[package]]
@@ -1796,12 +1837,6 @@ version = "0.5.5"
1796
1837
  source = "registry+https://github.com/rust-lang/crates.io-index"
1797
1838
  checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
1798
1839
 
1799
- [[package]]
1800
- name = "xxhash-rust"
1801
- version = "0.8.14"
1802
- source = "registry+https://github.com/rust-lang/crates.io-index"
1803
- checksum = "d7d48f1b18be023c95e7b75f481cac649d74be7c507ff4a407c55cfb957f7934"
1804
-
1805
1840
  [[package]]
1806
1841
  name = "yoke"
1807
1842
  version = "0.7.5"
data/Gemfile CHANGED
@@ -6,7 +6,12 @@ gem "rake"
6
6
  # Use local version of parquet
7
7
  gemspec
8
8
 
9
- group :development, :test do
10
- gem "minitest", "~> 5.0"
9
+ group :development do
11
10
  gem "benchmark-ips", "~> 2.12"
11
+ # gem "polars-df"
12
+ # gem "duckdb"
13
+ end
14
+
15
+ group :test do
16
+ gem "minitest", "~> 5.0"
12
17
  end
@@ -7,6 +7,7 @@ edition = "2021"
7
7
  crate-type = ["cdylib"]
8
8
 
9
9
  [dependencies]
10
+ ahash = "0.8"
10
11
  parquet = { version = "^54.0", features = ["json", "object_store"] }
11
12
  bytes = "^1.9"
12
13
  kanal = "0.1.0-pre8"
@@ -15,4 +16,9 @@ rb-sys = "^0.9"
15
16
  serde = { version = "1.0", features = ["derive"] }
16
17
  serde_magnus = "0.8.1"
17
18
  thiserror = "2.0"
18
- xxhash-rust = { version = "0.8.12", features = ["xxh3"] }
19
+
20
+ [target.'cfg(target_os = "linux")'.dependencies]
21
+ jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
22
+
23
+ [target.'cfg(not(any(target_os = "linux", target_os = "windows")))'.dependencies]
24
+ mimalloc = { version = "0.1", default-features = false }
@@ -0,0 +1,13 @@
1
+ #[cfg(target_os = "linux")]
2
+ use jemallocator::Jemalloc;
3
+
4
+ #[cfg(not(any(target_os = "linux", target_os = "windows")))]
5
+ use mimalloc::MiMalloc;
6
+
7
+ #[global_allocator]
8
+ #[cfg(target_os = "linux")]
9
+ static ALLOC: Jemalloc = Jemalloc;
10
+
11
+ #[global_allocator]
12
+ #[cfg(not(any(target_os = "linux", target_os = "windows")))]
13
+ static ALLOC: MiMalloc = MiMalloc;
@@ -0,0 +1,28 @@
1
+ use ahash::RandomState;
2
+ use magnus::{
3
+ block::Yield, value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value,
4
+ };
5
+
6
+ use crate::Record;
7
+
8
+ pub struct EnumeratorArgs {
9
+ pub rb_self: Value,
10
+ pub to_read: Value,
11
+ pub result_type: String,
12
+ pub columns: Option<Vec<String>>,
13
+ }
14
+
15
+ #[inline]
16
+ pub fn create_enumerator(
17
+ args: EnumeratorArgs,
18
+ ) -> Result<Yield<Box<dyn Iterator<Item = Record<RandomState>>>>, MagnusError> {
19
+ let kwargs = RHash::new();
20
+ kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
21
+ if let Some(columns) = args.columns {
22
+ kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
23
+ }
24
+ let enumerator = args
25
+ .rb_self
26
+ .enumeratorize("for_each", (args.to_read, KwArgs(kwargs)));
27
+ Ok(Yield::Enumerator(enumerator))
28
+ }
@@ -6,7 +6,7 @@
6
6
  /// so this optimization could be removed if any issues arise.
7
7
  use std::{
8
8
  collections::HashMap,
9
- sync::{atomic::AtomicU32, LazyLock, Mutex},
9
+ sync::{atomic::AtomicU32, LazyLock, Mutex, OnceLock},
10
10
  };
11
11
  use thiserror::Error;
12
12
 
@@ -79,3 +79,25 @@ impl StringCache {
79
79
  Ok(())
80
80
  }
81
81
  }
82
+
83
+ pub struct HeaderCacheCleanupIter<I> {
84
+ pub inner: I,
85
+ pub headers: OnceLock<Vec<&'static str>>,
86
+ }
87
+
88
+ impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
89
+ type Item = I::Item;
90
+
91
+ #[inline(always)]
92
+ fn next(&mut self) -> Option<Self::Item> {
93
+ self.inner.next()
94
+ }
95
+ }
96
+
97
+ impl<I> Drop for HeaderCacheCleanupIter<I> {
98
+ fn drop(&mut self) {
99
+ if let Some(headers) = self.headers.get() {
100
+ StringCache::clear(&headers).unwrap();
101
+ }
102
+ }
103
+ }
@@ -1,9 +1,16 @@
1
+ mod allocator;
2
+ mod enumerator;
1
3
  pub mod header_cache;
2
4
  mod reader;
5
+ mod ruby_integration;
3
6
  mod ruby_reader;
7
+ mod types;
4
8
  mod utils;
5
9
 
10
+ use crate::enumerator::*;
6
11
  use crate::reader::*;
12
+ use crate::ruby_integration::*;
13
+ use crate::types::*;
7
14
 
8
15
  use magnus::{Error, Ruby};
9
16
 
@@ -1,106 +1,39 @@
1
- use crate::header_cache::{CacheError, StringCache};
2
- use crate::ruby_reader::{build_ruby_reader, SeekableRead};
3
- use crate::utils::*;
4
- use bytes::Bytes;
1
+ // =============================================================================
2
+ // Imports and Dependencies
3
+ // =============================================================================
4
+ use crate::header_cache::{CacheError, HeaderCacheCleanupIter, StringCache};
5
+ use crate::{
6
+ create_enumerator, utils::*, EnumeratorArgs, ForgottenFileHandle, ParquetField, Record,
7
+ SeekableRubyValue,
8
+ };
9
+ use ahash::RandomState;
5
10
  use magnus::rb_sys::AsRawValue;
6
11
  use magnus::value::{Opaque, ReprValue};
7
- use magnus::IntoValue;
8
- use magnus::{block::Yield, Error as MagnusError, KwArgs, RHash, Ruby, Symbol, Value};
9
- use parquet::errors::ParquetError;
10
- use parquet::file::reader::{ChunkReader, Length, SerializedFileReader};
11
- use parquet::record::Field;
12
+ use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
13
+ use parquet::file::reader::FileReader;
14
+ use parquet::file::reader::SerializedFileReader;
15
+ use parquet::record::reader::RowIter as ParquetRowIter;
16
+ use parquet::schema::types::{Type as SchemaType, TypePtr};
12
17
  use std::collections::HashMap;
13
18
  use std::fs::File;
14
- use std::io::{self, BufReader, Read, Seek, SeekFrom};
19
+ use std::io::{self};
15
20
  use std::mem::ManuallyDrop;
16
21
  use std::os::fd::FromRawFd;
17
22
  use std::sync::OnceLock;
18
- use std::{borrow::Cow, hash::BuildHasher};
19
23
  use thiserror::Error;
20
- use xxhash_rust::xxh3::Xxh3Builder;
21
-
22
- use parquet::record::reader::RowIter as ParquetRowIter;
23
-
24
- #[derive(Error, Debug)]
25
- pub enum ReaderError {
26
- #[error("Failed to get file descriptor: {0}")]
27
- FileDescriptor(String),
28
- #[error("Invalid file descriptor")]
29
- InvalidFileDescriptor,
30
- #[error("Failed to open file: {0}")]
31
- FileOpen(#[from] io::Error),
32
- #[error("Failed to intern headers: {0}")]
33
- HeaderIntern(#[from] CacheError),
34
- #[error("Ruby error: {0}")]
35
- Ruby(String),
36
- }
37
-
38
- impl From<MagnusError> for ReaderError {
39
- fn from(err: MagnusError) -> Self {
40
- Self::Ruby(err.to_string())
41
- }
42
- }
43
-
44
- impl From<ReaderError> for MagnusError {
45
- fn from(err: ReaderError) -> Self {
46
- MagnusError::new(
47
- Ruby::get().unwrap().exception_runtime_error(),
48
- err.to_string(),
49
- )
50
- }
51
- }
52
-
53
- struct ForgottenFileHandle(ManuallyDrop<File>);
54
-
55
- impl Length for ForgottenFileHandle {
56
- fn len(&self) -> u64 {
57
- self.0.len()
58
- }
59
- }
60
-
61
- impl ChunkReader for ForgottenFileHandle {
62
- type T = BufReader<File>;
63
-
64
- fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
65
- self.0.get_read(start)
66
- }
67
-
68
- fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
69
- self.0.get_bytes(start, length)
70
- }
71
- }
72
-
73
- struct HeaderCacheCleanupIter<I> {
74
- inner: I,
75
- headers: OnceLock<Vec<&'static str>>,
76
- }
77
-
78
- impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
79
- type Item = I::Item;
80
-
81
- fn next(&mut self) -> Option<Self::Item> {
82
- self.inner.next()
83
- }
84
- }
85
-
86
- impl<I> Drop for HeaderCacheCleanupIter<I> {
87
- fn drop(&mut self) {
88
- if let Some(headers) = self.headers.get() {
89
- StringCache::clear(&headers).unwrap();
90
- }
91
- }
92
- }
93
24
 
25
+ #[inline]
94
26
  pub fn parse_parquet<'a>(
95
27
  rb_self: Value,
96
28
  args: &[Value],
97
- ) -> Result<Yield<Box<dyn Iterator<Item = Record<Xxh3Builder>>>>, MagnusError> {
29
+ ) -> Result<Yield<Box<dyn Iterator<Item = Record<RandomState>>>>, MagnusError> {
98
30
  let original = unsafe { Ruby::get_unchecked() };
99
31
  let ruby: &'static Ruby = Box::leak(Box::new(original));
100
32
 
101
33
  let ParquetArgs {
102
34
  to_read,
103
35
  result_type,
36
+ columns,
104
37
  } = parse_parquet_args(&ruby, args)?;
105
38
 
106
39
  if !ruby.block_given() {
@@ -108,15 +41,18 @@ pub fn parse_parquet<'a>(
108
41
  rb_self,
109
42
  to_read,
110
43
  result_type,
44
+ columns,
111
45
  });
112
46
  }
113
47
 
114
- let iter = if to_read.is_kind_of(ruby.class_string()) {
48
+ let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
115
49
  let path_string = to_read.to_r_string()?;
116
50
  let file_path = unsafe { path_string.as_str()? };
117
51
  let file = File::open(file_path).unwrap();
118
52
  let reader = SerializedFileReader::new(file).unwrap();
119
- ParquetRowIter::from_file_into(Box::new(reader))
53
+ let schema = reader.metadata().file_metadata().schema().clone();
54
+
55
+ (schema, ParquetRowIter::from_file_into(Box::new(reader)))
120
56
  } else if to_read.is_kind_of(ruby.class_io()) {
121
57
  let raw_value = to_read.as_raw();
122
58
  let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
@@ -131,14 +67,28 @@ pub fn parse_parquet<'a>(
131
67
  let file = unsafe { File::from_raw_fd(fd) };
132
68
  let file = ForgottenFileHandle(ManuallyDrop::new(file));
133
69
  let reader = SerializedFileReader::new(file).unwrap();
134
- ParquetRowIter::from_file_into(Box::new(reader))
70
+ let schema = reader.metadata().file_metadata().schema().clone();
71
+
72
+ (schema, ParquetRowIter::from_file_into(Box::new(reader)))
135
73
  } else {
136
74
  let readable = SeekableRubyValue(Opaque::from(to_read));
137
75
  let reader = SerializedFileReader::new(readable).unwrap();
138
- ParquetRowIter::from_file_into(Box::new(reader))
76
+ let schema = reader.metadata().file_metadata().schema().clone();
77
+
78
+ (schema, ParquetRowIter::from_file_into(Box::new(reader)))
139
79
  };
140
80
 
141
- let iter: Box<dyn Iterator<Item = Record<Xxh3Builder>>> = match result_type.as_str() {
81
+ if let Some(cols) = columns {
82
+ let projection = create_projection_schema(&schema, &cols);
83
+ iter = iter.project(Some(projection.to_owned())).map_err(|e| {
84
+ MagnusError::new(
85
+ ruby.exception_runtime_error(),
86
+ format!("Failed to create projection: {}", e),
87
+ )
88
+ })?;
89
+ }
90
+
91
+ let iter: Box<dyn Iterator<Item = Record<RandomState>>> = match result_type.as_str() {
142
92
  "hash" => {
143
93
  let headers = OnceLock::new();
144
94
  let headers_clone = headers.clone();
@@ -146,21 +96,23 @@ pub fn parse_parquet<'a>(
146
96
  .filter_map(move |row| {
147
97
  row.ok().map(|row| {
148
98
  let headers = headers_clone.get_or_init(|| {
149
- row.get_column_iter()
150
- .map(|(k, _)| StringCache::intern(k.to_owned()).unwrap())
151
- .collect::<Vec<_>>()
99
+ let column_count = row.get_column_iter().count();
100
+ let mut headers = Vec::with_capacity(column_count);
101
+ row.get_column_iter().for_each(|(k, _)| {
102
+ headers.push(StringCache::intern(k.to_owned()).unwrap())
103
+ });
104
+ headers
152
105
  });
153
106
 
154
- row.get_column_iter()
155
- .enumerate()
156
- .map(|(i, (_, v))| {
157
- let key = headers[i];
158
- (key, ParquetField(v.clone()))
159
- })
160
- .collect::<HashMap<&'static str, ParquetField, Xxh3Builder>>()
107
+ let mut map =
108
+ HashMap::with_capacity_and_hasher(headers.len(), Default::default());
109
+ row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
110
+ map.insert(headers[i], ParquetField(v.clone()));
111
+ });
112
+ map
161
113
  })
162
114
  })
163
- .map(|row| Record::Map(row));
115
+ .map(Record::Map);
164
116
 
165
117
  Box::new(HeaderCacheCleanupIter {
166
118
  inner: iter,
@@ -170,12 +122,14 @@ pub fn parse_parquet<'a>(
170
122
  "array" => Box::new(
171
123
  iter.filter_map(|row| {
172
124
  row.ok().map(|row| {
125
+ let column_count = row.get_column_iter().count();
126
+ let mut vec = Vec::with_capacity(column_count);
173
127
  row.get_column_iter()
174
- .map(|(_, v)| ParquetField(v.clone()))
175
- .collect::<Vec<ParquetField>>()
128
+ .for_each(|(_, v)| vec.push(ParquetField(v.clone())));
129
+ vec
176
130
  })
177
131
  })
178
- .map(|row| Record::Vec(row)),
132
+ .map(Record::Vec),
179
133
  ),
180
134
  _ => {
181
135
  return Err(MagnusError::new(
@@ -188,150 +142,49 @@ pub fn parse_parquet<'a>(
188
142
  Ok(Yield::Iter(iter))
189
143
  }
190
144
 
191
- struct EnumeratorArgs {
192
- rb_self: Value,
193
- to_read: Value,
194
- result_type: String,
195
- }
196
-
197
- fn create_enumerator(
198
- args: EnumeratorArgs,
199
- ) -> Result<Yield<Box<dyn Iterator<Item = Record<Xxh3Builder>>>>, MagnusError> {
200
- let kwargs = RHash::new();
201
-
202
- kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
203
-
204
- let enumerator = args
205
- .rb_self
206
- .enumeratorize("for_each", (args.to_read, KwArgs(kwargs)));
207
- Ok(Yield::Enumerator(enumerator))
208
- }
209
-
210
- #[derive(Debug)]
211
- pub enum Record<S: BuildHasher + Default> {
212
- Vec(Vec<ParquetField>),
213
- Map(HashMap<&'static str, ParquetField, S>),
214
- }
145
+ fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
146
+ if let SchemaType::GroupType { fields, .. } = schema {
147
+ let projected_fields: Vec<TypePtr> = fields
148
+ .iter()
149
+ .filter(|field| columns.contains(&field.name().to_string()))
150
+ .cloned()
151
+ .collect();
215
152
 
216
- impl<S: BuildHasher + Default> IntoValue for Record<S> {
217
- #[inline]
218
- fn into_value_with(self, handle: &Ruby) -> Value {
219
- match self {
220
- Record::Vec(vec) => {
221
- let ary = handle.ary_new_capa(vec.len());
222
- vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
223
- ary.into_value_with(handle)
224
- }
225
- Record::Map(map) => {
226
- // Pre-allocate the hash with the known size
227
- let hash = handle.hash_new_capa(map.len());
228
- map.into_iter()
229
- .try_for_each(|(k, v)| hash.aset(k, v))
230
- .unwrap();
231
- hash.into_value_with(handle)
232
- }
153
+ SchemaType::GroupType {
154
+ basic_info: schema.get_basic_info().clone(),
155
+ fields: projected_fields,
233
156
  }
157
+ } else {
158
+ // Return original schema if not a group type
159
+ schema.clone()
234
160
  }
235
161
  }
236
162
 
237
- #[derive(Debug, Clone)]
238
- pub struct CowValue<'a>(pub Cow<'a, str>);
239
-
240
- impl<'a> IntoValue for CowValue<'a> {
241
- fn into_value_with(self, handle: &Ruby) -> Value {
242
- self.0.into_value_with(handle)
243
- }
244
- }
245
-
246
- #[derive(Debug)]
247
- pub struct ParquetField(Field);
248
-
249
- impl<'a> IntoValue for ParquetField {
250
- fn into_value_with(self, handle: &Ruby) -> Value {
251
- match self.0 {
252
- Field::Byte(b) => b.into_value_with(handle),
253
- Field::Bool(b) => b.into_value_with(handle),
254
- Field::Short(s) => s.into_value_with(handle),
255
- Field::Int(i) => i.into_value_with(handle),
256
- Field::Long(l) => l.into_value_with(handle),
257
- Field::UByte(ub) => ub.into_value_with(handle),
258
- Field::UShort(us) => us.into_value_with(handle),
259
- Field::UInt(ui) => ui.into_value_with(handle),
260
- Field::ULong(ul) => ul.into_value_with(handle),
261
- Field::Float16(f) => f32::from(f).into_value_with(handle),
262
- Field::Float(f) => f.into_value_with(handle),
263
- Field::Double(d) => d.into_value_with(handle),
264
-
265
- Field::Str(s) => s.into_value_with(handle),
266
- Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
267
- Field::Date(d) => d.into_value_with(handle),
268
- Field::TimestampMillis(ts) => ts.into_value_with(handle),
269
- Field::TimestampMicros(ts) => ts.into_value_with(handle),
270
- Field::ListInternal(list) => {
271
- let ary = handle.ary_new_capa(list.elements().len());
272
- list.elements()
273
- .iter()
274
- .try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
275
- .unwrap();
276
- ary.into_value_with(handle)
277
- }
278
- Field::MapInternal(map) => {
279
- let hash = handle.hash_new_capa(map.entries().len());
280
- map.entries()
281
- .iter()
282
- .try_for_each(|(k, v)| {
283
- hash.aset(
284
- ParquetField(k.clone()).into_value_with(handle),
285
- ParquetField(v.clone()).into_value_with(handle),
286
- )
287
- })
288
- .unwrap();
289
- hash.into_value_with(handle)
290
- }
291
- // Field::Decimal(d) => d.to_string().into_value_with(handle),
292
- // Field::Group(row) => row.into_value_with(handle),
293
- Field::Null => handle.qnil().as_value(),
294
- _ => panic!("Unsupported field type"),
295
- }
296
- }
163
+ #[derive(Error, Debug)]
164
+ pub enum ReaderError {
165
+ #[error("Failed to get file descriptor: {0}")]
166
+ FileDescriptor(String),
167
+ #[error("Invalid file descriptor")]
168
+ InvalidFileDescriptor,
169
+ #[error("Failed to open file: {0}")]
170
+ FileOpen(#[from] io::Error),
171
+ #[error("Failed to intern headers: {0}")]
172
+ HeaderIntern(#[from] CacheError),
173
+ #[error("Ruby error: {0}")]
174
+ Ruby(String),
297
175
  }
298
176
 
299
- struct SeekableRubyValue(Opaque<Value>);
300
-
301
- impl Length for SeekableRubyValue {
302
- fn len(&self) -> u64 {
303
- let ruby = unsafe { Ruby::get_unchecked() };
304
- let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
305
- let current_pos = reader.seek(SeekFrom::Current(0)).unwrap();
306
- let file_len = reader.seek(SeekFrom::End(0)).unwrap();
307
- reader.seek(SeekFrom::Start(current_pos)).unwrap();
308
- file_len
177
+ impl From<MagnusError> for ReaderError {
178
+ fn from(err: MagnusError) -> Self {
179
+ Self::Ruby(err.to_string())
309
180
  }
310
181
  }
311
182
 
312
- impl ChunkReader for SeekableRubyValue {
313
- type T = BufReader<Box<dyn SeekableRead>>;
314
-
315
- fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
316
- let ruby = unsafe { Ruby::get_unchecked() };
317
- let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
318
- reader.seek(SeekFrom::Start(start))?;
319
- Ok(BufReader::new(reader))
320
- }
321
-
322
- fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
323
- let ruby = unsafe { Ruby::get_unchecked() };
324
- let mut buffer = Vec::with_capacity(length);
325
- let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
326
- reader.seek(SeekFrom::Start(start))?;
327
- let read = reader.take(length as _).read_to_end(&mut buffer)?;
328
-
329
- if read != length {
330
- return Err(ParquetError::EOF(format!(
331
- "Expected to read {} bytes, read only {}",
332
- length, read
333
- )));
334
- }
335
- Ok(buffer.into())
183
+ impl From<ReaderError> for MagnusError {
184
+ fn from(err: ReaderError) -> Self {
185
+ MagnusError::new(
186
+ Ruby::get().unwrap().exception_runtime_error(),
187
+ err.to_string(),
188
+ )
336
189
  }
337
190
  }
@@ -0,0 +1,77 @@
1
+ use std::{
2
+ fs::File,
3
+ io::{BufReader, SeekFrom},
4
+ mem::ManuallyDrop,
5
+ };
6
+
7
+ use bytes::Bytes;
8
+ use magnus::{value::Opaque, Ruby, Value};
9
+ use parquet::{
10
+ errors::ParquetError,
11
+ file::reader::{ChunkReader, Length},
12
+ };
13
+ use std::io::Read;
14
+
15
+ use crate::ruby_reader::{build_ruby_reader, SeekableRead};
16
+
17
+ const READ_BUFFER_SIZE: usize = 16 * 1024;
18
+
19
+ pub struct SeekableRubyValue(pub Opaque<Value>);
20
+
21
+ impl Length for SeekableRubyValue {
22
+ fn len(&self) -> u64 {
23
+ let ruby = unsafe { Ruby::get_unchecked() };
24
+ let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
25
+ let current_pos = reader.seek(SeekFrom::Current(0)).unwrap();
26
+ let file_len = reader.seek(SeekFrom::End(0)).unwrap();
27
+ reader.seek(SeekFrom::Start(current_pos)).unwrap();
28
+ file_len
29
+ }
30
+ }
31
+
32
+ impl ChunkReader for SeekableRubyValue {
33
+ type T = BufReader<Box<dyn SeekableRead>>;
34
+
35
+ fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
36
+ let ruby = unsafe { Ruby::get_unchecked() };
37
+ let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
38
+ reader.seek(SeekFrom::Start(start))?;
39
+ Ok(BufReader::with_capacity(READ_BUFFER_SIZE, reader))
40
+ }
41
+
42
+ fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
43
+ let ruby = unsafe { Ruby::get_unchecked() };
44
+ let mut buffer = Vec::with_capacity(length);
45
+ let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
46
+ reader.seek(SeekFrom::Start(start))?;
47
+ let read = reader.take(length as _).read_to_end(&mut buffer)?;
48
+
49
+ if read != length {
50
+ return Err(ParquetError::EOF(format!(
51
+ "Expected to read {} bytes, read only {}",
52
+ length, read
53
+ )));
54
+ }
55
+ Ok(buffer.into())
56
+ }
57
+ }
58
+
59
+ pub struct ForgottenFileHandle(pub ManuallyDrop<File>);
60
+
61
+ impl Length for ForgottenFileHandle {
62
+ fn len(&self) -> u64 {
63
+ self.0.len()
64
+ }
65
+ }
66
+
67
+ impl ChunkReader for ForgottenFileHandle {
68
+ type T = BufReader<File>;
69
+
70
+ fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
71
+ self.0.get_read(start)
72
+ }
73
+
74
+ fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
75
+ self.0.get_bytes(start, length)
76
+ }
77
+ }
@@ -2,30 +2,24 @@ use magnus::{
2
2
  value::{Opaque, ReprValue},
3
3
  RClass, RString, Ruby, Value,
4
4
  };
5
- use std::io::{self, Read, Seek};
5
+ use std::io::{self, Read, Seek, SeekFrom, Write};
6
6
  use std::sync::OnceLock;
7
7
 
8
8
  static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
9
9
 
10
- const READ_BUFFER_SIZE: usize = 16 * 1024;
11
-
12
10
  /// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
13
11
  /// and provide a standard Read implementation for them.
14
12
  pub struct RubyReader<T> {
15
13
  inner: T,
16
- buffer: Option<Vec<u8>>,
17
14
  offset: usize,
18
- // Number of bytes that have been read into the buffer
19
- // Used as an upper bound for offset
20
- buffered_bytes: usize,
21
15
  }
22
16
 
23
17
  pub trait SeekableRead: std::io::Read + Seek {}
24
18
  impl SeekableRead for RubyReader<Value> {}
25
19
  impl SeekableRead for RubyReader<RString> {}
26
20
 
27
- pub fn build_ruby_reader<'a>(
28
- ruby: &'a Ruby,
21
+ pub fn build_ruby_reader(
22
+ ruby: &Ruby,
29
23
  input: Value,
30
24
  ) -> Result<Box<dyn SeekableRead>, magnus::Error> {
31
25
  if RubyReader::is_string_io(ruby, &input) {
@@ -39,49 +33,48 @@ pub fn build_ruby_reader<'a>(
39
33
 
40
34
  impl Seek for RubyReader<Value> {
41
35
  fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
42
- let seek_to = match pos {
43
- io::SeekFrom::Start(offset) => {
44
- // SEEK_SET - absolute position
45
- offset as i64
46
- }
47
- io::SeekFrom::End(offset) => {
48
- // SEEK_END - from end of stream
49
- offset
50
- }
51
- io::SeekFrom::Current(offset) => {
52
- // SEEK_CUR - relative to current
53
- offset
54
- }
36
+ let (whence, offset) = match pos {
37
+ SeekFrom::Start(i) => (0, i as i64),
38
+ SeekFrom::Current(i) => (1, i),
39
+ SeekFrom::End(i) => (2, i),
55
40
  };
56
41
 
57
- let whence = match pos {
58
- io::SeekFrom::Start(_) => 0, // SEEK_SET
59
- io::SeekFrom::End(_) => 2, // SEEK_END
60
- io::SeekFrom::Current(_) => 1, // SEEK_CUR
61
- };
42
+ let new_position = self
43
+ .inner
44
+ .funcall("seek", (offset, whence))
45
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
46
+
47
+ Ok(new_position)
48
+ }
49
+ }
62
50
 
63
- // Call Ruby's seek method
64
- let _: u64 = self.inner.funcall("seek", (seek_to, whence)).unwrap();
51
+ impl Write for RubyReader<Value> {
52
+ fn write(&mut self, buf: &[u8]) -> Result<usize, io::Error> {
53
+ let ruby_bytes = RString::from_slice(buf);
65
54
 
66
- // Get current position
67
- let pos: u64 = self.inner.funcall("pos", ()).unwrap();
55
+ let bytes_written = self
56
+ .inner
57
+ .funcall::<_, _, usize>("write", (ruby_bytes,))
58
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
68
59
 
69
- Ok(pos)
60
+ Ok(bytes_written)
61
+ }
62
+
63
+ fn flush(&mut self) -> Result<(), io::Error> {
64
+ self.inner
65
+ .funcall::<_, _, Value>("flush", ())
66
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
67
+
68
+ Ok(())
70
69
  }
71
70
  }
72
71
 
73
72
  impl Seek for RubyReader<RString> {
74
73
  fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
75
74
  match pos {
76
- io::SeekFrom::Start(offset) => {
77
- self.offset = offset as usize;
78
- }
79
- io::SeekFrom::End(offset) => {
80
- self.offset = (self.inner.len() - offset as usize) as usize;
81
- }
82
- io::SeekFrom::Current(offset) => {
83
- self.offset = (self.offset as i64 + offset) as usize;
84
- }
75
+ io::SeekFrom::Start(offset) => self.offset = offset as usize,
76
+ io::SeekFrom::Current(offset) => self.offset = (self.offset as i64 + offset) as usize,
77
+ io::SeekFrom::End(offset) => self.offset = self.inner.len() - offset as usize,
85
78
  }
86
79
  Ok(self.offset as u64)
87
80
  }
@@ -106,59 +99,9 @@ impl RubyReader<Value> {
106
99
  fn from_io_like(input: Value) -> Self {
107
100
  Self {
108
101
  inner: input,
109
- buffer: Some(vec![0; READ_BUFFER_SIZE]),
110
102
  offset: 0,
111
- buffered_bytes: 0,
112
103
  }
113
104
  }
114
-
115
- fn read_from_buffer(&mut self, to_buf: &mut [u8]) -> Option<io::Result<usize>> {
116
- if let Some(from_buf) = &self.buffer {
117
- // If the offset is within the buffered bytes, copy the remaining bytes to the output buffer
118
- if self.offset < self.buffered_bytes {
119
- let remaining = self.buffered_bytes - self.offset;
120
- let copy_size = remaining.min(to_buf.len());
121
- to_buf[..copy_size]
122
- .copy_from_slice(&from_buf[self.offset..self.offset + copy_size]);
123
- self.offset += copy_size;
124
- Some(Ok(copy_size))
125
- } else {
126
- None
127
- }
128
- } else {
129
- None
130
- }
131
- }
132
-
133
- fn read_from_ruby(&mut self, buf: &mut [u8]) -> io::Result<usize> {
134
- let buffer = self.buffer.as_mut().unwrap();
135
- let result = self
136
- .inner
137
- .funcall::<_, _, RString>("read", (buffer.capacity(),))
138
- .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
139
-
140
- if result.is_nil() {
141
- return Ok(0); // EOF
142
- }
143
-
144
- let bytes = unsafe { result.as_slice() };
145
-
146
- // Update internal buffer
147
- let bytes_len = bytes.len();
148
- if bytes_len == 0 {
149
- return Ok(0);
150
- }
151
-
152
- // Only copy what we actually read
153
- buffer[..bytes_len].copy_from_slice(bytes);
154
- self.buffered_bytes = bytes_len;
155
-
156
- // Copy to output buffer
157
- let copy_size = bytes_len.min(buf.len());
158
- buf[..copy_size].copy_from_slice(&buffer[..copy_size]);
159
- self.offset = copy_size;
160
- Ok(copy_size)
161
- }
162
105
  }
163
106
 
164
107
  impl RubyReader<RString> {
@@ -176,9 +119,7 @@ impl RubyReader<RString> {
176
119
  let string_content = input.funcall::<_, _, RString>("string", ()).unwrap();
177
120
  Ok(Box::new(Self {
178
121
  inner: string_content,
179
- buffer: None,
180
122
  offset: 0,
181
- buffered_bytes: 0,
182
123
  }))
183
124
  }
184
125
 
@@ -197,21 +138,21 @@ impl RubyReader<RString> {
197
138
  .or_else(|_| input.funcall::<_, _, RString>("to_s", ()))?;
198
139
  Ok(Box::new(Self {
199
140
  inner: string_content,
200
- buffer: None,
201
141
  offset: 0,
202
- buffered_bytes: 0,
203
142
  }))
204
143
  }
205
144
  }
206
145
 
207
146
  impl Read for RubyReader<Value> {
208
- fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
209
- if let Some(result) = self.read_from_buffer(buf) {
210
- result
211
- } else {
212
- // If the buffer is empty, read from Ruby
213
- self.read_from_ruby(buf)
214
- }
147
+ fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
148
+ let bytes = self
149
+ .inner
150
+ .funcall::<_, _, RString>("read", (buf.len(),))
151
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
152
+
153
+ buf.write_all(unsafe { bytes.as_slice() })?;
154
+
155
+ Ok(bytes.len())
215
156
  }
216
157
  }
217
158
 
@@ -0,0 +1,90 @@
1
+ use std::{borrow::Cow, collections::HashMap, hash::BuildHasher};
2
+
3
+ use magnus::{value::ReprValue, IntoValue, Ruby, Value};
4
+ use parquet::record::Field;
5
+
6
+ #[derive(Debug)]
7
+ pub enum Record<S: BuildHasher + Default> {
8
+ Vec(Vec<ParquetField>),
9
+ Map(HashMap<&'static str, ParquetField, S>),
10
+ }
11
+
12
+ impl<S: BuildHasher + Default> IntoValue for Record<S> {
13
+ fn into_value_with(self, handle: &Ruby) -> Value {
14
+ match self {
15
+ Record::Vec(vec) => {
16
+ let ary = handle.ary_new_capa(vec.len());
17
+ vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
18
+ ary.into_value_with(handle)
19
+ }
20
+ Record::Map(map) => {
21
+ let hash = handle.hash_new_capa(map.len());
22
+ map.into_iter()
23
+ .try_for_each(|(k, v)| hash.aset(k, v))
24
+ .unwrap();
25
+ hash.into_value_with(handle)
26
+ }
27
+ }
28
+ }
29
+ }
30
+
31
+ #[derive(Debug, Clone)]
32
+ pub struct CowValue<'a>(pub Cow<'a, str>);
33
+
34
+ impl<'a> IntoValue for CowValue<'a> {
35
+ fn into_value_with(self, handle: &Ruby) -> Value {
36
+ self.0.into_value_with(handle)
37
+ }
38
+ }
39
+
40
+ #[derive(Debug)]
41
+ pub struct ParquetField(pub Field);
42
+
43
+ impl IntoValue for ParquetField {
44
+ fn into_value_with(self, handle: &Ruby) -> Value {
45
+ match self.0 {
46
+ Field::Byte(b) => b.into_value_with(handle),
47
+ Field::Bool(b) => b.into_value_with(handle),
48
+ Field::Short(s) => s.into_value_with(handle),
49
+ Field::Int(i) => i.into_value_with(handle),
50
+ Field::Long(l) => l.into_value_with(handle),
51
+ Field::UByte(ub) => ub.into_value_with(handle),
52
+ Field::UShort(us) => us.into_value_with(handle),
53
+ Field::UInt(ui) => ui.into_value_with(handle),
54
+ Field::ULong(ul) => ul.into_value_with(handle),
55
+ Field::Float16(f) => f32::from(f).into_value_with(handle),
56
+ Field::Float(f) => f.into_value_with(handle),
57
+ Field::Double(d) => d.into_value_with(handle),
58
+ Field::Str(s) => s.into_value_with(handle),
59
+ Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
60
+ Field::Date(d) => d.into_value_with(handle),
61
+ Field::TimestampMillis(ts) => ts.into_value_with(handle),
62
+ Field::TimestampMicros(ts) => ts.into_value_with(handle),
63
+ Field::ListInternal(list) => {
64
+ let elements = list.elements();
65
+ let ary = handle.ary_new_capa(elements.len());
66
+ elements
67
+ .iter()
68
+ .try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
69
+ .unwrap();
70
+ ary.into_value_with(handle)
71
+ }
72
+ Field::MapInternal(map) => {
73
+ let entries = map.entries();
74
+ let hash = handle.hash_new_capa(entries.len());
75
+ entries
76
+ .iter()
77
+ .try_for_each(|(k, v)| {
78
+ hash.aset(
79
+ ParquetField(k.clone()).into_value_with(handle),
80
+ ParquetField(v.clone()).into_value_with(handle),
81
+ )
82
+ })
83
+ .unwrap();
84
+ hash.into_value_with(handle)
85
+ }
86
+ Field::Null => handle.qnil().as_value(),
87
+ _ => panic!("Unsupported field type"),
88
+ }
89
+ }
90
+ }
@@ -29,6 +29,7 @@ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, E
29
29
  pub struct ParquetArgs {
30
30
  pub to_read: Value,
31
31
  pub result_type: String,
32
+ pub columns: Option<Vec<String>>,
32
33
  }
33
34
 
34
35
  /// Parse common arguments for CSV parsing
@@ -36,8 +37,11 @@ pub fn parse_parquet_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetArgs, Er
36
37
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
37
38
  let (to_read,) = parsed_args.required;
38
39
 
39
- let kwargs =
40
- get_kwargs::<_, (), (Option<Value>,), ()>(parsed_args.keywords, &[], &["result_type"])?;
40
+ let kwargs = get_kwargs::<_, (), (Option<Value>, Option<Vec<String>>), ()>(
41
+ parsed_args.keywords,
42
+ &[],
43
+ &["result_type", "columns"],
44
+ )?;
41
45
 
42
46
  let result_type = match kwargs
43
47
  .optional
@@ -66,5 +70,6 @@ pub fn parse_parquet_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetArgs, Er
66
70
  Ok(ParquetArgs {
67
71
  to_read,
68
72
  result_type,
73
+ columns: kwargs.optional.1,
69
74
  })
70
75
  }
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
data/lib/parquet.rbi CHANGED
@@ -1,17 +1,19 @@
1
1
  # typed: strict
2
-
3
2
  module Parquet
4
3
  # Options:
5
4
  # - `input`: String specifying the input file
6
5
  # - `result_type`: String specifying the output format
7
6
  # ("hash" or "array" or :hash or :array)
7
+ # - `columns`: When present, only the specified columns will be included in the output.
8
+ # This is useful for reducing how much data is read and improving performance.
8
9
  sig do
9
10
  params(
10
11
  input: T.any(String, IO),
11
12
  result_type: T.nilable(T.any(String, Symbol)),
13
+ columns: T.nilable(T::Array[String]),
12
14
  blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
13
15
  ).returns(T.any(Enumerator, T.untyped))
14
16
  end
15
- def self.each_row(input, result_type: nil, &blk)
17
+ def self.each_row(input, result_type: nil, columns: nil, &blk)
16
18
  end
17
19
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-02 00:00:00.000000000 Z
11
+ date: 2025-01-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -56,10 +56,14 @@ files:
56
56
  - Rakefile
57
57
  - ext/parquet/Cargo.toml
58
58
  - ext/parquet/extconf.rb
59
+ - ext/parquet/src/allocator.rs
60
+ - ext/parquet/src/enumerator.rs
59
61
  - ext/parquet/src/header_cache.rs
60
62
  - ext/parquet/src/lib.rs
61
63
  - ext/parquet/src/reader.rs
64
+ - ext/parquet/src/ruby_integration.rs
62
65
  - ext/parquet/src/ruby_reader.rs
66
+ - ext/parquet/src/types.rs
63
67
  - ext/parquet/src/utils.rs
64
68
  - lib/parquet.rb
65
69
  - lib/parquet.rbi