parquet 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 66a87903cde02fff1e0a91627311bc1eba1fccc1d61915b7504c9fc61e10ed39
4
- data.tar.gz: 3d93f8aa12b664fd0dcff05b98dc3f12ba6ce745507ff18847936dafe8958be4
3
+ metadata.gz: 70d9932bf622cd2647423e2519013d3a9f9256217effe9610e9aeaaebbcf1778
4
+ data.tar.gz: fae3767ce0d950c91b17f77b740159d863293e1288063ed15d9b9c1f82e87fe1
5
5
  SHA512:
6
- metadata.gz: f042ba14e11c51849be30d22df7a1c63911bcf2ca6778b463b53878b562676f151f9a91a376baf57c3819a89861af2b76e58a80b66ea8c38870b8cf18664105a
7
- data.tar.gz: f708de1716e9640bba8fc3537c76b2cfefbcf9db709ea560db619eb6a50ef2a3965d06b43cb127541ae33067f2facab8bf75df6d8fe6c3aea6cadb765d099a2f
6
+ metadata.gz: a03e75bcd377ce5a61cd5f17685995c420601ac5917bd3d4a99dc082686423729ee5f0913bb032fe826dd1a8bac9b52c152cfb2037a376751258c17f3b0e63b1
7
+ data.tar.gz: ddfbb0ee14a6b7dcce47caf41962afe9610ab175d2b829c2744d62bed67cc746e64d214f64318220f2301a9ce8dcdecf9f9f9e90786df3d18f244716724abef8
data/Cargo.lock CHANGED
@@ -749,6 +749,26 @@ version = "1.0.14"
749
749
  source = "registry+https://github.com/rust-lang/crates.io-index"
750
750
  checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
751
751
 
752
+ [[package]]
753
+ name = "jemalloc-sys"
754
+ version = "0.5.4+5.3.0-patched"
755
+ source = "registry+https://github.com/rust-lang/crates.io-index"
756
+ checksum = "ac6c1946e1cea1788cbfde01c993b52a10e2da07f4bac608228d1bed20bfebf2"
757
+ dependencies = [
758
+ "cc",
759
+ "libc",
760
+ ]
761
+
762
+ [[package]]
763
+ name = "jemallocator"
764
+ version = "0.5.4"
765
+ source = "registry+https://github.com/rust-lang/crates.io-index"
766
+ checksum = "a0de374a9f8e63150e6f5e8a60cc14c668226d7a347d8aee1a45766e3c4dd3bc"
767
+ dependencies = [
768
+ "jemalloc-sys",
769
+ "libc",
770
+ ]
771
+
752
772
  [[package]]
753
773
  name = "jobserver"
754
774
  version = "0.1.32"
@@ -876,6 +896,16 @@ version = "0.2.11"
876
896
  source = "registry+https://github.com/rust-lang/crates.io-index"
877
897
  checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa"
878
898
 
899
+ [[package]]
900
+ name = "libmimalloc-sys"
901
+ version = "0.1.39"
902
+ source = "registry+https://github.com/rust-lang/crates.io-index"
903
+ checksum = "23aa6811d3bd4deb8a84dde645f943476d13b248d818edcf8ce0b2f37f036b44"
904
+ dependencies = [
905
+ "cc",
906
+ "libc",
907
+ ]
908
+
879
909
  [[package]]
880
910
  name = "litemap"
881
911
  version = "0.7.4"
@@ -948,6 +978,15 @@ version = "2.7.4"
948
978
  source = "registry+https://github.com/rust-lang/crates.io-index"
949
979
  checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
950
980
 
981
+ [[package]]
982
+ name = "mimalloc"
983
+ version = "0.1.43"
984
+ source = "registry+https://github.com/rust-lang/crates.io-index"
985
+ checksum = "68914350ae34959d83f732418d51e2427a794055d0b9529f48259ac07af65633"
986
+ dependencies = [
987
+ "libmimalloc-sys",
988
+ ]
989
+
951
990
  [[package]]
952
991
  name = "minimal-lexical"
953
992
  version = "0.2.1"
@@ -1119,15 +1158,17 @@ dependencies = [
1119
1158
  name = "parquet"
1120
1159
  version = "0.1.0"
1121
1160
  dependencies = [
1161
+ "ahash",
1122
1162
  "bytes",
1163
+ "jemallocator",
1123
1164
  "kanal",
1124
1165
  "magnus 0.7.1",
1166
+ "mimalloc",
1125
1167
  "parquet 54.0.0",
1126
1168
  "rb-sys",
1127
1169
  "serde",
1128
1170
  "serde_magnus",
1129
1171
  "thiserror",
1130
- "xxhash-rust",
1131
1172
  ]
1132
1173
 
1133
1174
  [[package]]
@@ -1796,12 +1837,6 @@ version = "0.5.5"
1796
1837
  source = "registry+https://github.com/rust-lang/crates.io-index"
1797
1838
  checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
1798
1839
 
1799
- [[package]]
1800
- name = "xxhash-rust"
1801
- version = "0.8.14"
1802
- source = "registry+https://github.com/rust-lang/crates.io-index"
1803
- checksum = "d7d48f1b18be023c95e7b75f481cac649d74be7c507ff4a407c55cfb957f7934"
1804
-
1805
1840
  [[package]]
1806
1841
  name = "yoke"
1807
1842
  version = "0.7.5"
data/Gemfile CHANGED
@@ -6,7 +6,12 @@ gem "rake"
6
6
  # Use local version of parquet
7
7
  gemspec
8
8
 
9
- group :development, :test do
10
- gem "minitest", "~> 5.0"
9
+ group :development do
11
10
  gem "benchmark-ips", "~> 2.12"
11
+ # gem "polars-df"
12
+ # gem "duckdb"
13
+ end
14
+
15
+ group :test do
16
+ gem "minitest", "~> 5.0"
12
17
  end
@@ -7,6 +7,7 @@ edition = "2021"
7
7
  crate-type = ["cdylib"]
8
8
 
9
9
  [dependencies]
10
+ ahash = "0.8"
10
11
  parquet = { version = "^54.0", features = ["json", "object_store"] }
11
12
  bytes = "^1.9"
12
13
  kanal = "0.1.0-pre8"
@@ -15,4 +16,9 @@ rb-sys = "^0.9"
15
16
  serde = { version = "1.0", features = ["derive"] }
16
17
  serde_magnus = "0.8.1"
17
18
  thiserror = "2.0"
18
- xxhash-rust = { version = "0.8.12", features = ["xxh3"] }
19
+
20
+ [target.'cfg(target_os = "linux")'.dependencies]
21
+ jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
22
+
23
+ [target.'cfg(not(any(target_os = "linux", target_os = "windows")))'.dependencies]
24
+ mimalloc = { version = "0.1", default-features = false }
@@ -0,0 +1,13 @@
1
+ #[cfg(target_os = "linux")]
2
+ use jemallocator::Jemalloc;
3
+
4
+ #[cfg(not(any(target_os = "linux", target_os = "windows")))]
5
+ use mimalloc::MiMalloc;
6
+
7
+ #[global_allocator]
8
+ #[cfg(target_os = "linux")]
9
+ static ALLOC: Jemalloc = Jemalloc;
10
+
11
+ #[global_allocator]
12
+ #[cfg(not(any(target_os = "linux", target_os = "windows")))]
13
+ static ALLOC: MiMalloc = MiMalloc;
@@ -0,0 +1,28 @@
1
+ use ahash::RandomState;
2
+ use magnus::{
3
+ block::Yield, value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value,
4
+ };
5
+
6
+ use crate::Record;
7
+
8
+ pub struct EnumeratorArgs {
9
+ pub rb_self: Value,
10
+ pub to_read: Value,
11
+ pub result_type: String,
12
+ pub columns: Option<Vec<String>>,
13
+ }
14
+
15
+ #[inline]
16
+ pub fn create_enumerator(
17
+ args: EnumeratorArgs,
18
+ ) -> Result<Yield<Box<dyn Iterator<Item = Record<RandomState>>>>, MagnusError> {
19
+ let kwargs = RHash::new();
20
+ kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
21
+ if let Some(columns) = args.columns {
22
+ kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
23
+ }
24
+ let enumerator = args
25
+ .rb_self
26
+ .enumeratorize("for_each", (args.to_read, KwArgs(kwargs)));
27
+ Ok(Yield::Enumerator(enumerator))
28
+ }
@@ -6,7 +6,7 @@
6
6
  /// so this optimization could be removed if any issues arise.
7
7
  use std::{
8
8
  collections::HashMap,
9
- sync::{atomic::AtomicU32, LazyLock, Mutex},
9
+ sync::{atomic::AtomicU32, LazyLock, Mutex, OnceLock},
10
10
  };
11
11
  use thiserror::Error;
12
12
 
@@ -79,3 +79,25 @@ impl StringCache {
79
79
  Ok(())
80
80
  }
81
81
  }
82
+
83
+ pub struct HeaderCacheCleanupIter<I> {
84
+ pub inner: I,
85
+ pub headers: OnceLock<Vec<&'static str>>,
86
+ }
87
+
88
+ impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
89
+ type Item = I::Item;
90
+
91
+ #[inline(always)]
92
+ fn next(&mut self) -> Option<Self::Item> {
93
+ self.inner.next()
94
+ }
95
+ }
96
+
97
+ impl<I> Drop for HeaderCacheCleanupIter<I> {
98
+ fn drop(&mut self) {
99
+ if let Some(headers) = self.headers.get() {
100
+ StringCache::clear(&headers).unwrap();
101
+ }
102
+ }
103
+ }
@@ -1,9 +1,16 @@
1
+ mod allocator;
2
+ mod enumerator;
1
3
  pub mod header_cache;
2
4
  mod reader;
5
+ mod ruby_integration;
3
6
  mod ruby_reader;
7
+ mod types;
4
8
  mod utils;
5
9
 
10
+ use crate::enumerator::*;
6
11
  use crate::reader::*;
12
+ use crate::ruby_integration::*;
13
+ use crate::types::*;
7
14
 
8
15
  use magnus::{Error, Ruby};
9
16
 
@@ -1,106 +1,39 @@
1
- use crate::header_cache::{CacheError, StringCache};
2
- use crate::ruby_reader::{build_ruby_reader, SeekableRead};
3
- use crate::utils::*;
4
- use bytes::Bytes;
1
+ // =============================================================================
2
+ // Imports and Dependencies
3
+ // =============================================================================
4
+ use crate::header_cache::{CacheError, HeaderCacheCleanupIter, StringCache};
5
+ use crate::{
6
+ create_enumerator, utils::*, EnumeratorArgs, ForgottenFileHandle, ParquetField, Record,
7
+ SeekableRubyValue,
8
+ };
9
+ use ahash::RandomState;
5
10
  use magnus::rb_sys::AsRawValue;
6
11
  use magnus::value::{Opaque, ReprValue};
7
- use magnus::IntoValue;
8
- use magnus::{block::Yield, Error as MagnusError, KwArgs, RHash, Ruby, Symbol, Value};
9
- use parquet::errors::ParquetError;
10
- use parquet::file::reader::{ChunkReader, Length, SerializedFileReader};
11
- use parquet::record::Field;
12
+ use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
13
+ use parquet::file::reader::FileReader;
14
+ use parquet::file::reader::SerializedFileReader;
15
+ use parquet::record::reader::RowIter as ParquetRowIter;
16
+ use parquet::schema::types::{Type as SchemaType, TypePtr};
12
17
  use std::collections::HashMap;
13
18
  use std::fs::File;
14
- use std::io::{self, BufReader, Read, Seek, SeekFrom};
19
+ use std::io::{self};
15
20
  use std::mem::ManuallyDrop;
16
21
  use std::os::fd::FromRawFd;
17
22
  use std::sync::OnceLock;
18
- use std::{borrow::Cow, hash::BuildHasher};
19
23
  use thiserror::Error;
20
- use xxhash_rust::xxh3::Xxh3Builder;
21
-
22
- use parquet::record::reader::RowIter as ParquetRowIter;
23
-
24
- #[derive(Error, Debug)]
25
- pub enum ReaderError {
26
- #[error("Failed to get file descriptor: {0}")]
27
- FileDescriptor(String),
28
- #[error("Invalid file descriptor")]
29
- InvalidFileDescriptor,
30
- #[error("Failed to open file: {0}")]
31
- FileOpen(#[from] io::Error),
32
- #[error("Failed to intern headers: {0}")]
33
- HeaderIntern(#[from] CacheError),
34
- #[error("Ruby error: {0}")]
35
- Ruby(String),
36
- }
37
-
38
- impl From<MagnusError> for ReaderError {
39
- fn from(err: MagnusError) -> Self {
40
- Self::Ruby(err.to_string())
41
- }
42
- }
43
-
44
- impl From<ReaderError> for MagnusError {
45
- fn from(err: ReaderError) -> Self {
46
- MagnusError::new(
47
- Ruby::get().unwrap().exception_runtime_error(),
48
- err.to_string(),
49
- )
50
- }
51
- }
52
-
53
- struct ForgottenFileHandle(ManuallyDrop<File>);
54
-
55
- impl Length for ForgottenFileHandle {
56
- fn len(&self) -> u64 {
57
- self.0.len()
58
- }
59
- }
60
-
61
- impl ChunkReader for ForgottenFileHandle {
62
- type T = BufReader<File>;
63
-
64
- fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
65
- self.0.get_read(start)
66
- }
67
-
68
- fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
69
- self.0.get_bytes(start, length)
70
- }
71
- }
72
-
73
- struct HeaderCacheCleanupIter<I> {
74
- inner: I,
75
- headers: OnceLock<Vec<&'static str>>,
76
- }
77
-
78
- impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
79
- type Item = I::Item;
80
-
81
- fn next(&mut self) -> Option<Self::Item> {
82
- self.inner.next()
83
- }
84
- }
85
-
86
- impl<I> Drop for HeaderCacheCleanupIter<I> {
87
- fn drop(&mut self) {
88
- if let Some(headers) = self.headers.get() {
89
- StringCache::clear(&headers).unwrap();
90
- }
91
- }
92
- }
93
24
 
25
+ #[inline]
94
26
  pub fn parse_parquet<'a>(
95
27
  rb_self: Value,
96
28
  args: &[Value],
97
- ) -> Result<Yield<Box<dyn Iterator<Item = Record<Xxh3Builder>>>>, MagnusError> {
29
+ ) -> Result<Yield<Box<dyn Iterator<Item = Record<RandomState>>>>, MagnusError> {
98
30
  let original = unsafe { Ruby::get_unchecked() };
99
31
  let ruby: &'static Ruby = Box::leak(Box::new(original));
100
32
 
101
33
  let ParquetArgs {
102
34
  to_read,
103
35
  result_type,
36
+ columns,
104
37
  } = parse_parquet_args(&ruby, args)?;
105
38
 
106
39
  if !ruby.block_given() {
@@ -108,15 +41,18 @@ pub fn parse_parquet<'a>(
108
41
  rb_self,
109
42
  to_read,
110
43
  result_type,
44
+ columns,
111
45
  });
112
46
  }
113
47
 
114
- let iter = if to_read.is_kind_of(ruby.class_string()) {
48
+ let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
115
49
  let path_string = to_read.to_r_string()?;
116
50
  let file_path = unsafe { path_string.as_str()? };
117
51
  let file = File::open(file_path).unwrap();
118
52
  let reader = SerializedFileReader::new(file).unwrap();
119
- ParquetRowIter::from_file_into(Box::new(reader))
53
+ let schema = reader.metadata().file_metadata().schema().clone();
54
+
55
+ (schema, ParquetRowIter::from_file_into(Box::new(reader)))
120
56
  } else if to_read.is_kind_of(ruby.class_io()) {
121
57
  let raw_value = to_read.as_raw();
122
58
  let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
@@ -131,14 +67,28 @@ pub fn parse_parquet<'a>(
131
67
  let file = unsafe { File::from_raw_fd(fd) };
132
68
  let file = ForgottenFileHandle(ManuallyDrop::new(file));
133
69
  let reader = SerializedFileReader::new(file).unwrap();
134
- ParquetRowIter::from_file_into(Box::new(reader))
70
+ let schema = reader.metadata().file_metadata().schema().clone();
71
+
72
+ (schema, ParquetRowIter::from_file_into(Box::new(reader)))
135
73
  } else {
136
74
  let readable = SeekableRubyValue(Opaque::from(to_read));
137
75
  let reader = SerializedFileReader::new(readable).unwrap();
138
- ParquetRowIter::from_file_into(Box::new(reader))
76
+ let schema = reader.metadata().file_metadata().schema().clone();
77
+
78
+ (schema, ParquetRowIter::from_file_into(Box::new(reader)))
139
79
  };
140
80
 
141
- let iter: Box<dyn Iterator<Item = Record<Xxh3Builder>>> = match result_type.as_str() {
81
+ if let Some(cols) = columns {
82
+ let projection = create_projection_schema(&schema, &cols);
83
+ iter = iter.project(Some(projection.to_owned())).map_err(|e| {
84
+ MagnusError::new(
85
+ ruby.exception_runtime_error(),
86
+ format!("Failed to create projection: {}", e),
87
+ )
88
+ })?;
89
+ }
90
+
91
+ let iter: Box<dyn Iterator<Item = Record<RandomState>>> = match result_type.as_str() {
142
92
  "hash" => {
143
93
  let headers = OnceLock::new();
144
94
  let headers_clone = headers.clone();
@@ -146,21 +96,23 @@ pub fn parse_parquet<'a>(
146
96
  .filter_map(move |row| {
147
97
  row.ok().map(|row| {
148
98
  let headers = headers_clone.get_or_init(|| {
149
- row.get_column_iter()
150
- .map(|(k, _)| StringCache::intern(k.to_owned()).unwrap())
151
- .collect::<Vec<_>>()
99
+ let column_count = row.get_column_iter().count();
100
+ let mut headers = Vec::with_capacity(column_count);
101
+ row.get_column_iter().for_each(|(k, _)| {
102
+ headers.push(StringCache::intern(k.to_owned()).unwrap())
103
+ });
104
+ headers
152
105
  });
153
106
 
154
- row.get_column_iter()
155
- .enumerate()
156
- .map(|(i, (_, v))| {
157
- let key = headers[i];
158
- (key, ParquetField(v.clone()))
159
- })
160
- .collect::<HashMap<&'static str, ParquetField, Xxh3Builder>>()
107
+ let mut map =
108
+ HashMap::with_capacity_and_hasher(headers.len(), Default::default());
109
+ row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
110
+ map.insert(headers[i], ParquetField(v.clone()));
111
+ });
112
+ map
161
113
  })
162
114
  })
163
- .map(|row| Record::Map(row));
115
+ .map(Record::Map);
164
116
 
165
117
  Box::new(HeaderCacheCleanupIter {
166
118
  inner: iter,
@@ -170,12 +122,14 @@ pub fn parse_parquet<'a>(
170
122
  "array" => Box::new(
171
123
  iter.filter_map(|row| {
172
124
  row.ok().map(|row| {
125
+ let column_count = row.get_column_iter().count();
126
+ let mut vec = Vec::with_capacity(column_count);
173
127
  row.get_column_iter()
174
- .map(|(_, v)| ParquetField(v.clone()))
175
- .collect::<Vec<ParquetField>>()
128
+ .for_each(|(_, v)| vec.push(ParquetField(v.clone())));
129
+ vec
176
130
  })
177
131
  })
178
- .map(|row| Record::Vec(row)),
132
+ .map(Record::Vec),
179
133
  ),
180
134
  _ => {
181
135
  return Err(MagnusError::new(
@@ -188,150 +142,49 @@ pub fn parse_parquet<'a>(
188
142
  Ok(Yield::Iter(iter))
189
143
  }
190
144
 
191
- struct EnumeratorArgs {
192
- rb_self: Value,
193
- to_read: Value,
194
- result_type: String,
195
- }
196
-
197
- fn create_enumerator(
198
- args: EnumeratorArgs,
199
- ) -> Result<Yield<Box<dyn Iterator<Item = Record<Xxh3Builder>>>>, MagnusError> {
200
- let kwargs = RHash::new();
201
-
202
- kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
203
-
204
- let enumerator = args
205
- .rb_self
206
- .enumeratorize("for_each", (args.to_read, KwArgs(kwargs)));
207
- Ok(Yield::Enumerator(enumerator))
208
- }
209
-
210
- #[derive(Debug)]
211
- pub enum Record<S: BuildHasher + Default> {
212
- Vec(Vec<ParquetField>),
213
- Map(HashMap<&'static str, ParquetField, S>),
214
- }
145
+ fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
146
+ if let SchemaType::GroupType { fields, .. } = schema {
147
+ let projected_fields: Vec<TypePtr> = fields
148
+ .iter()
149
+ .filter(|field| columns.contains(&field.name().to_string()))
150
+ .cloned()
151
+ .collect();
215
152
 
216
- impl<S: BuildHasher + Default> IntoValue for Record<S> {
217
- #[inline]
218
- fn into_value_with(self, handle: &Ruby) -> Value {
219
- match self {
220
- Record::Vec(vec) => {
221
- let ary = handle.ary_new_capa(vec.len());
222
- vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
223
- ary.into_value_with(handle)
224
- }
225
- Record::Map(map) => {
226
- // Pre-allocate the hash with the known size
227
- let hash = handle.hash_new_capa(map.len());
228
- map.into_iter()
229
- .try_for_each(|(k, v)| hash.aset(k, v))
230
- .unwrap();
231
- hash.into_value_with(handle)
232
- }
153
+ SchemaType::GroupType {
154
+ basic_info: schema.get_basic_info().clone(),
155
+ fields: projected_fields,
233
156
  }
157
+ } else {
158
+ // Return original schema if not a group type
159
+ schema.clone()
234
160
  }
235
161
  }
236
162
 
237
- #[derive(Debug, Clone)]
238
- pub struct CowValue<'a>(pub Cow<'a, str>);
239
-
240
- impl<'a> IntoValue for CowValue<'a> {
241
- fn into_value_with(self, handle: &Ruby) -> Value {
242
- self.0.into_value_with(handle)
243
- }
244
- }
245
-
246
- #[derive(Debug)]
247
- pub struct ParquetField(Field);
248
-
249
- impl<'a> IntoValue for ParquetField {
250
- fn into_value_with(self, handle: &Ruby) -> Value {
251
- match self.0 {
252
- Field::Byte(b) => b.into_value_with(handle),
253
- Field::Bool(b) => b.into_value_with(handle),
254
- Field::Short(s) => s.into_value_with(handle),
255
- Field::Int(i) => i.into_value_with(handle),
256
- Field::Long(l) => l.into_value_with(handle),
257
- Field::UByte(ub) => ub.into_value_with(handle),
258
- Field::UShort(us) => us.into_value_with(handle),
259
- Field::UInt(ui) => ui.into_value_with(handle),
260
- Field::ULong(ul) => ul.into_value_with(handle),
261
- Field::Float16(f) => f32::from(f).into_value_with(handle),
262
- Field::Float(f) => f.into_value_with(handle),
263
- Field::Double(d) => d.into_value_with(handle),
264
-
265
- Field::Str(s) => s.into_value_with(handle),
266
- Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
267
- Field::Date(d) => d.into_value_with(handle),
268
- Field::TimestampMillis(ts) => ts.into_value_with(handle),
269
- Field::TimestampMicros(ts) => ts.into_value_with(handle),
270
- Field::ListInternal(list) => {
271
- let ary = handle.ary_new_capa(list.elements().len());
272
- list.elements()
273
- .iter()
274
- .try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
275
- .unwrap();
276
- ary.into_value_with(handle)
277
- }
278
- Field::MapInternal(map) => {
279
- let hash = handle.hash_new_capa(map.entries().len());
280
- map.entries()
281
- .iter()
282
- .try_for_each(|(k, v)| {
283
- hash.aset(
284
- ParquetField(k.clone()).into_value_with(handle),
285
- ParquetField(v.clone()).into_value_with(handle),
286
- )
287
- })
288
- .unwrap();
289
- hash.into_value_with(handle)
290
- }
291
- // Field::Decimal(d) => d.to_string().into_value_with(handle),
292
- // Field::Group(row) => row.into_value_with(handle),
293
- Field::Null => handle.qnil().as_value(),
294
- _ => panic!("Unsupported field type"),
295
- }
296
- }
163
+ #[derive(Error, Debug)]
164
+ pub enum ReaderError {
165
+ #[error("Failed to get file descriptor: {0}")]
166
+ FileDescriptor(String),
167
+ #[error("Invalid file descriptor")]
168
+ InvalidFileDescriptor,
169
+ #[error("Failed to open file: {0}")]
170
+ FileOpen(#[from] io::Error),
171
+ #[error("Failed to intern headers: {0}")]
172
+ HeaderIntern(#[from] CacheError),
173
+ #[error("Ruby error: {0}")]
174
+ Ruby(String),
297
175
  }
298
176
 
299
- struct SeekableRubyValue(Opaque<Value>);
300
-
301
- impl Length for SeekableRubyValue {
302
- fn len(&self) -> u64 {
303
- let ruby = unsafe { Ruby::get_unchecked() };
304
- let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
305
- let current_pos = reader.seek(SeekFrom::Current(0)).unwrap();
306
- let file_len = reader.seek(SeekFrom::End(0)).unwrap();
307
- reader.seek(SeekFrom::Start(current_pos)).unwrap();
308
- file_len
177
+ impl From<MagnusError> for ReaderError {
178
+ fn from(err: MagnusError) -> Self {
179
+ Self::Ruby(err.to_string())
309
180
  }
310
181
  }
311
182
 
312
- impl ChunkReader for SeekableRubyValue {
313
- type T = BufReader<Box<dyn SeekableRead>>;
314
-
315
- fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
316
- let ruby = unsafe { Ruby::get_unchecked() };
317
- let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
318
- reader.seek(SeekFrom::Start(start))?;
319
- Ok(BufReader::new(reader))
320
- }
321
-
322
- fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
323
- let ruby = unsafe { Ruby::get_unchecked() };
324
- let mut buffer = Vec::with_capacity(length);
325
- let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
326
- reader.seek(SeekFrom::Start(start))?;
327
- let read = reader.take(length as _).read_to_end(&mut buffer)?;
328
-
329
- if read != length {
330
- return Err(ParquetError::EOF(format!(
331
- "Expected to read {} bytes, read only {}",
332
- length, read
333
- )));
334
- }
335
- Ok(buffer.into())
183
+ impl From<ReaderError> for MagnusError {
184
+ fn from(err: ReaderError) -> Self {
185
+ MagnusError::new(
186
+ Ruby::get().unwrap().exception_runtime_error(),
187
+ err.to_string(),
188
+ )
336
189
  }
337
190
  }
@@ -0,0 +1,77 @@
1
+ use std::{
2
+ fs::File,
3
+ io::{BufReader, SeekFrom},
4
+ mem::ManuallyDrop,
5
+ };
6
+
7
+ use bytes::Bytes;
8
+ use magnus::{value::Opaque, Ruby, Value};
9
+ use parquet::{
10
+ errors::ParquetError,
11
+ file::reader::{ChunkReader, Length},
12
+ };
13
+ use std::io::Read;
14
+
15
+ use crate::ruby_reader::{build_ruby_reader, SeekableRead};
16
+
17
+ const READ_BUFFER_SIZE: usize = 16 * 1024;
18
+
19
+ pub struct SeekableRubyValue(pub Opaque<Value>);
20
+
21
+ impl Length for SeekableRubyValue {
22
+ fn len(&self) -> u64 {
23
+ let ruby = unsafe { Ruby::get_unchecked() };
24
+ let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
25
+ let current_pos = reader.seek(SeekFrom::Current(0)).unwrap();
26
+ let file_len = reader.seek(SeekFrom::End(0)).unwrap();
27
+ reader.seek(SeekFrom::Start(current_pos)).unwrap();
28
+ file_len
29
+ }
30
+ }
31
+
32
+ impl ChunkReader for SeekableRubyValue {
33
+ type T = BufReader<Box<dyn SeekableRead>>;
34
+
35
+ fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
36
+ let ruby = unsafe { Ruby::get_unchecked() };
37
+ let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
38
+ reader.seek(SeekFrom::Start(start))?;
39
+ Ok(BufReader::with_capacity(READ_BUFFER_SIZE, reader))
40
+ }
41
+
42
+ fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
43
+ let ruby = unsafe { Ruby::get_unchecked() };
44
+ let mut buffer = Vec::with_capacity(length);
45
+ let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
46
+ reader.seek(SeekFrom::Start(start))?;
47
+ let read = reader.take(length as _).read_to_end(&mut buffer)?;
48
+
49
+ if read != length {
50
+ return Err(ParquetError::EOF(format!(
51
+ "Expected to read {} bytes, read only {}",
52
+ length, read
53
+ )));
54
+ }
55
+ Ok(buffer.into())
56
+ }
57
+ }
58
+
59
+ pub struct ForgottenFileHandle(pub ManuallyDrop<File>);
60
+
61
+ impl Length for ForgottenFileHandle {
62
+ fn len(&self) -> u64 {
63
+ self.0.len()
64
+ }
65
+ }
66
+
67
+ impl ChunkReader for ForgottenFileHandle {
68
+ type T = BufReader<File>;
69
+
70
+ fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
71
+ self.0.get_read(start)
72
+ }
73
+
74
+ fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
75
+ self.0.get_bytes(start, length)
76
+ }
77
+ }
@@ -2,30 +2,24 @@ use magnus::{
2
2
  value::{Opaque, ReprValue},
3
3
  RClass, RString, Ruby, Value,
4
4
  };
5
- use std::io::{self, Read, Seek};
5
+ use std::io::{self, Read, Seek, SeekFrom, Write};
6
6
  use std::sync::OnceLock;
7
7
 
8
8
  static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
9
9
 
10
- const READ_BUFFER_SIZE: usize = 16 * 1024;
11
-
12
10
  /// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
13
11
  /// and provide a standard Read implementation for them.
14
12
  pub struct RubyReader<T> {
15
13
  inner: T,
16
- buffer: Option<Vec<u8>>,
17
14
  offset: usize,
18
- // Number of bytes that have been read into the buffer
19
- // Used as an upper bound for offset
20
- buffered_bytes: usize,
21
15
  }
22
16
 
23
17
  pub trait SeekableRead: std::io::Read + Seek {}
24
18
  impl SeekableRead for RubyReader<Value> {}
25
19
  impl SeekableRead for RubyReader<RString> {}
26
20
 
27
- pub fn build_ruby_reader<'a>(
28
- ruby: &'a Ruby,
21
+ pub fn build_ruby_reader(
22
+ ruby: &Ruby,
29
23
  input: Value,
30
24
  ) -> Result<Box<dyn SeekableRead>, magnus::Error> {
31
25
  if RubyReader::is_string_io(ruby, &input) {
@@ -39,49 +33,48 @@ pub fn build_ruby_reader<'a>(
39
33
 
40
34
  impl Seek for RubyReader<Value> {
41
35
  fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
42
- let seek_to = match pos {
43
- io::SeekFrom::Start(offset) => {
44
- // SEEK_SET - absolute position
45
- offset as i64
46
- }
47
- io::SeekFrom::End(offset) => {
48
- // SEEK_END - from end of stream
49
- offset
50
- }
51
- io::SeekFrom::Current(offset) => {
52
- // SEEK_CUR - relative to current
53
- offset
54
- }
36
+ let (whence, offset) = match pos {
37
+ SeekFrom::Start(i) => (0, i as i64),
38
+ SeekFrom::Current(i) => (1, i),
39
+ SeekFrom::End(i) => (2, i),
55
40
  };
56
41
 
57
- let whence = match pos {
58
- io::SeekFrom::Start(_) => 0, // SEEK_SET
59
- io::SeekFrom::End(_) => 2, // SEEK_END
60
- io::SeekFrom::Current(_) => 1, // SEEK_CUR
61
- };
42
+ let new_position = self
43
+ .inner
44
+ .funcall("seek", (offset, whence))
45
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
46
+
47
+ Ok(new_position)
48
+ }
49
+ }
62
50
 
63
- // Call Ruby's seek method
64
- let _: u64 = self.inner.funcall("seek", (seek_to, whence)).unwrap();
51
+ impl Write for RubyReader<Value> {
52
+ fn write(&mut self, buf: &[u8]) -> Result<usize, io::Error> {
53
+ let ruby_bytes = RString::from_slice(buf);
65
54
 
66
- // Get current position
67
- let pos: u64 = self.inner.funcall("pos", ()).unwrap();
55
+ let bytes_written = self
56
+ .inner
57
+ .funcall::<_, _, usize>("write", (ruby_bytes,))
58
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
68
59
 
69
- Ok(pos)
60
+ Ok(bytes_written)
61
+ }
62
+
63
+ fn flush(&mut self) -> Result<(), io::Error> {
64
+ self.inner
65
+ .funcall::<_, _, Value>("flush", ())
66
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
67
+
68
+ Ok(())
70
69
  }
71
70
  }
72
71
 
73
72
  impl Seek for RubyReader<RString> {
74
73
  fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
75
74
  match pos {
76
- io::SeekFrom::Start(offset) => {
77
- self.offset = offset as usize;
78
- }
79
- io::SeekFrom::End(offset) => {
80
- self.offset = (self.inner.len() - offset as usize) as usize;
81
- }
82
- io::SeekFrom::Current(offset) => {
83
- self.offset = (self.offset as i64 + offset) as usize;
84
- }
75
+ io::SeekFrom::Start(offset) => self.offset = offset as usize,
76
+ io::SeekFrom::Current(offset) => self.offset = (self.offset as i64 + offset) as usize,
77
+ io::SeekFrom::End(offset) => self.offset = self.inner.len() - offset as usize,
85
78
  }
86
79
  Ok(self.offset as u64)
87
80
  }
@@ -106,59 +99,9 @@ impl RubyReader<Value> {
106
99
  fn from_io_like(input: Value) -> Self {
107
100
  Self {
108
101
  inner: input,
109
- buffer: Some(vec![0; READ_BUFFER_SIZE]),
110
102
  offset: 0,
111
- buffered_bytes: 0,
112
103
  }
113
104
  }
114
-
115
- fn read_from_buffer(&mut self, to_buf: &mut [u8]) -> Option<io::Result<usize>> {
116
- if let Some(from_buf) = &self.buffer {
117
- // If the offset is within the buffered bytes, copy the remaining bytes to the output buffer
118
- if self.offset < self.buffered_bytes {
119
- let remaining = self.buffered_bytes - self.offset;
120
- let copy_size = remaining.min(to_buf.len());
121
- to_buf[..copy_size]
122
- .copy_from_slice(&from_buf[self.offset..self.offset + copy_size]);
123
- self.offset += copy_size;
124
- Some(Ok(copy_size))
125
- } else {
126
- None
127
- }
128
- } else {
129
- None
130
- }
131
- }
132
-
133
- fn read_from_ruby(&mut self, buf: &mut [u8]) -> io::Result<usize> {
134
- let buffer = self.buffer.as_mut().unwrap();
135
- let result = self
136
- .inner
137
- .funcall::<_, _, RString>("read", (buffer.capacity(),))
138
- .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
139
-
140
- if result.is_nil() {
141
- return Ok(0); // EOF
142
- }
143
-
144
- let bytes = unsafe { result.as_slice() };
145
-
146
- // Update internal buffer
147
- let bytes_len = bytes.len();
148
- if bytes_len == 0 {
149
- return Ok(0);
150
- }
151
-
152
- // Only copy what we actually read
153
- buffer[..bytes_len].copy_from_slice(bytes);
154
- self.buffered_bytes = bytes_len;
155
-
156
- // Copy to output buffer
157
- let copy_size = bytes_len.min(buf.len());
158
- buf[..copy_size].copy_from_slice(&buffer[..copy_size]);
159
- self.offset = copy_size;
160
- Ok(copy_size)
161
- }
162
105
  }
163
106
 
164
107
  impl RubyReader<RString> {
@@ -176,9 +119,7 @@ impl RubyReader<RString> {
176
119
  let string_content = input.funcall::<_, _, RString>("string", ()).unwrap();
177
120
  Ok(Box::new(Self {
178
121
  inner: string_content,
179
- buffer: None,
180
122
  offset: 0,
181
- buffered_bytes: 0,
182
123
  }))
183
124
  }
184
125
 
@@ -197,21 +138,21 @@ impl RubyReader<RString> {
197
138
  .or_else(|_| input.funcall::<_, _, RString>("to_s", ()))?;
198
139
  Ok(Box::new(Self {
199
140
  inner: string_content,
200
- buffer: None,
201
141
  offset: 0,
202
- buffered_bytes: 0,
203
142
  }))
204
143
  }
205
144
  }
206
145
 
207
146
  impl Read for RubyReader<Value> {
208
- fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
209
- if let Some(result) = self.read_from_buffer(buf) {
210
- result
211
- } else {
212
- // If the buffer is empty, read from Ruby
213
- self.read_from_ruby(buf)
214
- }
147
+ fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
148
+ let bytes = self
149
+ .inner
150
+ .funcall::<_, _, RString>("read", (buf.len(),))
151
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
152
+
153
+ buf.write_all(unsafe { bytes.as_slice() })?;
154
+
155
+ Ok(bytes.len())
215
156
  }
216
157
  }
217
158
 
@@ -0,0 +1,90 @@
1
+ use std::{borrow::Cow, collections::HashMap, hash::BuildHasher};
2
+
3
+ use magnus::{value::ReprValue, IntoValue, Ruby, Value};
4
+ use parquet::record::Field;
5
+
6
+ #[derive(Debug)]
7
+ pub enum Record<S: BuildHasher + Default> {
8
+ Vec(Vec<ParquetField>),
9
+ Map(HashMap<&'static str, ParquetField, S>),
10
+ }
11
+
12
+ impl<S: BuildHasher + Default> IntoValue for Record<S> {
13
+ fn into_value_with(self, handle: &Ruby) -> Value {
14
+ match self {
15
+ Record::Vec(vec) => {
16
+ let ary = handle.ary_new_capa(vec.len());
17
+ vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
18
+ ary.into_value_with(handle)
19
+ }
20
+ Record::Map(map) => {
21
+ let hash = handle.hash_new_capa(map.len());
22
+ map.into_iter()
23
+ .try_for_each(|(k, v)| hash.aset(k, v))
24
+ .unwrap();
25
+ hash.into_value_with(handle)
26
+ }
27
+ }
28
+ }
29
+ }
30
+
31
+ #[derive(Debug, Clone)]
32
+ pub struct CowValue<'a>(pub Cow<'a, str>);
33
+
34
+ impl<'a> IntoValue for CowValue<'a> {
35
+ fn into_value_with(self, handle: &Ruby) -> Value {
36
+ self.0.into_value_with(handle)
37
+ }
38
+ }
39
+
40
+ #[derive(Debug)]
41
+ pub struct ParquetField(pub Field);
42
+
43
+ impl IntoValue for ParquetField {
44
+ fn into_value_with(self, handle: &Ruby) -> Value {
45
+ match self.0 {
46
+ Field::Byte(b) => b.into_value_with(handle),
47
+ Field::Bool(b) => b.into_value_with(handle),
48
+ Field::Short(s) => s.into_value_with(handle),
49
+ Field::Int(i) => i.into_value_with(handle),
50
+ Field::Long(l) => l.into_value_with(handle),
51
+ Field::UByte(ub) => ub.into_value_with(handle),
52
+ Field::UShort(us) => us.into_value_with(handle),
53
+ Field::UInt(ui) => ui.into_value_with(handle),
54
+ Field::ULong(ul) => ul.into_value_with(handle),
55
+ Field::Float16(f) => f32::from(f).into_value_with(handle),
56
+ Field::Float(f) => f.into_value_with(handle),
57
+ Field::Double(d) => d.into_value_with(handle),
58
+ Field::Str(s) => s.into_value_with(handle),
59
+ Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
60
+ Field::Date(d) => d.into_value_with(handle),
61
+ Field::TimestampMillis(ts) => ts.into_value_with(handle),
62
+ Field::TimestampMicros(ts) => ts.into_value_with(handle),
63
+ Field::ListInternal(list) => {
64
+ let elements = list.elements();
65
+ let ary = handle.ary_new_capa(elements.len());
66
+ elements
67
+ .iter()
68
+ .try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
69
+ .unwrap();
70
+ ary.into_value_with(handle)
71
+ }
72
+ Field::MapInternal(map) => {
73
+ let entries = map.entries();
74
+ let hash = handle.hash_new_capa(entries.len());
75
+ entries
76
+ .iter()
77
+ .try_for_each(|(k, v)| {
78
+ hash.aset(
79
+ ParquetField(k.clone()).into_value_with(handle),
80
+ ParquetField(v.clone()).into_value_with(handle),
81
+ )
82
+ })
83
+ .unwrap();
84
+ hash.into_value_with(handle)
85
+ }
86
+ Field::Null => handle.qnil().as_value(),
87
+ _ => panic!("Unsupported field type"),
88
+ }
89
+ }
90
+ }
@@ -29,6 +29,7 @@ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, E
29
29
  pub struct ParquetArgs {
30
30
  pub to_read: Value,
31
31
  pub result_type: String,
32
+ pub columns: Option<Vec<String>>,
32
33
  }
33
34
 
34
35
  /// Parse common arguments for CSV parsing
@@ -36,8 +37,11 @@ pub fn parse_parquet_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetArgs, Er
36
37
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
37
38
  let (to_read,) = parsed_args.required;
38
39
 
39
- let kwargs =
40
- get_kwargs::<_, (), (Option<Value>,), ()>(parsed_args.keywords, &[], &["result_type"])?;
40
+ let kwargs = get_kwargs::<_, (), (Option<Value>, Option<Vec<String>>), ()>(
41
+ parsed_args.keywords,
42
+ &[],
43
+ &["result_type", "columns"],
44
+ )?;
41
45
 
42
46
  let result_type = match kwargs
43
47
  .optional
@@ -66,5 +70,6 @@ pub fn parse_parquet_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetArgs, Er
66
70
  Ok(ParquetArgs {
67
71
  to_read,
68
72
  result_type,
73
+ columns: kwargs.optional.1,
69
74
  })
70
75
  }
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
data/lib/parquet.rbi CHANGED
@@ -1,17 +1,19 @@
1
1
  # typed: strict
2
-
3
2
  module Parquet
4
3
  # Options:
5
4
  # - `input`: String specifying the input file
6
5
  # - `result_type`: String specifying the output format
7
6
  # ("hash" or "array" or :hash or :array)
7
+ # - `columns`: When present, only the specified columns will be included in the output.
8
+ # This is useful for reducing how much data is read and improving performance.
8
9
  sig do
9
10
  params(
10
11
  input: T.any(String, IO),
11
12
  result_type: T.nilable(T.any(String, Symbol)),
13
+ columns: T.nilable(T::Array[String]),
12
14
  blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
13
15
  ).returns(T.any(Enumerator, T.untyped))
14
16
  end
15
- def self.each_row(input, result_type: nil, &blk)
17
+ def self.each_row(input, result_type: nil, columns: nil, &blk)
16
18
  end
17
19
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-02 00:00:00.000000000 Z
11
+ date: 2025-01-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -56,10 +56,14 @@ files:
56
56
  - Rakefile
57
57
  - ext/parquet/Cargo.toml
58
58
  - ext/parquet/extconf.rb
59
+ - ext/parquet/src/allocator.rs
60
+ - ext/parquet/src/enumerator.rs
59
61
  - ext/parquet/src/header_cache.rs
60
62
  - ext/parquet/src/lib.rs
61
63
  - ext/parquet/src/reader.rs
64
+ - ext/parquet/src/ruby_integration.rs
62
65
  - ext/parquet/src/ruby_reader.rs
66
+ - ext/parquet/src/types.rs
63
67
  - ext/parquet/src/utils.rs
64
68
  - lib/parquet.rb
65
69
  - lib/parquet.rbi