parquet 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +99 -7
- data/Gemfile +7 -2
- data/README.md +66 -10
- data/ext/parquet/Cargo.toml +12 -1
- data/ext/parquet/src/allocator.rs +13 -0
- data/ext/parquet/src/enumerator.rs +54 -0
- data/ext/parquet/src/header_cache.rs +105 -26
- data/ext/parquet/src/lib.rs +9 -1
- data/ext/parquet/src/reader.rs +289 -231
- data/ext/parquet/src/ruby_integration.rs +77 -0
- data/ext/parquet/src/ruby_reader.rs +43 -102
- data/ext/parquet/src/types.rs +722 -0
- data/ext/parquet/src/utils.rs +64 -5
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +26 -5
- metadata +6 -2
data/ext/parquet/src/utils.rs
CHANGED
@@ -26,18 +26,22 @@ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, E
|
|
26
26
|
}
|
27
27
|
|
28
28
|
#[derive(Debug)]
|
29
|
-
pub struct
|
29
|
+
pub struct ParquetRowsArgs {
|
30
30
|
pub to_read: Value,
|
31
31
|
pub result_type: String,
|
32
|
+
pub columns: Option<Vec<String>>,
|
32
33
|
}
|
33
34
|
|
34
35
|
/// Parse common arguments for CSV parsing
|
35
|
-
pub fn
|
36
|
+
pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRowsArgs, Error> {
|
36
37
|
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
37
38
|
let (to_read,) = parsed_args.required;
|
38
39
|
|
39
|
-
let kwargs =
|
40
|
-
|
40
|
+
let kwargs = get_kwargs::<_, (), (Option<Value>, Option<Vec<String>>), ()>(
|
41
|
+
parsed_args.keywords,
|
42
|
+
&[],
|
43
|
+
&["result_type", "columns"],
|
44
|
+
)?;
|
41
45
|
|
42
46
|
let result_type = match kwargs
|
43
47
|
.optional
|
@@ -63,8 +67,63 @@ pub fn parse_parquet_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetArgs, Er
|
|
63
67
|
None => String::from("hash"),
|
64
68
|
};
|
65
69
|
|
66
|
-
Ok(
|
70
|
+
Ok(ParquetRowsArgs {
|
67
71
|
to_read,
|
68
72
|
result_type,
|
73
|
+
columns: kwargs.optional.1,
|
74
|
+
})
|
75
|
+
}
|
76
|
+
|
77
|
+
#[derive(Debug)]
|
78
|
+
pub struct ParquetColumnsArgs {
|
79
|
+
pub to_read: Value,
|
80
|
+
pub result_type: String,
|
81
|
+
pub columns: Option<Vec<String>>,
|
82
|
+
pub batch_size: Option<usize>,
|
83
|
+
}
|
84
|
+
|
85
|
+
/// Parse common arguments for CSV parsing
|
86
|
+
pub fn parse_parquet_columns_args(
|
87
|
+
ruby: &Ruby,
|
88
|
+
args: &[Value],
|
89
|
+
) -> Result<ParquetColumnsArgs, Error> {
|
90
|
+
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
91
|
+
let (to_read,) = parsed_args.required;
|
92
|
+
|
93
|
+
let kwargs = get_kwargs::<_, (), (Option<Value>, Option<Vec<String>>, Option<usize>), ()>(
|
94
|
+
parsed_args.keywords,
|
95
|
+
&[],
|
96
|
+
&["result_type", "columns", "batch_size"],
|
97
|
+
)?;
|
98
|
+
|
99
|
+
let result_type = match kwargs
|
100
|
+
.optional
|
101
|
+
.0
|
102
|
+
.map(|value| parse_string_or_symbol(ruby, value))
|
103
|
+
{
|
104
|
+
Some(Ok(Some(parsed))) => match parsed.as_str() {
|
105
|
+
"hash" | "array" => parsed,
|
106
|
+
_ => {
|
107
|
+
return Err(Error::new(
|
108
|
+
magnus::exception::runtime_error(),
|
109
|
+
"result_type must be either 'hash' or 'array'",
|
110
|
+
))
|
111
|
+
}
|
112
|
+
},
|
113
|
+
Some(Ok(None)) => String::from("hash"),
|
114
|
+
Some(Err(_)) => {
|
115
|
+
return Err(Error::new(
|
116
|
+
magnus::exception::type_error(),
|
117
|
+
"result_type must be a String or Symbol",
|
118
|
+
))
|
119
|
+
}
|
120
|
+
None => String::from("hash"),
|
121
|
+
};
|
122
|
+
|
123
|
+
Ok(ParquetColumnsArgs {
|
124
|
+
to_read,
|
125
|
+
result_type,
|
126
|
+
columns: kwargs.optional.1,
|
127
|
+
batch_size: kwargs.optional.2,
|
69
128
|
})
|
70
129
|
}
|
data/lib/parquet/version.rb
CHANGED
data/lib/parquet.rbi
CHANGED
@@ -1,17 +1,38 @@
|
|
1
1
|
# typed: strict
|
2
|
-
|
3
2
|
module Parquet
|
4
3
|
# Options:
|
5
|
-
# - `input`: String
|
4
|
+
# - `input`: String, File, or IO object containing parquet data
|
6
5
|
# - `result_type`: String specifying the output format
|
7
6
|
# ("hash" or "array" or :hash or :array)
|
7
|
+
# - `columns`: When present, only the specified columns will be included in the output.
|
8
|
+
# This is useful for reducing how much data is read and improving performance.
|
8
9
|
sig do
|
9
10
|
params(
|
10
|
-
input: T.any(String, IO),
|
11
|
+
input: T.any(String, File, StringIO, IO),
|
11
12
|
result_type: T.nilable(T.any(String, Symbol)),
|
13
|
+
columns: T.nilable(T::Array[String]),
|
12
14
|
blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
|
13
|
-
).returns(T.any(Enumerator,
|
15
|
+
).returns(T.any(Enumerator, NilClass))
|
16
|
+
end
|
17
|
+
def self.each_row(input, result_type: nil, columns: nil, &blk)
|
18
|
+
end
|
19
|
+
|
20
|
+
# Options:
|
21
|
+
# - `input`: String, File, or IO object containing parquet data
|
22
|
+
# - `result_type`: String specifying the output format
|
23
|
+
# ("hash" or "array" or :hash or :array)
|
24
|
+
# - `columns`: When present, only the specified columns will be included in the output.
|
25
|
+
# - `batch_size`: When present, specifies the number of rows per batch
|
26
|
+
sig do
|
27
|
+
params(
|
28
|
+
input: T.any(String, File, StringIO, IO),
|
29
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
30
|
+
columns: T.nilable(T::Array[String]),
|
31
|
+
batch_size: T.nilable(Integer),
|
32
|
+
blk:
|
33
|
+
T.nilable(T.proc.params(batch: T.any(T::Hash[String, T::Array[T.untyped]], T::Array[T::Array[T.untyped]])).void)
|
34
|
+
).returns(T.any(Enumerator, NilClass))
|
14
35
|
end
|
15
|
-
def self.
|
36
|
+
def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, &blk)
|
16
37
|
end
|
17
38
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-01-
|
11
|
+
date: 2025-01-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -56,10 +56,14 @@ files:
|
|
56
56
|
- Rakefile
|
57
57
|
- ext/parquet/Cargo.toml
|
58
58
|
- ext/parquet/extconf.rb
|
59
|
+
- ext/parquet/src/allocator.rs
|
60
|
+
- ext/parquet/src/enumerator.rs
|
59
61
|
- ext/parquet/src/header_cache.rs
|
60
62
|
- ext/parquet/src/lib.rs
|
61
63
|
- ext/parquet/src/reader.rs
|
64
|
+
- ext/parquet/src/ruby_integration.rs
|
62
65
|
- ext/parquet/src/ruby_reader.rs
|
66
|
+
- ext/parquet/src/types.rs
|
63
67
|
- ext/parquet/src/utils.rs
|
64
68
|
- lib/parquet.rb
|
65
69
|
- lib/parquet.rbi
|