parquet 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Cargo.lock +57 -0
- data/Gemfile +1 -1
- data/README.md +66 -10
- data/ext/parquet/Cargo.toml +5 -0
- data/ext/parquet/src/enumerator.rs +32 -6
- data/ext/parquet/src/header_cache.rs +85 -28
- data/ext/parquet/src/lib.rs +2 -1
- data/ext/parquet/src/reader.rs +218 -13
- data/ext/parquet/src/types.rs +647 -15
- data/ext/parquet/src/utils.rs +57 -3
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +22 -3
- metadata +2 -2
data/ext/parquet/src/utils.rs
CHANGED
@@ -26,14 +26,14 @@ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, E
|
|
26
26
|
}
|
27
27
|
|
28
28
|
#[derive(Debug)]
|
29
|
-
pub struct
|
29
|
+
pub struct ParquetRowsArgs {
|
30
30
|
pub to_read: Value,
|
31
31
|
pub result_type: String,
|
32
32
|
pub columns: Option<Vec<String>>,
|
33
33
|
}
|
34
34
|
|
35
35
|
/// Parse common arguments for CSV parsing
|
36
|
-
pub fn
|
36
|
+
pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRowsArgs, Error> {
|
37
37
|
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
38
38
|
let (to_read,) = parsed_args.required;
|
39
39
|
|
@@ -67,9 +67,63 @@ pub fn parse_parquet_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetArgs, Er
|
|
67
67
|
None => String::from("hash"),
|
68
68
|
};
|
69
69
|
|
70
|
-
Ok(
|
70
|
+
Ok(ParquetRowsArgs {
|
71
71
|
to_read,
|
72
72
|
result_type,
|
73
73
|
columns: kwargs.optional.1,
|
74
74
|
})
|
75
75
|
}
|
76
|
+
|
77
|
+
#[derive(Debug)]
|
78
|
+
pub struct ParquetColumnsArgs {
|
79
|
+
pub to_read: Value,
|
80
|
+
pub result_type: String,
|
81
|
+
pub columns: Option<Vec<String>>,
|
82
|
+
pub batch_size: Option<usize>,
|
83
|
+
}
|
84
|
+
|
85
|
+
/// Parse common arguments for CSV parsing
|
86
|
+
pub fn parse_parquet_columns_args(
|
87
|
+
ruby: &Ruby,
|
88
|
+
args: &[Value],
|
89
|
+
) -> Result<ParquetColumnsArgs, Error> {
|
90
|
+
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
91
|
+
let (to_read,) = parsed_args.required;
|
92
|
+
|
93
|
+
let kwargs = get_kwargs::<_, (), (Option<Value>, Option<Vec<String>>, Option<usize>), ()>(
|
94
|
+
parsed_args.keywords,
|
95
|
+
&[],
|
96
|
+
&["result_type", "columns", "batch_size"],
|
97
|
+
)?;
|
98
|
+
|
99
|
+
let result_type = match kwargs
|
100
|
+
.optional
|
101
|
+
.0
|
102
|
+
.map(|value| parse_string_or_symbol(ruby, value))
|
103
|
+
{
|
104
|
+
Some(Ok(Some(parsed))) => match parsed.as_str() {
|
105
|
+
"hash" | "array" => parsed,
|
106
|
+
_ => {
|
107
|
+
return Err(Error::new(
|
108
|
+
magnus::exception::runtime_error(),
|
109
|
+
"result_type must be either 'hash' or 'array'",
|
110
|
+
))
|
111
|
+
}
|
112
|
+
},
|
113
|
+
Some(Ok(None)) => String::from("hash"),
|
114
|
+
Some(Err(_)) => {
|
115
|
+
return Err(Error::new(
|
116
|
+
magnus::exception::type_error(),
|
117
|
+
"result_type must be a String or Symbol",
|
118
|
+
))
|
119
|
+
}
|
120
|
+
None => String::from("hash"),
|
121
|
+
};
|
122
|
+
|
123
|
+
Ok(ParquetColumnsArgs {
|
124
|
+
to_read,
|
125
|
+
result_type,
|
126
|
+
columns: kwargs.optional.1,
|
127
|
+
batch_size: kwargs.optional.2,
|
128
|
+
})
|
129
|
+
}
|
data/lib/parquet/version.rb
CHANGED
data/lib/parquet.rbi
CHANGED
@@ -1,19 +1,38 @@
|
|
1
1
|
# typed: strict
|
2
2
|
module Parquet
|
3
3
|
# Options:
|
4
|
-
# - `input`: String
|
4
|
+
# - `input`: String, File, or IO object containing parquet data
|
5
5
|
# - `result_type`: String specifying the output format
|
6
6
|
# ("hash" or "array" or :hash or :array)
|
7
7
|
# - `columns`: When present, only the specified columns will be included in the output.
|
8
8
|
# This is useful for reducing how much data is read and improving performance.
|
9
9
|
sig do
|
10
10
|
params(
|
11
|
-
input: T.any(String, IO),
|
11
|
+
input: T.any(String, File, StringIO, IO),
|
12
12
|
result_type: T.nilable(T.any(String, Symbol)),
|
13
13
|
columns: T.nilable(T::Array[String]),
|
14
14
|
blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
|
15
|
-
).returns(T.any(Enumerator,
|
15
|
+
).returns(T.any(Enumerator, NilClass))
|
16
16
|
end
|
17
17
|
def self.each_row(input, result_type: nil, columns: nil, &blk)
|
18
18
|
end
|
19
|
+
|
20
|
+
# Options:
|
21
|
+
# - `input`: String, File, or IO object containing parquet data
|
22
|
+
# - `result_type`: String specifying the output format
|
23
|
+
# ("hash" or "array" or :hash or :array)
|
24
|
+
# - `columns`: When present, only the specified columns will be included in the output.
|
25
|
+
# - `batch_size`: When present, specifies the number of rows per batch
|
26
|
+
sig do
|
27
|
+
params(
|
28
|
+
input: T.any(String, File, StringIO, IO),
|
29
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
30
|
+
columns: T.nilable(T::Array[String]),
|
31
|
+
batch_size: T.nilable(Integer),
|
32
|
+
blk:
|
33
|
+
T.nilable(T.proc.params(batch: T.any(T::Hash[String, T::Array[T.untyped]], T::Array[T::Array[T.untyped]])).void)
|
34
|
+
).returns(T.any(Enumerator, NilClass))
|
35
|
+
end
|
36
|
+
def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, &blk)
|
37
|
+
end
|
19
38
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-01-
|
11
|
+
date: 2025-01-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|