parquet 0.0.1 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -26,18 +26,22 @@ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, E
26
26
  }
27
27
 
28
28
  #[derive(Debug)]
29
- pub struct ParquetArgs {
29
+ pub struct ParquetRowsArgs {
30
30
  pub to_read: Value,
31
31
  pub result_type: String,
32
+ pub columns: Option<Vec<String>>,
32
33
  }
33
34
 
34
35
  /// Parse common arguments for CSV parsing
35
- pub fn parse_parquet_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetArgs, Error> {
36
+ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRowsArgs, Error> {
36
37
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
37
38
  let (to_read,) = parsed_args.required;
38
39
 
39
- let kwargs =
40
- get_kwargs::<_, (), (Option<Value>,), ()>(parsed_args.keywords, &[], &["result_type"])?;
40
+ let kwargs = get_kwargs::<_, (), (Option<Value>, Option<Vec<String>>), ()>(
41
+ parsed_args.keywords,
42
+ &[],
43
+ &["result_type", "columns"],
44
+ )?;
41
45
 
42
46
  let result_type = match kwargs
43
47
  .optional
@@ -63,8 +67,63 @@ pub fn parse_parquet_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetArgs, Er
63
67
  None => String::from("hash"),
64
68
  };
65
69
 
66
- Ok(ParquetArgs {
70
+ Ok(ParquetRowsArgs {
67
71
  to_read,
68
72
  result_type,
73
+ columns: kwargs.optional.1,
74
+ })
75
+ }
76
+
77
+ #[derive(Debug)]
78
+ pub struct ParquetColumnsArgs {
79
+ pub to_read: Value,
80
+ pub result_type: String,
81
+ pub columns: Option<Vec<String>>,
82
+ pub batch_size: Option<usize>,
83
+ }
84
+
85
+ /// Parse common arguments for CSV parsing
86
+ pub fn parse_parquet_columns_args(
87
+ ruby: &Ruby,
88
+ args: &[Value],
89
+ ) -> Result<ParquetColumnsArgs, Error> {
90
+ let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
91
+ let (to_read,) = parsed_args.required;
92
+
93
+ let kwargs = get_kwargs::<_, (), (Option<Value>, Option<Vec<String>>, Option<usize>), ()>(
94
+ parsed_args.keywords,
95
+ &[],
96
+ &["result_type", "columns", "batch_size"],
97
+ )?;
98
+
99
+ let result_type = match kwargs
100
+ .optional
101
+ .0
102
+ .map(|value| parse_string_or_symbol(ruby, value))
103
+ {
104
+ Some(Ok(Some(parsed))) => match parsed.as_str() {
105
+ "hash" | "array" => parsed,
106
+ _ => {
107
+ return Err(Error::new(
108
+ magnus::exception::runtime_error(),
109
+ "result_type must be either 'hash' or 'array'",
110
+ ))
111
+ }
112
+ },
113
+ Some(Ok(None)) => String::from("hash"),
114
+ Some(Err(_)) => {
115
+ return Err(Error::new(
116
+ magnus::exception::type_error(),
117
+ "result_type must be a String or Symbol",
118
+ ))
119
+ }
120
+ None => String::from("hash"),
121
+ };
122
+
123
+ Ok(ParquetColumnsArgs {
124
+ to_read,
125
+ result_type,
126
+ columns: kwargs.optional.1,
127
+ batch_size: kwargs.optional.2,
69
128
  })
70
129
  }
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.3"
3
3
  end
data/lib/parquet.rbi CHANGED
@@ -1,17 +1,38 @@
1
1
  # typed: strict
2
-
3
2
  module Parquet
4
3
  # Options:
5
- # - `input`: String specifying the input file
4
+ # - `input`: String, File, or IO object containing parquet data
6
5
  # - `result_type`: String specifying the output format
7
6
  # ("hash" or "array" or :hash or :array)
7
+ # - `columns`: When present, only the specified columns will be included in the output.
8
+ # This is useful for reducing how much data is read and improving performance.
8
9
  sig do
9
10
  params(
10
- input: T.any(String, IO),
11
+ input: T.any(String, File, StringIO, IO),
11
12
  result_type: T.nilable(T.any(String, Symbol)),
13
+ columns: T.nilable(T::Array[String]),
12
14
  blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
13
- ).returns(T.any(Enumerator, T.untyped))
15
+ ).returns(T.any(Enumerator, NilClass))
16
+ end
17
+ def self.each_row(input, result_type: nil, columns: nil, &blk)
18
+ end
19
+
20
+ # Options:
21
+ # - `input`: String, File, or IO object containing parquet data
22
+ # - `result_type`: String specifying the output format
23
+ # ("hash" or "array" or :hash or :array)
24
+ # - `columns`: When present, only the specified columns will be included in the output.
25
+ # - `batch_size`: When present, specifies the number of rows per batch
26
+ sig do
27
+ params(
28
+ input: T.any(String, File, StringIO, IO),
29
+ result_type: T.nilable(T.any(String, Symbol)),
30
+ columns: T.nilable(T::Array[String]),
31
+ batch_size: T.nilable(Integer),
32
+ blk:
33
+ T.nilable(T.proc.params(batch: T.any(T::Hash[String, T::Array[T.untyped]], T::Array[T::Array[T.untyped]])).void)
34
+ ).returns(T.any(Enumerator, NilClass))
14
35
  end
15
- def self.each_row(input, result_type: nil, &blk)
36
+ def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, &blk)
16
37
  end
17
38
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-02 00:00:00.000000000 Z
11
+ date: 2025-01-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -56,10 +56,14 @@ files:
56
56
  - Rakefile
57
57
  - ext/parquet/Cargo.toml
58
58
  - ext/parquet/extconf.rb
59
+ - ext/parquet/src/allocator.rs
60
+ - ext/parquet/src/enumerator.rs
59
61
  - ext/parquet/src/header_cache.rs
60
62
  - ext/parquet/src/lib.rs
61
63
  - ext/parquet/src/reader.rs
64
+ - ext/parquet/src/ruby_integration.rs
62
65
  - ext/parquet/src/ruby_reader.rs
66
+ - ext/parquet/src/types.rs
63
67
  - ext/parquet/src/utils.rs
64
68
  - lib/parquet.rb
65
69
  - lib/parquet.rbi