parquet 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -26,18 +26,22 @@ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, E
26
26
  }
27
27
 
28
28
  #[derive(Debug)]
29
- pub struct ParquetArgs {
29
+ pub struct ParquetRowsArgs {
30
30
  pub to_read: Value,
31
31
  pub result_type: String,
32
+ pub columns: Option<Vec<String>>,
32
33
  }
33
34
 
34
35
  /// Parse common arguments for CSV parsing
35
- pub fn parse_parquet_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetArgs, Error> {
36
+ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRowsArgs, Error> {
36
37
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
37
38
  let (to_read,) = parsed_args.required;
38
39
 
39
- let kwargs =
40
- get_kwargs::<_, (), (Option<Value>,), ()>(parsed_args.keywords, &[], &["result_type"])?;
40
+ let kwargs = get_kwargs::<_, (), (Option<Value>, Option<Vec<String>>), ()>(
41
+ parsed_args.keywords,
42
+ &[],
43
+ &["result_type", "columns"],
44
+ )?;
41
45
 
42
46
  let result_type = match kwargs
43
47
  .optional
@@ -63,8 +67,63 @@ pub fn parse_parquet_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetArgs, Er
63
67
  None => String::from("hash"),
64
68
  };
65
69
 
66
- Ok(ParquetArgs {
70
+ Ok(ParquetRowsArgs {
67
71
  to_read,
68
72
  result_type,
73
+ columns: kwargs.optional.1,
74
+ })
75
+ }
76
+
77
+ #[derive(Debug)]
78
+ pub struct ParquetColumnsArgs {
79
+ pub to_read: Value,
80
+ pub result_type: String,
81
+ pub columns: Option<Vec<String>>,
82
+ pub batch_size: Option<usize>,
83
+ }
84
+
85
+ /// Parse common arguments for CSV parsing
86
+ pub fn parse_parquet_columns_args(
87
+ ruby: &Ruby,
88
+ args: &[Value],
89
+ ) -> Result<ParquetColumnsArgs, Error> {
90
+ let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
91
+ let (to_read,) = parsed_args.required;
92
+
93
+ let kwargs = get_kwargs::<_, (), (Option<Value>, Option<Vec<String>>, Option<usize>), ()>(
94
+ parsed_args.keywords,
95
+ &[],
96
+ &["result_type", "columns", "batch_size"],
97
+ )?;
98
+
99
+ let result_type = match kwargs
100
+ .optional
101
+ .0
102
+ .map(|value| parse_string_or_symbol(ruby, value))
103
+ {
104
+ Some(Ok(Some(parsed))) => match parsed.as_str() {
105
+ "hash" | "array" => parsed,
106
+ _ => {
107
+ return Err(Error::new(
108
+ magnus::exception::runtime_error(),
109
+ "result_type must be either 'hash' or 'array'",
110
+ ))
111
+ }
112
+ },
113
+ Some(Ok(None)) => String::from("hash"),
114
+ Some(Err(_)) => {
115
+ return Err(Error::new(
116
+ magnus::exception::type_error(),
117
+ "result_type must be a String or Symbol",
118
+ ))
119
+ }
120
+ None => String::from("hash"),
121
+ };
122
+
123
+ Ok(ParquetColumnsArgs {
124
+ to_read,
125
+ result_type,
126
+ columns: kwargs.optional.1,
127
+ batch_size: kwargs.optional.2,
69
128
  })
70
129
  }
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.3"
3
3
  end
data/lib/parquet.rbi CHANGED
@@ -1,17 +1,38 @@
1
1
  # typed: strict
2
-
3
2
  module Parquet
4
3
  # Options:
5
- # - `input`: String specifying the input file
4
+ # - `input`: String, File, or IO object containing parquet data
6
5
  # - `result_type`: String specifying the output format
7
6
  # ("hash" or "array" or :hash or :array)
7
+ # - `columns`: When present, only the specified columns will be included in the output.
8
+ # This is useful for reducing how much data is read and improving performance.
8
9
  sig do
9
10
  params(
10
- input: T.any(String, IO),
11
+ input: T.any(String, File, StringIO, IO),
11
12
  result_type: T.nilable(T.any(String, Symbol)),
13
+ columns: T.nilable(T::Array[String]),
12
14
  blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
13
- ).returns(T.any(Enumerator, T.untyped))
15
+ ).returns(T.any(Enumerator, NilClass))
16
+ end
17
+ def self.each_row(input, result_type: nil, columns: nil, &blk)
18
+ end
19
+
20
+ # Options:
21
+ # - `input`: String, File, or IO object containing parquet data
22
+ # - `result_type`: String specifying the output format
23
+ # ("hash" or "array" or :hash or :array)
24
+ # - `columns`: When present, only the specified columns will be included in the output.
25
+ # - `batch_size`: When present, specifies the number of rows per batch
26
+ sig do
27
+ params(
28
+ input: T.any(String, File, StringIO, IO),
29
+ result_type: T.nilable(T.any(String, Symbol)),
30
+ columns: T.nilable(T::Array[String]),
31
+ batch_size: T.nilable(Integer),
32
+ blk:
33
+ T.nilable(T.proc.params(batch: T.any(T::Hash[String, T::Array[T.untyped]], T::Array[T::Array[T.untyped]])).void)
34
+ ).returns(T.any(Enumerator, NilClass))
14
35
  end
15
- def self.each_row(input, result_type: nil, &blk)
36
+ def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, &blk)
16
37
  end
17
38
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-02 00:00:00.000000000 Z
11
+ date: 2025-01-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -56,10 +56,14 @@ files:
56
56
  - Rakefile
57
57
  - ext/parquet/Cargo.toml
58
58
  - ext/parquet/extconf.rb
59
+ - ext/parquet/src/allocator.rs
60
+ - ext/parquet/src/enumerator.rs
59
61
  - ext/parquet/src/header_cache.rs
60
62
  - ext/parquet/src/lib.rs
61
63
  - ext/parquet/src/reader.rs
64
+ - ext/parquet/src/ruby_integration.rs
62
65
  - ext/parquet/src/ruby_reader.rs
66
+ - ext/parquet/src/types.rs
63
67
  - ext/parquet/src/utils.rs
64
68
  - lib/parquet.rb
65
69
  - lib/parquet.rbi