parquet 0.5.2 → 0.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,12 +11,12 @@ use crate::{
11
11
  use crate::{types::PrimitiveType, SchemaNode};
12
12
  use arrow_array::{Array, RecordBatch};
13
13
  use magnus::{value::ReprValue, Error as MagnusError, RArray, Ruby, Value};
14
- use std::sync::Arc;
14
+ use std::{rc::Rc, sync::Arc};
15
15
 
16
16
  #[inline]
17
17
  pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
18
18
  let ruby = unsafe { Ruby::get_unchecked() };
19
- write_columns_impl(Arc::new(ruby), args).map_err(|e| {
19
+ write_columns_impl(Rc::new(ruby), args).map_err(|e| {
20
20
  let z: MagnusError = e.into();
21
21
  z
22
22
  })?;
@@ -24,7 +24,7 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
24
24
  }
25
25
 
26
26
  #[inline]
27
- fn write_columns_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemError> {
27
+ fn write_columns_impl(ruby: Rc<Ruby>, args: &[Value]) -> Result<(), ParquetGemError> {
28
28
  let ParquetWriteArgs {
29
29
  read_from,
30
30
  write_to,
@@ -94,7 +94,7 @@ fn write_columns_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemE
94
94
  };
95
95
 
96
96
  if batch_array.len() != schema_len {
97
- return Err(MagnusError::new(
97
+ Err(MagnusError::new(
98
98
  magnus::exception::type_error(),
99
99
  format!(
100
100
  "Batch column count ({}) does not match schema length ({}). Schema expects columns: {:?}",
@@ -118,7 +118,7 @@ fn write_columns_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemE
118
118
  ))?,
119
119
  };
120
120
  if top_fields.len() != fields.len() {
121
- return Err(MagnusError::new(
121
+ Err(MagnusError::new(
122
122
  magnus::exception::runtime_error(),
123
123
  "Mismatch top-level DSL fields vs Arrow fields",
124
124
  ))?;
@@ -140,31 +140,34 @@ fn write_columns_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemE
140
140
  parquet_type,
141
141
  // Format is handled internally now
142
142
  ..
143
- } => match parquet_type {
144
- &PrimitiveType::Int8 => PST::Primitive(PrimitiveType::Int8),
145
- &PrimitiveType::Int16 => PST::Primitive(PrimitiveType::Int16),
146
- &PrimitiveType::Int32 => PST::Primitive(PrimitiveType::Int32),
147
- &PrimitiveType::Int64 => PST::Primitive(PrimitiveType::Int64),
148
- &PrimitiveType::UInt8 => PST::Primitive(PrimitiveType::UInt8),
149
- &PrimitiveType::UInt16 => PST::Primitive(PrimitiveType::UInt16),
150
- &PrimitiveType::UInt32 => PST::Primitive(PrimitiveType::UInt32),
151
- &PrimitiveType::UInt64 => PST::Primitive(PrimitiveType::UInt64),
152
- &PrimitiveType::Float32 => {
143
+ } => match *parquet_type {
144
+ PrimitiveType::Int8 => PST::Primitive(PrimitiveType::Int8),
145
+ PrimitiveType::Int16 => PST::Primitive(PrimitiveType::Int16),
146
+ PrimitiveType::Int32 => PST::Primitive(PrimitiveType::Int32),
147
+ PrimitiveType::Int64 => PST::Primitive(PrimitiveType::Int64),
148
+ PrimitiveType::UInt8 => PST::Primitive(PrimitiveType::UInt8),
149
+ PrimitiveType::UInt16 => PST::Primitive(PrimitiveType::UInt16),
150
+ PrimitiveType::UInt32 => PST::Primitive(PrimitiveType::UInt32),
151
+ PrimitiveType::UInt64 => PST::Primitive(PrimitiveType::UInt64),
152
+ PrimitiveType::Float32 => {
153
153
  PST::Primitive(PrimitiveType::Float32)
154
154
  }
155
- &PrimitiveType::Float64 => {
155
+ PrimitiveType::Float64 => {
156
156
  PST::Primitive(PrimitiveType::Float64)
157
157
  }
158
- &PrimitiveType::String => PST::Primitive(PrimitiveType::String),
159
- &PrimitiveType::Binary => PST::Primitive(PrimitiveType::Binary),
160
- &PrimitiveType::Boolean => {
158
+ PrimitiveType::Decimal128(precision, scale) => {
159
+ PST::Primitive(PrimitiveType::Decimal128(precision, scale))
160
+ }
161
+ PrimitiveType::String => PST::Primitive(PrimitiveType::String),
162
+ PrimitiveType::Binary => PST::Primitive(PrimitiveType::Binary),
163
+ PrimitiveType::Boolean => {
161
164
  PST::Primitive(PrimitiveType::Boolean)
162
165
  }
163
- &PrimitiveType::Date32 => PST::Primitive(PrimitiveType::Date32),
164
- &PrimitiveType::TimestampMillis => {
166
+ PrimitiveType::Date32 => PST::Primitive(PrimitiveType::Date32),
167
+ PrimitiveType::TimestampMillis => {
165
168
  PST::Primitive(PrimitiveType::TimestampMillis)
166
169
  }
167
- &PrimitiveType::TimestampMicros => {
170
+ PrimitiveType::TimestampMicros => {
168
171
  PST::Primitive(PrimitiveType::TimestampMicros)
169
172
  }
170
173
  },
@@ -205,12 +208,12 @@ fn write_columns_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemE
205
208
  if e.is_kind_of(ruby.exception_stop_iteration()) {
206
209
  break;
207
210
  }
208
- return Err(e)?;
211
+ Err(e)?;
209
212
  }
210
213
  }
211
214
  }
212
215
  } else {
213
- return Err(MagnusError::new(
216
+ Err(MagnusError::new(
214
217
  magnus::exception::type_error(),
215
218
  "read_from must be an Enumerator".to_string(),
216
219
  ))?;
@@ -16,14 +16,14 @@ use magnus::{
16
16
  value::ReprValue, Error as MagnusError, RArray, RHash, Ruby, Symbol, TryConvert, Value,
17
17
  };
18
18
  use rand::Rng;
19
- use std::sync::Arc;
19
+ use std::{rc::Rc, sync::Arc};
20
20
 
21
21
  const MIN_SAMPLES_FOR_ESTIMATE: usize = 10;
22
22
 
23
23
  #[inline]
24
24
  pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
25
25
  let ruby = unsafe { Ruby::get_unchecked() };
26
- write_rows_impl(Arc::new(ruby), args).map_err(|e| {
26
+ write_rows_impl(Rc::new(ruby), args).map_err(|e| {
27
27
  let z: MagnusError = e.into();
28
28
  z
29
29
  })?;
@@ -31,7 +31,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
31
31
  }
32
32
 
33
33
  #[inline]
34
- fn write_rows_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemError> {
34
+ fn write_rows_impl(ruby: Rc<Ruby>, args: &[Value]) -> Result<(), ParquetGemError> {
35
35
  let ParquetWriteArgs {
36
36
  read_from,
37
37
  write_to,
@@ -83,8 +83,8 @@ fn write_rows_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemErro
83
83
  })?;
84
84
  let row_size = estimate_single_row_size(&row_array, &column_collectors)?;
85
85
  size_samples.push(row_size);
86
- } else if rng.random_range(0..=total_rows) < sample_size as usize {
87
- let idx = rng.random_range(0..sample_size as usize);
86
+ } else if rng.random_range(0..=total_rows) < sample_size {
87
+ let idx = rng.random_range(0..sample_size);
88
88
  let row_array = RArray::from_value(row).ok_or_else(|| {
89
89
  MagnusError::new(ruby.exception_type_error(), "Row must be an array")
90
90
  })?;
@@ -115,12 +115,12 @@ fn write_rows_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemErro
115
115
  }
116
116
  break;
117
117
  }
118
- return Err(e)?;
118
+ Err(e)?;
119
119
  }
120
120
  }
121
121
  }
122
122
  } else {
123
- return Err(MagnusError::new(
123
+ Err(MagnusError::new(
124
124
  magnus::exception::type_error(),
125
125
  "read_from must be an Enumerator".to_string(),
126
126
  ))?;
@@ -257,6 +257,7 @@ pub fn estimate_value_size(
257
257
  | PST::Primitive(PrimitiveType::UInt64)
258
258
  | PST::Primitive(PrimitiveType::Float64) => Ok(8),
259
259
  PST::Primitive(PrimitiveType::Boolean) => Ok(1),
260
+ PST::Primitive(PrimitiveType::Decimal128(_, _)) => Ok(16),
260
261
  PST::Primitive(PrimitiveType::Date32)
261
262
  | PST::Primitive(PrimitiveType::TimestampMillis)
262
263
  | PST::Primitive(PrimitiveType::TimestampMicros) => Ok(8),
@@ -429,15 +430,13 @@ pub fn estimate_value_size(
429
430
  if let Some(field_value) = hash.get(&*field.name) {
430
431
  total_fields_size +=
431
432
  estimate_value_size(field_value, &field.type_)?;
433
+ } else if field.nullable {
434
+ total_fields_size += 0;
432
435
  } else {
433
- if field.nullable {
434
- total_fields_size += 0;
435
- } else {
436
- return Err(MagnusError::new(
437
- magnus::exception::runtime_error(),
438
- format!("Missing field: {} in hash {:?}", field.name, hash),
439
- ));
440
- }
436
+ return Err(MagnusError::new(
437
+ magnus::exception::runtime_error(),
438
+ format!("Missing field: {} in hash {:?}", field.name, hash),
439
+ ));
441
440
  }
442
441
  }
443
442
  }
@@ -11,6 +11,9 @@ module Parquet
11
11
  # field :id, :int64, nullable: false # ID cannot be null
12
12
  # field :name, :string # Default nullable: true
13
13
  #
14
+ # # Decimal field with precision and scale
15
+ # field :price, :decimal, precision: 10, scale: 2
16
+ #
14
17
  # # List with non-nullable items
15
18
  # field :scores, :list, item: :float, item_nullable: false
16
19
  #
@@ -45,7 +48,7 @@ module Parquet
45
48
 
46
49
  # Define a field in the schema
47
50
  # @param name [String, Symbol] field name
48
- # @param type [Symbol] data type (:int32, :int64, :string, :list, :map, :struct, etc)
51
+ # @param type [Symbol] data type (:int32, :int64, :string, :list, :map, :struct, :decimal, etc)
49
52
  # @param nullable [Boolean] whether the field can be null (default: true)
50
53
  # @param kwargs [Hash] additional options depending on type
51
54
  #
@@ -55,6 +58,7 @@ module Parquet
55
58
  # - `key:, value:` if type == :map
56
59
  # - `key_nullable:, value_nullable:` controls nullability of map keys/values (default: true)
57
60
  # - `format:` if you want to store some format string
61
+ # - `precision:, scale:` if type == :decimal (precision defaults to 38, scale to 0)
58
62
  # - `nullable:` default to true if not specified
59
63
  def field(name, type, nullable: true, **kwargs, &block)
60
64
  field_hash = { name: name.to_s, type: type, nullable: !!nullable }
@@ -73,7 +77,21 @@ module Parquet
73
77
  raise ArgumentError, "list field `#{name}` requires `item:` type" unless item_type
74
78
  # Pass item_nullable if provided, otherwise use true as default
75
79
  item_nullable = kwargs[:item_nullable].nil? ? true : !!kwargs[:item_nullable]
76
- field_hash[:item] = wrap_subtype(item_type, nullable: item_nullable, &block)
80
+
81
+ # Pass precision and scale if type is decimal
82
+ if item_type == :decimal
83
+ precision = kwargs[:precision]
84
+ scale = kwargs[:scale]
85
+ field_hash[:item] = wrap_subtype(
86
+ item_type,
87
+ nullable: item_nullable,
88
+ precision: precision,
89
+ scale: scale,
90
+ &block
91
+ )
92
+ else
93
+ field_hash[:item] = wrap_subtype(item_type, nullable: item_nullable, &block)
94
+ end
77
95
  when :map
78
96
  # user must specify key:, value:
79
97
  key_type = kwargs[:key]
@@ -82,8 +100,47 @@ module Parquet
82
100
  # Pass key_nullable and value_nullable if provided, otherwise use true as default
83
101
  key_nullable = kwargs[:key_nullable].nil? ? true : !!kwargs[:key_nullable]
84
102
  value_nullable = kwargs[:value_nullable].nil? ? true : !!kwargs[:value_nullable]
103
+
85
104
  field_hash[:key] = wrap_subtype(key_type, nullable: key_nullable)
86
- field_hash[:value] = wrap_subtype(value_type, nullable: value_nullable, &block)
105
+
106
+ # Pass precision and scale if value type is decimal
107
+ if value_type == :decimal
108
+ precision = kwargs[:precision]
109
+ scale = kwargs[:scale]
110
+ field_hash[:value] = wrap_subtype(
111
+ value_type,
112
+ nullable: value_nullable,
113
+ precision: precision,
114
+ scale: scale,
115
+ &block
116
+ )
117
+ else
118
+ field_hash[:value] = wrap_subtype(value_type, nullable: value_nullable, &block)
119
+ end
120
+ when :decimal
121
+ # Store precision and scale for decimal type according to rules:
122
+ # 1. When neither precision nor scale is provided, use maximum precision (38)
123
+ # 2. When only precision is provided, scale defaults to 0
124
+ # 3. When only scale is provided, use maximum precision (38)
125
+ # 4. When both are provided, use the provided values
126
+
127
+ if kwargs[:precision].nil? && kwargs[:scale].nil?
128
+ # No precision or scale provided - use maximum precision
129
+ field_hash[:precision] = 38
130
+ field_hash[:scale] = 0
131
+ elsif kwargs[:precision] && kwargs[:scale].nil?
132
+ # Precision only - scale defaults to 0
133
+ field_hash[:precision] = kwargs[:precision]
134
+ field_hash[:scale] = 0
135
+ elsif kwargs[:precision].nil? && kwargs[:scale]
136
+ # Scale only - use maximum precision
137
+ field_hash[:precision] = 38
138
+ field_hash[:scale] = kwargs[:scale]
139
+ else
140
+ # Both provided
141
+ field_hash[:precision] = kwargs[:precision]
142
+ field_hash[:scale] = kwargs[:scale]
143
+ end
87
144
  else
88
145
  # primitive type: :int32, :int64, :string, etc.
89
146
  # do nothing else special
@@ -122,7 +179,7 @@ module Parquet
122
179
  # If user said: field "something", :list, item: :struct do ... end
123
180
  # we want to recursively parse that sub-struct from the block.
124
181
  # So wrap_subtype might be:
125
- def wrap_subtype(t, nullable: true, &block)
182
+ def wrap_subtype(t, nullable: true, precision: nil, scale: nil, &block)
126
183
  if t == :struct
127
184
  sub_builder = SchemaBuilder.new
128
185
  sub_builder.instance_eval(&block) if block
@@ -144,6 +201,34 @@ module Parquet
144
201
  end
145
202
 
146
203
  { type: :list, nullable: nullable, name: "item", item: sub_builder.fields[0] }
204
+ elsif t == :decimal
205
+ # Handle decimal type with precision and scale
206
+ result = { type: t, nullable: nullable, name: "item" }
207
+
208
+ # Follow the same rules as in field() method:
209
+ # 1. When neither precision nor scale is provided, use maximum precision (38)
210
+ # 2. When only precision is provided, scale defaults to 0
211
+ # 3. When only scale is provided, use maximum precision (38)
212
+ # 4. When both are provided, use the provided values
213
+ if precision.nil? && scale.nil?
214
+ # No precision or scale provided - use maximum precision
215
+ result[:precision] = 38
216
+ result[:scale] = 0
217
+ elsif precision && scale.nil?
218
+ # Precision only - scale defaults to 0
219
+ result[:precision] = precision
220
+ result[:scale] = 0
221
+ elsif precision.nil? && scale
222
+ # Scale only - use maximum precision
223
+ result[:precision] = 38
224
+ result[:scale] = scale
225
+ else
226
+ # Both provided
227
+ result[:precision] = precision
228
+ result[:scale] = scale
229
+ end
230
+
231
+ result
147
232
  else
148
233
  # e.g. :int32 => { type: :int32, nullable: true }
149
234
  { type: t, nullable: nullable, name: "item" }
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.5.2"
2
+ VERSION = "0.5.4"
3
3
  end
data/lib/parquet.rbi CHANGED
@@ -1,6 +1,17 @@
1
1
  # typed: true
2
2
 
3
3
  module Parquet
4
+ # Returns metadata information about a Parquet file
5
+ #
6
+ # The returned hash contains information about:
7
+ # - Basic file metadata (num_rows, created_by)
8
+ # - Schema information (fields, types, etc.)
9
+ # - Row group details
10
+ # - Column chunk information (compression, encodings, statistics)
11
+ sig { params(path: String).returns(T::Hash[String, T.untyped]) }
12
+ def self.metadata(path)
13
+ end
14
+
4
15
  # Options:
5
16
  # - `input`: String, File, or IO object containing parquet data
6
17
  # - `result_type`: String specifying the output format
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.5.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-03-17 00:00:00.000000000 Z
11
+ date: 2025-04-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -55,6 +55,7 @@ files:
55
55
  - README.md
56
56
  - Rakefile
57
57
  - ext/parquet/Cargo.toml
58
+ - ext/parquet/build.rs
58
59
  - ext/parquet/extconf.rb
59
60
  - ext/parquet/src/allocator.rs
60
61
  - ext/parquet/src/enumerator.rs