parquet 0.5.1 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,12 +11,12 @@ use crate::{
11
11
  use crate::{types::PrimitiveType, SchemaNode};
12
12
  use arrow_array::{Array, RecordBatch};
13
13
  use magnus::{value::ReprValue, Error as MagnusError, RArray, Ruby, Value};
14
- use std::sync::Arc;
14
+ use std::{rc::Rc, sync::Arc};
15
15
 
16
16
  #[inline]
17
17
  pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
18
18
  let ruby = unsafe { Ruby::get_unchecked() };
19
- write_columns_impl(Arc::new(ruby), args).map_err(|e| {
19
+ write_columns_impl(Rc::new(ruby), args).map_err(|e| {
20
20
  let z: MagnusError = e.into();
21
21
  z
22
22
  })?;
@@ -24,7 +24,7 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
24
24
  }
25
25
 
26
26
  #[inline]
27
- fn write_columns_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemError> {
27
+ fn write_columns_impl(ruby: Rc<Ruby>, args: &[Value]) -> Result<(), ParquetGemError> {
28
28
  let ParquetWriteArgs {
29
29
  read_from,
30
30
  write_to,
@@ -94,7 +94,7 @@ fn write_columns_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemE
94
94
  };
95
95
 
96
96
  if batch_array.len() != schema_len {
97
- return Err(MagnusError::new(
97
+ Err(MagnusError::new(
98
98
  magnus::exception::type_error(),
99
99
  format!(
100
100
  "Batch column count ({}) does not match schema length ({}). Schema expects columns: {:?}",
@@ -118,7 +118,7 @@ fn write_columns_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemE
118
118
  ))?,
119
119
  };
120
120
  if top_fields.len() != fields.len() {
121
- return Err(MagnusError::new(
121
+ Err(MagnusError::new(
122
122
  magnus::exception::runtime_error(),
123
123
  "Mismatch top-level DSL fields vs Arrow fields",
124
124
  ))?;
@@ -140,31 +140,34 @@ fn write_columns_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemE
140
140
  parquet_type,
141
141
  // Format is handled internally now
142
142
  ..
143
- } => match parquet_type {
144
- &PrimitiveType::Int8 => PST::Primitive(PrimitiveType::Int8),
145
- &PrimitiveType::Int16 => PST::Primitive(PrimitiveType::Int16),
146
- &PrimitiveType::Int32 => PST::Primitive(PrimitiveType::Int32),
147
- &PrimitiveType::Int64 => PST::Primitive(PrimitiveType::Int64),
148
- &PrimitiveType::UInt8 => PST::Primitive(PrimitiveType::UInt8),
149
- &PrimitiveType::UInt16 => PST::Primitive(PrimitiveType::UInt16),
150
- &PrimitiveType::UInt32 => PST::Primitive(PrimitiveType::UInt32),
151
- &PrimitiveType::UInt64 => PST::Primitive(PrimitiveType::UInt64),
152
- &PrimitiveType::Float32 => {
143
+ } => match *parquet_type {
144
+ PrimitiveType::Int8 => PST::Primitive(PrimitiveType::Int8),
145
+ PrimitiveType::Int16 => PST::Primitive(PrimitiveType::Int16),
146
+ PrimitiveType::Int32 => PST::Primitive(PrimitiveType::Int32),
147
+ PrimitiveType::Int64 => PST::Primitive(PrimitiveType::Int64),
148
+ PrimitiveType::UInt8 => PST::Primitive(PrimitiveType::UInt8),
149
+ PrimitiveType::UInt16 => PST::Primitive(PrimitiveType::UInt16),
150
+ PrimitiveType::UInt32 => PST::Primitive(PrimitiveType::UInt32),
151
+ PrimitiveType::UInt64 => PST::Primitive(PrimitiveType::UInt64),
152
+ PrimitiveType::Float32 => {
153
153
  PST::Primitive(PrimitiveType::Float32)
154
154
  }
155
- &PrimitiveType::Float64 => {
155
+ PrimitiveType::Float64 => {
156
156
  PST::Primitive(PrimitiveType::Float64)
157
157
  }
158
- &PrimitiveType::String => PST::Primitive(PrimitiveType::String),
159
- &PrimitiveType::Binary => PST::Primitive(PrimitiveType::Binary),
160
- &PrimitiveType::Boolean => {
158
+ PrimitiveType::Decimal128(precision, scale) => {
159
+ PST::Primitive(PrimitiveType::Decimal128(precision, scale))
160
+ }
161
+ PrimitiveType::String => PST::Primitive(PrimitiveType::String),
162
+ PrimitiveType::Binary => PST::Primitive(PrimitiveType::Binary),
163
+ PrimitiveType::Boolean => {
161
164
  PST::Primitive(PrimitiveType::Boolean)
162
165
  }
163
- &PrimitiveType::Date32 => PST::Primitive(PrimitiveType::Date32),
164
- &PrimitiveType::TimestampMillis => {
166
+ PrimitiveType::Date32 => PST::Primitive(PrimitiveType::Date32),
167
+ PrimitiveType::TimestampMillis => {
165
168
  PST::Primitive(PrimitiveType::TimestampMillis)
166
169
  }
167
- &PrimitiveType::TimestampMicros => {
170
+ PrimitiveType::TimestampMicros => {
168
171
  PST::Primitive(PrimitiveType::TimestampMicros)
169
172
  }
170
173
  },
@@ -205,12 +208,12 @@ fn write_columns_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemE
205
208
  if e.is_kind_of(ruby.exception_stop_iteration()) {
206
209
  break;
207
210
  }
208
- return Err(e)?;
211
+ Err(e)?;
209
212
  }
210
213
  }
211
214
  }
212
215
  } else {
213
- return Err(MagnusError::new(
216
+ Err(MagnusError::new(
214
217
  magnus::exception::type_error(),
215
218
  "read_from must be an Enumerator".to_string(),
216
219
  ))?;
@@ -1,7 +1,7 @@
1
1
  use super::{
2
2
  build_column_collectors_from_dsl, copy_temp_file_to_io_like, create_writer,
3
3
  parse_parquet_write_args, DEFAULT_MEMORY_THRESHOLD, INITIAL_BATCH_SIZE, MIN_BATCH_SIZE,
4
- MIN_SAMPLES_FOR_ESTIMATE, SAMPLE_SIZE,
4
+ SAMPLE_SIZE,
5
5
  };
6
6
  use crate::{
7
7
  logger::RubyLogger,
@@ -16,12 +16,14 @@ use magnus::{
16
16
  value::ReprValue, Error as MagnusError, RArray, RHash, Ruby, Symbol, TryConvert, Value,
17
17
  };
18
18
  use rand::Rng;
19
- use std::sync::Arc;
19
+ use std::{rc::Rc, sync::Arc};
20
+
21
+ const MIN_SAMPLES_FOR_ESTIMATE: usize = 10;
20
22
 
21
23
  #[inline]
22
24
  pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
23
25
  let ruby = unsafe { Ruby::get_unchecked() };
24
- write_rows_impl(Arc::new(ruby), args).map_err(|e| {
26
+ write_rows_impl(Rc::new(ruby), args).map_err(|e| {
25
27
  let z: MagnusError = e.into();
26
28
  z
27
29
  })?;
@@ -29,7 +31,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
29
31
  }
30
32
 
31
33
  #[inline]
32
- fn write_rows_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemError> {
34
+ fn write_rows_impl(ruby: Rc<Ruby>, args: &[Value]) -> Result<(), ParquetGemError> {
33
35
  let ParquetWriteArgs {
34
36
  read_from,
35
37
  write_to,
@@ -81,8 +83,8 @@ fn write_rows_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemErro
81
83
  })?;
82
84
  let row_size = estimate_single_row_size(&row_array, &column_collectors)?;
83
85
  size_samples.push(row_size);
84
- } else if rng.random_range(0..=total_rows) < sample_size as usize {
85
- let idx = rng.random_range(0..sample_size as usize);
86
+ } else if rng.random_range(0..=total_rows) < sample_size {
87
+ let idx = rng.random_range(0..sample_size);
86
88
  let row_array = RArray::from_value(row).ok_or_else(|| {
87
89
  MagnusError::new(ruby.exception_type_error(), "Row must be an array")
88
90
  })?;
@@ -113,12 +115,12 @@ fn write_rows_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemErro
113
115
  }
114
116
  break;
115
117
  }
116
- return Err(e)?;
118
+ Err(e)?;
117
119
  }
118
120
  }
119
121
  }
120
122
  } else {
121
- return Err(MagnusError::new(
123
+ Err(MagnusError::new(
122
124
  magnus::exception::type_error(),
123
125
  "read_from must be an Enumerator".to_string(),
124
126
  ))?;
@@ -255,6 +257,7 @@ pub fn estimate_value_size(
255
257
  | PST::Primitive(PrimitiveType::UInt64)
256
258
  | PST::Primitive(PrimitiveType::Float64) => Ok(8),
257
259
  PST::Primitive(PrimitiveType::Boolean) => Ok(1),
260
+ PST::Primitive(PrimitiveType::Decimal128(_, _)) => Ok(16),
258
261
  PST::Primitive(PrimitiveType::Date32)
259
262
  | PST::Primitive(PrimitiveType::TimestampMillis)
260
263
  | PST::Primitive(PrimitiveType::TimestampMicros) => Ok(8),
@@ -427,15 +430,13 @@ pub fn estimate_value_size(
427
430
  if let Some(field_value) = hash.get(&*field.name) {
428
431
  total_fields_size +=
429
432
  estimate_value_size(field_value, &field.type_)?;
433
+ } else if field.nullable {
434
+ total_fields_size += 0;
430
435
  } else {
431
- if field.nullable {
432
- total_fields_size += 0;
433
- } else {
434
- return Err(MagnusError::new(
435
- magnus::exception::runtime_error(),
436
- format!("Missing field: {} in hash {:?}", field.name, hash),
437
- ));
438
- }
436
+ return Err(MagnusError::new(
437
+ magnus::exception::runtime_error(),
438
+ format!("Missing field: {} in hash {:?}", field.name, hash),
439
+ ));
439
440
  }
440
441
  }
441
442
  }
@@ -11,6 +11,9 @@ module Parquet
11
11
  # field :id, :int64, nullable: false # ID cannot be null
12
12
  # field :name, :string # Default nullable: true
13
13
  #
14
+ # # Decimal field with precision and scale
15
+ # field :price, :decimal, precision: 10, scale: 2
16
+ #
14
17
  # # List with non-nullable items
15
18
  # field :scores, :list, item: :float, item_nullable: false
16
19
  #
@@ -45,7 +48,7 @@ module Parquet
45
48
 
46
49
  # Define a field in the schema
47
50
  # @param name [String, Symbol] field name
48
- # @param type [Symbol] data type (:int32, :int64, :string, :list, :map, :struct, etc)
51
+ # @param type [Symbol] data type (:int32, :int64, :string, :list, :map, :struct, :decimal, etc)
49
52
  # @param nullable [Boolean] whether the field can be null (default: true)
50
53
  # @param kwargs [Hash] additional options depending on type
51
54
  #
@@ -55,6 +58,7 @@ module Parquet
55
58
  # - `key:, value:` if type == :map
56
59
  # - `key_nullable:, value_nullable:` controls nullability of map keys/values (default: true)
57
60
  # - `format:` if you want to store some format string
61
+ # - `precision:, scale:` if type == :decimal (precision defaults to 18, scale to 2)
58
62
  # - `nullable:` default to true if not specified
59
63
  def field(name, type, nullable: true, **kwargs, &block)
60
64
  field_hash = { name: name.to_s, type: type, nullable: !!nullable }
@@ -73,7 +77,15 @@ module Parquet
73
77
  raise ArgumentError, "list field `#{name}` requires `item:` type" unless item_type
74
78
  # Pass item_nullable if provided, otherwise use true as default
75
79
  item_nullable = kwargs[:item_nullable].nil? ? true : !!kwargs[:item_nullable]
76
- field_hash[:item] = wrap_subtype(item_type, nullable: item_nullable, &block)
80
+
81
+ # Pass precision and scale if type is decimal
82
+ if item_type == :decimal
83
+ precision = kwargs[:precision]
84
+ scale = kwargs[:scale]
85
+ field_hash[:item] = wrap_subtype(item_type, nullable: item_nullable, precision: precision, scale: scale, &block)
86
+ else
87
+ field_hash[:item] = wrap_subtype(item_type, nullable: item_nullable, &block)
88
+ end
77
89
  when :map
78
90
  # user must specify key:, value:
79
91
  key_type = kwargs[:key]
@@ -82,8 +94,41 @@ module Parquet
82
94
  # Pass key_nullable and value_nullable if provided, otherwise use true as default
83
95
  key_nullable = kwargs[:key_nullable].nil? ? true : !!kwargs[:key_nullable]
84
96
  value_nullable = kwargs[:value_nullable].nil? ? true : !!kwargs[:value_nullable]
97
+
85
98
  field_hash[:key] = wrap_subtype(key_type, nullable: key_nullable)
86
- field_hash[:value] = wrap_subtype(value_type, nullable: value_nullable, &block)
99
+
100
+ # Pass precision and scale if value type is decimal
101
+ if value_type == :decimal
102
+ precision = kwargs[:precision]
103
+ scale = kwargs[:scale]
104
+ field_hash[:value] = wrap_subtype(value_type, nullable: value_nullable, precision: precision, scale: scale, &block)
105
+ else
106
+ field_hash[:value] = wrap_subtype(value_type, nullable: value_nullable, &block)
107
+ end
108
+ when :decimal
109
+ # Store precision and scale for decimal type according to rules:
110
+ # 1. When neither precision nor scale is provided, use maximum precision (38)
111
+ # 2. When only precision is provided, scale defaults to 0
112
+ # 3. When only scale is provided, use maximum precision (38)
113
+ # 4. When both are provided, use the provided values
114
+
115
+ if kwargs[:precision].nil? && kwargs[:scale].nil?
116
+ # No precision or scale provided - use maximum precision
117
+ field_hash[:precision] = 38
118
+ field_hash[:scale] = 0
119
+ elsif kwargs[:precision] && kwargs[:scale].nil?
120
+ # Precision only - scale defaults to 0
121
+ field_hash[:precision] = kwargs[:precision]
122
+ field_hash[:scale] = 0
123
+ elsif kwargs[:precision].nil? && kwargs[:scale]
124
+ # Scale only - use maximum precision
125
+ field_hash[:precision] = 38
126
+ field_hash[:scale] = kwargs[:scale]
127
+ else
128
+ # Both provided
129
+ field_hash[:precision] = kwargs[:precision]
130
+ field_hash[:scale] = kwargs[:scale]
131
+ end
87
132
  else
88
133
  # primitive type: :int32, :int64, :string, etc.
89
134
  # do nothing else special
@@ -122,7 +167,7 @@ module Parquet
122
167
  # If user said: field "something", :list, item: :struct do ... end
123
168
  # we want to recursively parse that sub-struct from the block.
124
169
  # So wrap_subtype might be:
125
- def wrap_subtype(t, nullable: true, &block)
170
+ def wrap_subtype(t, nullable: true, precision: nil, scale: nil, &block)
126
171
  if t == :struct
127
172
  sub_builder = SchemaBuilder.new
128
173
  sub_builder.instance_eval(&block) if block
@@ -144,6 +189,34 @@ module Parquet
144
189
  end
145
190
 
146
191
  { type: :list, nullable: nullable, name: "item", item: sub_builder.fields[0] }
192
+ elsif t == :decimal
193
+ # Handle decimal type with precision and scale
194
+ result = { type: t, nullable: nullable, name: "item" }
195
+
196
+ # Follow the same rules as in field() method:
197
+ # 1. When neither precision nor scale is provided, use maximum precision (38)
198
+ # 2. When only precision is provided, scale defaults to 0
199
+ # 3. When only scale is provided, use maximum precision (38)
200
+ # 4. When both are provided, use the provided values
201
+ if precision.nil? && scale.nil?
202
+ # No precision or scale provided - use maximum precision
203
+ result[:precision] = 38
204
+ result[:scale] = 0
205
+ elsif precision && scale.nil?
206
+ # Precision only - scale defaults to 0
207
+ result[:precision] = precision
208
+ result[:scale] = 0
209
+ elsif precision.nil? && scale
210
+ # Scale only - use maximum precision
211
+ result[:precision] = 38
212
+ result[:scale] = scale
213
+ else
214
+ # Both provided
215
+ result[:precision] = precision
216
+ result[:scale] = scale
217
+ end
218
+
219
+ result
147
220
  else
148
221
  # e.g. :int32 => { type: :int32, nullable: true }
149
222
  { type: t, nullable: nullable, name: "item" }
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.5.1"
2
+ VERSION = "0.5.3"
3
3
  end
data/lib/parquet.rbi CHANGED
@@ -1,6 +1,17 @@
1
1
  # typed: true
2
2
 
3
3
  module Parquet
4
+ # Returns metadata information about a Parquet file
5
+ #
6
+ # The returned hash contains information about:
7
+ # - Basic file metadata (num_rows, created_by)
8
+ # - Schema information (fields, types, etc.)
9
+ # - Row group details
10
+ # - Column chunk information (compression, encodings, statistics)
11
+ sig { params(path: String).returns(T::Hash[String, T.untyped]) }
12
+ def self.metadata(path)
13
+ end
14
+
4
15
  # Options:
5
16
  # - `input`: String, File, or IO object containing parquet data
6
17
  # - `result_type`: String specifying the output format
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.5.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-02-27 00:00:00.000000000 Z
11
+ date: 2025-04-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -55,6 +55,7 @@ files:
55
55
  - README.md
56
56
  - Rakefile
57
57
  - ext/parquet/Cargo.toml
58
+ - ext/parquet/build.rs
58
59
  - ext/parquet/extconf.rb
59
60
  - ext/parquet/src/allocator.rs
60
61
  - ext/parquet/src/enumerator.rs