parquet 0.5.1 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +9 -1
- data/ext/parquet/Cargo.toml +4 -0
- data/ext/parquet/build.rs +5 -0
- data/ext/parquet/src/lib.rs +1 -0
- data/ext/parquet/src/reader/common.rs +7 -6
- data/ext/parquet/src/reader/mod.rs +204 -0
- data/ext/parquet/src/reader/parquet_column_reader.rs +19 -20
- data/ext/parquet/src/reader/parquet_row_reader.rs +18 -22
- data/ext/parquet/src/ruby_reader.rs +11 -24
- data/ext/parquet/src/types/core_types.rs +1 -0
- data/ext/parquet/src/types/mod.rs +8 -5
- data/ext/parquet/src/types/parquet_value.rs +204 -7
- data/ext/parquet/src/types/record_types.rs +31 -8
- data/ext/parquet/src/types/schema_converter.rs +118 -11
- data/ext/parquet/src/types/schema_node.rs +83 -2
- data/ext/parquet/src/types/timestamp.rs +6 -10
- data/ext/parquet/src/types/type_conversion.rs +84 -11
- data/ext/parquet/src/types/writer_types.rs +40 -11
- data/ext/parquet/src/utils.rs +6 -6
- data/ext/parquet/src/writer/mod.rs +25 -18
- data/ext/parquet/src/writer/write_columns.rs +27 -24
- data/ext/parquet/src/writer/write_rows.rs +17 -16
- data/lib/parquet/schema.rb +77 -4
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +11 -0
- metadata +3 -2
@@ -11,12 +11,12 @@ use crate::{
|
|
11
11
|
use crate::{types::PrimitiveType, SchemaNode};
|
12
12
|
use arrow_array::{Array, RecordBatch};
|
13
13
|
use magnus::{value::ReprValue, Error as MagnusError, RArray, Ruby, Value};
|
14
|
-
use std::sync::Arc;
|
14
|
+
use std::{rc::Rc, sync::Arc};
|
15
15
|
|
16
16
|
#[inline]
|
17
17
|
pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
18
18
|
let ruby = unsafe { Ruby::get_unchecked() };
|
19
|
-
write_columns_impl(
|
19
|
+
write_columns_impl(Rc::new(ruby), args).map_err(|e| {
|
20
20
|
let z: MagnusError = e.into();
|
21
21
|
z
|
22
22
|
})?;
|
@@ -24,7 +24,7 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
|
24
24
|
}
|
25
25
|
|
26
26
|
#[inline]
|
27
|
-
fn write_columns_impl(ruby:
|
27
|
+
fn write_columns_impl(ruby: Rc<Ruby>, args: &[Value]) -> Result<(), ParquetGemError> {
|
28
28
|
let ParquetWriteArgs {
|
29
29
|
read_from,
|
30
30
|
write_to,
|
@@ -94,7 +94,7 @@ fn write_columns_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemE
|
|
94
94
|
};
|
95
95
|
|
96
96
|
if batch_array.len() != schema_len {
|
97
|
-
|
97
|
+
Err(MagnusError::new(
|
98
98
|
magnus::exception::type_error(),
|
99
99
|
format!(
|
100
100
|
"Batch column count ({}) does not match schema length ({}). Schema expects columns: {:?}",
|
@@ -118,7 +118,7 @@ fn write_columns_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemE
|
|
118
118
|
))?,
|
119
119
|
};
|
120
120
|
if top_fields.len() != fields.len() {
|
121
|
-
|
121
|
+
Err(MagnusError::new(
|
122
122
|
magnus::exception::runtime_error(),
|
123
123
|
"Mismatch top-level DSL fields vs Arrow fields",
|
124
124
|
))?;
|
@@ -140,31 +140,34 @@ fn write_columns_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemE
|
|
140
140
|
parquet_type,
|
141
141
|
// Format is handled internally now
|
142
142
|
..
|
143
|
-
} => match parquet_type {
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
143
|
+
} => match *parquet_type {
|
144
|
+
PrimitiveType::Int8 => PST::Primitive(PrimitiveType::Int8),
|
145
|
+
PrimitiveType::Int16 => PST::Primitive(PrimitiveType::Int16),
|
146
|
+
PrimitiveType::Int32 => PST::Primitive(PrimitiveType::Int32),
|
147
|
+
PrimitiveType::Int64 => PST::Primitive(PrimitiveType::Int64),
|
148
|
+
PrimitiveType::UInt8 => PST::Primitive(PrimitiveType::UInt8),
|
149
|
+
PrimitiveType::UInt16 => PST::Primitive(PrimitiveType::UInt16),
|
150
|
+
PrimitiveType::UInt32 => PST::Primitive(PrimitiveType::UInt32),
|
151
|
+
PrimitiveType::UInt64 => PST::Primitive(PrimitiveType::UInt64),
|
152
|
+
PrimitiveType::Float32 => {
|
153
153
|
PST::Primitive(PrimitiveType::Float32)
|
154
154
|
}
|
155
|
-
|
155
|
+
PrimitiveType::Float64 => {
|
156
156
|
PST::Primitive(PrimitiveType::Float64)
|
157
157
|
}
|
158
|
-
|
159
|
-
|
160
|
-
|
158
|
+
PrimitiveType::Decimal128(precision, scale) => {
|
159
|
+
PST::Primitive(PrimitiveType::Decimal128(precision, scale))
|
160
|
+
}
|
161
|
+
PrimitiveType::String => PST::Primitive(PrimitiveType::String),
|
162
|
+
PrimitiveType::Binary => PST::Primitive(PrimitiveType::Binary),
|
163
|
+
PrimitiveType::Boolean => {
|
161
164
|
PST::Primitive(PrimitiveType::Boolean)
|
162
165
|
}
|
163
|
-
|
164
|
-
|
166
|
+
PrimitiveType::Date32 => PST::Primitive(PrimitiveType::Date32),
|
167
|
+
PrimitiveType::TimestampMillis => {
|
165
168
|
PST::Primitive(PrimitiveType::TimestampMillis)
|
166
169
|
}
|
167
|
-
|
170
|
+
PrimitiveType::TimestampMicros => {
|
168
171
|
PST::Primitive(PrimitiveType::TimestampMicros)
|
169
172
|
}
|
170
173
|
},
|
@@ -205,12 +208,12 @@ fn write_columns_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemE
|
|
205
208
|
if e.is_kind_of(ruby.exception_stop_iteration()) {
|
206
209
|
break;
|
207
210
|
}
|
208
|
-
|
211
|
+
Err(e)?;
|
209
212
|
}
|
210
213
|
}
|
211
214
|
}
|
212
215
|
} else {
|
213
|
-
|
216
|
+
Err(MagnusError::new(
|
214
217
|
magnus::exception::type_error(),
|
215
218
|
"read_from must be an Enumerator".to_string(),
|
216
219
|
))?;
|
@@ -1,7 +1,7 @@
|
|
1
1
|
use super::{
|
2
2
|
build_column_collectors_from_dsl, copy_temp_file_to_io_like, create_writer,
|
3
3
|
parse_parquet_write_args, DEFAULT_MEMORY_THRESHOLD, INITIAL_BATCH_SIZE, MIN_BATCH_SIZE,
|
4
|
-
|
4
|
+
SAMPLE_SIZE,
|
5
5
|
};
|
6
6
|
use crate::{
|
7
7
|
logger::RubyLogger,
|
@@ -16,12 +16,14 @@ use magnus::{
|
|
16
16
|
value::ReprValue, Error as MagnusError, RArray, RHash, Ruby, Symbol, TryConvert, Value,
|
17
17
|
};
|
18
18
|
use rand::Rng;
|
19
|
-
use std::sync::Arc;
|
19
|
+
use std::{rc::Rc, sync::Arc};
|
20
|
+
|
21
|
+
const MIN_SAMPLES_FOR_ESTIMATE: usize = 10;
|
20
22
|
|
21
23
|
#[inline]
|
22
24
|
pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
23
25
|
let ruby = unsafe { Ruby::get_unchecked() };
|
24
|
-
write_rows_impl(
|
26
|
+
write_rows_impl(Rc::new(ruby), args).map_err(|e| {
|
25
27
|
let z: MagnusError = e.into();
|
26
28
|
z
|
27
29
|
})?;
|
@@ -29,7 +31,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
29
31
|
}
|
30
32
|
|
31
33
|
#[inline]
|
32
|
-
fn write_rows_impl(ruby:
|
34
|
+
fn write_rows_impl(ruby: Rc<Ruby>, args: &[Value]) -> Result<(), ParquetGemError> {
|
33
35
|
let ParquetWriteArgs {
|
34
36
|
read_from,
|
35
37
|
write_to,
|
@@ -81,8 +83,8 @@ fn write_rows_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemErro
|
|
81
83
|
})?;
|
82
84
|
let row_size = estimate_single_row_size(&row_array, &column_collectors)?;
|
83
85
|
size_samples.push(row_size);
|
84
|
-
} else if rng.random_range(0..=total_rows) < sample_size
|
85
|
-
let idx = rng.random_range(0..sample_size
|
86
|
+
} else if rng.random_range(0..=total_rows) < sample_size {
|
87
|
+
let idx = rng.random_range(0..sample_size);
|
86
88
|
let row_array = RArray::from_value(row).ok_or_else(|| {
|
87
89
|
MagnusError::new(ruby.exception_type_error(), "Row must be an array")
|
88
90
|
})?;
|
@@ -113,12 +115,12 @@ fn write_rows_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemErro
|
|
113
115
|
}
|
114
116
|
break;
|
115
117
|
}
|
116
|
-
|
118
|
+
Err(e)?;
|
117
119
|
}
|
118
120
|
}
|
119
121
|
}
|
120
122
|
} else {
|
121
|
-
|
123
|
+
Err(MagnusError::new(
|
122
124
|
magnus::exception::type_error(),
|
123
125
|
"read_from must be an Enumerator".to_string(),
|
124
126
|
))?;
|
@@ -255,6 +257,7 @@ pub fn estimate_value_size(
|
|
255
257
|
| PST::Primitive(PrimitiveType::UInt64)
|
256
258
|
| PST::Primitive(PrimitiveType::Float64) => Ok(8),
|
257
259
|
PST::Primitive(PrimitiveType::Boolean) => Ok(1),
|
260
|
+
PST::Primitive(PrimitiveType::Decimal128(_, _)) => Ok(16),
|
258
261
|
PST::Primitive(PrimitiveType::Date32)
|
259
262
|
| PST::Primitive(PrimitiveType::TimestampMillis)
|
260
263
|
| PST::Primitive(PrimitiveType::TimestampMicros) => Ok(8),
|
@@ -427,15 +430,13 @@ pub fn estimate_value_size(
|
|
427
430
|
if let Some(field_value) = hash.get(&*field.name) {
|
428
431
|
total_fields_size +=
|
429
432
|
estimate_value_size(field_value, &field.type_)?;
|
433
|
+
} else if field.nullable {
|
434
|
+
total_fields_size += 0;
|
430
435
|
} else {
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
magnus::exception::runtime_error(),
|
436
|
-
format!("Missing field: {} in hash {:?}", field.name, hash),
|
437
|
-
));
|
438
|
-
}
|
436
|
+
return Err(MagnusError::new(
|
437
|
+
magnus::exception::runtime_error(),
|
438
|
+
format!("Missing field: {} in hash {:?}", field.name, hash),
|
439
|
+
));
|
439
440
|
}
|
440
441
|
}
|
441
442
|
}
|
data/lib/parquet/schema.rb
CHANGED
@@ -11,6 +11,9 @@ module Parquet
|
|
11
11
|
# field :id, :int64, nullable: false # ID cannot be null
|
12
12
|
# field :name, :string # Default nullable: true
|
13
13
|
#
|
14
|
+
# # Decimal field with precision and scale
|
15
|
+
# field :price, :decimal, precision: 10, scale: 2
|
16
|
+
#
|
14
17
|
# # List with non-nullable items
|
15
18
|
# field :scores, :list, item: :float, item_nullable: false
|
16
19
|
#
|
@@ -45,7 +48,7 @@ module Parquet
|
|
45
48
|
|
46
49
|
# Define a field in the schema
|
47
50
|
# @param name [String, Symbol] field name
|
48
|
-
# @param type [Symbol] data type (:int32, :int64, :string, :list, :map, :struct, etc)
|
51
|
+
# @param type [Symbol] data type (:int32, :int64, :string, :list, :map, :struct, :decimal, etc)
|
49
52
|
# @param nullable [Boolean] whether the field can be null (default: true)
|
50
53
|
# @param kwargs [Hash] additional options depending on type
|
51
54
|
#
|
@@ -55,6 +58,7 @@ module Parquet
|
|
55
58
|
# - `key:, value:` if type == :map
|
56
59
|
# - `key_nullable:, value_nullable:` controls nullability of map keys/values (default: true)
|
57
60
|
# - `format:` if you want to store some format string
|
61
|
+
# - `precision:, scale:` if type == :decimal (precision defaults to 18, scale to 2)
|
58
62
|
# - `nullable:` default to true if not specified
|
59
63
|
def field(name, type, nullable: true, **kwargs, &block)
|
60
64
|
field_hash = { name: name.to_s, type: type, nullable: !!nullable }
|
@@ -73,7 +77,15 @@ module Parquet
|
|
73
77
|
raise ArgumentError, "list field `#{name}` requires `item:` type" unless item_type
|
74
78
|
# Pass item_nullable if provided, otherwise use true as default
|
75
79
|
item_nullable = kwargs[:item_nullable].nil? ? true : !!kwargs[:item_nullable]
|
76
|
-
|
80
|
+
|
81
|
+
# Pass precision and scale if type is decimal
|
82
|
+
if item_type == :decimal
|
83
|
+
precision = kwargs[:precision]
|
84
|
+
scale = kwargs[:scale]
|
85
|
+
field_hash[:item] = wrap_subtype(item_type, nullable: item_nullable, precision: precision, scale: scale, &block)
|
86
|
+
else
|
87
|
+
field_hash[:item] = wrap_subtype(item_type, nullable: item_nullable, &block)
|
88
|
+
end
|
77
89
|
when :map
|
78
90
|
# user must specify key:, value:
|
79
91
|
key_type = kwargs[:key]
|
@@ -82,8 +94,41 @@ module Parquet
|
|
82
94
|
# Pass key_nullable and value_nullable if provided, otherwise use true as default
|
83
95
|
key_nullable = kwargs[:key_nullable].nil? ? true : !!kwargs[:key_nullable]
|
84
96
|
value_nullable = kwargs[:value_nullable].nil? ? true : !!kwargs[:value_nullable]
|
97
|
+
|
85
98
|
field_hash[:key] = wrap_subtype(key_type, nullable: key_nullable)
|
86
|
-
|
99
|
+
|
100
|
+
# Pass precision and scale if value type is decimal
|
101
|
+
if value_type == :decimal
|
102
|
+
precision = kwargs[:precision]
|
103
|
+
scale = kwargs[:scale]
|
104
|
+
field_hash[:value] = wrap_subtype(value_type, nullable: value_nullable, precision: precision, scale: scale, &block)
|
105
|
+
else
|
106
|
+
field_hash[:value] = wrap_subtype(value_type, nullable: value_nullable, &block)
|
107
|
+
end
|
108
|
+
when :decimal
|
109
|
+
# Store precision and scale for decimal type according to rules:
|
110
|
+
# 1. When neither precision nor scale is provided, use maximum precision (38)
|
111
|
+
# 2. When only precision is provided, scale defaults to 0
|
112
|
+
# 3. When only scale is provided, use maximum precision (38)
|
113
|
+
# 4. When both are provided, use the provided values
|
114
|
+
|
115
|
+
if kwargs[:precision].nil? && kwargs[:scale].nil?
|
116
|
+
# No precision or scale provided - use maximum precision
|
117
|
+
field_hash[:precision] = 38
|
118
|
+
field_hash[:scale] = 0
|
119
|
+
elsif kwargs[:precision] && kwargs[:scale].nil?
|
120
|
+
# Precision only - scale defaults to 0
|
121
|
+
field_hash[:precision] = kwargs[:precision]
|
122
|
+
field_hash[:scale] = 0
|
123
|
+
elsif kwargs[:precision].nil? && kwargs[:scale]
|
124
|
+
# Scale only - use maximum precision
|
125
|
+
field_hash[:precision] = 38
|
126
|
+
field_hash[:scale] = kwargs[:scale]
|
127
|
+
else
|
128
|
+
# Both provided
|
129
|
+
field_hash[:precision] = kwargs[:precision]
|
130
|
+
field_hash[:scale] = kwargs[:scale]
|
131
|
+
end
|
87
132
|
else
|
88
133
|
# primitive type: :int32, :int64, :string, etc.
|
89
134
|
# do nothing else special
|
@@ -122,7 +167,7 @@ module Parquet
|
|
122
167
|
# If user said: field "something", :list, item: :struct do ... end
|
123
168
|
# we want to recursively parse that sub-struct from the block.
|
124
169
|
# So wrap_subtype might be:
|
125
|
-
def wrap_subtype(t, nullable: true, &block)
|
170
|
+
def wrap_subtype(t, nullable: true, precision: nil, scale: nil, &block)
|
126
171
|
if t == :struct
|
127
172
|
sub_builder = SchemaBuilder.new
|
128
173
|
sub_builder.instance_eval(&block) if block
|
@@ -144,6 +189,34 @@ module Parquet
|
|
144
189
|
end
|
145
190
|
|
146
191
|
{ type: :list, nullable: nullable, name: "item", item: sub_builder.fields[0] }
|
192
|
+
elsif t == :decimal
|
193
|
+
# Handle decimal type with precision and scale
|
194
|
+
result = { type: t, nullable: nullable, name: "item" }
|
195
|
+
|
196
|
+
# Follow the same rules as in field() method:
|
197
|
+
# 1. When neither precision nor scale is provided, use maximum precision (38)
|
198
|
+
# 2. When only precision is provided, scale defaults to 0
|
199
|
+
# 3. When only scale is provided, use maximum precision (38)
|
200
|
+
# 4. When both are provided, use the provided values
|
201
|
+
if precision.nil? && scale.nil?
|
202
|
+
# No precision or scale provided - use maximum precision
|
203
|
+
result[:precision] = 38
|
204
|
+
result[:scale] = 0
|
205
|
+
elsif precision && scale.nil?
|
206
|
+
# Precision only - scale defaults to 0
|
207
|
+
result[:precision] = precision
|
208
|
+
result[:scale] = 0
|
209
|
+
elsif precision.nil? && scale
|
210
|
+
# Scale only - use maximum precision
|
211
|
+
result[:precision] = 38
|
212
|
+
result[:scale] = scale
|
213
|
+
else
|
214
|
+
# Both provided
|
215
|
+
result[:precision] = precision
|
216
|
+
result[:scale] = scale
|
217
|
+
end
|
218
|
+
|
219
|
+
result
|
147
220
|
else
|
148
221
|
# e.g. :int32 => { type: :int32, nullable: true }
|
149
222
|
{ type: t, nullable: nullable, name: "item" }
|
data/lib/parquet/version.rb
CHANGED
data/lib/parquet.rbi
CHANGED
@@ -1,6 +1,17 @@
|
|
1
1
|
# typed: true
|
2
2
|
|
3
3
|
module Parquet
|
4
|
+
# Returns metadata information about a Parquet file
|
5
|
+
#
|
6
|
+
# The returned hash contains information about:
|
7
|
+
# - Basic file metadata (num_rows, created_by)
|
8
|
+
# - Schema information (fields, types, etc.)
|
9
|
+
# - Row group details
|
10
|
+
# - Column chunk information (compression, encodings, statistics)
|
11
|
+
sig { params(path: String).returns(T::Hash[String, T.untyped]) }
|
12
|
+
def self.metadata(path)
|
13
|
+
end
|
14
|
+
|
4
15
|
# Options:
|
5
16
|
# - `input`: String, File, or IO object containing parquet data
|
6
17
|
# - `result_type`: String specifying the output format
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-04-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -55,6 +55,7 @@ files:
|
|
55
55
|
- README.md
|
56
56
|
- Rakefile
|
57
57
|
- ext/parquet/Cargo.toml
|
58
|
+
- ext/parquet/build.rs
|
58
59
|
- ext/parquet/extconf.rb
|
59
60
|
- ext/parquet/src/allocator.rs
|
60
61
|
- ext/parquet/src/enumerator.rs
|