parquet 0.7.3-aarch64-linux → 0.8.0-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +1 -0
- data/README.md +10 -3
- data/lib/parquet/3.2/parquet.so +0 -0
- data/lib/parquet/3.3/parquet.so +0 -0
- data/lib/parquet/3.4/parquet.so +0 -0
- data/lib/parquet/4.0/parquet.so +0 -0
- data/lib/parquet/schema.rb +6 -2
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +54 -12
- metadata +18 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 89db55543839853aef62e3f11511ba4ff54ee1896c7085b83d9e6e44d2b10335
|
|
4
|
+
data.tar.gz: 07db9c35d4c4777d9339f92b22112760c9ff0e96091b7dc26940c54ed4df03b8
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 3f78682dc2b7b4d5aa3fa186b7cf743b1808c93644ee2097d76051c21e900da23578493a3d0e637b4bc61dd9ce06a85865fdf46e33a24acf46ff059519b5199f
|
|
7
|
+
data.tar.gz: ca2f9c79cfd0faf50567c7df2d73cbbea508e5ade7234ac8d2791288d0f174d1b34f707e63ef29765cdd6573ca34ac8ee0694f21d3ba46d7a8fd0ee49eb2f3cc
|
data/Gemfile
CHANGED
data/README.md
CHANGED
|
@@ -166,7 +166,7 @@ end
|
|
|
166
166
|
Parquet.write_rows(rows,
|
|
167
167
|
schema: schema,
|
|
168
168
|
write_to: "output.parquet",
|
|
169
|
-
batch_size: 5000 #
|
|
169
|
+
batch_size: 5000 # Positive rows per batch (default: 1000)
|
|
170
170
|
)
|
|
171
171
|
```
|
|
172
172
|
|
|
@@ -200,6 +200,9 @@ Parquet.write_columns(batches.each,
|
|
|
200
200
|
)
|
|
201
201
|
```
|
|
202
202
|
|
|
203
|
+
`write_columns` also accepts `logger:` with the same Ruby logger interface as
|
|
204
|
+
row writes.
|
|
205
|
+
|
|
203
206
|
## Data Types
|
|
204
207
|
|
|
205
208
|
### Basic Types
|
|
@@ -340,7 +343,7 @@ Parquet.write_rows(data.each, schema: schema, write_to: "complex.parquet")
|
|
|
340
343
|
|
|
341
344
|
### Timezone Handling in Parquet
|
|
342
345
|
|
|
343
|
-
|
|
346
|
+
The Parquet specification has a fundamental limitation with timezone storage:
|
|
344
347
|
|
|
345
348
|
1. **UTC-normalized**: Any timestamp with timezone info (including "+09:00" or "America/New_York") is converted to UTC
|
|
346
349
|
2. **Local/unzoned**: Timestamps without timezone info are stored as-is
|
|
@@ -382,11 +385,15 @@ Control memory usage with flush thresholds:
|
|
|
382
385
|
Parquet.write_rows(huge_dataset.each,
|
|
383
386
|
schema: schema,
|
|
384
387
|
write_to: "output.parquet",
|
|
385
|
-
batch_size: 1000, #
|
|
388
|
+
batch_size: 1000, # Positive rows before considering flush
|
|
386
389
|
flush_threshold: 32 * 1024**2 # Flush if batch exceeds 32MB
|
|
387
390
|
)
|
|
388
391
|
```
|
|
389
392
|
|
|
393
|
+
Write batch and sample sizes are bounded before buffer allocation. Very large
|
|
394
|
+
batch sizes are rejected, and wide schemas have a lower effective batch cap so
|
|
395
|
+
the writer cannot reserve unbounded per-column value slots.
|
|
396
|
+
|
|
390
397
|
## Architecture
|
|
391
398
|
|
|
392
399
|
This gem uses a modular architecture:
|
data/lib/parquet/3.2/parquet.so
CHANGED
|
Binary file
|
data/lib/parquet/3.3/parquet.so
CHANGED
|
Binary file
|
data/lib/parquet/3.4/parquet.so
CHANGED
|
Binary file
|
|
Binary file
|
data/lib/parquet/schema.rb
CHANGED
|
@@ -116,8 +116,12 @@ module Parquet
|
|
|
116
116
|
key_type = kwargs[:key]
|
|
117
117
|
value_type = kwargs[:value]
|
|
118
118
|
raise ArgumentError, "map field `#{name}` requires `key:` and `value:`" if key_type.nil? || value_type.nil?
|
|
119
|
-
#
|
|
120
|
-
|
|
119
|
+
# Map keys are required by the Parquet spec. Reject an explicit nullable
|
|
120
|
+
# key at this boundary rather than letting it fail deep in the writer.
|
|
121
|
+
if kwargs[:key_nullable]
|
|
122
|
+
raise ArgumentError, "map field `#{name}` keys are always required; remove `key_nullable: true`"
|
|
123
|
+
end
|
|
124
|
+
key_nullable = false
|
|
121
125
|
value_nullable = kwargs[:value_nullable].nil? ? true : !!kwargs[:value_nullable]
|
|
122
126
|
|
|
123
127
|
field_hash[:key] = wrap_subtype(key_type, nullable: key_nullable)
|
data/lib/parquet/version.rb
CHANGED
data/lib/parquet.rbi
CHANGED
|
@@ -18,12 +18,29 @@ module Parquet
|
|
|
18
18
|
# ("hash" or "array" or :hash or :array)
|
|
19
19
|
# - `columns`: When present, only the specified columns will be included in the output.
|
|
20
20
|
# This is useful for reducing how much data is read and improving performance.
|
|
21
|
+
# - `string_storage`: How string *values* become Ruby strings (default `:copy`). Hash keys
|
|
22
|
+
# (struct field names and top-level column names) are always interned and
|
|
23
|
+
# reused regardless of this setting.
|
|
24
|
+
# - `:copy` allocates a fresh mutable String per value.
|
|
25
|
+
# - `:intern` deduplicates low-cardinality equal values into frozen interned
|
|
26
|
+
# Strings up to a bounded per-read cache, then falls back to frozen copies.
|
|
27
|
+
# A transient copy still happens per value, so it is not a per-value speedup.
|
|
28
|
+
# - `:shared` returns frozen, zero-copy strings backed by Rust memory for
|
|
29
|
+
# short, repeated, low-cardinality values. Each read returns at most the
|
|
30
|
+
# configured number of shared values and only values up to the configured
|
|
31
|
+
# byte size; values past those bounds become frozen copies. New process-wide
|
|
32
|
+
# leaks are also capped by the requested budget and hard process ceilings.
|
|
33
|
+
# All `:shared` results are frozen. Not recommended for high-cardinality or
|
|
34
|
+
# large-blob string columns.
|
|
35
|
+
# Pass a hash to set the `:shared` leak budget, e.g.
|
|
36
|
+
# `{ mode: :shared, max_entries: 16_384, max_value_bytes: 1024 }`.
|
|
21
37
|
sig do
|
|
22
38
|
params(
|
|
23
39
|
input: T.any(String, File, StringIO, IO),
|
|
24
40
|
result_type: T.nilable(T.any(String, Symbol)),
|
|
25
41
|
columns: T.nilable(T::Array[String]),
|
|
26
|
-
strict: T.nilable(T::Boolean)
|
|
42
|
+
strict: T.nilable(T::Boolean),
|
|
43
|
+
string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped]))
|
|
27
44
|
).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
|
|
28
45
|
end
|
|
29
46
|
sig do
|
|
@@ -32,10 +49,11 @@ module Parquet
|
|
|
32
49
|
result_type: T.nilable(T.any(String, Symbol)),
|
|
33
50
|
columns: T.nilable(T::Array[String]),
|
|
34
51
|
strict: T.nilable(T::Boolean),
|
|
52
|
+
string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped])),
|
|
35
53
|
blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
|
|
36
54
|
).returns(NilClass)
|
|
37
55
|
end
|
|
38
|
-
def self.each_row(input, result_type: nil, columns: nil, strict: nil, &blk)
|
|
56
|
+
def self.each_row(input, result_type: nil, columns: nil, strict: nil, string_storage: nil, &blk)
|
|
39
57
|
end
|
|
40
58
|
|
|
41
59
|
# Options:
|
|
@@ -44,13 +62,16 @@ module Parquet
|
|
|
44
62
|
# ("hash" or "array" or :hash or :array)
|
|
45
63
|
# - `columns`: When present, only the specified columns will be included in the output.
|
|
46
64
|
# - `batch_size`: When present, specifies the number of rows per batch
|
|
65
|
+
# - `string_storage`: How string values become Ruby strings (`:copy` (default), `:intern`,
|
|
66
|
+
# or `:shared`). See `each_row` for the semantics of each mode.
|
|
47
67
|
sig do
|
|
48
68
|
params(
|
|
49
69
|
input: T.any(String, File, StringIO, IO),
|
|
50
70
|
result_type: T.nilable(T.any(String, Symbol)),
|
|
51
71
|
columns: T.nilable(T::Array[String]),
|
|
52
72
|
batch_size: T.nilable(Integer),
|
|
53
|
-
strict: T.nilable(T::Boolean)
|
|
73
|
+
strict: T.nilable(T::Boolean),
|
|
74
|
+
string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped]))
|
|
54
75
|
).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
|
|
55
76
|
end
|
|
56
77
|
sig do
|
|
@@ -60,11 +81,12 @@ module Parquet
|
|
|
60
81
|
columns: T.nilable(T::Array[String]),
|
|
61
82
|
batch_size: T.nilable(Integer),
|
|
62
83
|
strict: T.nilable(T::Boolean),
|
|
84
|
+
string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped])),
|
|
63
85
|
blk:
|
|
64
86
|
T.nilable(T.proc.params(batch: T.any(T::Hash[String, T::Array[T.untyped]], T::Array[T::Array[T.untyped]])).void)
|
|
65
87
|
).returns(NilClass)
|
|
66
88
|
end
|
|
67
|
-
def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, strict: nil, &blk)
|
|
89
|
+
def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, strict: nil, string_storage: nil, &blk)
|
|
68
90
|
end
|
|
69
91
|
|
|
70
92
|
# Options:
|
|
@@ -79,11 +101,19 @@ module Parquet
|
|
|
79
101
|
# - `date32`
|
|
80
102
|
# - `timestamp_millis`, `timestamp_micros`
|
|
81
103
|
# - `write_to`: String path or IO object to write the parquet file to
|
|
82
|
-
# - `batch_size`: Optional batch size for writing (defaults to 1000
|
|
83
|
-
#
|
|
104
|
+
# - `batch_size`: Optional positive batch size for writing (defaults to 1000, at most 1_000_000
|
|
105
|
+
# for one-column schemas; wide schemas may have a lower safety cap)
|
|
106
|
+
# - `flush_threshold`: Optional threshold in bytes for the writer's in-progress (encoded)
|
|
107
|
+
# buffer before a row group is flushed (defaults to 100MB)
|
|
84
108
|
# - `compression`: Optional compression type to use (defaults to "zstd")
|
|
85
109
|
# Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
|
86
|
-
# - `sample_size`: Optional number of rows to sample for size estimation
|
|
110
|
+
# - `sample_size`: Optional positive number of rows to sample for size estimation
|
|
111
|
+
# (defaults to 100, at most 10_000)
|
|
112
|
+
# - `string_cache`: Deduplicate repeated string values while writing. `false` (default)
|
|
113
|
+
# disables it, `true` enables it with a default capacity, and an Integer
|
|
114
|
+
# enables it with that many retained distinct strings (at most 65_536).
|
|
115
|
+
# Retention also skips values larger than 4KB and stops after 16MB of
|
|
116
|
+
# cached string content.
|
|
87
117
|
sig do
|
|
88
118
|
params(
|
|
89
119
|
read_from: T::Enumerator[T::Array[T.untyped]],
|
|
@@ -92,7 +122,8 @@ module Parquet
|
|
|
92
122
|
batch_size: T.nilable(Integer),
|
|
93
123
|
flush_threshold: T.nilable(Integer),
|
|
94
124
|
compression: T.nilable(String),
|
|
95
|
-
sample_size: T.nilable(Integer)
|
|
125
|
+
sample_size: T.nilable(Integer),
|
|
126
|
+
string_cache: T.nilable(T.any(T::Boolean, Integer))
|
|
96
127
|
).void
|
|
97
128
|
end
|
|
98
129
|
def self.write_rows(
|
|
@@ -102,7 +133,8 @@ module Parquet
|
|
|
102
133
|
batch_size: nil,
|
|
103
134
|
flush_threshold: nil,
|
|
104
135
|
compression: nil,
|
|
105
|
-
sample_size: nil
|
|
136
|
+
sample_size: nil,
|
|
137
|
+
string_cache: nil
|
|
106
138
|
)
|
|
107
139
|
end
|
|
108
140
|
|
|
@@ -119,18 +151,28 @@ module Parquet
|
|
|
119
151
|
# - `timestamp_millis`, `timestamp_micros`
|
|
120
152
|
# - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
|
|
121
153
|
# - `write_to`: String path or IO object to write the parquet file to
|
|
122
|
-
# - `flush_threshold`: Optional
|
|
154
|
+
# - `flush_threshold`: Optional threshold in bytes for the writer's in-progress (encoded)
|
|
155
|
+
# buffer before a row group is flushed (defaults to 100MB)
|
|
123
156
|
# - `compression`: Optional compression type to use (defaults to "zstd")
|
|
124
157
|
# Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
|
158
|
+
# - `logger`: Optional Ruby logger for column-write progress messages
|
|
125
159
|
sig do
|
|
126
160
|
params(
|
|
127
161
|
read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
|
|
128
162
|
schema: T::Array[T::Hash[String, String]],
|
|
129
163
|
write_to: T.any(String, IO),
|
|
130
164
|
flush_threshold: T.nilable(Integer),
|
|
131
|
-
compression: T.nilable(String)
|
|
165
|
+
compression: T.nilable(String),
|
|
166
|
+
logger: T.nilable(T.untyped)
|
|
132
167
|
).void
|
|
133
168
|
end
|
|
134
|
-
def self.write_columns(
|
|
169
|
+
def self.write_columns(
|
|
170
|
+
read_from,
|
|
171
|
+
schema:,
|
|
172
|
+
write_to:,
|
|
173
|
+
flush_threshold: nil,
|
|
174
|
+
compression: nil,
|
|
175
|
+
logger: nil
|
|
176
|
+
)
|
|
135
177
|
end
|
|
136
178
|
end
|
metadata
CHANGED
|
@@ -1,15 +1,29 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: parquet
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.8.0
|
|
5
5
|
platform: aarch64-linux
|
|
6
6
|
authors:
|
|
7
7
|
- Nathan Jaremko
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2026-06-25 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: bigdecimal
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - ">="
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '0'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - ">="
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '0'
|
|
13
27
|
- !ruby/object:Gem::Dependency
|
|
14
28
|
name: rake-compiler
|
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -42,6 +56,7 @@ files:
|
|
|
42
56
|
- lib/parquet/3.2/parquet.so
|
|
43
57
|
- lib/parquet/3.3/parquet.so
|
|
44
58
|
- lib/parquet/3.4/parquet.so
|
|
59
|
+
- lib/parquet/4.0/parquet.so
|
|
45
60
|
- lib/parquet/schema.rb
|
|
46
61
|
- lib/parquet/version.rb
|
|
47
62
|
homepage: https://github.com/njaremko/parquet-ruby
|
|
@@ -65,7 +80,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
65
80
|
version: '3.2'
|
|
66
81
|
- - "<"
|
|
67
82
|
- !ruby/object:Gem::Version
|
|
68
|
-
version:
|
|
83
|
+
version: 4.1.dev
|
|
69
84
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
70
85
|
requirements:
|
|
71
86
|
- - ">="
|