parquet-tyfoom 0.8.0-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +21 -0
- data/LICENSE +21 -0
- data/README.md +428 -0
- data/Rakefile +43 -0
- data/lib/parquet/3.2/parquet.bundle +0 -0
- data/lib/parquet/3.3/parquet.bundle +0 -0
- data/lib/parquet/3.4/parquet.bundle +0 -0
- data/lib/parquet/4.0/parquet.bundle +0 -0
- data/lib/parquet/schema.rb +262 -0
- data/lib/parquet/version.rb +3 -0
- data/lib/parquet.rb +11 -0
- data/lib/parquet.rbi +181 -0
- metadata +100 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 80b41a76b9346c3366df0dba155290ce22cba5c3e1e7df203e04c0c0b356d5f1
|
|
4
|
+
data.tar.gz: c02f028d0a73de7f646ed6c2587f2e0143631f4d2c9af1e1745e4fbbdef00c92
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 7eca7c5e77388632071783d6fcc4cf0d5e043886ce91a9a674cb99a22710ffa6209b65c1a44ff733aadf5b44f4dda2097158582be1045ff22c8765c4b951920e
|
|
7
|
+
data.tar.gz: c57c21105860993cf7ad098bb26c25a098cc5dc7a5097d85df1edef52cfb1cce9850ae2ddae7619e913d2ae5bd428d61cde47dd784a367e8336a83aa2f4d84ed
|
data/Gemfile
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
source "https://rubygems.org"
|
|
2
|
+
|
|
3
|
+
gem "rb_sys", "~> 0.9.56"
|
|
4
|
+
gem "rake"
|
|
5
|
+
gem "bigdecimal"
|
|
6
|
+
|
|
7
|
+
# Use local version of parquet
|
|
8
|
+
gemspec
|
|
9
|
+
|
|
10
|
+
group :development do
|
|
11
|
+
# gem "benchmark-ips", "~> 2.12"
|
|
12
|
+
# gem "polars-df"
|
|
13
|
+
# gem "duckdb"
|
|
14
|
+
gem "benchmark-memory"
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
group :test do
|
|
18
|
+
gem "csv"
|
|
19
|
+
gem "logger"
|
|
20
|
+
gem "minitest", "~> 5.0"
|
|
21
|
+
end
|
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Nathan Jaremko
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,428 @@
|
|
|
1
|
+
# parquet-ruby
|
|
2
|
+
|
|
3
|
+
[](https://badge.fury.io/rb/parquet)
|
|
4
|
+
|
|
5
|
+
Read and write [Apache Parquet](https://parquet.apache.org/) files from Ruby. This gem wraps the official Apache [`parquet`](https://github.com/apache/arrow-rs/tree/main/parquet) rust crate, providing:
|
|
6
|
+
|
|
7
|
+
- **High performance** columnar data storage and retrieval
|
|
8
|
+
- **Memory-efficient** streaming APIs for large datasets
|
|
9
|
+
- **Full compatibility** with the Apache Parquet specification
|
|
10
|
+
- **Simple, Ruby-native** APIs that feel natural
|
|
11
|
+
|
|
12
|
+
## Why Use This Library?
|
|
13
|
+
|
|
14
|
+
Apache Parquet is the de facto standard for analytical data storage, offering:
|
|
15
|
+
- **Efficient compression** - typically 2-10x smaller than CSV
|
|
16
|
+
- **Fast columnar access** - read only the columns you need
|
|
17
|
+
- **Rich type system** - preserves data types, including nested structures
|
|
18
|
+
- **Wide ecosystem support** - works with Spark, Pandas, DuckDB, and more
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
Add this line to your application's Gemfile:
|
|
23
|
+
|
|
24
|
+
```ruby
|
|
25
|
+
gem 'parquet'
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Then execute:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
$ bundle install
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Or install it directly:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
$ gem install parquet
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Quick Start
|
|
41
|
+
|
|
42
|
+
### Reading Data
|
|
43
|
+
|
|
44
|
+
```ruby
|
|
45
|
+
require "parquet"
|
|
46
|
+
|
|
47
|
+
# Read Parquet files row by row
|
|
48
|
+
Parquet.each_row("data.parquet") do |row|
|
|
49
|
+
puts row # => {"id" => 1, "name" => "Alice", "score" => 95.5}
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Or column by column for better performance
|
|
53
|
+
Parquet.each_column("data.parquet", batch_size: 1000) do |batch|
|
|
54
|
+
puts batch # => {"id" => [1, 2, ...], "name" => ["Alice", "Bob", ...]}
|
|
55
|
+
end
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Writing Data
|
|
59
|
+
|
|
60
|
+
```ruby
|
|
61
|
+
# Define your schema
|
|
62
|
+
schema = [
|
|
63
|
+
{ "id" => "int64" },
|
|
64
|
+
{ "name" => "string" },
|
|
65
|
+
{ "score" => "double" }
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
# Write row by row
|
|
69
|
+
rows = [
|
|
70
|
+
[1, "Alice", 95.5],
|
|
71
|
+
[2, "Bob", 82.3]
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
Parquet.write_rows(rows.each, schema: schema, write_to: "output.parquet")
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Reading Parquet Files
|
|
78
|
+
|
|
79
|
+
The library provides two APIs for reading data, each optimized for different use cases:
|
|
80
|
+
|
|
81
|
+
### Row-wise Reading (Sequential Access)
|
|
82
|
+
|
|
83
|
+
Best for: Processing records one at a time, data transformations, ETL pipelines
|
|
84
|
+
|
|
85
|
+
```ruby
|
|
86
|
+
# Basic usage - returns hashes
|
|
87
|
+
Parquet.each_row("data.parquet") do |row|
|
|
88
|
+
puts row # => {"id" => 1, "name" => "Alice"}
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Memory-efficient array format
|
|
92
|
+
Parquet.each_row("data.parquet", result_type: :array) do |row|
|
|
93
|
+
puts row # => [1, "Alice"]
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Read specific columns only
|
|
97
|
+
Parquet.each_row("data.parquet", columns: ["id", "name"]) do |row|
|
|
98
|
+
# Only requested columns are loaded from disk
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Works with IO objects
|
|
102
|
+
File.open("data.parquet", "rb") do |file|
|
|
103
|
+
Parquet.each_row(file) do |row|
|
|
104
|
+
# Process row
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Column-wise Reading (Analytical Access)
|
|
110
|
+
|
|
111
|
+
Best for: Analytics, aggregations, when you need few columns from wide tables
|
|
112
|
+
|
|
113
|
+
```ruby
|
|
114
|
+
# Process data in column batches
|
|
115
|
+
Parquet.each_column("data.parquet", batch_size: 1000) do |batch|
|
|
116
|
+
# batch is a hash of column_name => array_of_values
|
|
117
|
+
ids = batch["id"] # => [1, 2, 3, ..., 1000]
|
|
118
|
+
names = batch["name"] # => ["Alice", "Bob", ...]
|
|
119
|
+
|
|
120
|
+
# Perform columnar operations
|
|
121
|
+
avg_id = ids.sum.to_f / ids.length
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Array format for more control
|
|
125
|
+
Parquet.each_column("data.parquet",
|
|
126
|
+
result_type: :array,
|
|
127
|
+
columns: ["id", "name"]) do |batch|
|
|
128
|
+
# batch is an array of arrays
|
|
129
|
+
# [[1, 2, ...], ["Alice", "Bob", ...]]
|
|
130
|
+
end
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### File Metadata
|
|
134
|
+
|
|
135
|
+
Inspect file structure without reading data:
|
|
136
|
+
|
|
137
|
+
```ruby
|
|
138
|
+
metadata = Parquet.metadata("data.parquet")
|
|
139
|
+
|
|
140
|
+
puts metadata["num_rows"] # Total row count
|
|
141
|
+
puts metadata["created_by"] # Writer identification
|
|
142
|
+
puts metadata["schema"]["fields"] # Column definitions
|
|
143
|
+
puts metadata["row_groups"].size # Number of row groups
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## Writing Parquet Files
|
|
147
|
+
|
|
148
|
+
### Row-wise Writing
|
|
149
|
+
|
|
150
|
+
Best for: Streaming data, converting from other formats, memory-constrained environments
|
|
151
|
+
|
|
152
|
+
```ruby
|
|
153
|
+
# Basic schema definition
|
|
154
|
+
schema = [
|
|
155
|
+
{ "id" => "int64" },
|
|
156
|
+
{ "name" => "string" },
|
|
157
|
+
{ "active" => "boolean" },
|
|
158
|
+
{ "balance" => "double" }
|
|
159
|
+
]
|
|
160
|
+
|
|
161
|
+
# Stream data from any enumerable
|
|
162
|
+
rows = CSV.foreach("input.csv").map do |row|
|
|
163
|
+
[row[0].to_i, row[1], row[2] == "true", row[3].to_f]
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
Parquet.write_rows(rows,
|
|
167
|
+
schema: schema,
|
|
168
|
+
write_to: "output.parquet",
|
|
169
|
+
batch_size: 5000 # Positive rows per batch (default: 1000)
|
|
170
|
+
)
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### Column-wise Writing
|
|
174
|
+
|
|
175
|
+
Best for: Pre-columnar data, better compression, higher performance
|
|
176
|
+
|
|
177
|
+
```ruby
|
|
178
|
+
# Prepare columnar data
|
|
179
|
+
ids = [1, 2, 3, 4, 5]
|
|
180
|
+
names = ["Alice", "Bob", "Charlie", "Diana", "Eve"]
|
|
181
|
+
scores = [95.5, 82.3, 88.7, 91.2, 79.8]
|
|
182
|
+
|
|
183
|
+
# Create batches
|
|
184
|
+
batches = [[
|
|
185
|
+
ids, # First column
|
|
186
|
+
names, # Second column
|
|
187
|
+
scores # Third column
|
|
188
|
+
]]
|
|
189
|
+
|
|
190
|
+
schema = [
|
|
191
|
+
{ "id" => "int64" },
|
|
192
|
+
{ "name" => "string" },
|
|
193
|
+
{ "score" => "double" }
|
|
194
|
+
]
|
|
195
|
+
|
|
196
|
+
Parquet.write_columns(batches.each,
|
|
197
|
+
schema: schema,
|
|
198
|
+
write_to: "output.parquet",
|
|
199
|
+
compression: "snappy" # Options: none, snappy, gzip, lz4, zstd
|
|
200
|
+
)
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
`write_columns` also accepts `logger:` with the same Ruby logger interface as
|
|
204
|
+
row writes.
|
|
205
|
+
|
|
206
|
+
## Data Types
|
|
207
|
+
|
|
208
|
+
### Basic Types
|
|
209
|
+
|
|
210
|
+
```ruby
|
|
211
|
+
schema = [
|
|
212
|
+
# Integers
|
|
213
|
+
{ "tiny" => "int8" }, # -128 to 127
|
|
214
|
+
{ "small" => "int16" }, # -32,768 to 32,767
|
|
215
|
+
{ "medium" => "int32" }, # ±2 billion
|
|
216
|
+
{ "large" => "int64" }, # ±9 quintillion
|
|
217
|
+
|
|
218
|
+
# Unsigned integers
|
|
219
|
+
{ "ubyte" => "uint8" }, # 0 to 255
|
|
220
|
+
{ "ushort" => "uint16" }, # 0 to 65,535
|
|
221
|
+
{ "uint" => "uint32" }, # 0 to 4 billion
|
|
222
|
+
{ "ulong" => "uint64" }, # 0 to 18 quintillion
|
|
223
|
+
|
|
224
|
+
# Floating point
|
|
225
|
+
{ "price" => "float" }, # 32-bit precision
|
|
226
|
+
{ "amount" => "double" }, # 64-bit precision
|
|
227
|
+
|
|
228
|
+
# Other basics
|
|
229
|
+
{ "name" => "string" },
|
|
230
|
+
{ "data" => "binary" },
|
|
231
|
+
{ "active" => "boolean" }
|
|
232
|
+
]
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
### Date and Time Types
|
|
236
|
+
|
|
237
|
+
```ruby
|
|
238
|
+
schema = [
|
|
239
|
+
# Date (days since Unix epoch)
|
|
240
|
+
{ "date" => "date32" },
|
|
241
|
+
|
|
242
|
+
# Timestamps (with different precisions)
|
|
243
|
+
{ "created_sec" => "timestamp_second" },
|
|
244
|
+
{ "created_ms" => "timestamp_millis" }, # Most common
|
|
245
|
+
{ "created_us" => "timestamp_micros" },
|
|
246
|
+
{ "created_ns" => "timestamp_nanos" },
|
|
247
|
+
|
|
248
|
+
# Time of day (without date)
|
|
249
|
+
{ "time_ms" => "time_millis" }, # Milliseconds since midnight
|
|
250
|
+
{ "time_us" => "time_micros" } # Microseconds since midnight
|
|
251
|
+
]
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
### Decimal Type (Financial Data)
|
|
255
|
+
|
|
256
|
+
For exact decimal arithmetic (no floating-point errors):
|
|
257
|
+
|
|
258
|
+
```ruby
|
|
259
|
+
require "bigdecimal"
|
|
260
|
+
|
|
261
|
+
schema = [
|
|
262
|
+
# Financial amounts with 2 decimal places
|
|
263
|
+
{ "price" => "decimal", "precision" => 10, "scale" => 2 }, # Up to 99,999,999.99
|
|
264
|
+
{ "balance" => "decimal", "precision" => 15, "scale" => 2 }, # Larger amounts
|
|
265
|
+
|
|
266
|
+
# High-precision calculations
|
|
267
|
+
{ "rate" => "decimal", "precision" => 10, "scale" => 8 } # 8 decimal places
|
|
268
|
+
]
|
|
269
|
+
|
|
270
|
+
# Use BigDecimal for exact values
|
|
271
|
+
data = [[
|
|
272
|
+
BigDecimal("19.99"),
|
|
273
|
+
BigDecimal("1234567.89"),
|
|
274
|
+
BigDecimal("0.00000123")
|
|
275
|
+
]]
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
## Complex Data Structures
|
|
279
|
+
|
|
280
|
+
The library includes a powerful Schema DSL for defining nested data:
|
|
281
|
+
|
|
282
|
+
### Using the Schema DSL
|
|
283
|
+
|
|
284
|
+
```ruby
|
|
285
|
+
schema = Parquet::Schema.define do
|
|
286
|
+
# Simple fields
|
|
287
|
+
field :id, :int64, nullable: false # Required field
|
|
288
|
+
field :name, :string # Optional by default
|
|
289
|
+
|
|
290
|
+
# Nested structure
|
|
291
|
+
field :address, :struct do
|
|
292
|
+
field :street, :string
|
|
293
|
+
field :city, :string
|
|
294
|
+
field :location, :struct do
|
|
295
|
+
field :lat, :double
|
|
296
|
+
field :lng, :double
|
|
297
|
+
end
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
# Lists
|
|
301
|
+
field :tags, :list, item: :string
|
|
302
|
+
field :scores, :list, item: :int32
|
|
303
|
+
|
|
304
|
+
# Maps (dictionaries)
|
|
305
|
+
field :metadata, :map, key: :string, value: :string
|
|
306
|
+
|
|
307
|
+
# Complex combinations
|
|
308
|
+
field :contacts, :list, item: :struct do
|
|
309
|
+
field :name, :string
|
|
310
|
+
field :email, :string
|
|
311
|
+
field :primary, :boolean
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
### Writing Complex Data
|
|
317
|
+
|
|
318
|
+
```ruby
|
|
319
|
+
data = [[
|
|
320
|
+
1, # id
|
|
321
|
+
"Alice Johnson", # name
|
|
322
|
+
{ # address
|
|
323
|
+
"street" => "123 Main St",
|
|
324
|
+
"city" => "Springfield",
|
|
325
|
+
"location" => {
|
|
326
|
+
"lat" => 40.7128,
|
|
327
|
+
"lng" => -74.0060
|
|
328
|
+
}
|
|
329
|
+
},
|
|
330
|
+
["ruby", "parquet", "data"], # tags
|
|
331
|
+
[85, 92, 88], # scores
|
|
332
|
+
{ "dept" => "Engineering" }, # metadata
|
|
333
|
+
[ # contacts
|
|
334
|
+
{ "name" => "Bob", "email" => "bob@example.com", "primary" => true },
|
|
335
|
+
{ "name" => "Carol", "email" => "carol@example.com", "primary" => false }
|
|
336
|
+
]
|
|
337
|
+
]]
|
|
338
|
+
|
|
339
|
+
Parquet.write_rows(data.each, schema: schema, write_to: "complex.parquet")
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
## ⚠️ Important Limitations
|
|
343
|
+
|
|
344
|
+
### Timezone Handling in Parquet
|
|
345
|
+
|
|
346
|
+
The Parquet specification has a fundamental limitation with timezone storage:
|
|
347
|
+
|
|
348
|
+
1. **UTC-normalized**: Any timestamp with timezone info (including "+09:00" or "America/New_York") is converted to UTC
|
|
349
|
+
2. **Local/unzoned**: Timestamps without timezone info are stored as-is
|
|
350
|
+
|
|
351
|
+
**The original timezone information is permanently lost.** This is not a limitation of this library but of the Parquet format itself.
|
|
352
|
+
|
|
353
|
+
```ruby
|
|
354
|
+
schema = Parquet::Schema.define do
|
|
355
|
+
# These BOTH store in UTC - timezone info is lost!
|
|
356
|
+
field :timestamp_utc, :timestamp_millis, timezone: "UTC"
|
|
357
|
+
field :timestamp_tokyo, :timestamp_millis, timezone: "+09:00"
|
|
358
|
+
|
|
359
|
+
# This stores as local time (no timezone)
|
|
360
|
+
field :timestamp_local, :timestamp_millis
|
|
361
|
+
end
|
|
362
|
+
|
|
363
|
+
# If you need timezone preservation, store it separately:
|
|
364
|
+
schema = Parquet::Schema.define do
|
|
365
|
+
field :timestamp, :timestamp_millis, has_timezone: true # UTC storage
|
|
366
|
+
field :original_tz, :string # "America/New_York"
|
|
367
|
+
end
|
|
368
|
+
```
|
|
369
|
+
|
|
370
|
+
## Performance Tips
|
|
371
|
+
|
|
372
|
+
1. **Use column-wise reading** when you need only a few columns from wide tables
|
|
373
|
+
2. **Specify columns parameter** to avoid reading unnecessary data
|
|
374
|
+
3. **Choose appropriate batch sizes**:
|
|
375
|
+
- Larger batches = better throughput but more memory
|
|
376
|
+
- Smaller batches = less memory but more overhead
|
|
377
|
+
4. **Pre-sort data** by commonly filtered columns for better compression
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
## Memory Management
|
|
381
|
+
|
|
382
|
+
Writes are streamed: an Enumerator (or any Enumerable) passed to `write_rows`
|
|
383
|
+
is consumed in bounded slices rather than materialized up front, and completed
|
|
384
|
+
row groups are flushed to the destination while the input is still being
|
|
385
|
+
enumerated. Peak memory is bounded by `batch_size` and `flush_threshold`, not
|
|
386
|
+
by the total dataset size:
|
|
387
|
+
|
|
388
|
+
```ruby
|
|
389
|
+
Parquet.write_rows(huge_dataset.each,
|
|
390
|
+
schema: schema,
|
|
391
|
+
write_to: "output.parquet",
|
|
392
|
+
batch_size: 1000, # Rows buffered per write batch (also the
|
|
393
|
+
# slice size pulled from an Enumerator)
|
|
394
|
+
flush_threshold: 32 * 1024**2 # Flush a row group to the destination once
|
|
395
|
+
# ~32MB of raw row data is staged (default 100MB)
|
|
396
|
+
)
|
|
397
|
+
```
|
|
398
|
+
|
|
399
|
+
`flush_threshold` bounds both the raw bytes staged since the last flush and
|
|
400
|
+
the writer's encoded in-progress buffer, so row groups reach the destination
|
|
401
|
+
incrementally even when compression shrinks the encoded data dramatically.
|
|
402
|
+
`write_columns` flushes the same way after each batch of columns.
|
|
403
|
+
|
|
404
|
+
When `write_to:` is an IO object instead of a file path, output is staged in a
|
|
405
|
+
temporary file on disk (memory stays bounded) and copied to the IO after the
|
|
406
|
+
write completes, so the IO receives its bytes only at the end.
|
|
407
|
+
|
|
408
|
+
Write batch and sample sizes are bounded before buffer allocation. Very large
|
|
409
|
+
batch sizes are rejected, and wide schemas have a lower effective batch cap so
|
|
410
|
+
the writer cannot reserve unbounded per-column value slots.
|
|
411
|
+
|
|
412
|
+
## Architecture
|
|
413
|
+
|
|
414
|
+
This gem uses a modular architecture:
|
|
415
|
+
|
|
416
|
+
- **parquet-core**: Language-agnostic Rust core for Parquet operations
|
|
417
|
+
- **parquet-ruby-adapter**: Ruby-specific FFI adapter layer
|
|
418
|
+
- **parquet gem**: High-level Ruby API
|
|
419
|
+
|
|
420
|
+
Take a look at [ARCH.md](./ARCH.md)
|
|
421
|
+
|
|
422
|
+
## Contributing
|
|
423
|
+
|
|
424
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/njaremko/parquet-ruby.
|
|
425
|
+
|
|
426
|
+
## License
|
|
427
|
+
|
|
428
|
+
The gem is available as open source under the terms of the MIT License.
|
data/Rakefile
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "rake/testtask"
|
|
4
|
+
require "rb_sys/extensiontask"
|
|
5
|
+
|
|
6
|
+
task default: :test
|
|
7
|
+
|
|
8
|
+
GEMSPEC = Gem::Specification.load("parquet.gemspec")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
platforms = [
|
|
12
|
+
"x86_64-linux",
|
|
13
|
+
"x86_64-linux-musl",
|
|
14
|
+
"aarch64-linux",
|
|
15
|
+
"aarch64-linux-musl",
|
|
16
|
+
"x86_64-darwin",
|
|
17
|
+
"arm64-darwin"
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
RbSys::ExtensionTask.new("parquet", GEMSPEC) do |ext|
|
|
21
|
+
ext.lib_dir = "lib/parquet"
|
|
22
|
+
ext.ext_dir = "ext/parquet"
|
|
23
|
+
ext.cross_compile = true
|
|
24
|
+
ext.cross_platform = platforms
|
|
25
|
+
ext.cross_compiling do |spec|
|
|
26
|
+
spec.dependencies.reject! { |dep| dep.name == "rb_sys" }
|
|
27
|
+
spec.files.reject! { |file| File.fnmatch?("ext/*", file, File::FNM_EXTGLOB) }
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
Rake::TestTask.new do |t|
|
|
32
|
+
t.deps << :compile
|
|
33
|
+
t.test_files = FileList[File.expand_path("test/*_test.rb", __dir__)]
|
|
34
|
+
t.libs << "lib"
|
|
35
|
+
t.libs << "test"
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
task :release do
|
|
39
|
+
sh "bundle exec rake test"
|
|
40
|
+
sh "mkdir -p pkg"
|
|
41
|
+
sh "gem build parquet.gemspec -o pkg/parquet-#{Parquet::VERSION}.gem"
|
|
42
|
+
sh "gem push pkg/parquet-#{Parquet::VERSION}.gem"
|
|
43
|
+
end
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Parquet
|
|
4
|
+
# Schema definition for Parquet files
|
|
5
|
+
class Schema
|
|
6
|
+
# Define a new schema using the DSL
|
|
7
|
+
# @return [Hash] schema definition hash
|
|
8
|
+
#
|
|
9
|
+
# @example Define a schema with nullable and non-nullable fields
|
|
10
|
+
# Parquet::Schema.define do
|
|
11
|
+
# field :id, :int64, nullable: false # ID cannot be null
|
|
12
|
+
# field :name, :string # Default nullable: true
|
|
13
|
+
#
|
|
14
|
+
# # Decimal field with precision and scale
|
|
15
|
+
# field :price, :decimal, precision: 10, scale: 2
|
|
16
|
+
#
|
|
17
|
+
# # List with non-nullable items
|
|
18
|
+
# field :scores, :list, item: :float, item_nullable: false
|
|
19
|
+
#
|
|
20
|
+
# # Map with nullable values
|
|
21
|
+
# field :metadata, :map,
|
|
22
|
+
# key: :string,
|
|
23
|
+
# value: :string,
|
|
24
|
+
# value_nullable: true
|
|
25
|
+
#
|
|
26
|
+
# # Nested struct with non-nullable fields
|
|
27
|
+
# field :address, :struct, nullable: true do
|
|
28
|
+
# field :street, :string, nullable: false
|
|
29
|
+
# field :city, :string, nullable: false
|
|
30
|
+
# field :zip, :string, nullable: false
|
|
31
|
+
# end
|
|
32
|
+
# end
|
|
33
|
+
def self.define(&block)
|
|
34
|
+
builder = SchemaBuilder.new
|
|
35
|
+
builder.instance_eval(&block)
|
|
36
|
+
|
|
37
|
+
# Return a structured hash representing the schema
|
|
38
|
+
{ type: :struct, fields: builder.fields }
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Internal builder class that provides the DSL methods
|
|
42
|
+
class SchemaBuilder
|
|
43
|
+
attr_reader :fields
|
|
44
|
+
|
|
45
|
+
def initialize
|
|
46
|
+
@fields = []
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Define a field in the schema
|
|
50
|
+
# @param name [String, Symbol] field name
|
|
51
|
+
# @param type [Symbol] data type (:int32, :int64, :string, :list, :map, :struct, :decimal, etc)
|
|
52
|
+
# @param nullable [Boolean] whether the field can be null (default: true)
|
|
53
|
+
# @param kwargs [Hash] additional options depending on type
|
|
54
|
+
#
|
|
55
|
+
# Additional keyword args:
|
|
56
|
+
# - `item:` if type == :list
|
|
57
|
+
# - `item_nullable:` controls nullability of list items (default: true)
|
|
58
|
+
# - `key:, value:` if type == :map
|
|
59
|
+
# - `key_nullable:, value_nullable:` controls nullability of map keys/values (default: true)
|
|
60
|
+
# - `format:` if you want to store some format string
|
|
61
|
+
# - `precision:, scale:` if type == :decimal (precision defaults to 38, scale to 0)
|
|
62
|
+
# - `has_timezone:` if type is timestamp - true means UTC storage (default), false means local/unzoned
|
|
63
|
+
# - `timezone:` (DEPRECATED) if type is timestamp - any value means UTC storage
|
|
64
|
+
# - `nullable:` default to true if not specified
|
|
65
|
+
def field(name, type, nullable: true, **kwargs, &block)
|
|
66
|
+
field_hash = { name: name.to_s, type: type, nullable: !!nullable }
|
|
67
|
+
|
|
68
|
+
# Possibly store a format if provided
|
|
69
|
+
field_hash[:format] = kwargs[:format] if kwargs.key?(:format)
|
|
70
|
+
|
|
71
|
+
# Handle timezone for timestamp types
|
|
72
|
+
if [:timestamp_second, :timestamp_millis, :timestamp_micros, :timestamp_nanos].include?(type)
|
|
73
|
+
# Support new has_timezone parameter (preferred)
|
|
74
|
+
if kwargs.key?(:has_timezone)
|
|
75
|
+
# If has_timezone is true, store "UTC" to indicate timezone presence
|
|
76
|
+
# If explicitly false, don't store timezone (indicates local/unzoned)
|
|
77
|
+
field_hash[:timezone] = "UTC" if kwargs[:has_timezone]
|
|
78
|
+
elsif kwargs.key?(:timezone)
|
|
79
|
+
# Legacy support: any timezone value means UTC storage
|
|
80
|
+
# Store "UTC" regardless of the actual value to make it clear
|
|
81
|
+
field_hash[:timezone] = "UTC"
|
|
82
|
+
else
|
|
83
|
+
# Default behavior when neither parameter is specified: UTC storage
|
|
84
|
+
field_hash[:timezone] = "UTC"
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
case type
|
|
89
|
+
when :struct
|
|
90
|
+
# We'll parse subfields from the block
|
|
91
|
+
sub_builder = SchemaBuilder.new
|
|
92
|
+
sub_builder.instance_eval(&block) if block
|
|
93
|
+
field_hash[:fields] = sub_builder.fields
|
|
94
|
+
when :list
|
|
95
|
+
item_type = kwargs[:item]
|
|
96
|
+
raise ArgumentError, "list field `#{name}` requires `item:` type" unless item_type
|
|
97
|
+
# Pass item_nullable if provided, otherwise use true as default
|
|
98
|
+
item_nullable = kwargs[:item_nullable].nil? ? true : !!kwargs[:item_nullable]
|
|
99
|
+
|
|
100
|
+
# Pass precision and scale if type is decimal
|
|
101
|
+
if item_type == :decimal
|
|
102
|
+
precision = kwargs[:precision]
|
|
103
|
+
scale = kwargs[:scale]
|
|
104
|
+
field_hash[:item] = wrap_subtype(
|
|
105
|
+
item_type,
|
|
106
|
+
nullable: item_nullable,
|
|
107
|
+
precision: precision,
|
|
108
|
+
scale: scale,
|
|
109
|
+
&block
|
|
110
|
+
)
|
|
111
|
+
else
|
|
112
|
+
field_hash[:item] = wrap_subtype(item_type, nullable: item_nullable, &block)
|
|
113
|
+
end
|
|
114
|
+
when :map
|
|
115
|
+
# user must specify key:, value:
|
|
116
|
+
key_type = kwargs[:key]
|
|
117
|
+
value_type = kwargs[:value]
|
|
118
|
+
raise ArgumentError, "map field `#{name}` requires `key:` and `value:`" if key_type.nil? || value_type.nil?
|
|
119
|
+
# Map keys are required by the Parquet spec. Reject an explicit nullable
|
|
120
|
+
# key at this boundary rather than letting it fail deep in the writer.
|
|
121
|
+
if kwargs[:key_nullable]
|
|
122
|
+
raise ArgumentError, "map field `#{name}` keys are always required; remove `key_nullable: true`"
|
|
123
|
+
end
|
|
124
|
+
key_nullable = false
|
|
125
|
+
value_nullable = kwargs[:value_nullable].nil? ? true : !!kwargs[:value_nullable]
|
|
126
|
+
|
|
127
|
+
field_hash[:key] = wrap_subtype(key_type, nullable: key_nullable)
|
|
128
|
+
|
|
129
|
+
# Pass precision and scale if value type is decimal
|
|
130
|
+
if value_type == :decimal
|
|
131
|
+
precision = kwargs[:precision]
|
|
132
|
+
scale = kwargs[:scale]
|
|
133
|
+
field_hash[:value] = wrap_subtype(
|
|
134
|
+
value_type,
|
|
135
|
+
nullable: value_nullable,
|
|
136
|
+
precision: precision,
|
|
137
|
+
scale: scale,
|
|
138
|
+
&block
|
|
139
|
+
)
|
|
140
|
+
else
|
|
141
|
+
field_hash[:value] = wrap_subtype(value_type, nullable: value_nullable, &block)
|
|
142
|
+
end
|
|
143
|
+
when :decimal
|
|
144
|
+
# Store precision and scale for decimal type according to rules:
|
|
145
|
+
# 1. When neither precision nor scale is provided, use maximum precision (38)
|
|
146
|
+
# 2. When only precision is provided, scale defaults to 0
|
|
147
|
+
# 3. When only scale is provided, use maximum precision (38)
|
|
148
|
+
# 4. When both are provided, use the provided values
|
|
149
|
+
|
|
150
|
+
if kwargs[:precision].nil? && kwargs[:scale].nil?
|
|
151
|
+
# No precision or scale provided - use maximum precision
|
|
152
|
+
field_hash[:precision] = 38
|
|
153
|
+
field_hash[:scale] = 0
|
|
154
|
+
elsif kwargs[:precision] && kwargs[:scale].nil?
|
|
155
|
+
# Precision only - scale defaults to 0
|
|
156
|
+
field_hash[:precision] = kwargs[:precision]
|
|
157
|
+
field_hash[:scale] = 0
|
|
158
|
+
elsif kwargs[:precision].nil? && kwargs[:scale]
|
|
159
|
+
# Scale only - use maximum precision
|
|
160
|
+
field_hash[:precision] = 38
|
|
161
|
+
field_hash[:scale] = kwargs[:scale]
|
|
162
|
+
else
|
|
163
|
+
# Both provided
|
|
164
|
+
field_hash[:precision] = kwargs[:precision]
|
|
165
|
+
field_hash[:scale] = kwargs[:scale]
|
|
166
|
+
end
|
|
167
|
+
else
|
|
168
|
+
# primitive type: :int32, :int64, :string, etc.
|
|
169
|
+
# do nothing else special
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
@fields << field_hash
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
def build_map(key_type, value_type, key_nullable: false, value_nullable: true, nullable: true, &block)
|
|
176
|
+
# Wrap the key type (maps typically use non-nullable keys)
|
|
177
|
+
key = wrap_subtype(key_type, nullable: key_nullable)
|
|
178
|
+
|
|
179
|
+
# Handle the case where value_type is a complex type (:struct or :list) and a block is provided
|
|
180
|
+
value =
|
|
181
|
+
if (value_type == :struct || value_type == :list) && block
|
|
182
|
+
wrap_subtype(value_type, nullable: value_nullable, &block)
|
|
183
|
+
else
|
|
184
|
+
wrap_subtype(value_type, nullable: value_nullable)
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# Map is represented as a list of key/value pairs in Parquet
|
|
188
|
+
{
|
|
189
|
+
type: :map,
|
|
190
|
+
nullable: nullable,
|
|
191
|
+
item: {
|
|
192
|
+
type: :struct,
|
|
193
|
+
nullable: false,
|
|
194
|
+
name: "key_value",
|
|
195
|
+
fields: [key, value]
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
private
|
|
201
|
+
|
|
202
|
+
# If user said: field "something", :list, item: :struct do ... end
|
|
203
|
+
# we want to recursively parse that sub-struct from the block.
|
|
204
|
+
# So wrap_subtype might be:
|
|
205
|
+
def wrap_subtype(t, nullable: true, precision: nil, scale: nil, &block)
|
|
206
|
+
if t == :struct
|
|
207
|
+
sub_builder = SchemaBuilder.new
|
|
208
|
+
sub_builder.instance_eval(&block) if block
|
|
209
|
+
|
|
210
|
+
# Validate that the struct has at least one field
|
|
211
|
+
if sub_builder.fields.empty?
|
|
212
|
+
raise ArgumentError, "Cannot create a struct with zero fields. Parquet doesn't support empty structs."
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
{ type: :struct, nullable: nullable, name: "item", fields: sub_builder.fields }
|
|
216
|
+
elsif t == :list && block
|
|
217
|
+
# Handle nested lists by processing the block to define the item type
|
|
218
|
+
sub_builder = SchemaBuilder.new
|
|
219
|
+
sub_builder.instance_eval(&block) if block
|
|
220
|
+
|
|
221
|
+
# We expect a single field named "item" that defines the inner list's item type
|
|
222
|
+
if sub_builder.fields.empty? || sub_builder.fields.length > 1 || sub_builder.fields[0][:name] != "item"
|
|
223
|
+
raise ArgumentError, "Nested list must define exactly one field named 'item' for the inner list's item type"
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
{ type: :list, nullable: nullable, name: "item", item: sub_builder.fields[0] }
|
|
227
|
+
elsif t == :decimal
|
|
228
|
+
# Handle decimal type with precision and scale
|
|
229
|
+
result = { type: t, nullable: nullable, name: "item" }
|
|
230
|
+
|
|
231
|
+
# Follow the same rules as in field() method:
|
|
232
|
+
# 1. When neither precision nor scale is provided, use maximum precision (38)
|
|
233
|
+
# 2. When only precision is provided, scale defaults to 0
|
|
234
|
+
# 3. When only scale is provided, use maximum precision (38)
|
|
235
|
+
# 4. When both are provided, use the provided values
|
|
236
|
+
if precision.nil? && scale.nil?
|
|
237
|
+
# No precision or scale provided - use maximum precision
|
|
238
|
+
result[:precision] = 38
|
|
239
|
+
result[:scale] = 0
|
|
240
|
+
elsif precision && scale.nil?
|
|
241
|
+
# Precision only - scale defaults to 0
|
|
242
|
+
result[:precision] = precision
|
|
243
|
+
result[:scale] = 0
|
|
244
|
+
elsif precision.nil? && scale
|
|
245
|
+
# Scale only - use maximum precision
|
|
246
|
+
result[:precision] = 38
|
|
247
|
+
result[:scale] = scale
|
|
248
|
+
else
|
|
249
|
+
# Both provided
|
|
250
|
+
result[:precision] = precision
|
|
251
|
+
result[:scale] = scale
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
result
|
|
255
|
+
else
|
|
256
|
+
# e.g. :int32 => { type: :int32, nullable: true }
|
|
257
|
+
{ type: t, nullable: nullable, name: "item" }
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
end
|
data/lib/parquet.rb
ADDED
data/lib/parquet.rbi
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# typed: true
|
|
2
|
+
|
|
3
|
+
module Parquet
|
|
4
|
+
# Returns metadata information about a Parquet file
|
|
5
|
+
#
|
|
6
|
+
# The returned hash contains information about:
|
|
7
|
+
# - Basic file metadata (num_rows, created_by)
|
|
8
|
+
# - Schema information (fields, types, etc.)
|
|
9
|
+
# - Row group details
|
|
10
|
+
# - Column chunk information (compression, encodings, statistics)
|
|
11
|
+
sig { params(path: String).returns(T::Hash[String, T.untyped]) }
|
|
12
|
+
def self.metadata(path)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Options:
|
|
16
|
+
# - `input`: String, File, or IO object containing parquet data
|
|
17
|
+
# - `result_type`: String specifying the output format
|
|
18
|
+
# ("hash" or "array" or :hash or :array)
|
|
19
|
+
# - `columns`: When present, only the specified columns will be included in the output.
|
|
20
|
+
# This is useful for reducing how much data is read and improving performance.
|
|
21
|
+
# - `string_storage`: How string *values* become Ruby strings (default `:copy`). Hash keys
|
|
22
|
+
# (struct field names and top-level column names) are always interned and
|
|
23
|
+
# reused regardless of this setting.
|
|
24
|
+
# - `:copy` allocates a fresh mutable String per value.
|
|
25
|
+
# - `:intern` deduplicates low-cardinality equal values into frozen interned
|
|
26
|
+
# Strings up to a bounded per-read cache, then falls back to frozen copies.
|
|
27
|
+
# A transient copy still happens per value, so it is not a per-value speedup.
|
|
28
|
+
# - `:shared` returns frozen, zero-copy strings backed by Rust memory for
|
|
29
|
+
# short, repeated, low-cardinality values. Each read returns at most the
|
|
30
|
+
# configured number of shared values and only values up to the configured
|
|
31
|
+
# byte size; values past those bounds become frozen copies. New process-wide
|
|
32
|
+
# leaks are also capped by the requested budget and hard process ceilings.
|
|
33
|
+
# All `:shared` results are frozen. Not recommended for high-cardinality or
|
|
34
|
+
# large-blob string columns.
|
|
35
|
+
# Pass a hash to set the `:shared` leak budget, e.g.
|
|
36
|
+
# `{ mode: :shared, max_entries: 16_384, max_value_bytes: 1024 }`.
|
|
37
|
+
sig do
|
|
38
|
+
params(
|
|
39
|
+
input: T.any(String, File, StringIO, IO),
|
|
40
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
|
41
|
+
columns: T.nilable(T::Array[String]),
|
|
42
|
+
strict: T.nilable(T::Boolean),
|
|
43
|
+
string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped]))
|
|
44
|
+
).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
|
|
45
|
+
end
|
|
46
|
+
sig do
|
|
47
|
+
params(
|
|
48
|
+
input: T.any(String, File, StringIO, IO),
|
|
49
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
|
50
|
+
columns: T.nilable(T::Array[String]),
|
|
51
|
+
strict: T.nilable(T::Boolean),
|
|
52
|
+
string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped])),
|
|
53
|
+
blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
|
|
54
|
+
).returns(NilClass)
|
|
55
|
+
end
|
|
56
|
+
def self.each_row(input, result_type: nil, columns: nil, strict: nil, string_storage: nil, &blk)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Options:
|
|
60
|
+
# - `input`: String, File, or IO object containing parquet data
|
|
61
|
+
# - `result_type`: String specifying the output format
|
|
62
|
+
# ("hash" or "array" or :hash or :array)
|
|
63
|
+
# - `columns`: When present, only the specified columns will be included in the output.
|
|
64
|
+
# - `batch_size`: When present, specifies the number of rows per batch
|
|
65
|
+
# - `string_storage`: How string values become Ruby strings (`:copy` (default), `:intern`,
|
|
66
|
+
# or `:shared`). See `each_row` for the semantics of each mode.
|
|
67
|
+
sig do
|
|
68
|
+
params(
|
|
69
|
+
input: T.any(String, File, StringIO, IO),
|
|
70
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
|
71
|
+
columns: T.nilable(T::Array[String]),
|
|
72
|
+
batch_size: T.nilable(Integer),
|
|
73
|
+
strict: T.nilable(T::Boolean),
|
|
74
|
+
string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped]))
|
|
75
|
+
).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
|
|
76
|
+
end
|
|
77
|
+
sig do
|
|
78
|
+
params(
|
|
79
|
+
input: T.any(String, File, StringIO, IO),
|
|
80
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
|
81
|
+
columns: T.nilable(T::Array[String]),
|
|
82
|
+
batch_size: T.nilable(Integer),
|
|
83
|
+
strict: T.nilable(T::Boolean),
|
|
84
|
+
string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped])),
|
|
85
|
+
blk:
|
|
86
|
+
T.nilable(T.proc.params(batch: T.any(T::Hash[String, T::Array[T.untyped]], T::Array[T::Array[T.untyped]])).void)
|
|
87
|
+
).returns(NilClass)
|
|
88
|
+
end
|
|
89
|
+
def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, strict: nil, string_storage: nil, &blk)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Options:
|
|
93
|
+
# - `read_from`: An Enumerator yielding arrays of values representing each row
|
|
94
|
+
# - `schema`: Array of hashes specifying column names and types. Supported types:
|
|
95
|
+
# - `int8`, `int16`, `int32`, `int64`
|
|
96
|
+
# - `uint8`, `uint16`, `uint32`, `uint64`
|
|
97
|
+
# - `float`, `double`
|
|
98
|
+
# - `string`
|
|
99
|
+
# - `binary`
|
|
100
|
+
# - `boolean`
|
|
101
|
+
# - `date32`
|
|
102
|
+
# - `timestamp_millis`, `timestamp_micros`
|
|
103
|
+
# - `write_to`: String path or IO object to write the parquet file to
|
|
104
|
+
# - `batch_size`: Optional positive batch size for writing (defaults to 1000, at most 1_000_000
|
|
105
|
+
# for one-column schemas; wide schemas may have a lower safety cap). Enumerator
|
|
106
|
+
# inputs are consumed in slices of this many rows, never materialized in full.
|
|
107
|
+
# - `flush_threshold`: Optional threshold in bytes before a row group is flushed to the
|
|
108
|
+
# destination; bounds both the raw bytes staged since the last flush
|
|
109
|
+
# and the writer's encoded in-progress buffer (defaults to 100MB)
|
|
110
|
+
# - `compression`: Optional compression type to use (defaults to "zstd")
|
|
111
|
+
# Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
|
112
|
+
# - `sample_size`: Optional positive number of rows to sample for size estimation
|
|
113
|
+
# (defaults to 100, at most 10_000)
|
|
114
|
+
# - `string_cache`: Deduplicate repeated string values while writing. `false` (default)
|
|
115
|
+
# disables it, `true` enables it with a default capacity, and an Integer
|
|
116
|
+
# enables it with that many retained distinct strings (at most 65_536).
|
|
117
|
+
# Retention also skips values larger than 4KB and stops after 16MB of
|
|
118
|
+
# cached string content.
|
|
119
|
+
sig do
|
|
120
|
+
params(
|
|
121
|
+
read_from: T::Enumerator[T::Array[T.untyped]],
|
|
122
|
+
schema: T::Array[T::Hash[String, String]],
|
|
123
|
+
write_to: T.any(String, IO),
|
|
124
|
+
batch_size: T.nilable(Integer),
|
|
125
|
+
flush_threshold: T.nilable(Integer),
|
|
126
|
+
compression: T.nilable(String),
|
|
127
|
+
sample_size: T.nilable(Integer),
|
|
128
|
+
string_cache: T.nilable(T.any(T::Boolean, Integer))
|
|
129
|
+
).void
|
|
130
|
+
end
|
|
131
|
+
def self.write_rows(
|
|
132
|
+
read_from,
|
|
133
|
+
schema:,
|
|
134
|
+
write_to:,
|
|
135
|
+
batch_size: nil,
|
|
136
|
+
flush_threshold: nil,
|
|
137
|
+
compression: nil,
|
|
138
|
+
sample_size: nil,
|
|
139
|
+
string_cache: nil
|
|
140
|
+
)
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Options:
|
|
144
|
+
# - `read_from`: An Enumerator yielding arrays of column batches
|
|
145
|
+
# - `schema`: Array of hashes specifying column names and types. Supported types:
|
|
146
|
+
# - `int8`, `int16`, `int32`, `int64`
|
|
147
|
+
# - `uint8`, `uint16`, `uint32`, `uint64`
|
|
148
|
+
# - `float`, `double`
|
|
149
|
+
# - `string`
|
|
150
|
+
# - `binary`
|
|
151
|
+
# - `boolean`
|
|
152
|
+
# - `date32`
|
|
153
|
+
# - `timestamp_millis`, `timestamp_micros`
|
|
154
|
+
# - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
|
|
155
|
+
# - `write_to`: String path or IO object to write the parquet file to
|
|
156
|
+
# - `flush_threshold`: Optional threshold in bytes before a row group is flushed to the
|
|
157
|
+
# destination; bounds both the raw bytes staged since the last flush
|
|
158
|
+
# and the writer's encoded in-progress buffer (defaults to 100MB)
|
|
159
|
+
# - `compression`: Optional compression type to use (defaults to "zstd")
|
|
160
|
+
# Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
|
161
|
+
# - `logger`: Optional Ruby logger for column-write progress messages
|
|
162
|
+
sig do
|
|
163
|
+
params(
|
|
164
|
+
read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
|
|
165
|
+
schema: T::Array[T::Hash[String, String]],
|
|
166
|
+
write_to: T.any(String, IO),
|
|
167
|
+
flush_threshold: T.nilable(Integer),
|
|
168
|
+
compression: T.nilable(String),
|
|
169
|
+
logger: T.nilable(T.untyped)
|
|
170
|
+
).void
|
|
171
|
+
end
|
|
172
|
+
def self.write_columns(
|
|
173
|
+
read_from,
|
|
174
|
+
schema:,
|
|
175
|
+
write_to:,
|
|
176
|
+
flush_threshold: nil,
|
|
177
|
+
compression: nil,
|
|
178
|
+
logger: nil
|
|
179
|
+
)
|
|
180
|
+
end
|
|
181
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: parquet-tyfoom
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.8.0
|
|
5
|
+
platform: arm64-darwin
|
|
6
|
+
authors:
|
|
7
|
+
- Nathan Jaremko
|
|
8
|
+
- Cameron McCord
|
|
9
|
+
autorequire:
|
|
10
|
+
bindir: bin
|
|
11
|
+
cert_chain: []
|
|
12
|
+
date: 2026-07-02 00:00:00.000000000 Z
|
|
13
|
+
dependencies:
|
|
14
|
+
- !ruby/object:Gem::Dependency
|
|
15
|
+
name: bigdecimal
|
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
|
17
|
+
requirements:
|
|
18
|
+
- - ">="
|
|
19
|
+
- !ruby/object:Gem::Version
|
|
20
|
+
version: '0'
|
|
21
|
+
type: :runtime
|
|
22
|
+
prerelease: false
|
|
23
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
24
|
+
requirements:
|
|
25
|
+
- - ">="
|
|
26
|
+
- !ruby/object:Gem::Version
|
|
27
|
+
version: '0'
|
|
28
|
+
- !ruby/object:Gem::Dependency
|
|
29
|
+
name: rake-compiler
|
|
30
|
+
requirement: !ruby/object:Gem::Requirement
|
|
31
|
+
requirements:
|
|
32
|
+
- - "~>"
|
|
33
|
+
- !ruby/object:Gem::Version
|
|
34
|
+
version: 1.2.0
|
|
35
|
+
type: :development
|
|
36
|
+
prerelease: false
|
|
37
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
38
|
+
requirements:
|
|
39
|
+
- - "~>"
|
|
40
|
+
- !ruby/object:Gem::Version
|
|
41
|
+
version: 1.2.0
|
|
42
|
+
description: |2
|
|
43
|
+
Tyfoom's fork of the `parquet` gem (github.com/njaremko/parquet-ruby), published while the
|
|
44
|
+
incremental streaming-write fix is pending upstream. It wraps the official Apache Rust
|
|
45
|
+
implementation and bounds write memory by streaming row groups to disk instead of buffering the
|
|
46
|
+
whole file. Drop-in compatible with the upstream gem: the library is still required as
|
|
47
|
+
`require "parquet"` and exposes the same `Parquet` API.
|
|
48
|
+
email:
|
|
49
|
+
- nathan@jaremko.ca
|
|
50
|
+
- cameron.mccord@tyfoom.com
|
|
51
|
+
executables: []
|
|
52
|
+
extensions: []
|
|
53
|
+
extra_rdoc_files: []
|
|
54
|
+
files:
|
|
55
|
+
- Gemfile
|
|
56
|
+
- LICENSE
|
|
57
|
+
- README.md
|
|
58
|
+
- Rakefile
|
|
59
|
+
- lib/parquet.rb
|
|
60
|
+
- lib/parquet.rbi
|
|
61
|
+
- lib/parquet/3.2/parquet.bundle
|
|
62
|
+
- lib/parquet/3.3/parquet.bundle
|
|
63
|
+
- lib/parquet/3.4/parquet.bundle
|
|
64
|
+
- lib/parquet/4.0/parquet.bundle
|
|
65
|
+
- lib/parquet/schema.rb
|
|
66
|
+
- lib/parquet/version.rb
|
|
67
|
+
homepage: https://github.com/cameronmccord2/parquet-ruby
|
|
68
|
+
licenses:
|
|
69
|
+
- MIT
|
|
70
|
+
metadata:
|
|
71
|
+
homepage_uri: https://github.com/cameronmccord2/parquet-ruby
|
|
72
|
+
source_code_uri: https://github.com/cameronmccord2/parquet-ruby
|
|
73
|
+
readme_uri: https://github.com/cameronmccord2/parquet-ruby/blob/stream-writes-incrementally/README.md
|
|
74
|
+
changelog_uri: https://github.com/cameronmccord2/parquet-ruby/blob/stream-writes-incrementally/CHANGELOG.md
|
|
75
|
+
documentation_uri: https://www.rubydoc.info/gems/parquet-tyfoom
|
|
76
|
+
funding_uri: https://github.com/sponsors/njaremko
|
|
77
|
+
allowed_push_host: https://rubygems.org
|
|
78
|
+
post_install_message:
|
|
79
|
+
rdoc_options: []
|
|
80
|
+
require_paths:
|
|
81
|
+
- lib
|
|
82
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
83
|
+
requirements:
|
|
84
|
+
- - ">="
|
|
85
|
+
- !ruby/object:Gem::Version
|
|
86
|
+
version: '3.2'
|
|
87
|
+
- - "<"
|
|
88
|
+
- !ruby/object:Gem::Version
|
|
89
|
+
version: 4.1.dev
|
|
90
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
91
|
+
requirements:
|
|
92
|
+
- - ">="
|
|
93
|
+
- !ruby/object:Gem::Version
|
|
94
|
+
version: '0'
|
|
95
|
+
requirements: []
|
|
96
|
+
rubygems_version: 3.5.23
|
|
97
|
+
signing_key:
|
|
98
|
+
specification_version: 4
|
|
99
|
+
summary: Tyfoom fork of the parquet gem (Rust), with incremental streaming writes
|
|
100
|
+
test_files: []
|