parquet 0.7.0-x86_64-linux-musl → 0.7.3-x86_64-linux-musl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +264 -475
- data/lib/parquet/3.2/parquet.so +0 -0
- data/lib/parquet/3.3/parquet.so +0 -0
- data/lib/parquet/3.4/parquet.so +0 -0
- data/lib/parquet/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bd5affcc32a0e9da20f7748f8032596e11c89de1d12c2b7cb068f4eaf494dfb2
|
4
|
+
data.tar.gz: f95f2a908908526f30c375350d6ccbb7b4132d9fa96ca1ab638bfb0943b00063
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f563161d350cef2669c05e546e166da4c5ee605a58d773f9513945c330df270cebd638fc6156d734ca96fe17caad934b30b05d8b4420a6c7361866af5504c805
|
7
|
+
data.tar.gz: 2ea749210e8be31fba1ec9b57ca6b39f5ac1049c3189f46b2ce956c2d00c6a245ddeb971e5967fed6f4347effe03e3dd6a98317ddbfcee3f126d3059a6d22c28
|
data/README.md
CHANGED
@@ -2,616 +2,405 @@
|
|
2
2
|
|
3
3
|
[](https://badge.fury.io/rb/parquet)
|
4
4
|
|
5
|
-
|
5
|
+
Read and write [Apache Parquet](https://parquet.apache.org/) files from Ruby. This gem wraps the official Apache [`parquet`](https://github.com/apache/arrow-rs/tree/main/parquet) rust crate, providing:
|
6
6
|
|
7
|
-
|
7
|
+
- **High performance** columnar data storage and retrieval
|
8
|
+
- **Memory-efficient** streaming APIs for large datasets
|
9
|
+
- **Full compatibility** with the Apache Parquet specification
|
10
|
+
- **Simple, Ruby-native** APIs that feel natural
|
8
11
|
|
9
|
-
|
12
|
+
## Why Use This Library?
|
10
13
|
|
11
|
-
|
14
|
+
Apache Parquet is the de facto standard for analytical data storage, offering:
|
15
|
+
- **Efficient compression** - typically 2-10x smaller than CSV
|
16
|
+
- **Fast columnar access** - read only the columns you need
|
17
|
+
- **Rich type system** - preserves data types, including nested structures
|
18
|
+
- **Wide ecosystem support** - works with Spark, Pandas, DuckDB, and more
|
12
19
|
|
13
|
-
|
20
|
+
## Installation
|
21
|
+
|
22
|
+
Add this line to your application's Gemfile:
|
23
|
+
|
24
|
+
```ruby
|
25
|
+
gem 'parquet'
|
26
|
+
```
|
27
|
+
|
28
|
+
Then execute:
|
29
|
+
|
30
|
+
```bash
|
31
|
+
$ bundle install
|
32
|
+
```
|
33
|
+
|
34
|
+
Or install it directly:
|
35
|
+
|
36
|
+
```bash
|
37
|
+
$ gem install parquet
|
38
|
+
```
|
39
|
+
|
40
|
+
## Quick Start
|
41
|
+
|
42
|
+
### Reading Data
|
14
43
|
|
15
44
|
```ruby
|
16
45
|
require "parquet"
|
17
46
|
|
18
|
-
#
|
19
|
-
|
47
|
+
# Read Parquet files row by row
|
48
|
+
Parquet.each_row("data.parquet") do |row|
|
49
|
+
puts row # => {"id" => 1, "name" => "Alice", "score" => 95.5}
|
50
|
+
end
|
20
51
|
|
21
|
-
# Or
|
22
|
-
|
23
|
-
|
52
|
+
# Or column by column for better performance
|
53
|
+
Parquet.each_column("data.parquet", batch_size: 1000) do |batch|
|
54
|
+
puts batch # => {"id" => [1, 2, ...], "name" => ["Alice", "Bob", ...]}
|
24
55
|
end
|
56
|
+
```
|
25
57
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
#
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
#
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
# "repetition" => "OPTIONAL",
|
44
|
-
# "converted_type" => "NONE"
|
45
|
-
# },
|
46
|
-
# # ... other fields
|
47
|
-
# ]
|
48
|
-
# },
|
49
|
-
# "row_groups" => [
|
50
|
-
# {
|
51
|
-
# "num_columns" => 5,
|
52
|
-
# "num_rows" => 3,
|
53
|
-
# "total_byte_size" => 379,
|
54
|
-
# "columns" => [
|
55
|
-
# {
|
56
|
-
# "column_path" => "id",
|
57
|
-
# "num_values" => 3,
|
58
|
-
# "compression" => "UNCOMPRESSED",
|
59
|
-
# "total_compressed_size" => 91,
|
60
|
-
# "encodings" => ["PLAIN", "RLE", "RLE_DICTIONARY"],
|
61
|
-
# "statistics" => {
|
62
|
-
# "min_is_exact" => true,
|
63
|
-
# "max_is_exact" => true
|
64
|
-
# }
|
65
|
-
# },
|
66
|
-
# # ... other columns
|
67
|
-
# ]
|
68
|
-
# }
|
69
|
-
# ]
|
70
|
-
# }
|
58
|
+
### Writing Data
|
59
|
+
|
60
|
+
```ruby
|
61
|
+
# Define your schema
|
62
|
+
schema = [
|
63
|
+
{ "id" => "int64" },
|
64
|
+
{ "name" => "string" },
|
65
|
+
{ "score" => "double" }
|
66
|
+
]
|
67
|
+
|
68
|
+
# Write row by row
|
69
|
+
rows = [
|
70
|
+
[1, "Alice", 95.5],
|
71
|
+
[2, "Bob", 82.3]
|
72
|
+
]
|
73
|
+
|
74
|
+
Parquet.write_rows(rows.each, schema: schema, write_to: "output.parquet")
|
71
75
|
```
|
72
76
|
|
73
|
-
|
74
|
-
- Total number of rows
|
75
|
-
- File creation information
|
76
|
-
- Key-value metadata (including Arrow schema)
|
77
|
-
- Detailed schema information for each column
|
78
|
-
- Row group information including:
|
79
|
-
- Number of columns and rows
|
80
|
-
- Total byte size
|
81
|
-
- Column-level details (compression, encodings, statistics)
|
77
|
+
## Reading Parquet Files
|
82
78
|
|
83
|
-
|
79
|
+
The library provides two APIs for reading data, each optimized for different use cases:
|
84
80
|
|
85
|
-
|
81
|
+
### Row-wise Reading (Sequential Access)
|
86
82
|
|
87
|
-
|
88
|
-
require "parquet"
|
83
|
+
Best for: Processing records one at a time, data transformations, ETL pipelines
|
89
84
|
|
90
|
-
|
85
|
+
```ruby
|
86
|
+
# Basic usage - returns hashes
|
91
87
|
Parquet.each_row("data.parquet") do |row|
|
92
|
-
puts row
|
88
|
+
puts row # => {"id" => 1, "name" => "Alice"}
|
93
89
|
end
|
94
90
|
|
95
|
-
#
|
91
|
+
# Memory-efficient array format
|
96
92
|
Parquet.each_row("data.parquet", result_type: :array) do |row|
|
97
|
-
puts row
|
93
|
+
puts row # => [1, "Alice"]
|
98
94
|
end
|
99
95
|
|
100
|
-
#
|
96
|
+
# Read specific columns only
|
101
97
|
Parquet.each_row("data.parquet", columns: ["id", "name"]) do |row|
|
102
|
-
|
98
|
+
# Only requested columns are loaded from disk
|
103
99
|
end
|
104
100
|
|
105
|
-
#
|
101
|
+
# Works with IO objects
|
106
102
|
File.open("data.parquet", "rb") do |file|
|
107
103
|
Parquet.each_row(file) do |row|
|
108
|
-
|
104
|
+
# Process row
|
109
105
|
end
|
110
106
|
end
|
111
107
|
```
|
112
108
|
|
113
|
-
### Column-wise
|
109
|
+
### Column-wise Reading (Analytical Access)
|
114
110
|
|
115
|
-
|
111
|
+
Best for: Analytics, aggregations, when you need few columns from wide tables
|
116
112
|
|
117
113
|
```ruby
|
118
|
-
|
119
|
-
|
120
|
-
#
|
121
|
-
|
122
|
-
#
|
123
|
-
|
124
|
-
#
|
125
|
-
|
126
|
-
# "name" => ["name_1", "name_2", ..., "name_1024"]
|
127
|
-
# }
|
114
|
+
# Process data in column batches
|
115
|
+
Parquet.each_column("data.parquet", batch_size: 1000) do |batch|
|
116
|
+
# batch is a hash of column_name => array_of_values
|
117
|
+
ids = batch["id"] # => [1, 2, 3, ..., 1000]
|
118
|
+
names = batch["name"] # => ["Alice", "Bob", ...]
|
119
|
+
|
120
|
+
# Perform columnar operations
|
121
|
+
avg_id = ids.sum.to_f / ids.length
|
128
122
|
end
|
129
123
|
|
130
|
-
# Array
|
124
|
+
# Array format for more control
|
131
125
|
Parquet.each_column("data.parquet",
|
132
|
-
columns: ["id", "name"],
|
133
126
|
result_type: :array,
|
134
|
-
|
135
|
-
|
136
|
-
# [
|
137
|
-
# [1, 2, ..., 1024], # id column
|
138
|
-
# ["name_1", "name_2", ...] # name column
|
139
|
-
# ]
|
127
|
+
columns: ["id", "name"]) do |batch|
|
128
|
+
# batch is an array of arrays
|
129
|
+
# [[1, 2, ...], ["Alice", "Bob", ...]]
|
140
130
|
end
|
141
131
|
```
|
142
132
|
|
143
|
-
###
|
133
|
+
### File Metadata
|
144
134
|
|
145
|
-
|
135
|
+
Inspect file structure without reading data:
|
146
136
|
|
147
|
-
|
148
|
-
|
149
|
-
- `columns`: Optional array of column names to read (improves performance)
|
150
|
-
|
151
|
-
Additional arguments for `each_column`:
|
137
|
+
```ruby
|
138
|
+
metadata = Parquet.metadata("data.parquet")
|
152
139
|
|
153
|
-
|
140
|
+
puts metadata["num_rows"] # Total row count
|
141
|
+
puts metadata["created_by"] # Writer identification
|
142
|
+
puts metadata["schema"]["fields"] # Column definitions
|
143
|
+
puts metadata["row_groups"].size # Number of row groups
|
144
|
+
```
|
154
145
|
|
155
|
-
|
146
|
+
## Writing Parquet Files
|
156
147
|
|
157
|
-
###
|
148
|
+
### Row-wise Writing
|
158
149
|
|
159
|
-
|
150
|
+
Best for: Streaming data, converting from other formats, memory-constrained environments
|
160
151
|
|
161
152
|
```ruby
|
162
|
-
|
163
|
-
|
164
|
-
# Define the schema for your data
|
153
|
+
# Basic schema definition
|
165
154
|
schema = [
|
166
155
|
{ "id" => "int64" },
|
167
156
|
{ "name" => "string" },
|
168
|
-
{ "
|
157
|
+
{ "active" => "boolean" },
|
158
|
+
{ "balance" => "double" }
|
169
159
|
]
|
170
160
|
|
171
|
-
#
|
172
|
-
rows =
|
173
|
-
[1, "
|
174
|
-
[2, "Bob", 82.3],
|
175
|
-
[3, "Charlie", 88.7]
|
176
|
-
].each
|
177
|
-
|
178
|
-
# Write to a file
|
179
|
-
Parquet.write_rows(rows, schema: schema, write_to: "data.parquet")
|
180
|
-
|
181
|
-
# Write to an IO object
|
182
|
-
File.open("data.parquet", "wb") do |file|
|
183
|
-
Parquet.write_rows(rows, schema: schema, write_to: file)
|
161
|
+
# Stream data from any enumerable
|
162
|
+
rows = CSV.foreach("input.csv").map do |row|
|
163
|
+
[row[0].to_i, row[1], row[2] == "true", row[3].to_f]
|
184
164
|
end
|
185
165
|
|
186
|
-
# Optionally specify batch size (default is 1000)
|
187
|
-
Parquet.write_rows(rows,
|
188
|
-
schema: schema,
|
189
|
-
write_to: "data.parquet",
|
190
|
-
batch_size: 500
|
191
|
-
)
|
192
|
-
|
193
|
-
# Optionally specify memory threshold for flushing (default is 64MB)
|
194
|
-
Parquet.write_rows(rows,
|
195
|
-
schema: schema,
|
196
|
-
write_to: "data.parquet",
|
197
|
-
flush_threshold: 32 * 1024 * 1024 # 32MB
|
198
|
-
)
|
199
|
-
|
200
|
-
# Optionally specify sample size for row size estimation (default is 100)
|
201
166
|
Parquet.write_rows(rows,
|
202
167
|
schema: schema,
|
203
|
-
write_to: "
|
204
|
-
|
168
|
+
write_to: "output.parquet",
|
169
|
+
batch_size: 5000 # Rows per batch (default: 1000)
|
205
170
|
)
|
206
171
|
```
|
207
172
|
|
208
|
-
###
|
173
|
+
### Column-wise Writing
|
209
174
|
|
210
|
-
|
175
|
+
Best for: Pre-columnar data, better compression, higher performance
|
211
176
|
|
212
177
|
```ruby
|
213
|
-
|
178
|
+
# Prepare columnar data
|
179
|
+
ids = [1, 2, 3, 4, 5]
|
180
|
+
names = ["Alice", "Bob", "Charlie", "Diana", "Eve"]
|
181
|
+
scores = [95.5, 82.3, 88.7, 91.2, 79.8]
|
182
|
+
|
183
|
+
# Create batches
|
184
|
+
batches = [[
|
185
|
+
ids, # First column
|
186
|
+
names, # Second column
|
187
|
+
scores # Third column
|
188
|
+
]]
|
214
189
|
|
215
|
-
# Define the schema
|
216
190
|
schema = [
|
217
191
|
{ "id" => "int64" },
|
218
192
|
{ "name" => "string" },
|
219
193
|
{ "score" => "double" }
|
220
194
|
]
|
221
195
|
|
222
|
-
|
223
|
-
batches = [
|
224
|
-
# First batch
|
225
|
-
[
|
226
|
-
[1, 2], # id column
|
227
|
-
["Alice", "Bob"], # name column
|
228
|
-
[95.5, 82.3] # score column
|
229
|
-
],
|
230
|
-
# Second batch
|
231
|
-
[
|
232
|
-
[3], # id column
|
233
|
-
["Charlie"], # name column
|
234
|
-
[88.7] # score column
|
235
|
-
]
|
236
|
-
]
|
237
|
-
|
238
|
-
# Create an enumerator from the batches
|
239
|
-
columns = batches.each
|
240
|
-
|
241
|
-
# Write to a parquet file with default ZSTD compression
|
242
|
-
Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
|
243
|
-
|
244
|
-
# Write to a parquet file with specific compression and memory threshold
|
245
|
-
Parquet.write_columns(columns,
|
196
|
+
Parquet.write_columns(batches.each,
|
246
197
|
schema: schema,
|
247
|
-
write_to: "
|
248
|
-
compression: "snappy"
|
249
|
-
flush_threshold: 32 * 1024 * 1024 # 32MB
|
198
|
+
write_to: "output.parquet",
|
199
|
+
compression: "snappy" # Options: none, snappy, gzip, lz4, zstd
|
250
200
|
)
|
251
|
-
|
252
|
-
# Write to an IO object
|
253
|
-
File.open("data.parquet", "wb") do |file|
|
254
|
-
Parquet.write_columns(columns, schema: schema, write_to: file)
|
255
|
-
end
|
256
201
|
```
|
257
202
|
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
- `uint8`, `uint16`, `uint32`, `uint64`
|
262
|
-
- `float`, `double`
|
263
|
-
- `string`
|
264
|
-
- `binary`
|
265
|
-
- `boolean`
|
266
|
-
- `date32`
|
267
|
-
- `timestamp_millis`, `timestamp_micros`, `timestamp_second`, `timestamp_nanos`
|
268
|
-
- `time_millis`, `time_micros`
|
269
|
-
|
270
|
-
### Timestamp Timezone Handling
|
271
|
-
|
272
|
-
**CRITICAL PARQUET SPECIFICATION LIMITATION**: The Apache Parquet format specification only supports two types of timestamps:
|
273
|
-
1. **UTC-normalized timestamps** (when ANY timezone is specified) - `isAdjustedToUTC = true`
|
274
|
-
2. **Local/unzoned timestamps** (when NO timezone is specified) - `isAdjustedToUTC = false`
|
275
|
-
|
276
|
-
This means that specific timezone offsets like "+09:00" or "America/New_York" CANNOT be preserved in Parquet files. This is not a limitation of this Ruby library, but of the Parquet format itself.
|
277
|
-
|
278
|
-
**When Writing:**
|
279
|
-
- If the schema specifies ANY timezone (whether it's "UTC", "+09:00", "America/New_York", etc.):
|
280
|
-
- Time values are converted to UTC before storing
|
281
|
-
- The file metadata sets `isAdjustedToUTC = true`
|
282
|
-
- The original timezone information is LOST
|
283
|
-
- If the schema doesn't specify a timezone:
|
284
|
-
- Timestamps are stored as local/unzoned time (no conversion)
|
285
|
-
- The file metadata sets `isAdjustedToUTC = false`
|
286
|
-
- These represent "wall clock" times without timezone context
|
287
|
-
|
288
|
-
**When Reading:**
|
289
|
-
- If the Parquet file has `isAdjustedToUTC = true` (ANY timezone was specified during writing):
|
290
|
-
- Time objects are returned in UTC
|
291
|
-
- The original timezone (e.g., "+09:00") is NOT recoverable
|
292
|
-
- If the file has `isAdjustedToUTC = false` (NO timezone was specified):
|
293
|
-
- Time objects are returned as local time in your system's timezone
|
294
|
-
- These are "wall clock" times without timezone information
|
203
|
+
## Data Types
|
204
|
+
|
205
|
+
### Basic Types
|
295
206
|
|
296
207
|
```ruby
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
208
|
+
schema = [
|
209
|
+
# Integers
|
210
|
+
{ "tiny" => "int8" }, # -128 to 127
|
211
|
+
{ "small" => "int16" }, # -32,768 to 32,767
|
212
|
+
{ "medium" => "int32" }, # ±2 billion
|
213
|
+
{ "large" => "int64" }, # ±9 quintillion
|
214
|
+
|
215
|
+
# Unsigned integers
|
216
|
+
{ "ubyte" => "uint8" }, # 0 to 255
|
217
|
+
{ "ushort" => "uint16" }, # 0 to 65,535
|
218
|
+
{ "uint" => "uint32" }, # 0 to 4 billion
|
219
|
+
{ "ulong" => "uint64" }, # 0 to 18 quintillion
|
220
|
+
|
221
|
+
# Floating point
|
222
|
+
{ "price" => "float" }, # 32-bit precision
|
223
|
+
{ "amount" => "double" }, # 64-bit precision
|
224
|
+
|
225
|
+
# Other basics
|
226
|
+
{ "name" => "string" },
|
227
|
+
{ "data" => "binary" },
|
228
|
+
{ "active" => "boolean" }
|
229
|
+
]
|
230
|
+
```
|
303
231
|
|
304
|
-
|
305
|
-
schema_legacy = Parquet::Schema.define do
|
306
|
-
field :timestamp_utc, :timestamp_millis, timezone: "UTC" # Stored as UTC
|
307
|
-
field :timestamp_tokyo, :timestamp_millis, timezone: "+09:00" # Also stored as UTC!
|
308
|
-
field :timestamp_local, :timestamp_millis # No timezone - local
|
309
|
-
end
|
232
|
+
### Date and Time Types
|
310
233
|
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
234
|
+
```ruby
|
235
|
+
schema = [
|
236
|
+
# Date (days since Unix epoch)
|
237
|
+
{ "date" => "date32" },
|
238
|
+
|
239
|
+
# Timestamps (with different precisions)
|
240
|
+
{ "created_sec" => "timestamp_second" },
|
241
|
+
{ "created_ms" => "timestamp_millis" }, # Most common
|
242
|
+
{ "created_us" => "timestamp_micros" },
|
243
|
+
{ "created_ns" => "timestamp_nanos" },
|
244
|
+
|
245
|
+
# Time of day (without date)
|
246
|
+
{ "time_ms" => "time_millis" }, # Milliseconds since midnight
|
247
|
+
{ "time_us" => "time_micros" } # Microseconds since midnight
|
318
248
|
]
|
249
|
+
```
|
319
250
|
|
320
|
-
|
251
|
+
### Decimal Type (Financial Data)
|
321
252
|
|
322
|
-
|
323
|
-
Parquet.each_row("timestamps.parquet") do |row|
|
324
|
-
# row["timestamp_utc"] => Time object in UTC
|
325
|
-
# row["timestamp_local"] => Time object in local timezone
|
326
|
-
# row["timestamp_default"] => Time object in local timezone
|
327
|
-
end
|
253
|
+
For exact decimal arithmetic (no floating-point errors):
|
328
254
|
|
329
|
-
|
330
|
-
|
331
|
-
field :timestamp, :timestamp_millis, has_timezone: true # Store as UTC
|
332
|
-
field :original_timezone, :string # Store timezone as string
|
333
|
-
end
|
334
|
-
```
|
335
|
-
|
336
|
-
## Architecture
|
255
|
+
```ruby
|
256
|
+
require "bigdecimal"
|
337
257
|
|
338
|
-
|
258
|
+
schema = [
|
259
|
+
# Financial amounts with 2 decimal places
|
260
|
+
{ "price" => "decimal", "precision" => 10, "scale" => 2 }, # Up to 99,999,999.99
|
261
|
+
{ "balance" => "decimal", "precision" => 15, "scale" => 2 }, # Larger amounts
|
339
262
|
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
- Efficient Arrow-based reader and writer implementations
|
344
|
-
|
345
|
-
- **parquet-ruby-adapter**: Ruby-specific adapter layer
|
346
|
-
- Implements core traits for Ruby integration
|
347
|
-
- Handles Ruby value conversion through the `ValueConverter` trait
|
348
|
-
- Manages Ruby I/O objects through the `ChunkReader` trait
|
263
|
+
# High-precision calculations
|
264
|
+
{ "rate" => "decimal", "precision" => 10, "scale" => 8 } # 8 decimal places
|
265
|
+
]
|
349
266
|
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
267
|
+
# Use BigDecimal for exact values
|
268
|
+
data = [[
|
269
|
+
BigDecimal("19.99"),
|
270
|
+
BigDecimal("1234567.89"),
|
271
|
+
BigDecimal("0.00000123")
|
272
|
+
]]
|
273
|
+
```
|
354
274
|
|
355
|
-
|
356
|
-
- Clear separation of concerns between core functionality and language bindings
|
357
|
-
- Easy testing of core logic without Ruby dependencies
|
358
|
-
- Potential reuse of core functionality for other language bindings
|
359
|
-
- Type-safe interfaces through Rust's trait system
|
275
|
+
## Complex Data Structures
|
360
276
|
|
361
|
-
|
277
|
+
The library includes a powerful Schema DSL for defining nested data:
|
362
278
|
|
363
|
-
|
279
|
+
### Using the Schema DSL
|
364
280
|
|
365
281
|
```ruby
|
366
|
-
require "parquet"
|
367
|
-
|
368
|
-
# Define a complex schema using the Schema DSL
|
369
282
|
schema = Parquet::Schema.define do
|
370
|
-
|
371
|
-
field :
|
283
|
+
# Simple fields
|
284
|
+
field :id, :int64, nullable: false # Required field
|
285
|
+
field :name, :string # Optional by default
|
372
286
|
|
373
|
-
# Nested
|
287
|
+
# Nested structure
|
374
288
|
field :address, :struct do
|
375
289
|
field :street, :string
|
376
290
|
field :city, :string
|
377
|
-
field :
|
378
|
-
|
379
|
-
field :
|
380
|
-
field :longitude, :double
|
291
|
+
field :location, :struct do
|
292
|
+
field :lat, :double
|
293
|
+
field :lng, :double
|
381
294
|
end
|
382
295
|
end
|
383
296
|
|
384
|
-
#
|
385
|
-
field :
|
297
|
+
# Lists
|
298
|
+
field :tags, :list, item: :string
|
299
|
+
field :scores, :list, item: :int32
|
300
|
+
|
301
|
+
# Maps (dictionaries)
|
302
|
+
field :metadata, :map, key: :string, value: :string
|
386
303
|
|
387
|
-
#
|
304
|
+
# Complex combinations
|
388
305
|
field :contacts, :list, item: :struct do
|
389
306
|
field :name, :string
|
390
|
-
field :
|
307
|
+
field :email, :string
|
391
308
|
field :primary, :boolean
|
392
309
|
end
|
393
|
-
|
394
|
-
# Map with string values
|
395
|
-
field :metadata, :map, key: :string, value: :string
|
396
|
-
|
397
|
-
# Map with struct values
|
398
|
-
field :properties, :map, key: :string, value: :struct do
|
399
|
-
field :count, :int32
|
400
|
-
field :description, :string
|
401
|
-
end
|
402
|
-
|
403
|
-
# Nested lists (list of lists of strings)
|
404
|
-
field :nested_lists, :list, item: :list do
|
405
|
-
field :item, :string # REQUIRED: Inner item field MUST be named 'item' for nested lists
|
406
|
-
end
|
407
|
-
|
408
|
-
# Map of lists
|
409
|
-
field :map_of_lists, :map, key: :string, value: :list do
|
410
|
-
field :item, :int32 # REQUIRED: List items in maps MUST be named 'item'
|
411
|
-
end
|
412
|
-
end
|
413
|
-
|
414
|
-
### Nested Lists
|
415
|
-
|
416
|
-
When working with nested lists (a list of lists), there are specific requirements:
|
417
|
-
|
418
|
-
1. Using the Schema DSL:
|
419
|
-
```ruby
|
420
|
-
# A list of lists of strings
|
421
|
-
field :nested_lists, :list, item: :list do
|
422
|
-
field :item, :string # For nested lists, inner item MUST be named 'item'
|
423
310
|
end
|
424
311
|
```
|
425
312
|
|
426
|
-
|
427
|
-
```ruby
|
428
|
-
# A list of lists of integers
|
429
|
-
{ "nested_numbers" => "list<list<int32>>" }
|
430
|
-
```
|
313
|
+
### Writing Complex Data
|
431
314
|
|
432
|
-
The data for nested lists is structured as an array of arrays:
|
433
315
|
```ruby
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
316
|
+
data = [[
|
317
|
+
1, # id
|
318
|
+
"Alice Johnson", # name
|
319
|
+
{ # address
|
320
|
+
"street" => "123 Main St",
|
321
|
+
"city" => "Springfield",
|
322
|
+
"location" => {
|
323
|
+
"lat" => 40.7128,
|
324
|
+
"lng" => -74.0060
|
325
|
+
}
|
326
|
+
},
|
327
|
+
["ruby", "parquet", "data"], # tags
|
328
|
+
[85, 92, 88], # scores
|
329
|
+
{ "dept" => "Engineering" }, # metadata
|
330
|
+
[ # contacts
|
331
|
+
{ "name" => "Bob", "email" => "bob@example.com", "primary" => true },
|
332
|
+
{ "name" => "Carol", "email" => "carol@example.com", "primary" => false }
|
333
|
+
]
|
334
|
+
]]
|
450
335
|
|
451
|
-
|
452
|
-
|
453
|
-
field :amount1, :decimal # Equivalent to INTEGER with 38 digits
|
336
|
+
Parquet.write_rows(data.each, schema: schema, write_to: "complex.parquet")
|
337
|
+
```
|
454
338
|
|
455
|
-
|
456
|
-
field :amount2, :decimal, precision: 10 # 10 digits, no decimal places
|
339
|
+
## ⚠️ Important Limitations
|
457
340
|
|
458
|
-
|
459
|
-
field :amount3, :decimal, scale: 2 # 38 digits with 2 decimal places
|
341
|
+
### Timezone Handling in Parquet
|
460
342
|
|
461
|
-
|
462
|
-
field :amount4, :decimal, precision: 10, scale: 2 # 10 digits with 2 decimal places
|
463
|
-
```
|
343
|
+
**Critical**: The Parquet specification has a fundamental limitation with timezone storage:
|
464
344
|
|
465
|
-
|
345
|
+
1. **UTC-normalized**: Any timestamp with timezone info (including "+09:00" or "America/New_York") is converted to UTC
|
346
|
+
2. **Local/unzoned**: Timestamps without timezone info are stored as-is
|
466
347
|
|
467
|
-
|
348
|
+
**The original timezone information is permanently lost.** This is not a limitation of this library but of the Parquet format itself.
|
468
349
|
|
469
350
|
```ruby
|
470
|
-
require "parquet"
|
471
|
-
require "bigdecimal"
|
472
|
-
|
473
|
-
# Schema for financial transactions
|
474
351
|
schema = Parquet::Schema.define do
|
475
|
-
|
476
|
-
field :
|
477
|
-
field :
|
478
|
-
field :balance, :decimal, precision: 16, scale: 2 # Larger precision for running balances
|
479
|
-
field :currency, :string
|
480
|
-
field :exchange_rate, :decimal, precision: 10, scale: 6 # 6 decimal places for forex rates
|
481
|
-
field :fee, :decimal, precision: 8, scale: 2, nullable: true # Optional fee
|
482
|
-
field :category, :string
|
483
|
-
end
|
352
|
+
# These BOTH store in UTC - timezone info is lost!
|
353
|
+
field :timestamp_utc, :timestamp_millis, timezone: "UTC"
|
354
|
+
field :timestamp_tokyo, :timestamp_millis, timezone: "+09:00"
|
484
355
|
|
485
|
-
#
|
486
|
-
|
487
|
-
|
488
|
-
"T-12345",
|
489
|
-
Time.now,
|
490
|
-
BigDecimal("1256.99"), # amount (directly using BigDecimal)
|
491
|
-
BigDecimal("10250.25"), # balance
|
492
|
-
"USD",
|
493
|
-
BigDecimal("1.0"), # exchange_rate
|
494
|
-
BigDecimal("2.50"), # fee
|
495
|
-
"Groceries"
|
496
|
-
],
|
497
|
-
[
|
498
|
-
"T-12346",
|
499
|
-
Time.now - 86400, # yesterday
|
500
|
-
BigDecimal("-89.50"), # negative amount for withdrawal
|
501
|
-
BigDecimal("10160.75"), # updated balance
|
502
|
-
"USD",
|
503
|
-
BigDecimal("1.0"), # exchange_rate
|
504
|
-
nil, # no fee
|
505
|
-
"Transportation"
|
506
|
-
],
|
507
|
-
[
|
508
|
-
"T-12347",
|
509
|
-
Time.now - 172800, # two days ago
|
510
|
-
BigDecimal("250.00"), # amount
|
511
|
-
BigDecimal("10410.75"), # balance
|
512
|
-
"EUR", # different currency
|
513
|
-
BigDecimal("1.05463"), # exchange_rate
|
514
|
-
BigDecimal("1.75"), # fee
|
515
|
-
"Entertainment"
|
516
|
-
]
|
517
|
-
]
|
356
|
+
# This stores as local time (no timezone)
|
357
|
+
field :timestamp_local, :timestamp_millis
|
358
|
+
end
|
518
359
|
|
519
|
-
#
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
Parquet.each_row("financial_data.parquet") do |transaction|
|
524
|
-
# Access decimal fields as BigDecimal objects
|
525
|
-
puts "Transaction: #{transaction['transaction_id']}"
|
526
|
-
puts " Amount: #{transaction['currency']} #{transaction['amount']}"
|
527
|
-
puts " Balance: $#{transaction['balance']}"
|
528
|
-
puts " Fee: #{transaction['fee'] || 'No fee'}"
|
529
|
-
|
530
|
-
# You can perform precise decimal calculations
|
531
|
-
if transaction['currency'] != 'USD'
|
532
|
-
usd_amount = transaction['amount'] * transaction['exchange_rate']
|
533
|
-
puts " USD Equivalent: $#{usd_amount.round(2)}"
|
534
|
-
end
|
360
|
+
# If you need timezone preservation, store it separately:
|
361
|
+
schema = Parquet::Schema.define do
|
362
|
+
field :timestamp, :timestamp_millis, has_timezone: true # UTC storage
|
363
|
+
field :original_tz, :string # "America/New_York"
|
535
364
|
end
|
536
365
|
```
|
537
366
|
|
538
|
-
|
367
|
+
## Performance Tips
|
539
368
|
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
-
|
369
|
+
1. **Use column-wise reading** when you need only a few columns from wide tables
|
370
|
+
2. **Specify columns parameter** to avoid reading unnecessary data
|
371
|
+
3. **Choose appropriate batch sizes**:
|
372
|
+
- Larger batches = better throughput but more memory
|
373
|
+
- Smaller batches = less memory but more overhead
|
374
|
+
4. **Pre-sort data** by commonly filtered columns for better compression
|
544
375
|
|
545
|
-
Choose appropriate precision and scale for your data to optimize storage while ensuring adequate range:
|
546
376
|
|
547
|
-
|
548
|
-
# Banking examples
|
549
|
-
field :account_balance, :decimal, precision: 16, scale: 2 # Up to 14 digits before decimal point
|
550
|
-
field :interest_rate, :decimal, precision: 8, scale: 6 # Rate with 6 decimal places (e.g., 0.015625)
|
377
|
+
## Memory Management
|
551
378
|
|
552
|
-
|
553
|
-
field :product_price, :decimal, precision: 10, scale: 2 # Product price
|
554
|
-
field :shipping_weight, :decimal, precision: 6, scale: 3 # Weight in kg with 3 decimal places
|
379
|
+
Control memory usage with flush thresholds:
|
555
380
|
|
556
|
-
|
557
|
-
|
558
|
-
|
381
|
+
```ruby
|
382
|
+
Parquet.write_rows(huge_dataset.each,
|
383
|
+
schema: schema,
|
384
|
+
write_to: "output.parquet",
|
385
|
+
batch_size: 1000, # Rows before considering flush
|
386
|
+
flush_threshold: 32 * 1024**2 # Flush if batch exceeds 32MB
|
387
|
+
)
|
559
388
|
```
|
560
389
|
|
561
|
-
|
390
|
+
## Architecture
|
562
391
|
|
563
|
-
|
392
|
+
This gem uses a modular architecture:
|
564
393
|
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
[
|
569
|
-
1, # id
|
570
|
-
"John Doe", # name
|
571
|
-
{ # address (struct)
|
572
|
-
"street" => "123 Main St",
|
573
|
-
"city" => "Springfield",
|
574
|
-
"zip" => "12345",
|
575
|
-
"coordinates" => {
|
576
|
-
"latitude" => 37.7749,
|
577
|
-
"longitude" => -122.4194
|
578
|
-
}
|
579
|
-
},
|
580
|
-
[85.5, 92.0, 78.5], # scores (list of floats)
|
581
|
-
[ # contacts (list of structs)
|
582
|
-
{ "name" => "Contact 1", "phone" => "555-1234", "primary" => true },
|
583
|
-
{ "name" => "Contact 2", "phone" => "555-5678", "primary" => false }
|
584
|
-
],
|
585
|
-
{ "created" => "2023-01-01", "status" => "active" }, # metadata (map)
|
586
|
-
{ # properties (map of structs)
|
587
|
-
"feature1" => { "count" => 5, "description" => "Main feature" },
|
588
|
-
"feature2" => { "count" => 3, "description" => "Secondary feature" }
|
589
|
-
},
|
590
|
-
[["a", "b"], ["c", "d", "e"]], # nested_lists (a list of lists of strings)
|
591
|
-
{ # map_of_lists
|
592
|
-
"group1" => [1, 2, 3],
|
593
|
-
"group2" => [4, 5, 6]
|
594
|
-
}
|
595
|
-
]
|
596
|
-
]
|
394
|
+
- **parquet-core**: Language-agnostic Rust core for Parquet operations
|
395
|
+
- **parquet-ruby-adapter**: Ruby-specific FFI adapter layer
|
396
|
+
- **parquet gem**: High-level Ruby API
|
597
397
|
|
598
|
-
|
599
|
-
Parquet.write_rows(data.each, schema: schema, write_to: "complex_data.parquet")
|
398
|
+
Take a look at [ARCH.md](./ARCH.md)
|
600
399
|
|
601
|
-
|
602
|
-
Parquet.each_row("complex_data.parquet") do |row|
|
603
|
-
puts row.inspect
|
604
|
-
end
|
605
|
-
```
|
400
|
+
## Contributing
|
606
401
|
|
607
|
-
|
402
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/njaremko/parquet-ruby.
|
608
403
|
|
609
|
-
|
610
|
-
- **Complex types**: Structs, lists, and maps with arbitrary nesting
|
611
|
-
- **Nullability control**: Specify which fields can contain null values with `nullable: false/true`
|
612
|
-
- **List item nullability**: Control whether list items can be null with `item_nullable: false/true`
|
613
|
-
- **Map key/value nullability**: Control whether map keys or values can be null with `value_nullable: false/true`
|
404
|
+
## License
|
614
405
|
|
615
|
-
|
616
|
-
- For lists: The `item:` parameter specifying the item type
|
617
|
-
- For maps: Both `key:` and `value:` parameters specifying key and value types
|
406
|
+
The gem is available as open source under the terms of the MIT License.
|
data/lib/parquet/3.2/parquet.so
CHANGED
Binary file
|
data/lib/parquet/3.3/parquet.so
CHANGED
Binary file
|
data/lib/parquet/3.4/parquet.so
CHANGED
Binary file
|
data/lib/parquet/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.3
|
5
5
|
platform: x86_64-linux-musl
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-07-
|
11
|
+
date: 2025-07-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake-compiler
|