parquet 0.7.2 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +124 -72
- data/Gemfile +1 -0
- data/README.md +271 -475
- data/ext/parquet/Cargo.toml +6 -6
- data/ext/parquet/src/adapter_ffi.rs +169 -34
- data/ext/parquet-core/Cargo.toml +6 -5
- data/ext/parquet-core/src/arrow_conversion.rs +354 -287
- data/ext/parquet-core/src/error.rs +28 -2
- data/ext/parquet-core/src/lib.rs +1 -1
- data/ext/parquet-core/src/reader.rs +115 -13
- data/ext/parquet-core/src/schema.rs +171 -4
- data/ext/parquet-core/src/test_utils.rs +1 -1
- data/ext/parquet-core/src/traits/schema.rs +46 -7
- data/ext/parquet-core/src/value.rs +7 -3
- data/ext/parquet-core/src/writer.rs +410 -52
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +97 -36
- data/ext/parquet-core/tests/binary_data.rs +1 -1
- data/ext/parquet-core/tests/column_projection.rs +1 -1
- data/ext/parquet-core/tests/complex_types.rs +1 -1
- data/ext/parquet-core/tests/compression_tests.rs +1 -1
- data/ext/parquet-core/tests/concurrent_access.rs +10 -9
- data/ext/parquet-core/tests/decimal_tests.rs +7 -7
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +1 -1
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +18 -25
- data/ext/parquet-core/tests/null_handling_tests.rs +1 -1
- data/ext/parquet-core/tests/primitive_types.rs +1 -1
- data/ext/parquet-core/tests/real_world_patterns.rs +2 -2
- data/ext/parquet-core/tests/review_regressions.rs +787 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +1 -1
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +8 -0
- data/ext/parquet-core/tests/temporal_tests.rs +1 -1
- data/ext/parquet-core/tests/test_helpers.rs +1 -1
- data/ext/parquet-core/tests/writer_tests.rs +1 -1
- data/ext/parquet-ruby-adapter/Cargo.toml +6 -5
- data/ext/parquet-ruby-adapter/src/converter.rs +20 -23
- data/ext/parquet-ruby-adapter/src/error.rs +14 -21
- data/ext/parquet-ruby-adapter/src/lib.rs +6 -5
- data/ext/parquet-ruby-adapter/src/logger.rs +5 -2
- data/ext/parquet-ruby-adapter/src/metadata.rs +15 -15
- data/ext/parquet-ruby-adapter/src/reader.rs +67 -52
- data/ext/parquet-ruby-adapter/src/schema.rs +132 -87
- data/ext/parquet-ruby-adapter/src/string_cache.rs +72 -62
- data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
- data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
- data/ext/parquet-ruby-adapter/src/types.rs +5 -1
- data/ext/parquet-ruby-adapter/src/utils.rs +144 -74
- data/ext/parquet-ruby-adapter/src/writer.rs +82 -95
- data/lib/parquet/schema.rb +6 -2
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +54 -12
- metadata +19 -3
- data/ext/parquet-ruby-adapter/src/batch_manager.rs +0 -116
data/README.md
CHANGED
|
@@ -2,616 +2,412 @@
|
|
|
2
2
|
|
|
3
3
|
[](https://badge.fury.io/rb/parquet)
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
Read and write [Apache Parquet](https://parquet.apache.org/) files from Ruby. This gem wraps the official Apache [`parquet`](https://github.com/apache/arrow-rs/tree/main/parquet) rust crate, providing:
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
- **High performance** columnar data storage and retrieval
|
|
8
|
+
- **Memory-efficient** streaming APIs for large datasets
|
|
9
|
+
- **Full compatibility** with the Apache Parquet specification
|
|
10
|
+
- **Simple, Ruby-native** APIs that feel natural
|
|
8
11
|
|
|
9
|
-
|
|
12
|
+
## Why Use This Library?
|
|
10
13
|
|
|
11
|
-
|
|
14
|
+
Apache Parquet is the de facto standard for analytical data storage, offering:
|
|
15
|
+
- **Efficient compression** - typically 2-10x smaller than CSV
|
|
16
|
+
- **Fast columnar access** - read only the columns you need
|
|
17
|
+
- **Rich type system** - preserves data types, including nested structures
|
|
18
|
+
- **Wide ecosystem support** - works with Spark, Pandas, DuckDB, and more
|
|
12
19
|
|
|
13
|
-
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
Add this line to your application's Gemfile:
|
|
23
|
+
|
|
24
|
+
```ruby
|
|
25
|
+
gem 'parquet'
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Then execute:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
$ bundle install
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Or install it directly:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
$ gem install parquet
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Quick Start
|
|
41
|
+
|
|
42
|
+
### Reading Data
|
|
14
43
|
|
|
15
44
|
```ruby
|
|
16
45
|
require "parquet"
|
|
17
46
|
|
|
18
|
-
#
|
|
19
|
-
|
|
47
|
+
# Read Parquet files row by row
|
|
48
|
+
Parquet.each_row("data.parquet") do |row|
|
|
49
|
+
puts row # => {"id" => 1, "name" => "Alice", "score" => 95.5}
|
|
50
|
+
end
|
|
20
51
|
|
|
21
|
-
# Or
|
|
22
|
-
|
|
23
|
-
|
|
52
|
+
# Or column by column for better performance
|
|
53
|
+
Parquet.each_column("data.parquet", batch_size: 1000) do |batch|
|
|
54
|
+
puts batch # => {"id" => [1, 2, ...], "name" => ["Alice", "Bob", ...]}
|
|
24
55
|
end
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Writing Data
|
|
25
59
|
|
|
26
|
-
|
|
27
|
-
#
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
#
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
# "type" => "primitive",
|
|
42
|
-
# "physical_type" => "INT64",
|
|
43
|
-
# "repetition" => "OPTIONAL",
|
|
44
|
-
# "converted_type" => "NONE"
|
|
45
|
-
# },
|
|
46
|
-
# # ... other fields
|
|
47
|
-
# ]
|
|
48
|
-
# },
|
|
49
|
-
# "row_groups" => [
|
|
50
|
-
# {
|
|
51
|
-
# "num_columns" => 5,
|
|
52
|
-
# "num_rows" => 3,
|
|
53
|
-
# "total_byte_size" => 379,
|
|
54
|
-
# "columns" => [
|
|
55
|
-
# {
|
|
56
|
-
# "column_path" => "id",
|
|
57
|
-
# "num_values" => 3,
|
|
58
|
-
# "compression" => "UNCOMPRESSED",
|
|
59
|
-
# "total_compressed_size" => 91,
|
|
60
|
-
# "encodings" => ["PLAIN", "RLE", "RLE_DICTIONARY"],
|
|
61
|
-
# "statistics" => {
|
|
62
|
-
# "min_is_exact" => true,
|
|
63
|
-
# "max_is_exact" => true
|
|
64
|
-
# }
|
|
65
|
-
# },
|
|
66
|
-
# # ... other columns
|
|
67
|
-
# ]
|
|
68
|
-
# }
|
|
69
|
-
# ]
|
|
70
|
-
# }
|
|
60
|
+
```ruby
|
|
61
|
+
# Define your schema
|
|
62
|
+
schema = [
|
|
63
|
+
{ "id" => "int64" },
|
|
64
|
+
{ "name" => "string" },
|
|
65
|
+
{ "score" => "double" }
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
# Write row by row
|
|
69
|
+
rows = [
|
|
70
|
+
[1, "Alice", 95.5],
|
|
71
|
+
[2, "Bob", 82.3]
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
Parquet.write_rows(rows.each, schema: schema, write_to: "output.parquet")
|
|
71
75
|
```
|
|
72
76
|
|
|
73
|
-
|
|
74
|
-
- Total number of rows
|
|
75
|
-
- File creation information
|
|
76
|
-
- Key-value metadata (including Arrow schema)
|
|
77
|
-
- Detailed schema information for each column
|
|
78
|
-
- Row group information including:
|
|
79
|
-
- Number of columns and rows
|
|
80
|
-
- Total byte size
|
|
81
|
-
- Column-level details (compression, encodings, statistics)
|
|
77
|
+
## Reading Parquet Files
|
|
82
78
|
|
|
83
|
-
|
|
79
|
+
The library provides two APIs for reading data, each optimized for different use cases:
|
|
84
80
|
|
|
85
|
-
|
|
81
|
+
### Row-wise Reading (Sequential Access)
|
|
86
82
|
|
|
87
|
-
|
|
88
|
-
require "parquet"
|
|
83
|
+
Best for: Processing records one at a time, data transformations, ETL pipelines
|
|
89
84
|
|
|
90
|
-
|
|
85
|
+
```ruby
|
|
86
|
+
# Basic usage - returns hashes
|
|
91
87
|
Parquet.each_row("data.parquet") do |row|
|
|
92
|
-
puts row
|
|
88
|
+
puts row # => {"id" => 1, "name" => "Alice"}
|
|
93
89
|
end
|
|
94
90
|
|
|
95
|
-
#
|
|
91
|
+
# Memory-efficient array format
|
|
96
92
|
Parquet.each_row("data.parquet", result_type: :array) do |row|
|
|
97
|
-
puts row
|
|
93
|
+
puts row # => [1, "Alice"]
|
|
98
94
|
end
|
|
99
95
|
|
|
100
|
-
#
|
|
96
|
+
# Read specific columns only
|
|
101
97
|
Parquet.each_row("data.parquet", columns: ["id", "name"]) do |row|
|
|
102
|
-
|
|
98
|
+
# Only requested columns are loaded from disk
|
|
103
99
|
end
|
|
104
100
|
|
|
105
|
-
#
|
|
101
|
+
# Works with IO objects
|
|
106
102
|
File.open("data.parquet", "rb") do |file|
|
|
107
103
|
Parquet.each_row(file) do |row|
|
|
108
|
-
|
|
104
|
+
# Process row
|
|
109
105
|
end
|
|
110
106
|
end
|
|
111
107
|
```
|
|
112
108
|
|
|
113
|
-
### Column-wise
|
|
109
|
+
### Column-wise Reading (Analytical Access)
|
|
114
110
|
|
|
115
|
-
|
|
111
|
+
Best for: Analytics, aggregations, when you need few columns from wide tables
|
|
116
112
|
|
|
117
113
|
```ruby
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
#
|
|
121
|
-
|
|
122
|
-
#
|
|
123
|
-
|
|
124
|
-
#
|
|
125
|
-
|
|
126
|
-
# "name" => ["name_1", "name_2", ..., "name_1024"]
|
|
127
|
-
# }
|
|
114
|
+
# Process data in column batches
|
|
115
|
+
Parquet.each_column("data.parquet", batch_size: 1000) do |batch|
|
|
116
|
+
# batch is a hash of column_name => array_of_values
|
|
117
|
+
ids = batch["id"] # => [1, 2, 3, ..., 1000]
|
|
118
|
+
names = batch["name"] # => ["Alice", "Bob", ...]
|
|
119
|
+
|
|
120
|
+
# Perform columnar operations
|
|
121
|
+
avg_id = ids.sum.to_f / ids.length
|
|
128
122
|
end
|
|
129
123
|
|
|
130
|
-
# Array
|
|
124
|
+
# Array format for more control
|
|
131
125
|
Parquet.each_column("data.parquet",
|
|
132
|
-
columns: ["id", "name"],
|
|
133
126
|
result_type: :array,
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
# [
|
|
137
|
-
# [1, 2, ..., 1024], # id column
|
|
138
|
-
# ["name_1", "name_2", ...] # name column
|
|
139
|
-
# ]
|
|
127
|
+
columns: ["id", "name"]) do |batch|
|
|
128
|
+
# batch is an array of arrays
|
|
129
|
+
# [[1, 2, ...], ["Alice", "Bob", ...]]
|
|
140
130
|
end
|
|
141
131
|
```
|
|
142
132
|
|
|
143
|
-
###
|
|
144
|
-
|
|
145
|
-
Both methods accept these common arguments:
|
|
133
|
+
### File Metadata
|
|
146
134
|
|
|
147
|
-
|
|
148
|
-
- `result_type`: Output format (`:hash` or `:array`, defaults to `:hash`)
|
|
149
|
-
- `columns`: Optional array of column names to read (improves performance)
|
|
135
|
+
Inspect file structure without reading data:
|
|
150
136
|
|
|
151
|
-
|
|
137
|
+
```ruby
|
|
138
|
+
metadata = Parquet.metadata("data.parquet")
|
|
152
139
|
|
|
153
|
-
|
|
140
|
+
puts metadata["num_rows"] # Total row count
|
|
141
|
+
puts metadata["created_by"] # Writer identification
|
|
142
|
+
puts metadata["schema"]["fields"] # Column definitions
|
|
143
|
+
puts metadata["row_groups"].size # Number of row groups
|
|
144
|
+
```
|
|
154
145
|
|
|
155
|
-
|
|
146
|
+
## Writing Parquet Files
|
|
156
147
|
|
|
157
|
-
###
|
|
148
|
+
### Row-wise Writing
|
|
158
149
|
|
|
159
|
-
|
|
150
|
+
Best for: Streaming data, converting from other formats, memory-constrained environments
|
|
160
151
|
|
|
161
152
|
```ruby
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
# Define the schema for your data
|
|
153
|
+
# Basic schema definition
|
|
165
154
|
schema = [
|
|
166
155
|
{ "id" => "int64" },
|
|
167
156
|
{ "name" => "string" },
|
|
168
|
-
{ "
|
|
157
|
+
{ "active" => "boolean" },
|
|
158
|
+
{ "balance" => "double" }
|
|
169
159
|
]
|
|
170
160
|
|
|
171
|
-
#
|
|
172
|
-
rows =
|
|
173
|
-
[1, "
|
|
174
|
-
[2, "Bob", 82.3],
|
|
175
|
-
[3, "Charlie", 88.7]
|
|
176
|
-
].each
|
|
177
|
-
|
|
178
|
-
# Write to a file
|
|
179
|
-
Parquet.write_rows(rows, schema: schema, write_to: "data.parquet")
|
|
180
|
-
|
|
181
|
-
# Write to an IO object
|
|
182
|
-
File.open("data.parquet", "wb") do |file|
|
|
183
|
-
Parquet.write_rows(rows, schema: schema, write_to: file)
|
|
161
|
+
# Stream data from any enumerable
|
|
162
|
+
rows = CSV.foreach("input.csv").map do |row|
|
|
163
|
+
[row[0].to_i, row[1], row[2] == "true", row[3].to_f]
|
|
184
164
|
end
|
|
185
165
|
|
|
186
|
-
# Optionally specify batch size (default is 1000)
|
|
187
|
-
Parquet.write_rows(rows,
|
|
188
|
-
schema: schema,
|
|
189
|
-
write_to: "data.parquet",
|
|
190
|
-
batch_size: 500
|
|
191
|
-
)
|
|
192
|
-
|
|
193
|
-
# Optionally specify memory threshold for flushing (default is 64MB)
|
|
194
|
-
Parquet.write_rows(rows,
|
|
195
|
-
schema: schema,
|
|
196
|
-
write_to: "data.parquet",
|
|
197
|
-
flush_threshold: 32 * 1024 * 1024 # 32MB
|
|
198
|
-
)
|
|
199
|
-
|
|
200
|
-
# Optionally specify sample size for row size estimation (default is 100)
|
|
201
166
|
Parquet.write_rows(rows,
|
|
202
167
|
schema: schema,
|
|
203
|
-
write_to: "
|
|
204
|
-
|
|
168
|
+
write_to: "output.parquet",
|
|
169
|
+
batch_size: 5000 # Positive rows per batch (default: 1000)
|
|
205
170
|
)
|
|
206
171
|
```
|
|
207
172
|
|
|
208
|
-
###
|
|
173
|
+
### Column-wise Writing
|
|
209
174
|
|
|
210
|
-
|
|
175
|
+
Best for: Pre-columnar data, better compression, higher performance
|
|
211
176
|
|
|
212
177
|
```ruby
|
|
213
|
-
|
|
178
|
+
# Prepare columnar data
|
|
179
|
+
ids = [1, 2, 3, 4, 5]
|
|
180
|
+
names = ["Alice", "Bob", "Charlie", "Diana", "Eve"]
|
|
181
|
+
scores = [95.5, 82.3, 88.7, 91.2, 79.8]
|
|
182
|
+
|
|
183
|
+
# Create batches
|
|
184
|
+
batches = [[
|
|
185
|
+
ids, # First column
|
|
186
|
+
names, # Second column
|
|
187
|
+
scores # Third column
|
|
188
|
+
]]
|
|
214
189
|
|
|
215
|
-
# Define the schema
|
|
216
190
|
schema = [
|
|
217
191
|
{ "id" => "int64" },
|
|
218
192
|
{ "name" => "string" },
|
|
219
193
|
{ "score" => "double" }
|
|
220
194
|
]
|
|
221
195
|
|
|
222
|
-
|
|
223
|
-
batches = [
|
|
224
|
-
# First batch
|
|
225
|
-
[
|
|
226
|
-
[1, 2], # id column
|
|
227
|
-
["Alice", "Bob"], # name column
|
|
228
|
-
[95.5, 82.3] # score column
|
|
229
|
-
],
|
|
230
|
-
# Second batch
|
|
231
|
-
[
|
|
232
|
-
[3], # id column
|
|
233
|
-
["Charlie"], # name column
|
|
234
|
-
[88.7] # score column
|
|
235
|
-
]
|
|
236
|
-
]
|
|
237
|
-
|
|
238
|
-
# Create an enumerator from the batches
|
|
239
|
-
columns = batches.each
|
|
240
|
-
|
|
241
|
-
# Write to a parquet file with default ZSTD compression
|
|
242
|
-
Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
|
|
243
|
-
|
|
244
|
-
# Write to a parquet file with specific compression and memory threshold
|
|
245
|
-
Parquet.write_columns(columns,
|
|
196
|
+
Parquet.write_columns(batches.each,
|
|
246
197
|
schema: schema,
|
|
247
|
-
write_to: "
|
|
248
|
-
compression: "snappy"
|
|
249
|
-
flush_threshold: 32 * 1024 * 1024 # 32MB
|
|
198
|
+
write_to: "output.parquet",
|
|
199
|
+
compression: "snappy" # Options: none, snappy, gzip, lz4, zstd
|
|
250
200
|
)
|
|
251
|
-
|
|
252
|
-
# Write to an IO object
|
|
253
|
-
File.open("data.parquet", "wb") do |file|
|
|
254
|
-
Parquet.write_columns(columns, schema: schema, write_to: file)
|
|
255
|
-
end
|
|
256
201
|
```
|
|
257
202
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
- `int8`, `int16`, `int32`, `int64`
|
|
261
|
-
- `uint8`, `uint16`, `uint32`, `uint64`
|
|
262
|
-
- `float`, `double`
|
|
263
|
-
- `string`
|
|
264
|
-
- `binary`
|
|
265
|
-
- `boolean`
|
|
266
|
-
- `date32`
|
|
267
|
-
- `timestamp_millis`, `timestamp_micros`, `timestamp_second`, `timestamp_nanos`
|
|
268
|
-
- `time_millis`, `time_micros`
|
|
269
|
-
|
|
270
|
-
### Timestamp Timezone Handling
|
|
271
|
-
|
|
272
|
-
**CRITICAL PARQUET SPECIFICATION LIMITATION**: The Apache Parquet format specification only supports two types of timestamps:
|
|
273
|
-
1. **UTC-normalized timestamps** (when ANY timezone is specified) - `isAdjustedToUTC = true`
|
|
274
|
-
2. **Local/unzoned timestamps** (when NO timezone is specified) - `isAdjustedToUTC = false`
|
|
275
|
-
|
|
276
|
-
This means that specific timezone offsets like "+09:00" or "America/New_York" CANNOT be preserved in Parquet files. This is not a limitation of this Ruby library, but of the Parquet format itself.
|
|
277
|
-
|
|
278
|
-
**When Writing:**
|
|
279
|
-
- If the schema specifies ANY timezone (whether it's "UTC", "+09:00", "America/New_York", etc.):
|
|
280
|
-
- Time values are converted to UTC before storing
|
|
281
|
-
- The file metadata sets `isAdjustedToUTC = true`
|
|
282
|
-
- The original timezone information is LOST
|
|
283
|
-
- If the schema doesn't specify a timezone:
|
|
284
|
-
- Timestamps are stored as local/unzoned time (no conversion)
|
|
285
|
-
- The file metadata sets `isAdjustedToUTC = false`
|
|
286
|
-
- These represent "wall clock" times without timezone context
|
|
287
|
-
|
|
288
|
-
**When Reading:**
|
|
289
|
-
- If the Parquet file has `isAdjustedToUTC = true` (ANY timezone was specified during writing):
|
|
290
|
-
- Time objects are returned in UTC
|
|
291
|
-
- The original timezone (e.g., "+09:00") is NOT recoverable
|
|
292
|
-
- If the file has `isAdjustedToUTC = false` (NO timezone was specified):
|
|
293
|
-
- Time objects are returned as local time in your system's timezone
|
|
294
|
-
- These are "wall clock" times without timezone information
|
|
203
|
+
`write_columns` also accepts `logger:` with the same Ruby logger interface as
|
|
204
|
+
row writes.
|
|
295
205
|
|
|
296
|
-
|
|
297
|
-
# Preferred approach: use has_timezone to be explicit about UTC vs local storage
|
|
298
|
-
schema = Parquet::Schema.define do
|
|
299
|
-
field :timestamp_utc, :timestamp_millis, has_timezone: true # Stored as UTC (default)
|
|
300
|
-
field :timestamp_local, :timestamp_millis, has_timezone: false # Stored as local/unzoned
|
|
301
|
-
field :timestamp_default, :timestamp_millis # Default: UTC storage
|
|
302
|
-
end
|
|
206
|
+
## Data Types
|
|
303
207
|
|
|
304
|
-
|
|
305
|
-
schema_legacy = Parquet::Schema.define do
|
|
306
|
-
field :timestamp_utc, :timestamp_millis, timezone: "UTC" # Stored as UTC
|
|
307
|
-
field :timestamp_tokyo, :timestamp_millis, timezone: "+09:00" # Also stored as UTC!
|
|
308
|
-
field :timestamp_local, :timestamp_millis # No timezone - local
|
|
309
|
-
end
|
|
208
|
+
### Basic Types
|
|
310
209
|
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
210
|
+
```ruby
|
|
211
|
+
schema = [
|
|
212
|
+
# Integers
|
|
213
|
+
{ "tiny" => "int8" }, # -128 to 127
|
|
214
|
+
{ "small" => "int16" }, # -32,768 to 32,767
|
|
215
|
+
{ "medium" => "int32" }, # ±2 billion
|
|
216
|
+
{ "large" => "int64" }, # ±9 quintillion
|
|
217
|
+
|
|
218
|
+
# Unsigned integers
|
|
219
|
+
{ "ubyte" => "uint8" }, # 0 to 255
|
|
220
|
+
{ "ushort" => "uint16" }, # 0 to 65,535
|
|
221
|
+
{ "uint" => "uint32" }, # 0 to 4 billion
|
|
222
|
+
{ "ulong" => "uint64" }, # 0 to 18 quintillion
|
|
223
|
+
|
|
224
|
+
# Floating point
|
|
225
|
+
{ "price" => "float" }, # 32-bit precision
|
|
226
|
+
{ "amount" => "double" }, # 64-bit precision
|
|
227
|
+
|
|
228
|
+
# Other basics
|
|
229
|
+
{ "name" => "string" },
|
|
230
|
+
{ "data" => "binary" },
|
|
231
|
+
{ "active" => "boolean" }
|
|
318
232
|
]
|
|
233
|
+
```
|
|
319
234
|
|
|
320
|
-
|
|
235
|
+
### Date and Time Types
|
|
321
236
|
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
#
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
237
|
+
```ruby
|
|
238
|
+
schema = [
|
|
239
|
+
# Date (days since Unix epoch)
|
|
240
|
+
{ "date" => "date32" },
|
|
241
|
+
|
|
242
|
+
# Timestamps (with different precisions)
|
|
243
|
+
{ "created_sec" => "timestamp_second" },
|
|
244
|
+
{ "created_ms" => "timestamp_millis" }, # Most common
|
|
245
|
+
{ "created_us" => "timestamp_micros" },
|
|
246
|
+
{ "created_ns" => "timestamp_nanos" },
|
|
247
|
+
|
|
248
|
+
# Time of day (without date)
|
|
249
|
+
{ "time_ms" => "time_millis" }, # Milliseconds since midnight
|
|
250
|
+
{ "time_us" => "time_micros" } # Microseconds since midnight
|
|
251
|
+
]
|
|
334
252
|
```
|
|
335
253
|
|
|
336
|
-
|
|
254
|
+
### Decimal Type (Financial Data)
|
|
337
255
|
|
|
338
|
-
|
|
256
|
+
For exact decimal arithmetic (no floating-point errors):
|
|
339
257
|
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
- Handles Ruby value conversion through the `ValueConverter` trait
|
|
348
|
-
- Manages Ruby I/O objects through the `ChunkReader` trait
|
|
258
|
+
```ruby
|
|
259
|
+
require "bigdecimal"
|
|
260
|
+
|
|
261
|
+
schema = [
|
|
262
|
+
# Financial amounts with 2 decimal places
|
|
263
|
+
{ "price" => "decimal", "precision" => 10, "scale" => 2 }, # Up to 99,999,999.99
|
|
264
|
+
{ "balance" => "decimal", "precision" => 15, "scale" => 2 }, # Larger amounts
|
|
349
265
|
|
|
350
|
-
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
- Supports both file-based and IO-based operations
|
|
266
|
+
# High-precision calculations
|
|
267
|
+
{ "rate" => "decimal", "precision" => 10, "scale" => 8 } # 8 decimal places
|
|
268
|
+
]
|
|
354
269
|
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
270
|
+
# Use BigDecimal for exact values
|
|
271
|
+
data = [[
|
|
272
|
+
BigDecimal("19.99"),
|
|
273
|
+
BigDecimal("1234567.89"),
|
|
274
|
+
BigDecimal("0.00000123")
|
|
275
|
+
]]
|
|
276
|
+
```
|
|
360
277
|
|
|
361
|
-
|
|
278
|
+
## Complex Data Structures
|
|
362
279
|
|
|
363
|
-
|
|
280
|
+
The library includes a powerful Schema DSL for defining nested data:
|
|
364
281
|
|
|
365
|
-
|
|
366
|
-
require "parquet"
|
|
282
|
+
### Using the Schema DSL
|
|
367
283
|
|
|
368
|
-
|
|
284
|
+
```ruby
|
|
369
285
|
schema = Parquet::Schema.define do
|
|
370
|
-
|
|
371
|
-
field :
|
|
286
|
+
# Simple fields
|
|
287
|
+
field :id, :int64, nullable: false # Required field
|
|
288
|
+
field :name, :string # Optional by default
|
|
372
289
|
|
|
373
|
-
# Nested
|
|
290
|
+
# Nested structure
|
|
374
291
|
field :address, :struct do
|
|
375
292
|
field :street, :string
|
|
376
293
|
field :city, :string
|
|
377
|
-
field :
|
|
378
|
-
|
|
379
|
-
field :
|
|
380
|
-
field :longitude, :double
|
|
294
|
+
field :location, :struct do
|
|
295
|
+
field :lat, :double
|
|
296
|
+
field :lng, :double
|
|
381
297
|
end
|
|
382
298
|
end
|
|
383
299
|
|
|
384
|
-
#
|
|
385
|
-
field :
|
|
300
|
+
# Lists
|
|
301
|
+
field :tags, :list, item: :string
|
|
302
|
+
field :scores, :list, item: :int32
|
|
303
|
+
|
|
304
|
+
# Maps (dictionaries)
|
|
305
|
+
field :metadata, :map, key: :string, value: :string
|
|
386
306
|
|
|
387
|
-
#
|
|
307
|
+
# Complex combinations
|
|
388
308
|
field :contacts, :list, item: :struct do
|
|
389
309
|
field :name, :string
|
|
390
|
-
field :
|
|
310
|
+
field :email, :string
|
|
391
311
|
field :primary, :boolean
|
|
392
312
|
end
|
|
393
|
-
|
|
394
|
-
# Map with string values
|
|
395
|
-
field :metadata, :map, key: :string, value: :string
|
|
396
|
-
|
|
397
|
-
# Map with struct values
|
|
398
|
-
field :properties, :map, key: :string, value: :struct do
|
|
399
|
-
field :count, :int32
|
|
400
|
-
field :description, :string
|
|
401
|
-
end
|
|
402
|
-
|
|
403
|
-
# Nested lists (list of lists of strings)
|
|
404
|
-
field :nested_lists, :list, item: :list do
|
|
405
|
-
field :item, :string # REQUIRED: Inner item field MUST be named 'item' for nested lists
|
|
406
|
-
end
|
|
407
|
-
|
|
408
|
-
# Map of lists
|
|
409
|
-
field :map_of_lists, :map, key: :string, value: :list do
|
|
410
|
-
field :item, :int32 # REQUIRED: List items in maps MUST be named 'item'
|
|
411
|
-
end
|
|
412
|
-
end
|
|
413
|
-
|
|
414
|
-
### Nested Lists
|
|
415
|
-
|
|
416
|
-
When working with nested lists (a list of lists), there are specific requirements:
|
|
417
|
-
|
|
418
|
-
1. Using the Schema DSL:
|
|
419
|
-
```ruby
|
|
420
|
-
# A list of lists of strings
|
|
421
|
-
field :nested_lists, :list, item: :list do
|
|
422
|
-
field :item, :string # For nested lists, inner item MUST be named 'item'
|
|
423
313
|
end
|
|
424
314
|
```
|
|
425
315
|
|
|
426
|
-
|
|
427
|
-
```ruby
|
|
428
|
-
# A list of lists of integers
|
|
429
|
-
{ "nested_numbers" => "list<list<int32>>" }
|
|
430
|
-
```
|
|
316
|
+
### Writing Complex Data
|
|
431
317
|
|
|
432
|
-
The data for nested lists is structured as an array of arrays:
|
|
433
318
|
```ruby
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
319
|
+
data = [[
|
|
320
|
+
1, # id
|
|
321
|
+
"Alice Johnson", # name
|
|
322
|
+
{ # address
|
|
323
|
+
"street" => "123 Main St",
|
|
324
|
+
"city" => "Springfield",
|
|
325
|
+
"location" => {
|
|
326
|
+
"lat" => 40.7128,
|
|
327
|
+
"lng" => -74.0060
|
|
328
|
+
}
|
|
329
|
+
},
|
|
330
|
+
["ruby", "parquet", "data"], # tags
|
|
331
|
+
[85, 92, 88], # scores
|
|
332
|
+
{ "dept" => "Engineering" }, # metadata
|
|
333
|
+
[ # contacts
|
|
334
|
+
{ "name" => "Bob", "email" => "bob@example.com", "primary" => true },
|
|
335
|
+
{ "name" => "Carol", "email" => "carol@example.com", "primary" => false }
|
|
336
|
+
]
|
|
337
|
+
]]
|
|
450
338
|
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
field :amount1, :decimal # Equivalent to INTEGER with 38 digits
|
|
339
|
+
Parquet.write_rows(data.each, schema: schema, write_to: "complex.parquet")
|
|
340
|
+
```
|
|
454
341
|
|
|
455
|
-
|
|
456
|
-
field :amount2, :decimal, precision: 10 # 10 digits, no decimal places
|
|
342
|
+
## ⚠️ Important Limitations
|
|
457
343
|
|
|
458
|
-
|
|
459
|
-
field :amount3, :decimal, scale: 2 # 38 digits with 2 decimal places
|
|
344
|
+
### Timezone Handling in Parquet
|
|
460
345
|
|
|
461
|
-
|
|
462
|
-
field :amount4, :decimal, precision: 10, scale: 2 # 10 digits with 2 decimal places
|
|
463
|
-
```
|
|
346
|
+
The Parquet specification has a fundamental limitation with timezone storage:
|
|
464
347
|
|
|
465
|
-
|
|
348
|
+
1. **UTC-normalized**: Any timestamp with timezone info (including "+09:00" or "America/New_York") is converted to UTC
|
|
349
|
+
2. **Local/unzoned**: Timestamps without timezone info are stored as-is
|
|
466
350
|
|
|
467
|
-
|
|
351
|
+
**The original timezone information is permanently lost.** This is not a limitation of this library but of the Parquet format itself.
|
|
468
352
|
|
|
469
353
|
```ruby
|
|
470
|
-
require "parquet"
|
|
471
|
-
require "bigdecimal"
|
|
472
|
-
|
|
473
|
-
# Schema for financial transactions
|
|
474
354
|
schema = Parquet::Schema.define do
|
|
475
|
-
|
|
476
|
-
field :
|
|
477
|
-
field :
|
|
478
|
-
field :balance, :decimal, precision: 16, scale: 2 # Larger precision for running balances
|
|
479
|
-
field :currency, :string
|
|
480
|
-
field :exchange_rate, :decimal, precision: 10, scale: 6 # 6 decimal places for forex rates
|
|
481
|
-
field :fee, :decimal, precision: 8, scale: 2, nullable: true # Optional fee
|
|
482
|
-
field :category, :string
|
|
483
|
-
end
|
|
355
|
+
# These BOTH store in UTC - timezone info is lost!
|
|
356
|
+
field :timestamp_utc, :timestamp_millis, timezone: "UTC"
|
|
357
|
+
field :timestamp_tokyo, :timestamp_millis, timezone: "+09:00"
|
|
484
358
|
|
|
485
|
-
#
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
"T-12345",
|
|
489
|
-
Time.now,
|
|
490
|
-
BigDecimal("1256.99"), # amount (directly using BigDecimal)
|
|
491
|
-
BigDecimal("10250.25"), # balance
|
|
492
|
-
"USD",
|
|
493
|
-
BigDecimal("1.0"), # exchange_rate
|
|
494
|
-
BigDecimal("2.50"), # fee
|
|
495
|
-
"Groceries"
|
|
496
|
-
],
|
|
497
|
-
[
|
|
498
|
-
"T-12346",
|
|
499
|
-
Time.now - 86400, # yesterday
|
|
500
|
-
BigDecimal("-89.50"), # negative amount for withdrawal
|
|
501
|
-
BigDecimal("10160.75"), # updated balance
|
|
502
|
-
"USD",
|
|
503
|
-
BigDecimal("1.0"), # exchange_rate
|
|
504
|
-
nil, # no fee
|
|
505
|
-
"Transportation"
|
|
506
|
-
],
|
|
507
|
-
[
|
|
508
|
-
"T-12347",
|
|
509
|
-
Time.now - 172800, # two days ago
|
|
510
|
-
BigDecimal("250.00"), # amount
|
|
511
|
-
BigDecimal("10410.75"), # balance
|
|
512
|
-
"EUR", # different currency
|
|
513
|
-
BigDecimal("1.05463"), # exchange_rate
|
|
514
|
-
BigDecimal("1.75"), # fee
|
|
515
|
-
"Entertainment"
|
|
516
|
-
]
|
|
517
|
-
]
|
|
359
|
+
# This stores as local time (no timezone)
|
|
360
|
+
field :timestamp_local, :timestamp_millis
|
|
361
|
+
end
|
|
518
362
|
|
|
519
|
-
#
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
Parquet.each_row("financial_data.parquet") do |transaction|
|
|
524
|
-
# Access decimal fields as BigDecimal objects
|
|
525
|
-
puts "Transaction: #{transaction['transaction_id']}"
|
|
526
|
-
puts " Amount: #{transaction['currency']} #{transaction['amount']}"
|
|
527
|
-
puts " Balance: $#{transaction['balance']}"
|
|
528
|
-
puts " Fee: #{transaction['fee'] || 'No fee'}"
|
|
529
|
-
|
|
530
|
-
# You can perform precise decimal calculations
|
|
531
|
-
if transaction['currency'] != 'USD'
|
|
532
|
-
usd_amount = transaction['amount'] * transaction['exchange_rate']
|
|
533
|
-
puts " USD Equivalent: $#{usd_amount.round(2)}"
|
|
534
|
-
end
|
|
363
|
+
# If you need timezone preservation, store it separately:
|
|
364
|
+
schema = Parquet::Schema.define do
|
|
365
|
+
field :timestamp, :timestamp_millis, has_timezone: true # UTC storage
|
|
366
|
+
field :original_tz, :string # "America/New_York"
|
|
535
367
|
end
|
|
536
368
|
```
|
|
537
369
|
|
|
538
|
-
|
|
370
|
+
## Performance Tips
|
|
539
371
|
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
-
|
|
372
|
+
1. **Use column-wise reading** when you need only a few columns from wide tables
|
|
373
|
+
2. **Specify columns parameter** to avoid reading unnecessary data
|
|
374
|
+
3. **Choose appropriate batch sizes**:
|
|
375
|
+
- Larger batches = better throughput but more memory
|
|
376
|
+
- Smaller batches = less memory but more overhead
|
|
377
|
+
4. **Pre-sort data** by commonly filtered columns for better compression
|
|
544
378
|
|
|
545
|
-
Choose appropriate precision and scale for your data to optimize storage while ensuring adequate range:
|
|
546
379
|
|
|
547
|
-
|
|
548
|
-
# Banking examples
|
|
549
|
-
field :account_balance, :decimal, precision: 16, scale: 2 # Up to 14 digits before decimal point
|
|
550
|
-
field :interest_rate, :decimal, precision: 8, scale: 6 # Rate with 6 decimal places (e.g., 0.015625)
|
|
380
|
+
## Memory Management
|
|
551
381
|
|
|
552
|
-
|
|
553
|
-
field :product_price, :decimal, precision: 10, scale: 2 # Product price
|
|
554
|
-
field :shipping_weight, :decimal, precision: 6, scale: 3 # Weight in kg with 3 decimal places
|
|
382
|
+
Control memory usage with flush thresholds:
|
|
555
383
|
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
384
|
+
```ruby
|
|
385
|
+
Parquet.write_rows(huge_dataset.each,
|
|
386
|
+
schema: schema,
|
|
387
|
+
write_to: "output.parquet",
|
|
388
|
+
batch_size: 1000, # Positive rows before considering flush
|
|
389
|
+
flush_threshold: 32 * 1024**2 # Flush if batch exceeds 32MB
|
|
390
|
+
)
|
|
559
391
|
```
|
|
560
392
|
|
|
561
|
-
|
|
393
|
+
Write batch and sample sizes are bounded before buffer allocation. Very large
|
|
394
|
+
batch sizes are rejected, and wide schemas have a lower effective batch cap so
|
|
395
|
+
the writer cannot reserve unbounded per-column value slots.
|
|
562
396
|
|
|
563
|
-
|
|
397
|
+
## Architecture
|
|
564
398
|
|
|
565
|
-
|
|
566
|
-
# Sample data with nested structures
|
|
567
|
-
data = [
|
|
568
|
-
[
|
|
569
|
-
1, # id
|
|
570
|
-
"John Doe", # name
|
|
571
|
-
{ # address (struct)
|
|
572
|
-
"street" => "123 Main St",
|
|
573
|
-
"city" => "Springfield",
|
|
574
|
-
"zip" => "12345",
|
|
575
|
-
"coordinates" => {
|
|
576
|
-
"latitude" => 37.7749,
|
|
577
|
-
"longitude" => -122.4194
|
|
578
|
-
}
|
|
579
|
-
},
|
|
580
|
-
[85.5, 92.0, 78.5], # scores (list of floats)
|
|
581
|
-
[ # contacts (list of structs)
|
|
582
|
-
{ "name" => "Contact 1", "phone" => "555-1234", "primary" => true },
|
|
583
|
-
{ "name" => "Contact 2", "phone" => "555-5678", "primary" => false }
|
|
584
|
-
],
|
|
585
|
-
{ "created" => "2023-01-01", "status" => "active" }, # metadata (map)
|
|
586
|
-
{ # properties (map of structs)
|
|
587
|
-
"feature1" => { "count" => 5, "description" => "Main feature" },
|
|
588
|
-
"feature2" => { "count" => 3, "description" => "Secondary feature" }
|
|
589
|
-
},
|
|
590
|
-
[["a", "b"], ["c", "d", "e"]], # nested_lists (a list of lists of strings)
|
|
591
|
-
{ # map_of_lists
|
|
592
|
-
"group1" => [1, 2, 3],
|
|
593
|
-
"group2" => [4, 5, 6]
|
|
594
|
-
}
|
|
595
|
-
]
|
|
596
|
-
]
|
|
399
|
+
This gem uses a modular architecture:
|
|
597
400
|
|
|
598
|
-
|
|
599
|
-
|
|
401
|
+
- **parquet-core**: Language-agnostic Rust core for Parquet operations
|
|
402
|
+
- **parquet-ruby-adapter**: Ruby-specific FFI adapter layer
|
|
403
|
+
- **parquet gem**: High-level Ruby API
|
|
600
404
|
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
end
|
|
605
|
-
```
|
|
405
|
+
Take a look at [ARCH.md](./ARCH.md)
|
|
406
|
+
|
|
407
|
+
## Contributing
|
|
606
408
|
|
|
607
|
-
|
|
409
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/njaremko/parquet-ruby.
|
|
608
410
|
|
|
609
|
-
|
|
610
|
-
- **Complex types**: Structs, lists, and maps with arbitrary nesting
|
|
611
|
-
- **Nullability control**: Specify which fields can contain null values with `nullable: false/true`
|
|
612
|
-
- **List item nullability**: Control whether list items can be null with `item_nullable: false/true`
|
|
613
|
-
- **Map key/value nullability**: Control whether map keys or values can be null with `value_nullable: false/true`
|
|
411
|
+
## License
|
|
614
412
|
|
|
615
|
-
|
|
616
|
-
- For lists: The `item:` parameter specifying the item type
|
|
617
|
-
- For maps: Both `key:` and `value:` parameters specifying key and value types
|
|
413
|
+
The gem is available as open source under the terms of the MIT License.
|