parquet-tyfoom 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Cargo.lock +1854 -0
- data/Cargo.toml +3 -0
- data/Gemfile +21 -0
- data/LICENSE +21 -0
- data/README.md +428 -0
- data/Rakefile +43 -0
- data/ext/parquet/Cargo.toml +39 -0
- data/ext/parquet/build.rs +5 -0
- data/ext/parquet/extconf.rb +4 -0
- data/ext/parquet/src/adapter_ffi.rs +297 -0
- data/ext/parquet/src/allocator.rs +13 -0
- data/ext/parquet/src/lib.rs +24 -0
- data/ext/parquet-core/Cargo.toml +24 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
- data/ext/parquet-core/src/error.rs +189 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +368 -0
- data/ext/parquet-core/src/schema.rs +452 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +190 -0
- data/ext/parquet-core/src/value.rs +220 -0
- data/ext/parquet-core/src/writer.rs +1241 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +431 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/review_regressions.rs +787 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
- data/ext/parquet-ruby-adapter/src/error.rs +141 -0
- data/ext/parquet-ruby-adapter/src/io.rs +432 -0
- data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
- data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
- data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +98 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
- data/lib/parquet/schema.rb +262 -0
- data/lib/parquet/version.rb +3 -0
- data/lib/parquet.rb +11 -0
- data/lib/parquet.rbi +181 -0
- metadata +165 -0
data/Cargo.toml
ADDED
data/Gemfile
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
source "https://rubygems.org"
|
|
2
|
+
|
|
3
|
+
gem "rb_sys", "~> 0.9.56"
|
|
4
|
+
gem "rake"
|
|
5
|
+
gem "bigdecimal"
|
|
6
|
+
|
|
7
|
+
# Use local version of parquet
|
|
8
|
+
gemspec
|
|
9
|
+
|
|
10
|
+
group :development do
|
|
11
|
+
# gem "benchmark-ips", "~> 2.12"
|
|
12
|
+
# gem "polars-df"
|
|
13
|
+
# gem "duckdb"
|
|
14
|
+
gem "benchmark-memory"
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
group :test do
|
|
18
|
+
gem "csv"
|
|
19
|
+
gem "logger"
|
|
20
|
+
gem "minitest", "~> 5.0"
|
|
21
|
+
end
|
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Nathan Jaremko
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,428 @@
|
|
|
1
|
+
# parquet-ruby
|
|
2
|
+
|
|
3
|
+
[](https://badge.fury.io/rb/parquet)
|
|
4
|
+
|
|
5
|
+
Read and write [Apache Parquet](https://parquet.apache.org/) files from Ruby. This gem wraps the official Apache [`parquet`](https://github.com/apache/arrow-rs/tree/main/parquet) rust crate, providing:
|
|
6
|
+
|
|
7
|
+
- **High performance** columnar data storage and retrieval
|
|
8
|
+
- **Memory-efficient** streaming APIs for large datasets
|
|
9
|
+
- **Full compatibility** with the Apache Parquet specification
|
|
10
|
+
- **Simple, Ruby-native** APIs that feel natural
|
|
11
|
+
|
|
12
|
+
## Why Use This Library?
|
|
13
|
+
|
|
14
|
+
Apache Parquet is the de facto standard for analytical data storage, offering:
|
|
15
|
+
- **Efficient compression** - typically 2-10x smaller than CSV
|
|
16
|
+
- **Fast columnar access** - read only the columns you need
|
|
17
|
+
- **Rich type system** - preserves data types, including nested structures
|
|
18
|
+
- **Wide ecosystem support** - works with Spark, Pandas, DuckDB, and more
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
Add this line to your application's Gemfile:
|
|
23
|
+
|
|
24
|
+
```ruby
|
|
25
|
+
gem 'parquet'
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Then execute:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
$ bundle install
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Or install it directly:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
$ gem install parquet
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Quick Start
|
|
41
|
+
|
|
42
|
+
### Reading Data
|
|
43
|
+
|
|
44
|
+
```ruby
|
|
45
|
+
require "parquet"
|
|
46
|
+
|
|
47
|
+
# Read Parquet files row by row
|
|
48
|
+
Parquet.each_row("data.parquet") do |row|
|
|
49
|
+
puts row # => {"id" => 1, "name" => "Alice", "score" => 95.5}
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Or column by column for better performance
|
|
53
|
+
Parquet.each_column("data.parquet", batch_size: 1000) do |batch|
|
|
54
|
+
puts batch # => {"id" => [1, 2, ...], "name" => ["Alice", "Bob", ...]}
|
|
55
|
+
end
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Writing Data
|
|
59
|
+
|
|
60
|
+
```ruby
|
|
61
|
+
# Define your schema
|
|
62
|
+
schema = [
|
|
63
|
+
{ "id" => "int64" },
|
|
64
|
+
{ "name" => "string" },
|
|
65
|
+
{ "score" => "double" }
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
# Write row by row
|
|
69
|
+
rows = [
|
|
70
|
+
[1, "Alice", 95.5],
|
|
71
|
+
[2, "Bob", 82.3]
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
Parquet.write_rows(rows.each, schema: schema, write_to: "output.parquet")
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Reading Parquet Files
|
|
78
|
+
|
|
79
|
+
The library provides two APIs for reading data, each optimized for different use cases:
|
|
80
|
+
|
|
81
|
+
### Row-wise Reading (Sequential Access)
|
|
82
|
+
|
|
83
|
+
Best for: Processing records one at a time, data transformations, ETL pipelines
|
|
84
|
+
|
|
85
|
+
```ruby
|
|
86
|
+
# Basic usage - returns hashes
|
|
87
|
+
Parquet.each_row("data.parquet") do |row|
|
|
88
|
+
puts row # => {"id" => 1, "name" => "Alice"}
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Memory-efficient array format
|
|
92
|
+
Parquet.each_row("data.parquet", result_type: :array) do |row|
|
|
93
|
+
puts row # => [1, "Alice"]
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Read specific columns only
|
|
97
|
+
Parquet.each_row("data.parquet", columns: ["id", "name"]) do |row|
|
|
98
|
+
# Only requested columns are loaded from disk
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Works with IO objects
|
|
102
|
+
File.open("data.parquet", "rb") do |file|
|
|
103
|
+
Parquet.each_row(file) do |row|
|
|
104
|
+
# Process row
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Column-wise Reading (Analytical Access)
|
|
110
|
+
|
|
111
|
+
Best for: Analytics, aggregations, when you need few columns from wide tables
|
|
112
|
+
|
|
113
|
+
```ruby
|
|
114
|
+
# Process data in column batches
|
|
115
|
+
Parquet.each_column("data.parquet", batch_size: 1000) do |batch|
|
|
116
|
+
# batch is a hash of column_name => array_of_values
|
|
117
|
+
ids = batch["id"] # => [1, 2, 3, ..., 1000]
|
|
118
|
+
names = batch["name"] # => ["Alice", "Bob", ...]
|
|
119
|
+
|
|
120
|
+
# Perform columnar operations
|
|
121
|
+
avg_id = ids.sum.to_f / ids.length
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Array format for more control
|
|
125
|
+
Parquet.each_column("data.parquet",
|
|
126
|
+
result_type: :array,
|
|
127
|
+
columns: ["id", "name"]) do |batch|
|
|
128
|
+
# batch is an array of arrays
|
|
129
|
+
# [[1, 2, ...], ["Alice", "Bob", ...]]
|
|
130
|
+
end
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### File Metadata
|
|
134
|
+
|
|
135
|
+
Inspect file structure without reading data:
|
|
136
|
+
|
|
137
|
+
```ruby
|
|
138
|
+
metadata = Parquet.metadata("data.parquet")
|
|
139
|
+
|
|
140
|
+
puts metadata["num_rows"] # Total row count
|
|
141
|
+
puts metadata["created_by"] # Writer identification
|
|
142
|
+
puts metadata["schema"]["fields"] # Column definitions
|
|
143
|
+
puts metadata["row_groups"].size # Number of row groups
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## Writing Parquet Files
|
|
147
|
+
|
|
148
|
+
### Row-wise Writing
|
|
149
|
+
|
|
150
|
+
Best for: Streaming data, converting from other formats, memory-constrained environments
|
|
151
|
+
|
|
152
|
+
```ruby
|
|
153
|
+
# Basic schema definition
|
|
154
|
+
schema = [
|
|
155
|
+
{ "id" => "int64" },
|
|
156
|
+
{ "name" => "string" },
|
|
157
|
+
{ "active" => "boolean" },
|
|
158
|
+
{ "balance" => "double" }
|
|
159
|
+
]
|
|
160
|
+
|
|
161
|
+
# Stream data from any enumerable
|
|
162
|
+
rows = CSV.foreach("input.csv").map do |row|
|
|
163
|
+
[row[0].to_i, row[1], row[2] == "true", row[3].to_f]
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
Parquet.write_rows(rows,
|
|
167
|
+
schema: schema,
|
|
168
|
+
write_to: "output.parquet",
|
|
169
|
+
batch_size: 5000 # Positive rows per batch (default: 1000)
|
|
170
|
+
)
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### Column-wise Writing
|
|
174
|
+
|
|
175
|
+
Best for: Pre-columnar data, better compression, higher performance
|
|
176
|
+
|
|
177
|
+
```ruby
|
|
178
|
+
# Prepare columnar data
|
|
179
|
+
ids = [1, 2, 3, 4, 5]
|
|
180
|
+
names = ["Alice", "Bob", "Charlie", "Diana", "Eve"]
|
|
181
|
+
scores = [95.5, 82.3, 88.7, 91.2, 79.8]
|
|
182
|
+
|
|
183
|
+
# Create batches
|
|
184
|
+
batches = [[
|
|
185
|
+
ids, # First column
|
|
186
|
+
names, # Second column
|
|
187
|
+
scores # Third column
|
|
188
|
+
]]
|
|
189
|
+
|
|
190
|
+
schema = [
|
|
191
|
+
{ "id" => "int64" },
|
|
192
|
+
{ "name" => "string" },
|
|
193
|
+
{ "score" => "double" }
|
|
194
|
+
]
|
|
195
|
+
|
|
196
|
+
Parquet.write_columns(batches.each,
|
|
197
|
+
schema: schema,
|
|
198
|
+
write_to: "output.parquet",
|
|
199
|
+
compression: "snappy" # Options: none, snappy, gzip, lz4, zstd
|
|
200
|
+
)
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
`write_columns` also accepts `logger:` with the same Ruby logger interface as
|
|
204
|
+
row writes.
|
|
205
|
+
|
|
206
|
+
## Data Types
|
|
207
|
+
|
|
208
|
+
### Basic Types
|
|
209
|
+
|
|
210
|
+
```ruby
|
|
211
|
+
schema = [
|
|
212
|
+
# Integers
|
|
213
|
+
{ "tiny" => "int8" }, # -128 to 127
|
|
214
|
+
{ "small" => "int16" }, # -32,768 to 32,767
|
|
215
|
+
{ "medium" => "int32" }, # ±2 billion
|
|
216
|
+
{ "large" => "int64" }, # ±9 quintillion
|
|
217
|
+
|
|
218
|
+
# Unsigned integers
|
|
219
|
+
{ "ubyte" => "uint8" }, # 0 to 255
|
|
220
|
+
{ "ushort" => "uint16" }, # 0 to 65,535
|
|
221
|
+
{ "uint" => "uint32" }, # 0 to 4 billion
|
|
222
|
+
{ "ulong" => "uint64" }, # 0 to 18 quintillion
|
|
223
|
+
|
|
224
|
+
# Floating point
|
|
225
|
+
{ "price" => "float" }, # 32-bit precision
|
|
226
|
+
{ "amount" => "double" }, # 64-bit precision
|
|
227
|
+
|
|
228
|
+
# Other basics
|
|
229
|
+
{ "name" => "string" },
|
|
230
|
+
{ "data" => "binary" },
|
|
231
|
+
{ "active" => "boolean" }
|
|
232
|
+
]
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
### Date and Time Types
|
|
236
|
+
|
|
237
|
+
```ruby
|
|
238
|
+
schema = [
|
|
239
|
+
# Date (days since Unix epoch)
|
|
240
|
+
{ "date" => "date32" },
|
|
241
|
+
|
|
242
|
+
# Timestamps (with different precisions)
|
|
243
|
+
{ "created_sec" => "timestamp_second" },
|
|
244
|
+
{ "created_ms" => "timestamp_millis" }, # Most common
|
|
245
|
+
{ "created_us" => "timestamp_micros" },
|
|
246
|
+
{ "created_ns" => "timestamp_nanos" },
|
|
247
|
+
|
|
248
|
+
# Time of day (without date)
|
|
249
|
+
{ "time_ms" => "time_millis" }, # Milliseconds since midnight
|
|
250
|
+
{ "time_us" => "time_micros" } # Microseconds since midnight
|
|
251
|
+
]
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
### Decimal Type (Financial Data)
|
|
255
|
+
|
|
256
|
+
For exact decimal arithmetic (no floating-point errors):
|
|
257
|
+
|
|
258
|
+
```ruby
|
|
259
|
+
require "bigdecimal"
|
|
260
|
+
|
|
261
|
+
schema = [
|
|
262
|
+
# Financial amounts with 2 decimal places
|
|
263
|
+
{ "price" => "decimal", "precision" => 10, "scale" => 2 }, # Up to 99,999,999.99
|
|
264
|
+
{ "balance" => "decimal", "precision" => 15, "scale" => 2 }, # Larger amounts
|
|
265
|
+
|
|
266
|
+
# High-precision calculations
|
|
267
|
+
{ "rate" => "decimal", "precision" => 10, "scale" => 8 } # 8 decimal places
|
|
268
|
+
]
|
|
269
|
+
|
|
270
|
+
# Use BigDecimal for exact values
|
|
271
|
+
data = [[
|
|
272
|
+
BigDecimal("19.99"),
|
|
273
|
+
BigDecimal("1234567.89"),
|
|
274
|
+
BigDecimal("0.00000123")
|
|
275
|
+
]]
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
## Complex Data Structures
|
|
279
|
+
|
|
280
|
+
The library includes a powerful Schema DSL for defining nested data:
|
|
281
|
+
|
|
282
|
+
### Using the Schema DSL
|
|
283
|
+
|
|
284
|
+
```ruby
|
|
285
|
+
schema = Parquet::Schema.define do
|
|
286
|
+
# Simple fields
|
|
287
|
+
field :id, :int64, nullable: false # Required field
|
|
288
|
+
field :name, :string # Optional by default
|
|
289
|
+
|
|
290
|
+
# Nested structure
|
|
291
|
+
field :address, :struct do
|
|
292
|
+
field :street, :string
|
|
293
|
+
field :city, :string
|
|
294
|
+
field :location, :struct do
|
|
295
|
+
field :lat, :double
|
|
296
|
+
field :lng, :double
|
|
297
|
+
end
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
# Lists
|
|
301
|
+
field :tags, :list, item: :string
|
|
302
|
+
field :scores, :list, item: :int32
|
|
303
|
+
|
|
304
|
+
# Maps (dictionaries)
|
|
305
|
+
field :metadata, :map, key: :string, value: :string
|
|
306
|
+
|
|
307
|
+
# Complex combinations
|
|
308
|
+
field :contacts, :list, item: :struct do
|
|
309
|
+
field :name, :string
|
|
310
|
+
field :email, :string
|
|
311
|
+
field :primary, :boolean
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
### Writing Complex Data
|
|
317
|
+
|
|
318
|
+
```ruby
|
|
319
|
+
data = [[
|
|
320
|
+
1, # id
|
|
321
|
+
"Alice Johnson", # name
|
|
322
|
+
{ # address
|
|
323
|
+
"street" => "123 Main St",
|
|
324
|
+
"city" => "Springfield",
|
|
325
|
+
"location" => {
|
|
326
|
+
"lat" => 40.7128,
|
|
327
|
+
"lng" => -74.0060
|
|
328
|
+
}
|
|
329
|
+
},
|
|
330
|
+
["ruby", "parquet", "data"], # tags
|
|
331
|
+
[85, 92, 88], # scores
|
|
332
|
+
{ "dept" => "Engineering" }, # metadata
|
|
333
|
+
[ # contacts
|
|
334
|
+
{ "name" => "Bob", "email" => "bob@example.com", "primary" => true },
|
|
335
|
+
{ "name" => "Carol", "email" => "carol@example.com", "primary" => false }
|
|
336
|
+
]
|
|
337
|
+
]]
|
|
338
|
+
|
|
339
|
+
Parquet.write_rows(data.each, schema: schema, write_to: "complex.parquet")
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
## ⚠️ Important Limitations
|
|
343
|
+
|
|
344
|
+
### Timezone Handling in Parquet
|
|
345
|
+
|
|
346
|
+
The Parquet specification has a fundamental limitation with timezone storage:
|
|
347
|
+
|
|
348
|
+
1. **UTC-normalized**: Any timestamp with timezone info (including "+09:00" or "America/New_York") is converted to UTC
|
|
349
|
+
2. **Local/unzoned**: Timestamps without timezone info are stored as-is
|
|
350
|
+
|
|
351
|
+
**The original timezone information is permanently lost.** This is not a limitation of this library but of the Parquet format itself.
|
|
352
|
+
|
|
353
|
+
```ruby
|
|
354
|
+
schema = Parquet::Schema.define do
|
|
355
|
+
# These BOTH store in UTC - timezone info is lost!
|
|
356
|
+
field :timestamp_utc, :timestamp_millis, timezone: "UTC"
|
|
357
|
+
field :timestamp_tokyo, :timestamp_millis, timezone: "+09:00"
|
|
358
|
+
|
|
359
|
+
# This stores as local time (no timezone)
|
|
360
|
+
field :timestamp_local, :timestamp_millis
|
|
361
|
+
end
|
|
362
|
+
|
|
363
|
+
# If you need timezone preservation, store it separately:
|
|
364
|
+
schema = Parquet::Schema.define do
|
|
365
|
+
field :timestamp, :timestamp_millis, has_timezone: true # UTC storage
|
|
366
|
+
field :original_tz, :string # "America/New_York"
|
|
367
|
+
end
|
|
368
|
+
```
|
|
369
|
+
|
|
370
|
+
## Performance Tips
|
|
371
|
+
|
|
372
|
+
1. **Use column-wise reading** when you need only a few columns from wide tables
|
|
373
|
+
2. **Specify columns parameter** to avoid reading unnecessary data
|
|
374
|
+
3. **Choose appropriate batch sizes**:
|
|
375
|
+
- Larger batches = better throughput but more memory
|
|
376
|
+
- Smaller batches = less memory but more overhead
|
|
377
|
+
4. **Pre-sort data** by commonly filtered columns for better compression
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
## Memory Management
|
|
381
|
+
|
|
382
|
+
Writes are streamed: an Enumerator (or any Enumerable) passed to `write_rows`
|
|
383
|
+
is consumed in bounded slices rather than materialized up front, and completed
|
|
384
|
+
row groups are flushed to the destination while the input is still being
|
|
385
|
+
enumerated. Peak memory is bounded by `batch_size` and `flush_threshold`, not
|
|
386
|
+
by the total dataset size:
|
|
387
|
+
|
|
388
|
+
```ruby
|
|
389
|
+
Parquet.write_rows(huge_dataset.each,
|
|
390
|
+
schema: schema,
|
|
391
|
+
write_to: "output.parquet",
|
|
392
|
+
batch_size: 1000, # Rows buffered per write batch (also the
|
|
393
|
+
# slice size pulled from an Enumerator)
|
|
394
|
+
flush_threshold: 32 * 1024**2 # Flush a row group to the destination once
|
|
395
|
+
# ~32MB of raw row data is staged (default 100MB)
|
|
396
|
+
)
|
|
397
|
+
```
|
|
398
|
+
|
|
399
|
+
`flush_threshold` bounds both the raw bytes staged since the last flush and
|
|
400
|
+
the writer's encoded in-progress buffer, so row groups reach the destination
|
|
401
|
+
incrementally even when compression shrinks the encoded data dramatically.
|
|
402
|
+
`write_columns` flushes the same way after each batch of columns.
|
|
403
|
+
|
|
404
|
+
When `write_to:` is an IO object instead of a file path, output is staged in a
|
|
405
|
+
temporary file on disk (memory stays bounded) and copied to the IO after the
|
|
406
|
+
write completes, so the IO receives its bytes only at the end.
|
|
407
|
+
|
|
408
|
+
Write batch and sample sizes are bounded before buffer allocation. Very large
|
|
409
|
+
batch sizes are rejected, and wide schemas have a lower effective batch cap so
|
|
410
|
+
the writer cannot reserve unbounded per-column value slots.
|
|
411
|
+
|
|
412
|
+
## Architecture
|
|
413
|
+
|
|
414
|
+
This gem uses a modular architecture:
|
|
415
|
+
|
|
416
|
+
- **parquet-core**: Language-agnostic Rust core for Parquet operations
|
|
417
|
+
- **parquet-ruby-adapter**: Ruby-specific FFI adapter layer
|
|
418
|
+
- **parquet gem**: High-level Ruby API
|
|
419
|
+
|
|
420
|
+
Take a look at [ARCH.md](./ARCH.md)
|
|
421
|
+
|
|
422
|
+
## Contributing
|
|
423
|
+
|
|
424
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/njaremko/parquet-ruby.
|
|
425
|
+
|
|
426
|
+
## License
|
|
427
|
+
|
|
428
|
+
The gem is available as open source under the terms of the MIT License.
|
data/Rakefile
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "rake/testtask"
|
|
4
|
+
require "rb_sys/extensiontask"
|
|
5
|
+
|
|
6
|
+
task default: :test
|
|
7
|
+
|
|
8
|
+
GEMSPEC = Gem::Specification.load("parquet.gemspec")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
platforms = [
|
|
12
|
+
"x86_64-linux",
|
|
13
|
+
"x86_64-linux-musl",
|
|
14
|
+
"aarch64-linux",
|
|
15
|
+
"aarch64-linux-musl",
|
|
16
|
+
"x86_64-darwin",
|
|
17
|
+
"arm64-darwin"
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
RbSys::ExtensionTask.new("parquet", GEMSPEC) do |ext|
|
|
21
|
+
ext.lib_dir = "lib/parquet"
|
|
22
|
+
ext.ext_dir = "ext/parquet"
|
|
23
|
+
ext.cross_compile = true
|
|
24
|
+
ext.cross_platform = platforms
|
|
25
|
+
ext.cross_compiling do |spec|
|
|
26
|
+
spec.dependencies.reject! { |dep| dep.name == "rb_sys" }
|
|
27
|
+
spec.files.reject! { |file| File.fnmatch?("ext/*", file, File::FNM_EXTGLOB) }
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
Rake::TestTask.new do |t|
|
|
32
|
+
t.deps << :compile
|
|
33
|
+
t.test_files = FileList[File.expand_path("test/*_test.rb", __dir__)]
|
|
34
|
+
t.libs << "lib"
|
|
35
|
+
t.libs << "test"
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
task :release do
|
|
39
|
+
sh "bundle exec rake test"
|
|
40
|
+
sh "mkdir -p pkg"
|
|
41
|
+
sh "gem build parquet.gemspec -o pkg/parquet-#{Parquet::VERSION}.gem"
|
|
42
|
+
sh "gem push pkg/parquet-#{Parquet::VERSION}.gem"
|
|
43
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
[package]
|
|
2
|
+
name = "parquet"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
edition = "2021"
|
|
5
|
+
|
|
6
|
+
[lib]
|
|
7
|
+
crate-type = ["cdylib"]
|
|
8
|
+
|
|
9
|
+
[build-dependencies]
|
|
10
|
+
rb-sys-env = "^0.2"
|
|
11
|
+
|
|
12
|
+
[dependencies]
|
|
13
|
+
ahash = "0.8"
|
|
14
|
+
arrow-array = "58.3.0"
|
|
15
|
+
arrow-buffer = "58.3.0"
|
|
16
|
+
arrow-ipc = { version = "58.3.0", features = ["lz4"] }
|
|
17
|
+
arrow-schema = "58.3.0"
|
|
18
|
+
bytes = "^1.9"
|
|
19
|
+
either = "1.9"
|
|
20
|
+
itertools = "^0.14"
|
|
21
|
+
jiff = "0.2"
|
|
22
|
+
magnus = { version = "0.8", features = ["rb-sys"] }
|
|
23
|
+
parquet = { version = "58.3.0", features = ["json"] }
|
|
24
|
+
parquet-ruby-adapter = { path = "../parquet-ruby-adapter" }
|
|
25
|
+
rand = "0.9"
|
|
26
|
+
rb-sys = "^0.9"
|
|
27
|
+
simdutf8 = "0.1.5"
|
|
28
|
+
tempfile = "^3.15"
|
|
29
|
+
thiserror = "2.0"
|
|
30
|
+
num = "0.4.3"
|
|
31
|
+
uuid = "1.16.0"
|
|
32
|
+
ordered-float = "5.0.0"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
[target.'cfg(target_os = "linux")'.dependencies]
|
|
36
|
+
jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
|
|
37
|
+
|
|
38
|
+
[target.'cfg(not(any(target_os = "linux", target_os = "windows")))'.dependencies]
|
|
39
|
+
mimalloc = { version = "0.1", default-features = false }
|