parquet 0.5.13-aarch64-linux-musl → 0.6.0-aarch64-linux-musl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +1 -0
- data/README.md +94 -3
- data/lib/parquet/3.2/parquet.so +0 -0
- data/lib/parquet/3.3/parquet.so +0 -0
- data/lib/parquet/3.4/parquet.so +0 -0
- data/lib/parquet/schema.rb +19 -0
- data/lib/parquet/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ef860225882188543417167eb81cf2d330d1a13e04c4d0bc139775437bacea2b
|
4
|
+
data.tar.gz: 7040158d3e2541e0d807b082d3fd3f84db7188bc5692c1271e27af5a2cf79c94
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1c8f29b9753dcb9d1eb0ccb36974337108237c8a18b33e835476f87071de99777792c74da683adeed7e04818737c62c8df98aea1bea64b39355967bcc0cfb688
|
7
|
+
data.tar.gz: 2ffdcb881ec52f38b0ea104baae9292c1c93a03282dce46fd45e6d2075f8b257ed13e783f5b8b227629b753b0e3929504aff0b4369685f32452262078a692814
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
[](https://badge.fury.io/rb/parquet)
|
4
4
|
|
5
|
-
This project is a Ruby library wrapping the [parquet
|
5
|
+
This project is a Ruby library wrapping the [`parquet`](https://github.com/apache/arrow-rs/tree/main/parquet) rust crate.
|
6
6
|
|
7
7
|
## Usage
|
8
8
|
|
9
|
-
This library provides high-level bindings to parquet
|
9
|
+
This library provides high-level bindings to `parquet` with two primary APIs for reading Parquet files: row-wise and column-wise iteration. The column-wise API generally offers better performance, especially when working with subset of columns.
|
10
10
|
|
11
11
|
### Metadata
|
12
12
|
|
@@ -264,9 +264,100 @@ The following data types are supported in the schema:
|
|
264
264
|
- `binary`
|
265
265
|
- `boolean`
|
266
266
|
- `date32`
|
267
|
-
- `timestamp_millis`, `timestamp_micros`
|
267
|
+
- `timestamp_millis`, `timestamp_micros`, `timestamp_second`, `timestamp_nanos`
|
268
268
|
- `time_millis`, `time_micros`
|
269
269
|
|
270
|
+
### Timestamp Timezone Handling
|
271
|
+
|
272
|
+
**CRITICAL PARQUET SPECIFICATION LIMITATION**: The Apache Parquet format specification only supports two types of timestamps:
|
273
|
+
1. **UTC-normalized timestamps** (when ANY timezone is specified) - `isAdjustedToUTC = true`
|
274
|
+
2. **Local/unzoned timestamps** (when NO timezone is specified) - `isAdjustedToUTC = false`
|
275
|
+
|
276
|
+
This means that specific timezone offsets like "+09:00" or "America/New_York" CANNOT be preserved in Parquet files. This is not a limitation of this Ruby library, but of the Parquet format itself.
|
277
|
+
|
278
|
+
**When Writing:**
|
279
|
+
- If the schema specifies ANY timezone (whether it's "UTC", "+09:00", "America/New_York", etc.):
|
280
|
+
- Time values are converted to UTC before storing
|
281
|
+
- The file metadata sets `isAdjustedToUTC = true`
|
282
|
+
- The original timezone information is LOST
|
283
|
+
- If the schema doesn't specify a timezone:
|
284
|
+
- Timestamps are stored as local/unzoned time (no conversion)
|
285
|
+
- The file metadata sets `isAdjustedToUTC = false`
|
286
|
+
- These represent "wall clock" times without timezone context
|
287
|
+
|
288
|
+
**When Reading:**
|
289
|
+
- If the Parquet file has `isAdjustedToUTC = true` (ANY timezone was specified during writing):
|
290
|
+
- Time objects are returned in UTC
|
291
|
+
- The original timezone (e.g., "+09:00") is NOT recoverable
|
292
|
+
- If the file has `isAdjustedToUTC = false` (NO timezone was specified):
|
293
|
+
- Time objects are returned as local time in your system's timezone
|
294
|
+
- These are "wall clock" times without timezone information
|
295
|
+
|
296
|
+
```ruby
|
297
|
+
# Preferred approach: use has_timezone to be explicit about UTC vs local storage
|
298
|
+
schema = Parquet::Schema.define do
|
299
|
+
field :timestamp_utc, :timestamp_millis, has_timezone: true # Stored as UTC (default)
|
300
|
+
field :timestamp_local, :timestamp_millis, has_timezone: false # Stored as local/unzoned
|
301
|
+
field :timestamp_default, :timestamp_millis # Default: UTC storage
|
302
|
+
end
|
303
|
+
|
304
|
+
# Legacy approach still supported (any timezone value means UTC storage)
|
305
|
+
schema_legacy = Parquet::Schema.define do
|
306
|
+
field :timestamp_utc, :timestamp_millis, timezone: "UTC" # Stored as UTC
|
307
|
+
field :timestamp_tokyo, :timestamp_millis, timezone: "+09:00" # Also stored as UTC!
|
308
|
+
field :timestamp_local, :timestamp_millis # No timezone - local
|
309
|
+
end
|
310
|
+
|
311
|
+
# Time values will be converted based on schema
|
312
|
+
rows = [
|
313
|
+
[
|
314
|
+
Time.new(2024, 1, 1, 12, 0, 0, "+03:00"), # Converted to UTC if has_timezone: true
|
315
|
+
Time.new(2024, 1, 1, 12, 0, 0, "-05:00"), # Kept as local if has_timezone: false
|
316
|
+
Time.new(2024, 1, 1, 12, 0, 0) # Kept as local (default)
|
317
|
+
]
|
318
|
+
]
|
319
|
+
|
320
|
+
Parquet.write_rows(rows.each, schema: schema, write_to: "timestamps.parquet")
|
321
|
+
|
322
|
+
# Reading back - timezone presence determines UTC vs local
|
323
|
+
Parquet.each_row("timestamps.parquet") do |row|
|
324
|
+
# row["timestamp_utc"] => Time object in UTC
|
325
|
+
# row["timestamp_local"] => Time object in local timezone
|
326
|
+
# row["timestamp_default"] => Time object in local timezone
|
327
|
+
end
|
328
|
+
|
329
|
+
# If you need to preserve specific timezone information, store it separately:
|
330
|
+
schema_with_tz = Parquet::Schema.define do
|
331
|
+
field :timestamp, :timestamp_millis, has_timezone: true # Store as UTC
|
332
|
+
field :original_timezone, :string # Store timezone as string
|
333
|
+
end
|
334
|
+
```
|
335
|
+
|
336
|
+
## Architecture
|
337
|
+
|
338
|
+
This library uses a modular, trait-based architecture that separates language-agnostic Parquet operations from Ruby-specific bindings:
|
339
|
+
|
340
|
+
- **parquet-core**: Language-agnostic core functionality for Parquet file operations
|
341
|
+
- Pure Rust implementation without Ruby dependencies
|
342
|
+
- Traits for customizable I/O operations (`ChunkReader`) and value conversion (`ValueConverter`)
|
343
|
+
- Efficient Arrow-based reader and writer implementations
|
344
|
+
|
345
|
+
- **parquet-ruby-adapter**: Ruby-specific adapter layer
|
346
|
+
- Implements core traits for Ruby integration
|
347
|
+
- Handles Ruby value conversion through the `ValueConverter` trait
|
348
|
+
- Manages Ruby I/O objects through the `ChunkReader` trait
|
349
|
+
|
350
|
+
- **parquet gem**: Ruby FFI bindings
|
351
|
+
- Provides high-level Ruby API
|
352
|
+
- Manages memory safety between Ruby and Rust
|
353
|
+
- Supports both file-based and IO-based operations
|
354
|
+
|
355
|
+
This architecture enables:
|
356
|
+
- Clear separation of concerns between core functionality and language bindings
|
357
|
+
- Easy testing of core logic without Ruby dependencies
|
358
|
+
- Potential reuse of core functionality for other language bindings
|
359
|
+
- Type-safe interfaces through Rust's trait system
|
360
|
+
|
270
361
|
### Schema DSL for Complex Data Types
|
271
362
|
|
272
363
|
In addition to the hash-based schema definition shown above, this library provides a more expressive DSL for defining complex schemas with nested structures:
|
data/lib/parquet/3.2/parquet.so
CHANGED
Binary file
|
data/lib/parquet/3.3/parquet.so
CHANGED
Binary file
|
data/lib/parquet/3.4/parquet.so
CHANGED
Binary file
|
data/lib/parquet/schema.rb
CHANGED
@@ -59,12 +59,31 @@ module Parquet
|
|
59
59
|
# - `key_nullable:, value_nullable:` controls nullability of map keys/values (default: true)
|
60
60
|
# - `format:` if you want to store some format string
|
61
61
|
# - `precision:, scale:` if type == :decimal (precision defaults to 38, scale to 0)
|
62
|
+
# - `has_timezone:` if type is timestamp - true means UTC storage (default), false means local/unzoned
|
63
|
+
# - `timezone:` (DEPRECATED) if type is timestamp - any value means UTC storage
|
62
64
|
# - `nullable:` default to true if not specified
|
63
65
|
def field(name, type, nullable: true, **kwargs, &block)
|
64
66
|
field_hash = { name: name.to_s, type: type, nullable: !!nullable }
|
65
67
|
|
66
68
|
# Possibly store a format if provided
|
67
69
|
field_hash[:format] = kwargs[:format] if kwargs.key?(:format)
|
70
|
+
|
71
|
+
# Handle timezone for timestamp types
|
72
|
+
if [:timestamp_second, :timestamp_millis, :timestamp_micros, :timestamp_nanos].include?(type)
|
73
|
+
# Support new has_timezone parameter (preferred)
|
74
|
+
if kwargs.key?(:has_timezone)
|
75
|
+
# If has_timezone is true, store "UTC" to indicate timezone presence
|
76
|
+
# If explicitly false, don't store timezone (indicates local/unzoned)
|
77
|
+
field_hash[:timezone] = "UTC" if kwargs[:has_timezone]
|
78
|
+
elsif kwargs.key?(:timezone)
|
79
|
+
# Legacy support: any timezone value means UTC storage
|
80
|
+
# Store "UTC" regardless of the actual value to make it clear
|
81
|
+
field_hash[:timezone] = "UTC"
|
82
|
+
else
|
83
|
+
# Default behavior when neither parameter is specified: UTC storage
|
84
|
+
field_hash[:timezone] = "UTC"
|
85
|
+
end
|
86
|
+
end
|
68
87
|
|
69
88
|
case type
|
70
89
|
when :struct
|
data/lib/parquet/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: aarch64-linux-musl
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-07-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake-compiler
|