parquet 0.4.2-x86_64-linux-musl → 0.5.1-x86_64-linux-musl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +105 -1
- data/lib/parquet/3.2/parquet.so +0 -0
- data/lib/parquet/3.3/parquet.so +0 -0
- data/lib/parquet/3.4/parquet.so +0 -0
- data/lib/parquet/schema.rb +154 -0
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0d6605ede3c285ff55f311a9aabf082324fb0c55bcd251561c8d384116ae6e84
|
4
|
+
data.tar.gz: e788dd632b8cacfb6caa25f4c248d33e9aafdeece5ea44f3f783a77edbbdd9c3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5fffd806cb18001d4e5bfa39cb4edd5ec19fac0f87cb01782630bccc4929c9a96d4f85f15598597040930e9facc065b238470db5862518f5cf330944598ac842
|
7
|
+
data.tar.gz: c8031944b16aea4c0a789a9741fc15ee612424350e00b7cedfba2dfa8095e3f0aa1bfd8b995ddf54a46a68ba474ad101decb410c241f5a862e1e3706ee09df42
|
data/README.md
CHANGED
@@ -194,4 +194,108 @@ The following data types are supported in the schema:
|
|
194
194
|
- `date32`
|
195
195
|
- `timestamp_millis`, `timestamp_micros`
|
196
196
|
|
197
|
-
|
197
|
+
### Schema DSL for Complex Data Types
|
198
|
+
|
199
|
+
In addition to the hash-based schema definition shown above, this library provides a more expressive DSL for defining complex schemas with nested structures:
|
200
|
+
|
201
|
+
```ruby
|
202
|
+
require "parquet"
|
203
|
+
|
204
|
+
# Define a complex schema using the Schema DSL
|
205
|
+
schema = Parquet::Schema.define do
|
206
|
+
field :id, :int64, nullable: false # Required field
|
207
|
+
field :name, :string # Optional field (nullable: true is default)
|
208
|
+
|
209
|
+
# Nested struct
|
210
|
+
field :address, :struct do
|
211
|
+
field :street, :string
|
212
|
+
field :city, :string
|
213
|
+
field :zip, :string
|
214
|
+
field :coordinates, :struct do
|
215
|
+
field :latitude, :double
|
216
|
+
field :longitude, :double
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
# List of primitives
|
221
|
+
field :scores, :list, item: :float
|
222
|
+
|
223
|
+
# List of structs
|
224
|
+
field :contacts, :list, item: :struct do
|
225
|
+
field :name, :string
|
226
|
+
field :phone, :string
|
227
|
+
field :primary, :boolean
|
228
|
+
end
|
229
|
+
|
230
|
+
# Map with string values
|
231
|
+
field :metadata, :map, key: :string, value: :string
|
232
|
+
|
233
|
+
# Map with struct values
|
234
|
+
field :properties, :map, key: :string, value: :struct do
|
235
|
+
field :count, :int32
|
236
|
+
field :description, :string
|
237
|
+
end
|
238
|
+
|
239
|
+
# Nested lists
|
240
|
+
field :nested_lists, :list, item: :list do
|
241
|
+
field :item, :string # For nested lists, inner item must be named 'item'
|
242
|
+
end
|
243
|
+
|
244
|
+
# Map of lists
|
245
|
+
field :map_of_lists, :map, key: :string, value: :list do
|
246
|
+
field :item, :int32 # For list items in maps, item must be named 'item'
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
# Sample data with nested structures
|
251
|
+
data = [
|
252
|
+
[
|
253
|
+
1, # id
|
254
|
+
"John Doe", # name
|
255
|
+
{ # address (struct)
|
256
|
+
"street" => "123 Main St",
|
257
|
+
"city" => "Springfield",
|
258
|
+
"zip" => "12345",
|
259
|
+
"coordinates" => {
|
260
|
+
"latitude" => 37.7749,
|
261
|
+
"longitude" => -122.4194
|
262
|
+
}
|
263
|
+
},
|
264
|
+
[85.5, 92.0, 78.5], # scores (list of floats)
|
265
|
+
[ # contacts (list of structs)
|
266
|
+
{ "name" => "Contact 1", "phone" => "555-1234", "primary" => true },
|
267
|
+
{ "name" => "Contact 2", "phone" => "555-5678", "primary" => false }
|
268
|
+
],
|
269
|
+
{ "created" => "2023-01-01", "status" => "active" }, # metadata (map)
|
270
|
+
{ # properties (map of structs)
|
271
|
+
"feature1" => { "count" => 5, "description" => "Main feature" },
|
272
|
+
"feature2" => { "count" => 3, "description" => "Secondary feature" }
|
273
|
+
},
|
274
|
+
[["a", "b"], ["c", "d", "e"]], # nested_lists
|
275
|
+
{ # map_of_lists
|
276
|
+
"group1" => [1, 2, 3],
|
277
|
+
"group2" => [4, 5, 6]
|
278
|
+
}
|
279
|
+
]
|
280
|
+
]
|
281
|
+
|
282
|
+
# Write to a parquet file using the schema
|
283
|
+
Parquet.write_rows(data.each, schema: schema, write_to: "complex_data.parquet")
|
284
|
+
|
285
|
+
# Read back the data
|
286
|
+
Parquet.each_row("complex_data.parquet") do |row|
|
287
|
+
puts row.inspect
|
288
|
+
end
|
289
|
+
```
|
290
|
+
|
291
|
+
The Schema DSL supports:
|
292
|
+
|
293
|
+
- **Primitive types**: All standard Parquet types (`int32`, `string`, etc.)
|
294
|
+
- **Complex types**: Structs, lists, and maps with arbitrary nesting
|
295
|
+
- **Nullability control**: Specify which fields can contain null values with `nullable: false/true`
|
296
|
+
- **List item nullability**: Control whether list items can be null with `item_nullable: false/true`
|
297
|
+
- **Map key/value nullability**: Control whether map keys or values can be null with `value_nullable: false/true`
|
298
|
+
|
299
|
+
Note: When using List and Map types, you need to provide at least:
|
300
|
+
- For lists: The `item:` parameter specifying the item type
|
301
|
+
- For maps: Both `key:` and `value:` parameters specifying key and value types
|
data/lib/parquet/3.2/parquet.so
CHANGED
Binary file
|
data/lib/parquet/3.3/parquet.so
CHANGED
Binary file
|
data/lib/parquet/3.4/parquet.so
CHANGED
Binary file
|
@@ -0,0 +1,154 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Parquet
|
4
|
+
# Schema definition for Parquet files
|
5
|
+
class Schema
|
6
|
+
# Define a new schema using the DSL
|
7
|
+
# @return [Hash] schema definition hash
|
8
|
+
#
|
9
|
+
# @example Define a schema with nullable and non-nullable fields
|
10
|
+
# Parquet::Schema.define do
|
11
|
+
# field :id, :int64, nullable: false # ID cannot be null
|
12
|
+
# field :name, :string # Default nullable: true
|
13
|
+
#
|
14
|
+
# # List with non-nullable items
|
15
|
+
# field :scores, :list, item: :float, item_nullable: false
|
16
|
+
#
|
17
|
+
# # Map with nullable values
|
18
|
+
# field :metadata, :map,
|
19
|
+
# key: :string,
|
20
|
+
# value: :string,
|
21
|
+
# value_nullable: true
|
22
|
+
#
|
23
|
+
# # Nested struct with non-nullable fields
|
24
|
+
# field :address, :struct, nullable: true do
|
25
|
+
# field :street, :string, nullable: false
|
26
|
+
# field :city, :string, nullable: false
|
27
|
+
# field :zip, :string, nullable: false
|
28
|
+
# end
|
29
|
+
# end
|
30
|
+
def self.define(&block)
|
31
|
+
builder = SchemaBuilder.new
|
32
|
+
builder.instance_eval(&block)
|
33
|
+
|
34
|
+
# Return a structured hash representing the schema
|
35
|
+
{ type: :struct, fields: builder.fields }
|
36
|
+
end
|
37
|
+
|
38
|
+
# Internal builder class that provides the DSL methods
|
39
|
+
class SchemaBuilder
|
40
|
+
attr_reader :fields
|
41
|
+
|
42
|
+
def initialize
|
43
|
+
@fields = []
|
44
|
+
end
|
45
|
+
|
46
|
+
# Define a field in the schema
|
47
|
+
# @param name [String, Symbol] field name
|
48
|
+
# @param type [Symbol] data type (:int32, :int64, :string, :list, :map, :struct, etc)
|
49
|
+
# @param nullable [Boolean] whether the field can be null (default: true)
|
50
|
+
# @param kwargs [Hash] additional options depending on type
|
51
|
+
#
|
52
|
+
# Additional keyword args:
|
53
|
+
# - `item:` if type == :list
|
54
|
+
# - `item_nullable:` controls nullability of list items (default: true)
|
55
|
+
# - `key:, value:` if type == :map
|
56
|
+
# - `key_nullable:, value_nullable:` controls nullability of map keys/values (default: true)
|
57
|
+
# - `format:` if you want to store some format string
|
58
|
+
# - `nullable:` default to true if not specified
|
59
|
+
def field(name, type, nullable: true, **kwargs, &block)
|
60
|
+
field_hash = { name: name.to_s, type: type, nullable: !!nullable }
|
61
|
+
|
62
|
+
# Possibly store a format if provided
|
63
|
+
field_hash[:format] = kwargs[:format] if kwargs.key?(:format)
|
64
|
+
|
65
|
+
case type
|
66
|
+
when :struct
|
67
|
+
# We'll parse subfields from the block
|
68
|
+
sub_builder = SchemaBuilder.new
|
69
|
+
sub_builder.instance_eval(&block) if block
|
70
|
+
field_hash[:fields] = sub_builder.fields
|
71
|
+
when :list
|
72
|
+
item_type = kwargs[:item]
|
73
|
+
raise ArgumentError, "list field `#{name}` requires `item:` type" unless item_type
|
74
|
+
# Pass item_nullable if provided, otherwise use true as default
|
75
|
+
item_nullable = kwargs[:item_nullable].nil? ? true : !!kwargs[:item_nullable]
|
76
|
+
field_hash[:item] = wrap_subtype(item_type, nullable: item_nullable, &block)
|
77
|
+
when :map
|
78
|
+
# user must specify key:, value:
|
79
|
+
key_type = kwargs[:key]
|
80
|
+
value_type = kwargs[:value]
|
81
|
+
raise ArgumentError, "map field `#{name}` requires `key:` and `value:`" if key_type.nil? || value_type.nil?
|
82
|
+
# Pass key_nullable and value_nullable if provided, otherwise use true as default
|
83
|
+
key_nullable = kwargs[:key_nullable].nil? ? true : !!kwargs[:key_nullable]
|
84
|
+
value_nullable = kwargs[:value_nullable].nil? ? true : !!kwargs[:value_nullable]
|
85
|
+
field_hash[:key] = wrap_subtype(key_type, nullable: key_nullable)
|
86
|
+
field_hash[:value] = wrap_subtype(value_type, nullable: value_nullable, &block)
|
87
|
+
else
|
88
|
+
# primitive type: :int32, :int64, :string, etc.
|
89
|
+
# do nothing else special
|
90
|
+
end
|
91
|
+
|
92
|
+
@fields << field_hash
|
93
|
+
end
|
94
|
+
|
95
|
+
def build_map(key_type, value_type, key_nullable: false, value_nullable: true, nullable: true, &block)
|
96
|
+
# Wrap the key type (maps typically use non-nullable keys)
|
97
|
+
key = wrap_subtype(key_type, nullable: key_nullable)
|
98
|
+
|
99
|
+
# Handle the case where value_type is a complex type (:struct or :list) and a block is provided
|
100
|
+
value =
|
101
|
+
if (value_type == :struct || value_type == :list) && block
|
102
|
+
wrap_subtype(value_type, nullable: value_nullable, &block)
|
103
|
+
else
|
104
|
+
wrap_subtype(value_type, nullable: value_nullable)
|
105
|
+
end
|
106
|
+
|
107
|
+
# Map is represented as a list of key/value pairs in Parquet
|
108
|
+
{
|
109
|
+
type: :map,
|
110
|
+
nullable: nullable,
|
111
|
+
item: {
|
112
|
+
type: :struct,
|
113
|
+
nullable: false,
|
114
|
+
name: "key_value",
|
115
|
+
fields: [key, value]
|
116
|
+
}
|
117
|
+
}
|
118
|
+
end
|
119
|
+
|
120
|
+
private
|
121
|
+
|
122
|
+
# If user said: field "something", :list, item: :struct do ... end
|
123
|
+
# we want to recursively parse that sub-struct from the block.
|
124
|
+
# So wrap_subtype might be:
|
125
|
+
def wrap_subtype(t, nullable: true, &block)
|
126
|
+
if t == :struct
|
127
|
+
sub_builder = SchemaBuilder.new
|
128
|
+
sub_builder.instance_eval(&block) if block
|
129
|
+
|
130
|
+
# Validate that the struct has at least one field
|
131
|
+
if sub_builder.fields.empty?
|
132
|
+
raise ArgumentError, "Cannot create a struct with zero fields. Parquet doesn't support empty structs."
|
133
|
+
end
|
134
|
+
|
135
|
+
{ type: :struct, nullable: nullable, name: "item", fields: sub_builder.fields }
|
136
|
+
elsif t == :list && block
|
137
|
+
# Handle nested lists by processing the block to define the item type
|
138
|
+
sub_builder = SchemaBuilder.new
|
139
|
+
sub_builder.instance_eval(&block) if block
|
140
|
+
|
141
|
+
# We expect a single field named "item" that defines the inner list's item type
|
142
|
+
if sub_builder.fields.empty? || sub_builder.fields.length > 1 || sub_builder.fields[0][:name] != "item"
|
143
|
+
raise ArgumentError, "Nested list must define exactly one field named 'item' for the inner list's item type"
|
144
|
+
end
|
145
|
+
|
146
|
+
{ type: :list, nullable: nullable, name: "item", item: sub_builder.fields[0] }
|
147
|
+
else
|
148
|
+
# e.g. :int32 => { type: :int32, nullable: true }
|
149
|
+
{ type: t, nullable: nullable, name: "item" }
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
data/lib/parquet/version.rb
CHANGED
data/lib/parquet.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.1
|
5
5
|
platform: x86_64-linux-musl
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-02-
|
11
|
+
date: 2025-02-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake-compiler
|
@@ -42,6 +42,7 @@ files:
|
|
42
42
|
- lib/parquet/3.2/parquet.so
|
43
43
|
- lib/parquet/3.3/parquet.so
|
44
44
|
- lib/parquet/3.4/parquet.so
|
45
|
+
- lib/parquet/schema.rb
|
45
46
|
- lib/parquet/version.rb
|
46
47
|
homepage: https://github.com/njaremko/parquet-ruby
|
47
48
|
licenses:
|