parquet 0.4.2-aarch64-linux-musl → 0.5.0-aarch64-linux-musl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 93a7c8eb35d3b40fda00e01e751308d98c97cf39ca7fa675e564acef4d68621e
4
- data.tar.gz: 9f84f828efd20915a234fc2fad226bc6f4a7681a6ce7925093f8977c6dd6269d
3
+ metadata.gz: f05e1f81e9db1603f9218c2603f20591520f1b63e0e4b2bbea0b9c0102bbb6d9
4
+ data.tar.gz: a011d79c0cf1e3b72091636f3a1fc00159c51340d6bad0ae60e4930e570f36f2
5
5
  SHA512:
6
- metadata.gz: c83dfd95360e534d1e46285681c9932338671a393684ccb888176f022cffbbaff440d8c3fc2d82daa6469acfdf7c3de3f0d6193231baa3fe56df64bfdc453cc9
7
- data.tar.gz: 10ec218eebec6f8ed30b50926ef581adc4e67c723ca5dd2d6a6fcfbb84703ce22e3837ce65734c1702b082beb10ca527800abd66141137e4b0ee3782c0c9513a
6
+ metadata.gz: c57dcb269043a0f09e1a3baabbd1ac8ba7df58bf1b5620238a13c108b7ccbb4098a89673920f37fd587ad8d36b0a68cd170c3b594a23081f6227293a5225e0c2
7
+ data.tar.gz: ed8c8d0426ceb74856f10539073e18d379ef07f2cfb9288e5e374833ad109b9381b3f5d254f851c94672ff902083863a8972f47b90e8e127172c7595099b6138
data/README.md CHANGED
@@ -194,4 +194,108 @@ The following data types are supported in the schema:
194
194
  - `date32`
195
195
  - `timestamp_millis`, `timestamp_micros`
196
196
 
197
- Note: Writing of List and Map types is not currently supported.
197
+ ### Schema DSL for Complex Data Types
198
+
199
+ In addition to the hash-based schema definition shown above, this library provides a more expressive DSL for defining complex schemas with nested structures:
200
+
201
+ ```ruby
202
+ require "parquet"
203
+
204
+ # Define a complex schema using the Schema DSL
205
+ schema = Parquet::Schema.define do
206
+ field :id, :int64, nullable: false # Required field
207
+ field :name, :string # Optional field (nullable: true is default)
208
+
209
+ # Nested struct
210
+ field :address, :struct do
211
+ field :street, :string
212
+ field :city, :string
213
+ field :zip, :string
214
+ field :coordinates, :struct do
215
+ field :latitude, :double
216
+ field :longitude, :double
217
+ end
218
+ end
219
+
220
+ # List of primitives
221
+ field :scores, :list, item: :float
222
+
223
+ # List of structs
224
+ field :contacts, :list, item: :struct do
225
+ field :name, :string
226
+ field :phone, :string
227
+ field :primary, :boolean
228
+ end
229
+
230
+ # Map with string values
231
+ field :metadata, :map, key: :string, value: :string
232
+
233
+ # Map with struct values
234
+ field :properties, :map, key: :string, value: :struct do
235
+ field :count, :int32
236
+ field :description, :string
237
+ end
238
+
239
+ # Nested lists
240
+ field :nested_lists, :list, item: :list do
241
+ field :item, :string # For nested lists, inner item must be named 'item'
242
+ end
243
+
244
+ # Map of lists
245
+ field :map_of_lists, :map, key: :string, value: :list do
246
+ field :item, :int32 # For list items in maps, item must be named 'item'
247
+ end
248
+ end
249
+
250
+ # Sample data with nested structures
251
+ data = [
252
+ [
253
+ 1, # id
254
+ "John Doe", # name
255
+ { # address (struct)
256
+ "street" => "123 Main St",
257
+ "city" => "Springfield",
258
+ "zip" => "12345",
259
+ "coordinates" => {
260
+ "latitude" => 37.7749,
261
+ "longitude" => -122.4194
262
+ }
263
+ },
264
+ [85.5, 92.0, 78.5], # scores (list of floats)
265
+ [ # contacts (list of structs)
266
+ { "name" => "Contact 1", "phone" => "555-1234", "primary" => true },
267
+ { "name" => "Contact 2", "phone" => "555-5678", "primary" => false }
268
+ ],
269
+ { "created" => "2023-01-01", "status" => "active" }, # metadata (map)
270
+ { # properties (map of structs)
271
+ "feature1" => { "count" => 5, "description" => "Main feature" },
272
+ "feature2" => { "count" => 3, "description" => "Secondary feature" }
273
+ },
274
+ [["a", "b"], ["c", "d", "e"]], # nested_lists
275
+ { # map_of_lists
276
+ "group1" => [1, 2, 3],
277
+ "group2" => [4, 5, 6]
278
+ }
279
+ ]
280
+ ]
281
+
282
+ # Write to a parquet file using the schema
283
+ Parquet.write_rows(data.each, schema: schema, write_to: "complex_data.parquet")
284
+
285
+ # Read back the data
286
+ Parquet.each_row("complex_data.parquet") do |row|
287
+ puts row.inspect
288
+ end
289
+ ```
290
+
291
+ The Schema DSL supports:
292
+
293
+ - **Primitive types**: All standard Parquet types (`int32`, `string`, etc.)
294
+ - **Complex types**: Structs, lists, and maps with arbitrary nesting
295
+ - **Nullability control**: Specify which fields can contain null values with `nullable: false/true`
296
+ - **List item nullability**: Control whether list items can be null with `item_nullable: false/true`
297
+ - **Map key/value nullability**: Control whether map keys or values can be null with `key_nullable: false/true` and `value_nullable: false/true`
298
+
299
+ Note: When using List and Map types, you need to provide at least:
300
+ - For lists: The `item:` parameter specifying the item type
301
+ - For maps: Both `key:` and `value:` parameters specifying key and value types
Binary file
Binary file
Binary file
@@ -0,0 +1,154 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parquet
4
+ # Schema definition for Parquet files
5
+ class Schema
6
+ # Define a new schema using the DSL
7
+ # @return [Hash] schema definition hash
8
+ #
9
+ # @example Define a schema with nullable and non-nullable fields
10
+ # Parquet::Schema.define do
11
+ # field :id, :int64, nullable: false # ID cannot be null
12
+ # field :name, :string # Default nullable: true
13
+ #
14
+ # # List with non-nullable items
15
+ # field :scores, :list, item: :float, item_nullable: false
16
+ #
17
+ # # Map with nullable values
18
+ # field :metadata, :map,
19
+ # key: :string,
20
+ # value: :string,
21
+ # value_nullable: true
22
+ #
23
+ # # Nested struct with non-nullable fields
24
+ # field :address, :struct, nullable: true do
25
+ # field :street, :string, nullable: false
26
+ # field :city, :string, nullable: false
27
+ # field :zip, :string, nullable: false
28
+ # end
29
+ # end
30
+ def self.define(&block)
31
+ builder = SchemaBuilder.new
32
+ builder.instance_eval(&block)
33
+
34
+ # Return a structured hash representing the schema
35
+ { type: :struct, fields: builder.fields }
36
+ end
37
+
38
+ # Internal builder class that provides the DSL methods
39
+ class SchemaBuilder
40
+ attr_reader :fields
41
+
42
+ def initialize
43
+ @fields = []
44
+ end
45
+
46
+ # Define a field in the schema
47
+ # @param name [String, Symbol] field name
48
+ # @param type [Symbol] data type (:int32, :int64, :string, :list, :map, :struct, etc)
49
+ # @param nullable [Boolean] whether the field can be null (default: true)
50
+ # @param kwargs [Hash] additional options depending on type
51
+ #
52
+ # Additional keyword args:
53
+ # - `item:` if type == :list
54
+ # - `item_nullable:` controls nullability of list items (default: true)
55
+ # - `key:, value:` if type == :map
56
+ # - `key_nullable:, value_nullable:` controls nullability of map keys/values (default: true)
57
+ # - `format:` if you want to store some format string
58
+ # - `nullable:` default to true if not specified
59
+ def field(name, type, nullable: true, **kwargs, &block)
60
+ field_hash = { name: name.to_s, type: type, nullable: !!nullable }
61
+
62
+ # Possibly store a format if provided
63
+ field_hash[:format] = kwargs[:format] if kwargs.key?(:format)
64
+
65
+ case type
66
+ when :struct
67
+ # We'll parse subfields from the block
68
+ sub_builder = SchemaBuilder.new
69
+ sub_builder.instance_eval(&block) if block
70
+ field_hash[:fields] = sub_builder.fields
71
+ when :list
72
+ item_type = kwargs[:item]
73
+ raise ArgumentError, "list field `#{name}` requires `item:` type" unless item_type
74
+ # Pass item_nullable if provided, otherwise use true as default
75
+ item_nullable = kwargs[:item_nullable].nil? ? true : !!kwargs[:item_nullable]
76
+ field_hash[:item] = wrap_subtype(item_type, nullable: item_nullable, &block)
77
+ when :map
78
+ # user must specify key:, value:
79
+ key_type = kwargs[:key]
80
+ value_type = kwargs[:value]
81
+ raise ArgumentError, "map field `#{name}` requires `key:` and `value:`" if key_type.nil? || value_type.nil?
82
+ # Pass key_nullable and value_nullable if provided, otherwise use true as default
83
+ key_nullable = kwargs[:key_nullable].nil? ? true : !!kwargs[:key_nullable]
84
+ value_nullable = kwargs[:value_nullable].nil? ? true : !!kwargs[:value_nullable]
85
+ field_hash[:key] = wrap_subtype(key_type, nullable: key_nullable)
86
+ field_hash[:value] = wrap_subtype(value_type, nullable: value_nullable, &block)
87
+ else
88
+ # primitive type: :int32, :int64, :string, etc.
89
+ # do nothing else special
90
+ end
91
+
92
+ @fields << field_hash
93
+ end
94
+
95
+ def build_map(key_type, value_type, key_nullable: false, value_nullable: true, nullable: true, &block)
96
+ # Wrap the key type (maps typically use non-nullable keys)
97
+ key = wrap_subtype(key_type, nullable: key_nullable)
98
+
99
+ # Handle the case where value_type is a complex type (:struct or :list) and a block is provided
100
+ value =
101
+ if (value_type == :struct || value_type == :list) && block
102
+ wrap_subtype(value_type, nullable: value_nullable, &block)
103
+ else
104
+ wrap_subtype(value_type, nullable: value_nullable)
105
+ end
106
+
107
+ # Map is represented as a list of key/value pairs in Parquet
108
+ {
109
+ type: :map,
110
+ nullable: nullable,
111
+ item: {
112
+ type: :struct,
113
+ nullable: false,
114
+ name: "key_value",
115
+ fields: [key, value]
116
+ }
117
+ }
118
+ end
119
+
120
+ private
121
+
122
+ # If user said: field "something", :list, item: :struct do ... end
123
+ # we want to recursively parse that sub-struct from the block.
124
+ # So wrap_subtype might be:
125
+ def wrap_subtype(t, nullable: true, &block)
126
+ if t == :struct
127
+ sub_builder = SchemaBuilder.new
128
+ sub_builder.instance_eval(&block) if block
129
+
130
+ # Validate that the struct has at least one field
131
+ if sub_builder.fields.empty?
132
+ raise ArgumentError, "Cannot create a struct with zero fields. Parquet doesn't support empty structs."
133
+ end
134
+
135
+ { type: :struct, nullable: nullable, name: "item", fields: sub_builder.fields }
136
+ elsif t == :list && block
137
+ # Handle nested lists by processing the block to define the item type
138
+ sub_builder = SchemaBuilder.new
139
+ sub_builder.instance_eval(&block) if block
140
+
141
+ # We expect a single field named "item" that defines the inner list's item type
142
+ if sub_builder.fields.empty? || sub_builder.fields.length > 1 || sub_builder.fields[0][:name] != "item"
143
+ raise ArgumentError, "Nested list must define exactly one field named 'item' for the inner list's item type"
144
+ end
145
+
146
+ { type: :list, nullable: nullable, name: "item", item: sub_builder.fields[0] }
147
+ else
148
+ # e.g. :int32 => { type: :int32, nullable: true }
149
+ { type: t, nullable: nullable, name: "item" }
150
+ end
151
+ end
152
+ end
153
+ end
154
+ end
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.4.2"
2
+ VERSION = "0.5.0"
3
3
  end
data/lib/parquet.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require_relative "parquet/version"
2
+ require_relative "parquet/schema"
2
3
 
3
4
  begin
4
5
  require "parquet/#{RUBY_VERSION.to_f}/parquet"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.5.0
5
5
  platform: aarch64-linux-musl
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-02-21 00:00:00.000000000 Z
11
+ date: 2025-02-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake-compiler
@@ -42,6 +42,7 @@ files:
42
42
  - lib/parquet/3.2/parquet.so
43
43
  - lib/parquet/3.3/parquet.so
44
44
  - lib/parquet/3.4/parquet.so
45
+ - lib/parquet/schema.rb
45
46
  - lib/parquet/version.rb
46
47
  homepage: https://github.com/njaremko/parquet-ruby
47
48
  licenses: