parquet 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +66 -59
- data/README.md +105 -1
- data/ext/parquet/Cargo.toml +4 -3
- data/ext/parquet/src/enumerator.rs +8 -0
- data/ext/parquet/src/header_cache.rs +7 -3
- data/ext/parquet/src/lib.rs +1 -0
- data/ext/parquet/src/logger.rs +171 -0
- data/ext/parquet/src/reader/common.rs +113 -0
- data/ext/parquet/src/reader/mod.rs +27 -13
- data/ext/parquet/src/reader/parquet_column_reader.rs +38 -78
- data/ext/parquet/src/reader/parquet_row_reader.rs +42 -19
- data/ext/parquet/src/types/core_types.rs +57 -1
- data/ext/parquet/src/types/mod.rs +9 -2
- data/ext/parquet/src/types/parquet_value.rs +212 -36
- data/ext/parquet/src/types/record_types.rs +18 -15
- data/ext/parquet/src/types/schema_converter.rs +349 -0
- data/ext/parquet/src/types/schema_node.rs +329 -0
- data/ext/parquet/src/types/timestamp.rs +18 -8
- data/ext/parquet/src/types/type_conversion.rs +1119 -509
- data/ext/parquet/src/types/writer_types.rs +78 -107
- data/ext/parquet/src/utils.rs +29 -9
- data/ext/parquet/src/writer/mod.rs +837 -264
- data/lib/parquet/schema.rb +154 -0
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rb +1 -0
- metadata +7 -2
@@ -0,0 +1,154 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Parquet
|
4
|
+
# Schema definition for Parquet files
|
5
|
+
class Schema
|
6
|
+
# Define a new schema using the DSL
|
7
|
+
# @return [Hash] schema definition hash
|
8
|
+
#
|
9
|
+
# @example Define a schema with nullable and non-nullable fields
|
10
|
+
# Parquet::Schema.define do
|
11
|
+
# field :id, :int64, nullable: false # ID cannot be null
|
12
|
+
# field :name, :string # Default nullable: true
|
13
|
+
#
|
14
|
+
# # List with non-nullable items
|
15
|
+
# field :scores, :list, item: :float, item_nullable: false
|
16
|
+
#
|
17
|
+
# # Map with nullable values
|
18
|
+
# field :metadata, :map,
|
19
|
+
# key: :string,
|
20
|
+
# value: :string,
|
21
|
+
# value_nullable: true
|
22
|
+
#
|
23
|
+
# # Nested struct with non-nullable fields
|
24
|
+
# field :address, :struct, nullable: true do
|
25
|
+
# field :street, :string, nullable: false
|
26
|
+
# field :city, :string, nullable: false
|
27
|
+
# field :zip, :string, nullable: false
|
28
|
+
# end
|
29
|
+
# end
|
30
|
+
def self.define(&block)
|
31
|
+
builder = SchemaBuilder.new
|
32
|
+
builder.instance_eval(&block)
|
33
|
+
|
34
|
+
# Return a structured hash representing the schema
|
35
|
+
{ type: :struct, fields: builder.fields }
|
36
|
+
end
|
37
|
+
|
38
|
+
# Internal builder class that provides the DSL methods
|
39
|
+
class SchemaBuilder
|
40
|
+
attr_reader :fields
|
41
|
+
|
42
|
+
def initialize
|
43
|
+
@fields = []
|
44
|
+
end
|
45
|
+
|
46
|
+
# Define a field in the schema
|
47
|
+
# @param name [String, Symbol] field name
|
48
|
+
# @param type [Symbol] data type (:int32, :int64, :string, :list, :map, :struct, etc)
|
49
|
+
# @param nullable [Boolean] whether the field can be null (default: true)
|
50
|
+
# @param kwargs [Hash] additional options depending on type
|
51
|
+
#
|
52
|
+
# Additional keyword args:
|
53
|
+
# - `item:` if type == :list
|
54
|
+
# - `item_nullable:` controls nullability of list items (default: true)
|
55
|
+
# - `key:, value:` if type == :map
|
56
|
+
# - `key_nullable:, value_nullable:` controls nullability of map keys/values (default: true)
|
57
|
+
# - `format:` if you want to store some format string
|
58
|
+
# - `nullable:` default to true if not specified
|
59
|
+
def field(name, type, nullable: true, **kwargs, &block)
|
60
|
+
field_hash = { name: name.to_s, type: type, nullable: !!nullable }
|
61
|
+
|
62
|
+
# Possibly store a format if provided
|
63
|
+
field_hash[:format] = kwargs[:format] if kwargs.key?(:format)
|
64
|
+
|
65
|
+
case type
|
66
|
+
when :struct
|
67
|
+
# We'll parse subfields from the block
|
68
|
+
sub_builder = SchemaBuilder.new
|
69
|
+
sub_builder.instance_eval(&block) if block
|
70
|
+
field_hash[:fields] = sub_builder.fields
|
71
|
+
when :list
|
72
|
+
item_type = kwargs[:item]
|
73
|
+
raise ArgumentError, "list field `#{name}` requires `item:` type" unless item_type
|
74
|
+
# Pass item_nullable if provided, otherwise use true as default
|
75
|
+
item_nullable = kwargs[:item_nullable].nil? ? true : !!kwargs[:item_nullable]
|
76
|
+
field_hash[:item] = wrap_subtype(item_type, nullable: item_nullable, &block)
|
77
|
+
when :map
|
78
|
+
# user must specify key:, value:
|
79
|
+
key_type = kwargs[:key]
|
80
|
+
value_type = kwargs[:value]
|
81
|
+
raise ArgumentError, "map field `#{name}` requires `key:` and `value:`" if key_type.nil? || value_type.nil?
|
82
|
+
# Pass key_nullable and value_nullable if provided, otherwise use true as default
|
83
|
+
key_nullable = kwargs[:key_nullable].nil? ? true : !!kwargs[:key_nullable]
|
84
|
+
value_nullable = kwargs[:value_nullable].nil? ? true : !!kwargs[:value_nullable]
|
85
|
+
field_hash[:key] = wrap_subtype(key_type, nullable: key_nullable)
|
86
|
+
field_hash[:value] = wrap_subtype(value_type, nullable: value_nullable, &block)
|
87
|
+
else
|
88
|
+
# primitive type: :int32, :int64, :string, etc.
|
89
|
+
# do nothing else special
|
90
|
+
end
|
91
|
+
|
92
|
+
@fields << field_hash
|
93
|
+
end
|
94
|
+
|
95
|
+
def build_map(key_type, value_type, key_nullable: false, value_nullable: true, nullable: true, &block)
|
96
|
+
# Wrap the key type (maps typically use non-nullable keys)
|
97
|
+
key = wrap_subtype(key_type, nullable: key_nullable)
|
98
|
+
|
99
|
+
# Handle the case where value_type is a complex type (:struct or :list) and a block is provided
|
100
|
+
value =
|
101
|
+
if (value_type == :struct || value_type == :list) && block
|
102
|
+
wrap_subtype(value_type, nullable: value_nullable, &block)
|
103
|
+
else
|
104
|
+
wrap_subtype(value_type, nullable: value_nullable)
|
105
|
+
end
|
106
|
+
|
107
|
+
# Map is represented as a list of key/value pairs in Parquet
|
108
|
+
{
|
109
|
+
type: :map,
|
110
|
+
nullable: nullable,
|
111
|
+
item: {
|
112
|
+
type: :struct,
|
113
|
+
nullable: false,
|
114
|
+
name: "key_value",
|
115
|
+
fields: [key, value]
|
116
|
+
}
|
117
|
+
}
|
118
|
+
end
|
119
|
+
|
120
|
+
private
|
121
|
+
|
122
|
+
# If user said: field "something", :list, item: :struct do ... end
|
123
|
+
# we want to recursively parse that sub-struct from the block.
|
124
|
+
# So wrap_subtype might be:
|
125
|
+
def wrap_subtype(t, nullable: true, &block)
|
126
|
+
if t == :struct
|
127
|
+
sub_builder = SchemaBuilder.new
|
128
|
+
sub_builder.instance_eval(&block) if block
|
129
|
+
|
130
|
+
# Validate that the struct has at least one field
|
131
|
+
if sub_builder.fields.empty?
|
132
|
+
raise ArgumentError, "Cannot create a struct with zero fields. Parquet doesn't support empty structs."
|
133
|
+
end
|
134
|
+
|
135
|
+
{ type: :struct, nullable: nullable, name: "item", fields: sub_builder.fields }
|
136
|
+
elsif t == :list && block
|
137
|
+
# Handle nested lists by processing the block to define the item type
|
138
|
+
sub_builder = SchemaBuilder.new
|
139
|
+
sub_builder.instance_eval(&block) if block
|
140
|
+
|
141
|
+
# We expect a single field named "item" that defines the inner list's item type
|
142
|
+
if sub_builder.fields.empty? || sub_builder.fields.length > 1 || sub_builder.fields[0][:name] != "item"
|
143
|
+
raise ArgumentError, "Nested list must define exactly one field named 'item' for the inner list's item type"
|
144
|
+
end
|
145
|
+
|
146
|
+
{ type: :list, nullable: nullable, name: "item", item: sub_builder.fields[0] }
|
147
|
+
else
|
148
|
+
# e.g. :int32 => { type: :int32, nullable: true }
|
149
|
+
{ type: t, nullable: nullable, name: "item" }
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
data/lib/parquet/version.rb
CHANGED
data/lib/parquet.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-02-
|
11
|
+
date: 2025-02-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -60,6 +60,8 @@ files:
|
|
60
60
|
- ext/parquet/src/enumerator.rs
|
61
61
|
- ext/parquet/src/header_cache.rs
|
62
62
|
- ext/parquet/src/lib.rs
|
63
|
+
- ext/parquet/src/logger.rs
|
64
|
+
- ext/parquet/src/reader/common.rs
|
63
65
|
- ext/parquet/src/reader/mod.rs
|
64
66
|
- ext/parquet/src/reader/parquet_column_reader.rs
|
65
67
|
- ext/parquet/src/reader/parquet_row_reader.rs
|
@@ -68,6 +70,8 @@ files:
|
|
68
70
|
- ext/parquet/src/types/mod.rs
|
69
71
|
- ext/parquet/src/types/parquet_value.rs
|
70
72
|
- ext/parquet/src/types/record_types.rs
|
73
|
+
- ext/parquet/src/types/schema_converter.rs
|
74
|
+
- ext/parquet/src/types/schema_node.rs
|
71
75
|
- ext/parquet/src/types/timestamp.rs
|
72
76
|
- ext/parquet/src/types/type_conversion.rs
|
73
77
|
- ext/parquet/src/types/writer_types.rs
|
@@ -75,6 +79,7 @@ files:
|
|
75
79
|
- ext/parquet/src/writer/mod.rs
|
76
80
|
- lib/parquet.rb
|
77
81
|
- lib/parquet.rbi
|
82
|
+
- lib/parquet/schema.rb
|
78
83
|
- lib/parquet/version.rb
|
79
84
|
homepage: https://github.com/njaremko/parquet-ruby
|
80
85
|
licenses:
|