parquet 0.4.2 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,154 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parquet
4
+ # Schema definition for Parquet files
5
+ class Schema
6
+ # Define a new schema using the DSL
7
+ # @return [Hash] schema definition hash
8
+ #
9
+ # @example Define a schema with nullable and non-nullable fields
10
+ # Parquet::Schema.define do
11
+ # field :id, :int64, nullable: false # ID cannot be null
12
+ # field :name, :string # Default nullable: true
13
+ #
14
+ # # List with non-nullable items
15
+ # field :scores, :list, item: :float, item_nullable: false
16
+ #
17
+ # # Map with nullable values
18
+ # field :metadata, :map,
19
+ # key: :string,
20
+ # value: :string,
21
+ # value_nullable: true
22
+ #
23
+ # # Nested struct with non-nullable fields
24
+ # field :address, :struct, nullable: true do
25
+ # field :street, :string, nullable: false
26
+ # field :city, :string, nullable: false
27
+ # field :zip, :string, nullable: false
28
+ # end
29
+ # end
30
+ def self.define(&block)
31
+ builder = SchemaBuilder.new
32
+ builder.instance_eval(&block)
33
+
34
+ # Return a structured hash representing the schema
35
+ { type: :struct, fields: builder.fields }
36
+ end
37
+
38
+ # Internal builder class that provides the DSL methods
39
+ class SchemaBuilder
40
+ attr_reader :fields
41
+
42
+ def initialize
43
+ @fields = []
44
+ end
45
+
46
+ # Define a field in the schema
47
+ # @param name [String, Symbol] field name
48
+ # @param type [Symbol] data type (:int32, :int64, :string, :list, :map, :struct, etc)
49
+ # @param nullable [Boolean] whether the field can be null (default: true)
50
+ # @param kwargs [Hash] additional options depending on type
51
+ #
52
+ # Additional keyword args:
53
+ # - `item:` if type == :list
54
+ # - `item_nullable:` controls nullability of list items (default: true)
55
+ # - `key:, value:` if type == :map
56
+ # - `key_nullable:, value_nullable:` controls nullability of map keys/values (default: true)
57
+ # - `format:` if you want to store some format string
58
+ # - `nullable:` default to true if not specified
59
+ def field(name, type, nullable: true, **kwargs, &block)
60
+ field_hash = { name: name.to_s, type: type, nullable: !!nullable }
61
+
62
+ # Possibly store a format if provided
63
+ field_hash[:format] = kwargs[:format] if kwargs.key?(:format)
64
+
65
+ case type
66
+ when :struct
67
+ # We'll parse subfields from the block
68
+ sub_builder = SchemaBuilder.new
69
+ sub_builder.instance_eval(&block) if block
70
+ field_hash[:fields] = sub_builder.fields
71
+ when :list
72
+ item_type = kwargs[:item]
73
+ raise ArgumentError, "list field `#{name}` requires `item:` type" unless item_type
74
+ # Pass item_nullable if provided, otherwise use true as default
75
+ item_nullable = kwargs[:item_nullable].nil? ? true : !!kwargs[:item_nullable]
76
+ field_hash[:item] = wrap_subtype(item_type, nullable: item_nullable, &block)
77
+ when :map
78
+ # user must specify key:, value:
79
+ key_type = kwargs[:key]
80
+ value_type = kwargs[:value]
81
+ raise ArgumentError, "map field `#{name}` requires `key:` and `value:`" if key_type.nil? || value_type.nil?
82
+ # Pass key_nullable and value_nullable if provided, otherwise use true as default
83
+ key_nullable = kwargs[:key_nullable].nil? ? true : !!kwargs[:key_nullable]
84
+ value_nullable = kwargs[:value_nullable].nil? ? true : !!kwargs[:value_nullable]
85
+ field_hash[:key] = wrap_subtype(key_type, nullable: key_nullable)
86
+ field_hash[:value] = wrap_subtype(value_type, nullable: value_nullable, &block)
87
+ else
88
+ # primitive type: :int32, :int64, :string, etc.
89
+ # do nothing else special
90
+ end
91
+
92
+ @fields << field_hash
93
+ end
94
+
95
+ def build_map(key_type, value_type, key_nullable: false, value_nullable: true, nullable: true, &block)
96
+ # Wrap the key type (maps typically use non-nullable keys)
97
+ key = wrap_subtype(key_type, nullable: key_nullable)
98
+
99
+ # Handle the case where value_type is a complex type (:struct or :list) and a block is provided
100
+ value =
101
+ if (value_type == :struct || value_type == :list) && block
102
+ wrap_subtype(value_type, nullable: value_nullable, &block)
103
+ else
104
+ wrap_subtype(value_type, nullable: value_nullable)
105
+ end
106
+
107
+ # Map is represented as a list of key/value pairs in Parquet
108
+ {
109
+ type: :map,
110
+ nullable: nullable,
111
+ item: {
112
+ type: :struct,
113
+ nullable: false,
114
+ name: "key_value",
115
+ fields: [key, value]
116
+ }
117
+ }
118
+ end
119
+
120
+ private
121
+
122
+ # If user said: field "something", :list, item: :struct do ... end
123
+ # we want to recursively parse that sub-struct from the block.
124
+ # So wrap_subtype might be:
125
+ def wrap_subtype(t, nullable: true, &block)
126
+ if t == :struct
127
+ sub_builder = SchemaBuilder.new
128
+ sub_builder.instance_eval(&block) if block
129
+
130
+ # Validate that the struct has at least one field
131
+ if sub_builder.fields.empty?
132
+ raise ArgumentError, "Cannot create a struct with zero fields. Parquet doesn't support empty structs."
133
+ end
134
+
135
+ { type: :struct, nullable: nullable, name: "item", fields: sub_builder.fields }
136
+ elsif t == :list && block
137
+ # Handle nested lists by processing the block to define the item type
138
+ sub_builder = SchemaBuilder.new
139
+ sub_builder.instance_eval(&block) if block
140
+
141
+ # We expect a single field named "item" that defines the inner list's item type
142
+ if sub_builder.fields.empty? || sub_builder.fields.length > 1 || sub_builder.fields[0][:name] != "item"
143
+ raise ArgumentError, "Nested list must define exactly one field named 'item' for the inner list's item type"
144
+ end
145
+
146
+ { type: :list, nullable: nullable, name: "item", item: sub_builder.fields[0] }
147
+ else
148
+ # e.g. :int32 => { type: :int32, nullable: true }
149
+ { type: t, nullable: nullable, name: "item" }
150
+ end
151
+ end
152
+ end
153
+ end
154
+ end
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.4.2"
2
+ VERSION = "0.5.0"
3
3
  end
data/lib/parquet.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require_relative "parquet/version"
2
+ require_relative "parquet/schema"
2
3
 
3
4
  begin
4
5
  require "parquet/#{RUBY_VERSION.to_f}/parquet"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-02-21 00:00:00.000000000 Z
11
+ date: 2025-02-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -60,6 +60,8 @@ files:
60
60
  - ext/parquet/src/enumerator.rs
61
61
  - ext/parquet/src/header_cache.rs
62
62
  - ext/parquet/src/lib.rs
63
+ - ext/parquet/src/logger.rs
64
+ - ext/parquet/src/reader/common.rs
63
65
  - ext/parquet/src/reader/mod.rs
64
66
  - ext/parquet/src/reader/parquet_column_reader.rs
65
67
  - ext/parquet/src/reader/parquet_row_reader.rs
@@ -68,6 +70,8 @@ files:
68
70
  - ext/parquet/src/types/mod.rs
69
71
  - ext/parquet/src/types/parquet_value.rs
70
72
  - ext/parquet/src/types/record_types.rs
73
+ - ext/parquet/src/types/schema_converter.rs
74
+ - ext/parquet/src/types/schema_node.rs
71
75
  - ext/parquet/src/types/timestamp.rs
72
76
  - ext/parquet/src/types/type_conversion.rs
73
77
  - ext/parquet/src/types/writer_types.rs
@@ -75,6 +79,7 @@ files:
75
79
  - ext/parquet/src/writer/mod.rs
76
80
  - lib/parquet.rb
77
81
  - lib/parquet.rbi
82
+ - lib/parquet/schema.rb
78
83
  - lib/parquet/version.rb
79
84
  homepage: https://github.com/njaremko/parquet-ruby
80
85
  licenses: