parquet-tyfoom 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Cargo.lock +1854 -0
- data/Cargo.toml +3 -0
- data/Gemfile +21 -0
- data/LICENSE +21 -0
- data/README.md +428 -0
- data/Rakefile +43 -0
- data/ext/parquet/Cargo.toml +39 -0
- data/ext/parquet/build.rs +5 -0
- data/ext/parquet/extconf.rb +4 -0
- data/ext/parquet/src/adapter_ffi.rs +297 -0
- data/ext/parquet/src/allocator.rs +13 -0
- data/ext/parquet/src/lib.rs +24 -0
- data/ext/parquet-core/Cargo.toml +24 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
- data/ext/parquet-core/src/error.rs +189 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +368 -0
- data/ext/parquet-core/src/schema.rs +452 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +190 -0
- data/ext/parquet-core/src/value.rs +220 -0
- data/ext/parquet-core/src/writer.rs +1241 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +431 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/review_regressions.rs +787 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
- data/ext/parquet-ruby-adapter/src/error.rs +141 -0
- data/ext/parquet-ruby-adapter/src/io.rs +432 -0
- data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
- data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
- data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +98 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
- data/lib/parquet/schema.rb +262 -0
- data/lib/parquet/version.rb +3 -0
- data/lib/parquet.rb +11 -0
- data/lib/parquet.rbi +181 -0
- metadata +165 -0
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Parquet
|
|
4
|
+
# Schema definition for Parquet files
|
|
5
|
+
class Schema
|
|
6
|
+
# Define a new schema using the DSL
|
|
7
|
+
# @return [Hash] schema definition hash
|
|
8
|
+
#
|
|
9
|
+
# @example Define a schema with nullable and non-nullable fields
|
|
10
|
+
# Parquet::Schema.define do
|
|
11
|
+
# field :id, :int64, nullable: false # ID cannot be null
|
|
12
|
+
# field :name, :string # Default nullable: true
|
|
13
|
+
#
|
|
14
|
+
# # Decimal field with precision and scale
|
|
15
|
+
# field :price, :decimal, precision: 10, scale: 2
|
|
16
|
+
#
|
|
17
|
+
# # List with non-nullable items
|
|
18
|
+
# field :scores, :list, item: :float, item_nullable: false
|
|
19
|
+
#
|
|
20
|
+
# # Map with nullable values
|
|
21
|
+
# field :metadata, :map,
|
|
22
|
+
# key: :string,
|
|
23
|
+
# value: :string,
|
|
24
|
+
# value_nullable: true
|
|
25
|
+
#
|
|
26
|
+
# # Nested struct with non-nullable fields
|
|
27
|
+
# field :address, :struct, nullable: true do
|
|
28
|
+
# field :street, :string, nullable: false
|
|
29
|
+
# field :city, :string, nullable: false
|
|
30
|
+
# field :zip, :string, nullable: false
|
|
31
|
+
# end
|
|
32
|
+
# end
|
|
33
|
+
def self.define(&block)
|
|
34
|
+
builder = SchemaBuilder.new
|
|
35
|
+
builder.instance_eval(&block)
|
|
36
|
+
|
|
37
|
+
# Return a structured hash representing the schema
|
|
38
|
+
{ type: :struct, fields: builder.fields }
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Internal builder class that provides the DSL methods
|
|
42
|
+
class SchemaBuilder
|
|
43
|
+
attr_reader :fields
|
|
44
|
+
|
|
45
|
+
def initialize
|
|
46
|
+
@fields = []
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Define a field in the schema
|
|
50
|
+
# @param name [String, Symbol] field name
|
|
51
|
+
# @param type [Symbol] data type (:int32, :int64, :string, :list, :map, :struct, :decimal, etc)
|
|
52
|
+
# @param nullable [Boolean] whether the field can be null (default: true)
|
|
53
|
+
# @param kwargs [Hash] additional options depending on type
|
|
54
|
+
#
|
|
55
|
+
# Additional keyword args:
|
|
56
|
+
# - `item:` if type == :list
|
|
57
|
+
# - `item_nullable:` controls nullability of list items (default: true)
|
|
58
|
+
# - `key:, value:` if type == :map
|
|
59
|
+
# - `key_nullable:, value_nullable:` controls nullability of map keys/values (default: true)
|
|
60
|
+
# - `format:` if you want to store some format string
|
|
61
|
+
# - `precision:, scale:` if type == :decimal (precision defaults to 38, scale to 0)
|
|
62
|
+
# - `has_timezone:` if type is timestamp - true means UTC storage (default), false means local/unzoned
|
|
63
|
+
# - `timezone:` (DEPRECATED) if type is timestamp - any value means UTC storage
|
|
64
|
+
# - `nullable:` default to true if not specified
|
|
65
|
+
def field(name, type, nullable: true, **kwargs, &block)
|
|
66
|
+
field_hash = { name: name.to_s, type: type, nullable: !!nullable }
|
|
67
|
+
|
|
68
|
+
# Possibly store a format if provided
|
|
69
|
+
field_hash[:format] = kwargs[:format] if kwargs.key?(:format)
|
|
70
|
+
|
|
71
|
+
# Handle timezone for timestamp types
|
|
72
|
+
if [:timestamp_second, :timestamp_millis, :timestamp_micros, :timestamp_nanos].include?(type)
|
|
73
|
+
# Support new has_timezone parameter (preferred)
|
|
74
|
+
if kwargs.key?(:has_timezone)
|
|
75
|
+
# If has_timezone is true, store "UTC" to indicate timezone presence
|
|
76
|
+
# If explicitly false, don't store timezone (indicates local/unzoned)
|
|
77
|
+
field_hash[:timezone] = "UTC" if kwargs[:has_timezone]
|
|
78
|
+
elsif kwargs.key?(:timezone)
|
|
79
|
+
# Legacy support: any timezone value means UTC storage
|
|
80
|
+
# Store "UTC" regardless of the actual value to make it clear
|
|
81
|
+
field_hash[:timezone] = "UTC"
|
|
82
|
+
else
|
|
83
|
+
# Default behavior when neither parameter is specified: UTC storage
|
|
84
|
+
field_hash[:timezone] = "UTC"
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
case type
|
|
89
|
+
when :struct
|
|
90
|
+
# We'll parse subfields from the block
|
|
91
|
+
sub_builder = SchemaBuilder.new
|
|
92
|
+
sub_builder.instance_eval(&block) if block
|
|
93
|
+
field_hash[:fields] = sub_builder.fields
|
|
94
|
+
when :list
|
|
95
|
+
item_type = kwargs[:item]
|
|
96
|
+
raise ArgumentError, "list field `#{name}` requires `item:` type" unless item_type
|
|
97
|
+
# Pass item_nullable if provided, otherwise use true as default
|
|
98
|
+
item_nullable = kwargs[:item_nullable].nil? ? true : !!kwargs[:item_nullable]
|
|
99
|
+
|
|
100
|
+
# Pass precision and scale if type is decimal
|
|
101
|
+
if item_type == :decimal
|
|
102
|
+
precision = kwargs[:precision]
|
|
103
|
+
scale = kwargs[:scale]
|
|
104
|
+
field_hash[:item] = wrap_subtype(
|
|
105
|
+
item_type,
|
|
106
|
+
nullable: item_nullable,
|
|
107
|
+
precision: precision,
|
|
108
|
+
scale: scale,
|
|
109
|
+
&block
|
|
110
|
+
)
|
|
111
|
+
else
|
|
112
|
+
field_hash[:item] = wrap_subtype(item_type, nullable: item_nullable, &block)
|
|
113
|
+
end
|
|
114
|
+
when :map
|
|
115
|
+
# user must specify key:, value:
|
|
116
|
+
key_type = kwargs[:key]
|
|
117
|
+
value_type = kwargs[:value]
|
|
118
|
+
raise ArgumentError, "map field `#{name}` requires `key:` and `value:`" if key_type.nil? || value_type.nil?
|
|
119
|
+
# Map keys are required by the Parquet spec. Reject an explicit nullable
|
|
120
|
+
# key at this boundary rather than letting it fail deep in the writer.
|
|
121
|
+
if kwargs[:key_nullable]
|
|
122
|
+
raise ArgumentError, "map field `#{name}` keys are always required; remove `key_nullable: true`"
|
|
123
|
+
end
|
|
124
|
+
key_nullable = false
|
|
125
|
+
value_nullable = kwargs[:value_nullable].nil? ? true : !!kwargs[:value_nullable]
|
|
126
|
+
|
|
127
|
+
field_hash[:key] = wrap_subtype(key_type, nullable: key_nullable)
|
|
128
|
+
|
|
129
|
+
# Pass precision and scale if value type is decimal
|
|
130
|
+
if value_type == :decimal
|
|
131
|
+
precision = kwargs[:precision]
|
|
132
|
+
scale = kwargs[:scale]
|
|
133
|
+
field_hash[:value] = wrap_subtype(
|
|
134
|
+
value_type,
|
|
135
|
+
nullable: value_nullable,
|
|
136
|
+
precision: precision,
|
|
137
|
+
scale: scale,
|
|
138
|
+
&block
|
|
139
|
+
)
|
|
140
|
+
else
|
|
141
|
+
field_hash[:value] = wrap_subtype(value_type, nullable: value_nullable, &block)
|
|
142
|
+
end
|
|
143
|
+
when :decimal
|
|
144
|
+
# Store precision and scale for decimal type according to rules:
|
|
145
|
+
# 1. When neither precision nor scale is provided, use maximum precision (38)
|
|
146
|
+
# 2. When only precision is provided, scale defaults to 0
|
|
147
|
+
# 3. When only scale is provided, use maximum precision (38)
|
|
148
|
+
# 4. When both are provided, use the provided values
|
|
149
|
+
|
|
150
|
+
if kwargs[:precision].nil? && kwargs[:scale].nil?
|
|
151
|
+
# No precision or scale provided - use maximum precision
|
|
152
|
+
field_hash[:precision] = 38
|
|
153
|
+
field_hash[:scale] = 0
|
|
154
|
+
elsif kwargs[:precision] && kwargs[:scale].nil?
|
|
155
|
+
# Precision only - scale defaults to 0
|
|
156
|
+
field_hash[:precision] = kwargs[:precision]
|
|
157
|
+
field_hash[:scale] = 0
|
|
158
|
+
elsif kwargs[:precision].nil? && kwargs[:scale]
|
|
159
|
+
# Scale only - use maximum precision
|
|
160
|
+
field_hash[:precision] = 38
|
|
161
|
+
field_hash[:scale] = kwargs[:scale]
|
|
162
|
+
else
|
|
163
|
+
# Both provided
|
|
164
|
+
field_hash[:precision] = kwargs[:precision]
|
|
165
|
+
field_hash[:scale] = kwargs[:scale]
|
|
166
|
+
end
|
|
167
|
+
else
|
|
168
|
+
# primitive type: :int32, :int64, :string, etc.
|
|
169
|
+
# do nothing else special
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
@fields << field_hash
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
def build_map(key_type, value_type, key_nullable: false, value_nullable: true, nullable: true, &block)
|
|
176
|
+
# Wrap the key type (maps typically use non-nullable keys)
|
|
177
|
+
key = wrap_subtype(key_type, nullable: key_nullable)
|
|
178
|
+
|
|
179
|
+
# Handle the case where value_type is a complex type (:struct or :list) and a block is provided
|
|
180
|
+
value =
|
|
181
|
+
if (value_type == :struct || value_type == :list) && block
|
|
182
|
+
wrap_subtype(value_type, nullable: value_nullable, &block)
|
|
183
|
+
else
|
|
184
|
+
wrap_subtype(value_type, nullable: value_nullable)
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# Map is represented as a list of key/value pairs in Parquet
|
|
188
|
+
{
|
|
189
|
+
type: :map,
|
|
190
|
+
nullable: nullable,
|
|
191
|
+
item: {
|
|
192
|
+
type: :struct,
|
|
193
|
+
nullable: false,
|
|
194
|
+
name: "key_value",
|
|
195
|
+
fields: [key, value]
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
private
|
|
201
|
+
|
|
202
|
+
# If user said: field "something", :list, item: :struct do ... end
|
|
203
|
+
# we want to recursively parse that sub-struct from the block.
|
|
204
|
+
# So wrap_subtype might be:
|
|
205
|
+
def wrap_subtype(t, nullable: true, precision: nil, scale: nil, &block)
|
|
206
|
+
if t == :struct
|
|
207
|
+
sub_builder = SchemaBuilder.new
|
|
208
|
+
sub_builder.instance_eval(&block) if block
|
|
209
|
+
|
|
210
|
+
# Validate that the struct has at least one field
|
|
211
|
+
if sub_builder.fields.empty?
|
|
212
|
+
raise ArgumentError, "Cannot create a struct with zero fields. Parquet doesn't support empty structs."
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
{ type: :struct, nullable: nullable, name: "item", fields: sub_builder.fields }
|
|
216
|
+
elsif t == :list && block
|
|
217
|
+
# Handle nested lists by processing the block to define the item type
|
|
218
|
+
sub_builder = SchemaBuilder.new
|
|
219
|
+
sub_builder.instance_eval(&block) if block
|
|
220
|
+
|
|
221
|
+
# We expect a single field named "item" that defines the inner list's item type
|
|
222
|
+
if sub_builder.fields.empty? || sub_builder.fields.length > 1 || sub_builder.fields[0][:name] != "item"
|
|
223
|
+
raise ArgumentError, "Nested list must define exactly one field named 'item' for the inner list's item type"
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
{ type: :list, nullable: nullable, name: "item", item: sub_builder.fields[0] }
|
|
227
|
+
elsif t == :decimal
|
|
228
|
+
# Handle decimal type with precision and scale
|
|
229
|
+
result = { type: t, nullable: nullable, name: "item" }
|
|
230
|
+
|
|
231
|
+
# Follow the same rules as in field() method:
|
|
232
|
+
# 1. When neither precision nor scale is provided, use maximum precision (38)
|
|
233
|
+
# 2. When only precision is provided, scale defaults to 0
|
|
234
|
+
# 3. When only scale is provided, use maximum precision (38)
|
|
235
|
+
# 4. When both are provided, use the provided values
|
|
236
|
+
if precision.nil? && scale.nil?
|
|
237
|
+
# No precision or scale provided - use maximum precision
|
|
238
|
+
result[:precision] = 38
|
|
239
|
+
result[:scale] = 0
|
|
240
|
+
elsif precision && scale.nil?
|
|
241
|
+
# Precision only - scale defaults to 0
|
|
242
|
+
result[:precision] = precision
|
|
243
|
+
result[:scale] = 0
|
|
244
|
+
elsif precision.nil? && scale
|
|
245
|
+
# Scale only - use maximum precision
|
|
246
|
+
result[:precision] = 38
|
|
247
|
+
result[:scale] = scale
|
|
248
|
+
else
|
|
249
|
+
# Both provided
|
|
250
|
+
result[:precision] = precision
|
|
251
|
+
result[:scale] = scale
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
result
|
|
255
|
+
else
|
|
256
|
+
# e.g. :int32 => { type: :int32, nullable: true }
|
|
257
|
+
{ type: t, nullable: nullable, name: "item" }
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
end
|
data/lib/parquet.rb
ADDED
data/lib/parquet.rbi
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# typed: true
|
|
2
|
+
|
|
3
|
+
module Parquet
|
|
4
|
+
# Returns metadata information about a Parquet file
|
|
5
|
+
#
|
|
6
|
+
# The returned hash contains information about:
|
|
7
|
+
# - Basic file metadata (num_rows, created_by)
|
|
8
|
+
# - Schema information (fields, types, etc.)
|
|
9
|
+
# - Row group details
|
|
10
|
+
# - Column chunk information (compression, encodings, statistics)
|
|
11
|
+
sig { params(path: String).returns(T::Hash[String, T.untyped]) }
|
|
12
|
+
def self.metadata(path)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Options:
|
|
16
|
+
# - `input`: String, File, or IO object containing parquet data
|
|
17
|
+
# - `result_type`: String specifying the output format
|
|
18
|
+
# ("hash" or "array" or :hash or :array)
|
|
19
|
+
# - `columns`: When present, only the specified columns will be included in the output.
|
|
20
|
+
# This is useful for reducing how much data is read and improving performance.
|
|
21
|
+
# - `string_storage`: How string *values* become Ruby strings (default `:copy`). Hash keys
|
|
22
|
+
# (struct field names and top-level column names) are always interned and
|
|
23
|
+
# reused regardless of this setting.
|
|
24
|
+
# - `:copy` allocates a fresh mutable String per value.
|
|
25
|
+
# - `:intern` deduplicates low-cardinality equal values into frozen interned
|
|
26
|
+
# Strings up to a bounded per-read cache, then falls back to frozen copies.
|
|
27
|
+
# A transient copy still happens per value, so it is not a per-value speedup.
|
|
28
|
+
# - `:shared` returns frozen, zero-copy strings backed by Rust memory for
|
|
29
|
+
# short, repeated, low-cardinality values. Each read returns at most the
|
|
30
|
+
# configured number of shared values and only values up to the configured
|
|
31
|
+
# byte size; values past those bounds become frozen copies. New process-wide
|
|
32
|
+
# leaks are also capped by the requested budget and hard process ceilings.
|
|
33
|
+
# All `:shared` results are frozen. Not recommended for high-cardinality or
|
|
34
|
+
# large-blob string columns.
|
|
35
|
+
# Pass a hash to set the `:shared` leak budget, e.g.
|
|
36
|
+
# `{ mode: :shared, max_entries: 16_384, max_value_bytes: 1024 }`.
|
|
37
|
+
sig do
|
|
38
|
+
params(
|
|
39
|
+
input: T.any(String, File, StringIO, IO),
|
|
40
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
|
41
|
+
columns: T.nilable(T::Array[String]),
|
|
42
|
+
strict: T.nilable(T::Boolean),
|
|
43
|
+
string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped]))
|
|
44
|
+
).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
|
|
45
|
+
end
|
|
46
|
+
sig do
|
|
47
|
+
params(
|
|
48
|
+
input: T.any(String, File, StringIO, IO),
|
|
49
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
|
50
|
+
columns: T.nilable(T::Array[String]),
|
|
51
|
+
strict: T.nilable(T::Boolean),
|
|
52
|
+
string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped])),
|
|
53
|
+
blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
|
|
54
|
+
).returns(NilClass)
|
|
55
|
+
end
|
|
56
|
+
def self.each_row(input, result_type: nil, columns: nil, strict: nil, string_storage: nil, &blk)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Options:
|
|
60
|
+
# - `input`: String, File, or IO object containing parquet data
|
|
61
|
+
# - `result_type`: String specifying the output format
|
|
62
|
+
# ("hash" or "array" or :hash or :array)
|
|
63
|
+
# - `columns`: When present, only the specified columns will be included in the output.
|
|
64
|
+
# - `batch_size`: When present, specifies the number of rows per batch
|
|
65
|
+
# - `string_storage`: How string values become Ruby strings (`:copy` (default), `:intern`,
|
|
66
|
+
# or `:shared`). See `each_row` for the semantics of each mode.
|
|
67
|
+
sig do
|
|
68
|
+
params(
|
|
69
|
+
input: T.any(String, File, StringIO, IO),
|
|
70
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
|
71
|
+
columns: T.nilable(T::Array[String]),
|
|
72
|
+
batch_size: T.nilable(Integer),
|
|
73
|
+
strict: T.nilable(T::Boolean),
|
|
74
|
+
string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped]))
|
|
75
|
+
).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
|
|
76
|
+
end
|
|
77
|
+
sig do
|
|
78
|
+
params(
|
|
79
|
+
input: T.any(String, File, StringIO, IO),
|
|
80
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
|
81
|
+
columns: T.nilable(T::Array[String]),
|
|
82
|
+
batch_size: T.nilable(Integer),
|
|
83
|
+
strict: T.nilable(T::Boolean),
|
|
84
|
+
string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped])),
|
|
85
|
+
blk:
|
|
86
|
+
T.nilable(T.proc.params(batch: T.any(T::Hash[String, T::Array[T.untyped]], T::Array[T::Array[T.untyped]])).void)
|
|
87
|
+
).returns(NilClass)
|
|
88
|
+
end
|
|
89
|
+
def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, strict: nil, string_storage: nil, &blk)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Options:
|
|
93
|
+
# - `read_from`: An Enumerator yielding arrays of values representing each row
|
|
94
|
+
# - `schema`: Array of hashes specifying column names and types. Supported types:
|
|
95
|
+
# - `int8`, `int16`, `int32`, `int64`
|
|
96
|
+
# - `uint8`, `uint16`, `uint32`, `uint64`
|
|
97
|
+
# - `float`, `double`
|
|
98
|
+
# - `string`
|
|
99
|
+
# - `binary`
|
|
100
|
+
# - `boolean`
|
|
101
|
+
# - `date32`
|
|
102
|
+
# - `timestamp_millis`, `timestamp_micros`
|
|
103
|
+
# - `write_to`: String path or IO object to write the parquet file to
|
|
104
|
+
# - `batch_size`: Optional positive batch size for writing (defaults to 1000, at most 1_000_000
|
|
105
|
+
# for one-column schemas; wide schemas may have a lower safety cap). Enumerator
|
|
106
|
+
# inputs are consumed in slices of this many rows, never materialized in full.
|
|
107
|
+
# - `flush_threshold`: Optional threshold in bytes before a row group is flushed to the
|
|
108
|
+
# destination; bounds both the raw bytes staged since the last flush
|
|
109
|
+
# and the writer's encoded in-progress buffer (defaults to 100MB)
|
|
110
|
+
# - `compression`: Optional compression type to use (defaults to "zstd")
|
|
111
|
+
# Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
|
112
|
+
# - `sample_size`: Optional positive number of rows to sample for size estimation
|
|
113
|
+
# (defaults to 100, at most 10_000)
|
|
114
|
+
# - `string_cache`: Deduplicate repeated string values while writing. `false` (default)
|
|
115
|
+
# disables it, `true` enables it with a default capacity, and an Integer
|
|
116
|
+
# enables it with that many retained distinct strings (at most 65_536).
|
|
117
|
+
# Retention also skips values larger than 4KB and stops after 16MB of
|
|
118
|
+
# cached string content.
|
|
119
|
+
sig do
|
|
120
|
+
params(
|
|
121
|
+
read_from: T::Enumerator[T::Array[T.untyped]],
|
|
122
|
+
schema: T::Array[T::Hash[String, String]],
|
|
123
|
+
write_to: T.any(String, IO),
|
|
124
|
+
batch_size: T.nilable(Integer),
|
|
125
|
+
flush_threshold: T.nilable(Integer),
|
|
126
|
+
compression: T.nilable(String),
|
|
127
|
+
sample_size: T.nilable(Integer),
|
|
128
|
+
string_cache: T.nilable(T.any(T::Boolean, Integer))
|
|
129
|
+
).void
|
|
130
|
+
end
|
|
131
|
+
def self.write_rows(
|
|
132
|
+
read_from,
|
|
133
|
+
schema:,
|
|
134
|
+
write_to:,
|
|
135
|
+
batch_size: nil,
|
|
136
|
+
flush_threshold: nil,
|
|
137
|
+
compression: nil,
|
|
138
|
+
sample_size: nil,
|
|
139
|
+
string_cache: nil
|
|
140
|
+
)
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Options:
|
|
144
|
+
# - `read_from`: An Enumerator yielding arrays of column batches
|
|
145
|
+
# - `schema`: Array of hashes specifying column names and types. Supported types:
|
|
146
|
+
# - `int8`, `int16`, `int32`, `int64`
|
|
147
|
+
# - `uint8`, `uint16`, `uint32`, `uint64`
|
|
148
|
+
# - `float`, `double`
|
|
149
|
+
# - `string`
|
|
150
|
+
# - `binary`
|
|
151
|
+
# - `boolean`
|
|
152
|
+
# - `date32`
|
|
153
|
+
# - `timestamp_millis`, `timestamp_micros`
|
|
154
|
+
# - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
|
|
155
|
+
# - `write_to`: String path or IO object to write the parquet file to
|
|
156
|
+
# - `flush_threshold`: Optional threshold in bytes before a row group is flushed to the
|
|
157
|
+
# destination; bounds both the raw bytes staged since the last flush
|
|
158
|
+
# and the writer's encoded in-progress buffer (defaults to 100MB)
|
|
159
|
+
# - `compression`: Optional compression type to use (defaults to "zstd")
|
|
160
|
+
# Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
|
161
|
+
# - `logger`: Optional Ruby logger for column-write progress messages
|
|
162
|
+
sig do
|
|
163
|
+
params(
|
|
164
|
+
read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
|
|
165
|
+
schema: T::Array[T::Hash[String, String]],
|
|
166
|
+
write_to: T.any(String, IO),
|
|
167
|
+
flush_threshold: T.nilable(Integer),
|
|
168
|
+
compression: T.nilable(String),
|
|
169
|
+
logger: T.nilable(T.untyped)
|
|
170
|
+
).void
|
|
171
|
+
end
|
|
172
|
+
def self.write_columns(
|
|
173
|
+
read_from,
|
|
174
|
+
schema:,
|
|
175
|
+
write_to:,
|
|
176
|
+
flush_threshold: nil,
|
|
177
|
+
compression: nil,
|
|
178
|
+
logger: nil
|
|
179
|
+
)
|
|
180
|
+
end
|
|
181
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: parquet-tyfoom
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.8.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Nathan Jaremko
|
|
8
|
+
- Cameron McCord
|
|
9
|
+
autorequire:
|
|
10
|
+
bindir: bin
|
|
11
|
+
cert_chain: []
|
|
12
|
+
date: 2026-07-02 00:00:00.000000000 Z
|
|
13
|
+
dependencies:
|
|
14
|
+
- !ruby/object:Gem::Dependency
|
|
15
|
+
name: rb_sys
|
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
|
17
|
+
requirements:
|
|
18
|
+
- - "~>"
|
|
19
|
+
- !ruby/object:Gem::Version
|
|
20
|
+
version: 0.9.39
|
|
21
|
+
type: :runtime
|
|
22
|
+
prerelease: false
|
|
23
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
24
|
+
requirements:
|
|
25
|
+
- - "~>"
|
|
26
|
+
- !ruby/object:Gem::Version
|
|
27
|
+
version: 0.9.39
|
|
28
|
+
- !ruby/object:Gem::Dependency
|
|
29
|
+
name: bigdecimal
|
|
30
|
+
requirement: !ruby/object:Gem::Requirement
|
|
31
|
+
requirements:
|
|
32
|
+
- - ">="
|
|
33
|
+
- !ruby/object:Gem::Version
|
|
34
|
+
version: '0'
|
|
35
|
+
type: :runtime
|
|
36
|
+
prerelease: false
|
|
37
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
38
|
+
requirements:
|
|
39
|
+
- - ">="
|
|
40
|
+
- !ruby/object:Gem::Version
|
|
41
|
+
version: '0'
|
|
42
|
+
- !ruby/object:Gem::Dependency
|
|
43
|
+
name: rake-compiler
|
|
44
|
+
requirement: !ruby/object:Gem::Requirement
|
|
45
|
+
requirements:
|
|
46
|
+
- - "~>"
|
|
47
|
+
- !ruby/object:Gem::Version
|
|
48
|
+
version: 1.2.0
|
|
49
|
+
type: :development
|
|
50
|
+
prerelease: false
|
|
51
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
52
|
+
requirements:
|
|
53
|
+
- - "~>"
|
|
54
|
+
- !ruby/object:Gem::Version
|
|
55
|
+
version: 1.2.0
|
|
56
|
+
description: |2
|
|
57
|
+
Tyfoom's fork of the `parquet` gem (github.com/njaremko/parquet-ruby), published while the
|
|
58
|
+
incremental streaming-write fix is pending upstream. It wraps the official Apache Rust
|
|
59
|
+
implementation and bounds write memory by streaming row groups to disk instead of buffering the
|
|
60
|
+
whole file. Drop-in compatible with the upstream gem: the library is still required as
|
|
61
|
+
`require "parquet"` and exposes the same `Parquet` API.
|
|
62
|
+
email:
|
|
63
|
+
- nathan@jaremko.ca
|
|
64
|
+
- cameron.mccord@tyfoom.com
|
|
65
|
+
executables: []
|
|
66
|
+
extensions:
|
|
67
|
+
- ext/parquet/extconf.rb
|
|
68
|
+
extra_rdoc_files: []
|
|
69
|
+
files:
|
|
70
|
+
- Cargo.lock
|
|
71
|
+
- Cargo.toml
|
|
72
|
+
- Gemfile
|
|
73
|
+
- LICENSE
|
|
74
|
+
- README.md
|
|
75
|
+
- Rakefile
|
|
76
|
+
- ext/parquet-core/Cargo.toml
|
|
77
|
+
- ext/parquet-core/src/arrow_conversion.rs
|
|
78
|
+
- ext/parquet-core/src/error.rs
|
|
79
|
+
- ext/parquet-core/src/lib.rs
|
|
80
|
+
- ext/parquet-core/src/reader.rs
|
|
81
|
+
- ext/parquet-core/src/schema.rs
|
|
82
|
+
- ext/parquet-core/src/test_utils.rs
|
|
83
|
+
- ext/parquet-core/src/traits/mod.rs
|
|
84
|
+
- ext/parquet-core/src/traits/schema.rs
|
|
85
|
+
- ext/parquet-core/src/value.rs
|
|
86
|
+
- ext/parquet-core/src/writer.rs
|
|
87
|
+
- ext/parquet-core/tests/arrow_conversion_tests.rs
|
|
88
|
+
- ext/parquet-core/tests/binary_data.rs
|
|
89
|
+
- ext/parquet-core/tests/column_projection.rs
|
|
90
|
+
- ext/parquet-core/tests/complex_types.rs
|
|
91
|
+
- ext/parquet-core/tests/compression_tests.rs
|
|
92
|
+
- ext/parquet-core/tests/concurrent_access.rs
|
|
93
|
+
- ext/parquet-core/tests/decimal_tests.rs
|
|
94
|
+
- ext/parquet-core/tests/edge_cases_corner_cases.rs
|
|
95
|
+
- ext/parquet-core/tests/error_handling_comprehensive_tests.rs
|
|
96
|
+
- ext/parquet-core/tests/null_handling_tests.rs
|
|
97
|
+
- ext/parquet-core/tests/performance_memory.rs
|
|
98
|
+
- ext/parquet-core/tests/primitive_types.rs
|
|
99
|
+
- ext/parquet-core/tests/real_world_patterns.rs
|
|
100
|
+
- ext/parquet-core/tests/review_regressions.rs
|
|
101
|
+
- ext/parquet-core/tests/roundtrip_correctness.rs
|
|
102
|
+
- ext/parquet-core/tests/schema_comprehensive_tests.rs
|
|
103
|
+
- ext/parquet-core/tests/temporal_tests.rs
|
|
104
|
+
- ext/parquet-core/tests/test_helpers.rs
|
|
105
|
+
- ext/parquet-core/tests/writer_tests.rs
|
|
106
|
+
- ext/parquet-ruby-adapter/Cargo.toml
|
|
107
|
+
- ext/parquet-ruby-adapter/build.rs
|
|
108
|
+
- ext/parquet-ruby-adapter/examples/try_into_value_demo.rs
|
|
109
|
+
- ext/parquet-ruby-adapter/src/chunk_reader.rs
|
|
110
|
+
- ext/parquet-ruby-adapter/src/converter.rs
|
|
111
|
+
- ext/parquet-ruby-adapter/src/error.rs
|
|
112
|
+
- ext/parquet-ruby-adapter/src/io.rs
|
|
113
|
+
- ext/parquet-ruby-adapter/src/lib.rs
|
|
114
|
+
- ext/parquet-ruby-adapter/src/logger.rs
|
|
115
|
+
- ext/parquet-ruby-adapter/src/metadata.rs
|
|
116
|
+
- ext/parquet-ruby-adapter/src/reader.rs
|
|
117
|
+
- ext/parquet-ruby-adapter/src/schema.rs
|
|
118
|
+
- ext/parquet-ruby-adapter/src/string_cache.rs
|
|
119
|
+
- ext/parquet-ruby-adapter/src/string_cache_test.rs
|
|
120
|
+
- ext/parquet-ruby-adapter/src/string_storage.rs
|
|
121
|
+
- ext/parquet-ruby-adapter/src/try_into_value.rs
|
|
122
|
+
- ext/parquet-ruby-adapter/src/types.rs
|
|
123
|
+
- ext/parquet-ruby-adapter/src/utils.rs
|
|
124
|
+
- ext/parquet-ruby-adapter/src/writer.rs
|
|
125
|
+
- ext/parquet/Cargo.toml
|
|
126
|
+
- ext/parquet/build.rs
|
|
127
|
+
- ext/parquet/extconf.rb
|
|
128
|
+
- ext/parquet/src/adapter_ffi.rs
|
|
129
|
+
- ext/parquet/src/allocator.rs
|
|
130
|
+
- ext/parquet/src/lib.rs
|
|
131
|
+
- lib/parquet.rb
|
|
132
|
+
- lib/parquet.rbi
|
|
133
|
+
- lib/parquet/schema.rb
|
|
134
|
+
- lib/parquet/version.rb
|
|
135
|
+
homepage: https://github.com/cameronmccord2/parquet-ruby
|
|
136
|
+
licenses:
|
|
137
|
+
- MIT
|
|
138
|
+
metadata:
|
|
139
|
+
homepage_uri: https://github.com/cameronmccord2/parquet-ruby
|
|
140
|
+
source_code_uri: https://github.com/cameronmccord2/parquet-ruby
|
|
141
|
+
readme_uri: https://github.com/cameronmccord2/parquet-ruby/blob/stream-writes-incrementally/README.md
|
|
142
|
+
changelog_uri: https://github.com/cameronmccord2/parquet-ruby/blob/stream-writes-incrementally/CHANGELOG.md
|
|
143
|
+
documentation_uri: https://www.rubydoc.info/gems/parquet-tyfoom
|
|
144
|
+
funding_uri: https://github.com/sponsors/njaremko
|
|
145
|
+
allowed_push_host: https://rubygems.org
|
|
146
|
+
post_install_message:
|
|
147
|
+
rdoc_options: []
|
|
148
|
+
require_paths:
|
|
149
|
+
- lib
|
|
150
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
151
|
+
requirements:
|
|
152
|
+
- - ">="
|
|
153
|
+
- !ruby/object:Gem::Version
|
|
154
|
+
version: 3.1.0
|
|
155
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
156
|
+
requirements:
|
|
157
|
+
- - ">="
|
|
158
|
+
- !ruby/object:Gem::Version
|
|
159
|
+
version: '0'
|
|
160
|
+
requirements: []
|
|
161
|
+
rubygems_version: 3.4.19
|
|
162
|
+
signing_key:
|
|
163
|
+
specification_version: 4
|
|
164
|
+
summary: Tyfoom fork of the parquet gem (Rust), with incremental streaming writes
|
|
165
|
+
test_files: []
|