parquet-tyfoom 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/Cargo.lock +1854 -0
  3. data/Cargo.toml +3 -0
  4. data/Gemfile +21 -0
  5. data/LICENSE +21 -0
  6. data/README.md +428 -0
  7. data/Rakefile +43 -0
  8. data/ext/parquet/Cargo.toml +39 -0
  9. data/ext/parquet/build.rs +5 -0
  10. data/ext/parquet/extconf.rb +4 -0
  11. data/ext/parquet/src/adapter_ffi.rs +297 -0
  12. data/ext/parquet/src/allocator.rs +13 -0
  13. data/ext/parquet/src/lib.rs +24 -0
  14. data/ext/parquet-core/Cargo.toml +24 -0
  15. data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
  16. data/ext/parquet-core/src/error.rs +189 -0
  17. data/ext/parquet-core/src/lib.rs +60 -0
  18. data/ext/parquet-core/src/reader.rs +368 -0
  19. data/ext/parquet-core/src/schema.rs +452 -0
  20. data/ext/parquet-core/src/test_utils.rs +308 -0
  21. data/ext/parquet-core/src/traits/mod.rs +5 -0
  22. data/ext/parquet-core/src/traits/schema.rs +190 -0
  23. data/ext/parquet-core/src/value.rs +220 -0
  24. data/ext/parquet-core/src/writer.rs +1241 -0
  25. data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
  26. data/ext/parquet-core/tests/binary_data.rs +437 -0
  27. data/ext/parquet-core/tests/column_projection.rs +557 -0
  28. data/ext/parquet-core/tests/complex_types.rs +821 -0
  29. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  30. data/ext/parquet-core/tests/concurrent_access.rs +431 -0
  31. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  32. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  33. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
  34. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  35. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  36. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  37. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  38. data/ext/parquet-core/tests/review_regressions.rs +787 -0
  39. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  40. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
  41. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  42. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  43. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  44. data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
  45. data/ext/parquet-ruby-adapter/build.rs +5 -0
  46. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  47. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  48. data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
  49. data/ext/parquet-ruby-adapter/src/error.rs +141 -0
  50. data/ext/parquet-ruby-adapter/src/io.rs +432 -0
  51. data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
  52. data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
  53. data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
  54. data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
  55. data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
  56. data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
  57. data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
  58. data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
  59. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  60. data/ext/parquet-ruby-adapter/src/types.rs +98 -0
  61. data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
  62. data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
  63. data/lib/parquet/schema.rb +262 -0
  64. data/lib/parquet/version.rb +3 -0
  65. data/lib/parquet.rb +11 -0
  66. data/lib/parquet.rbi +181 -0
  67. metadata +165 -0
@@ -0,0 +1,262 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parquet
4
+ # Schema definition for Parquet files
5
+ class Schema
6
+ # Define a new schema using the DSL
7
+ # @return [Hash] schema definition hash
8
+ #
9
+ # @example Define a schema with nullable and non-nullable fields
10
+ # Parquet::Schema.define do
11
+ # field :id, :int64, nullable: false # ID cannot be null
12
+ # field :name, :string # Default nullable: true
13
+ #
14
+ # # Decimal field with precision and scale
15
+ # field :price, :decimal, precision: 10, scale: 2
16
+ #
17
+ # # List with non-nullable items
18
+ # field :scores, :list, item: :float, item_nullable: false
19
+ #
20
+ # # Map with nullable values
21
+ # field :metadata, :map,
22
+ # key: :string,
23
+ # value: :string,
24
+ # value_nullable: true
25
+ #
26
+ # # Nested struct with non-nullable fields
27
+ # field :address, :struct, nullable: true do
28
+ # field :street, :string, nullable: false
29
+ # field :city, :string, nullable: false
30
+ # field :zip, :string, nullable: false
31
+ # end
32
+ # end
33
+ def self.define(&block)
34
+ builder = SchemaBuilder.new
35
+ builder.instance_eval(&block)
36
+
37
+ # Return a structured hash representing the schema
38
+ { type: :struct, fields: builder.fields }
39
+ end
40
+
41
+ # Internal builder class that provides the DSL methods
42
+ class SchemaBuilder
43
+ attr_reader :fields
44
+
45
+ def initialize
46
+ @fields = []
47
+ end
48
+
49
+ # Define a field in the schema
50
+ # @param name [String, Symbol] field name
51
+ # @param type [Symbol] data type (:int32, :int64, :string, :list, :map, :struct, :decimal, etc)
52
+ # @param nullable [Boolean] whether the field can be null (default: true)
53
+ # @param kwargs [Hash] additional options depending on type
54
+ #
55
+ # Additional keyword args:
56
+ # - `item:` if type == :list
57
+ # - `item_nullable:` controls nullability of list items (default: true)
58
+ # - `key:, value:` if type == :map
59
+ # - `key_nullable:, value_nullable:` controls nullability of map keys/values (default: true)
60
+ # - `format:` if you want to store some format string
61
+ # - `precision:, scale:` if type == :decimal (precision defaults to 38, scale to 0)
62
+ # - `has_timezone:` if type is timestamp - true means UTC storage (default), false means local/unzoned
63
+ # - `timezone:` (DEPRECATED) if type is timestamp - any value means UTC storage
64
+ # - `nullable:` default to true if not specified
65
+ def field(name, type, nullable: true, **kwargs, &block)
66
+ field_hash = { name: name.to_s, type: type, nullable: !!nullable }
67
+
68
+ # Possibly store a format if provided
69
+ field_hash[:format] = kwargs[:format] if kwargs.key?(:format)
70
+
71
+ # Handle timezone for timestamp types
72
+ if [:timestamp_second, :timestamp_millis, :timestamp_micros, :timestamp_nanos].include?(type)
73
+ # Support new has_timezone parameter (preferred)
74
+ if kwargs.key?(:has_timezone)
75
+ # If has_timezone is true, store "UTC" to indicate timezone presence
76
+ # If explicitly false, don't store timezone (indicates local/unzoned)
77
+ field_hash[:timezone] = "UTC" if kwargs[:has_timezone]
78
+ elsif kwargs.key?(:timezone)
79
+ # Legacy support: any timezone value means UTC storage
80
+ # Store "UTC" regardless of the actual value to make it clear
81
+ field_hash[:timezone] = "UTC"
82
+ else
83
+ # Default behavior when neither parameter is specified: UTC storage
84
+ field_hash[:timezone] = "UTC"
85
+ end
86
+ end
87
+
88
+ case type
89
+ when :struct
90
+ # We'll parse subfields from the block
91
+ sub_builder = SchemaBuilder.new
92
+ sub_builder.instance_eval(&block) if block
93
+ field_hash[:fields] = sub_builder.fields
94
+ when :list
95
+ item_type = kwargs[:item]
96
+ raise ArgumentError, "list field `#{name}` requires `item:` type" unless item_type
97
+ # Pass item_nullable if provided, otherwise use true as default
98
+ item_nullable = kwargs[:item_nullable].nil? ? true : !!kwargs[:item_nullable]
99
+
100
+ # Pass precision and scale if type is decimal
101
+ if item_type == :decimal
102
+ precision = kwargs[:precision]
103
+ scale = kwargs[:scale]
104
+ field_hash[:item] = wrap_subtype(
105
+ item_type,
106
+ nullable: item_nullable,
107
+ precision: precision,
108
+ scale: scale,
109
+ &block
110
+ )
111
+ else
112
+ field_hash[:item] = wrap_subtype(item_type, nullable: item_nullable, &block)
113
+ end
114
+ when :map
115
+ # user must specify key:, value:
116
+ key_type = kwargs[:key]
117
+ value_type = kwargs[:value]
118
+ raise ArgumentError, "map field `#{name}` requires `key:` and `value:`" if key_type.nil? || value_type.nil?
119
+ # Map keys are required by the Parquet spec. Reject an explicit nullable
120
+ # key at this boundary rather than letting it fail deep in the writer.
121
+ if kwargs[:key_nullable]
122
+ raise ArgumentError, "map field `#{name}` keys are always required; remove `key_nullable: true`"
123
+ end
124
+ key_nullable = false
125
+ value_nullable = kwargs[:value_nullable].nil? ? true : !!kwargs[:value_nullable]
126
+
127
+ field_hash[:key] = wrap_subtype(key_type, nullable: key_nullable)
128
+
129
+ # Pass precision and scale if value type is decimal
130
+ if value_type == :decimal
131
+ precision = kwargs[:precision]
132
+ scale = kwargs[:scale]
133
+ field_hash[:value] = wrap_subtype(
134
+ value_type,
135
+ nullable: value_nullable,
136
+ precision: precision,
137
+ scale: scale,
138
+ &block
139
+ )
140
+ else
141
+ field_hash[:value] = wrap_subtype(value_type, nullable: value_nullable, &block)
142
+ end
143
+ when :decimal
144
+ # Store precision and scale for decimal type according to rules:
145
+ # 1. When neither precision nor scale is provided, use maximum precision (38)
146
+ # 2. When only precision is provided, scale defaults to 0
147
+ # 3. When only scale is provided, use maximum precision (38)
148
+ # 4. When both are provided, use the provided values
149
+
150
+ if kwargs[:precision].nil? && kwargs[:scale].nil?
151
+ # No precision or scale provided - use maximum precision
152
+ field_hash[:precision] = 38
153
+ field_hash[:scale] = 0
154
+ elsif kwargs[:precision] && kwargs[:scale].nil?
155
+ # Precision only - scale defaults to 0
156
+ field_hash[:precision] = kwargs[:precision]
157
+ field_hash[:scale] = 0
158
+ elsif kwargs[:precision].nil? && kwargs[:scale]
159
+ # Scale only - use maximum precision
160
+ field_hash[:precision] = 38
161
+ field_hash[:scale] = kwargs[:scale]
162
+ else
163
+ # Both provided
164
+ field_hash[:precision] = kwargs[:precision]
165
+ field_hash[:scale] = kwargs[:scale]
166
+ end
167
+ else
168
+ # primitive type: :int32, :int64, :string, etc.
169
+ # do nothing else special
170
+ end
171
+
172
+ @fields << field_hash
173
+ end
174
+
175
+ def build_map(key_type, value_type, key_nullable: false, value_nullable: true, nullable: true, &block)
176
+ # Wrap the key type (maps typically use non-nullable keys)
177
+ key = wrap_subtype(key_type, nullable: key_nullable)
178
+
179
+ # Handle the case where value_type is a complex type (:struct or :list) and a block is provided
180
+ value =
181
+ if (value_type == :struct || value_type == :list) && block
182
+ wrap_subtype(value_type, nullable: value_nullable, &block)
183
+ else
184
+ wrap_subtype(value_type, nullable: value_nullable)
185
+ end
186
+
187
+ # Map is represented as a list of key/value pairs in Parquet
188
+ {
189
+ type: :map,
190
+ nullable: nullable,
191
+ item: {
192
+ type: :struct,
193
+ nullable: false,
194
+ name: "key_value",
195
+ fields: [key, value]
196
+ }
197
+ }
198
+ end
199
+
200
+ private
201
+
202
+ # If user said: field "something", :list, item: :struct do ... end
203
+ # we want to recursively parse that sub-struct from the block.
204
+ # So wrap_subtype might be:
205
+ def wrap_subtype(t, nullable: true, precision: nil, scale: nil, &block)
206
+ if t == :struct
207
+ sub_builder = SchemaBuilder.new
208
+ sub_builder.instance_eval(&block) if block
209
+
210
+ # Validate that the struct has at least one field
211
+ if sub_builder.fields.empty?
212
+ raise ArgumentError, "Cannot create a struct with zero fields. Parquet doesn't support empty structs."
213
+ end
214
+
215
+ { type: :struct, nullable: nullable, name: "item", fields: sub_builder.fields }
216
+ elsif t == :list && block
217
+ # Handle nested lists by processing the block to define the item type
218
+ sub_builder = SchemaBuilder.new
219
+ sub_builder.instance_eval(&block) if block
220
+
221
+ # We expect a single field named "item" that defines the inner list's item type
222
+ if sub_builder.fields.empty? || sub_builder.fields.length > 1 || sub_builder.fields[0][:name] != "item"
223
+ raise ArgumentError, "Nested list must define exactly one field named 'item' for the inner list's item type"
224
+ end
225
+
226
+ { type: :list, nullable: nullable, name: "item", item: sub_builder.fields[0] }
227
+ elsif t == :decimal
228
+ # Handle decimal type with precision and scale
229
+ result = { type: t, nullable: nullable, name: "item" }
230
+
231
+ # Follow the same rules as in field() method:
232
+ # 1. When neither precision nor scale is provided, use maximum precision (38)
233
+ # 2. When only precision is provided, scale defaults to 0
234
+ # 3. When only scale is provided, use maximum precision (38)
235
+ # 4. When both are provided, use the provided values
236
+ if precision.nil? && scale.nil?
237
+ # No precision or scale provided - use maximum precision
238
+ result[:precision] = 38
239
+ result[:scale] = 0
240
+ elsif precision && scale.nil?
241
+ # Precision only - scale defaults to 0
242
+ result[:precision] = precision
243
+ result[:scale] = 0
244
+ elsif precision.nil? && scale
245
+ # Scale only - use maximum precision
246
+ result[:precision] = 38
247
+ result[:scale] = scale
248
+ else
249
+ # Both provided
250
+ result[:precision] = precision
251
+ result[:scale] = scale
252
+ end
253
+
254
+ result
255
+ else
256
+ # e.g. :int32 => { type: :int32, nullable: true }
257
+ { type: t, nullable: nullable, name: "item" }
258
+ end
259
+ end
260
+ end
261
+ end
262
+ end
@@ -0,0 +1,3 @@
1
+ module Parquet
2
+ VERSION = "0.8.0"
3
+ end
data/lib/parquet.rb ADDED
@@ -0,0 +1,11 @@
1
+ require_relative "parquet/version"
2
+ require_relative "parquet/schema"
3
+
4
+ begin
5
+ require "parquet/#{RUBY_VERSION.to_f}/parquet"
6
+ rescue LoadError
7
+ require "parquet/parquet"
8
+ end
9
+
10
+ module Parquet
11
+ end
data/lib/parquet.rbi ADDED
@@ -0,0 +1,181 @@
1
+ # typed: true
2
+
3
+ module Parquet
4
+ # Returns metadata information about a Parquet file
5
+ #
6
+ # The returned hash contains information about:
7
+ # - Basic file metadata (num_rows, created_by)
8
+ # - Schema information (fields, types, etc.)
9
+ # - Row group details
10
+ # - Column chunk information (compression, encodings, statistics)
11
+ sig { params(path: String).returns(T::Hash[String, T.untyped]) }
12
+ def self.metadata(path)
13
+ end
14
+
15
+ # Options:
16
+ # - `input`: String, File, or IO object containing parquet data
17
+ # - `result_type`: String specifying the output format
18
+ # ("hash" or "array" or :hash or :array)
19
+ # - `columns`: When present, only the specified columns will be included in the output.
20
+ # This is useful for reducing how much data is read and improving performance.
21
+ # - `string_storage`: How string *values* become Ruby strings (default `:copy`). Hash keys
22
+ # (struct field names and top-level column names) are always interned and
23
+ # reused regardless of this setting.
24
+ # - `:copy` allocates a fresh mutable String per value.
25
+ # - `:intern` deduplicates low-cardinality equal values into frozen interned
26
+ # Strings up to a bounded per-read cache, then falls back to frozen copies.
27
+ # A transient copy still happens per value, so it is not a per-value speedup.
28
+ # - `:shared` returns frozen, zero-copy strings backed by Rust memory for
29
+ # short, repeated, low-cardinality values. Each read returns at most the
30
+ # configured number of shared values and only values up to the configured
31
+ # byte size; values past those bounds become frozen copies. New process-wide
32
+ # leaks are also capped by the requested budget and hard process ceilings.
33
+ # All `:shared` results are frozen. Not recommended for high-cardinality or
34
+ # large-blob string columns.
35
+ # Pass a hash to set the `:shared` leak budget, e.g.
36
+ # `{ mode: :shared, max_entries: 16_384, max_value_bytes: 1024 }`.
37
+ sig do
38
+ params(
39
+ input: T.any(String, File, StringIO, IO),
40
+ result_type: T.nilable(T.any(String, Symbol)),
41
+ columns: T.nilable(T::Array[String]),
42
+ strict: T.nilable(T::Boolean),
43
+ string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped]))
44
+ ).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
45
+ end
46
+ sig do
47
+ params(
48
+ input: T.any(String, File, StringIO, IO),
49
+ result_type: T.nilable(T.any(String, Symbol)),
50
+ columns: T.nilable(T::Array[String]),
51
+ strict: T.nilable(T::Boolean),
52
+ string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped])),
53
+ blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
54
+ ).returns(NilClass)
55
+ end
56
+ def self.each_row(input, result_type: nil, columns: nil, strict: nil, string_storage: nil, &blk)
57
+ end
58
+
59
+ # Options:
60
+ # - `input`: String, File, or IO object containing parquet data
61
+ # - `result_type`: String specifying the output format
62
+ # ("hash" or "array" or :hash or :array)
63
+ # - `columns`: When present, only the specified columns will be included in the output.
64
+ # - `batch_size`: When present, specifies the number of rows per batch
65
+ # - `string_storage`: How string values become Ruby strings (`:copy` (default), `:intern`,
66
+ # or `:shared`). See `each_row` for the semantics of each mode.
67
+ sig do
68
+ params(
69
+ input: T.any(String, File, StringIO, IO),
70
+ result_type: T.nilable(T.any(String, Symbol)),
71
+ columns: T.nilable(T::Array[String]),
72
+ batch_size: T.nilable(Integer),
73
+ strict: T.nilable(T::Boolean),
74
+ string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped]))
75
+ ).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
76
+ end
77
+ sig do
78
+ params(
79
+ input: T.any(String, File, StringIO, IO),
80
+ result_type: T.nilable(T.any(String, Symbol)),
81
+ columns: T.nilable(T::Array[String]),
82
+ batch_size: T.nilable(Integer),
83
+ strict: T.nilable(T::Boolean),
84
+ string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped])),
85
+ blk:
86
+ T.nilable(T.proc.params(batch: T.any(T::Hash[String, T::Array[T.untyped]], T::Array[T::Array[T.untyped]])).void)
87
+ ).returns(NilClass)
88
+ end
89
+ def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, strict: nil, string_storage: nil, &blk)
90
+ end
91
+
92
+ # Options:
93
+ # - `read_from`: An Enumerator yielding arrays of values representing each row
94
+ # - `schema`: Array of hashes specifying column names and types. Supported types:
95
+ # - `int8`, `int16`, `int32`, `int64`
96
+ # - `uint8`, `uint16`, `uint32`, `uint64`
97
+ # - `float`, `double`
98
+ # - `string`
99
+ # - `binary`
100
+ # - `boolean`
101
+ # - `date32`
102
+ # - `timestamp_millis`, `timestamp_micros`
103
+ # - `write_to`: String path or IO object to write the parquet file to
104
+ # - `batch_size`: Optional positive batch size for writing (defaults to 1000, at most 1_000_000
105
+ # for one-column schemas; wide schemas may have a lower safety cap). Enumerator
106
+ # inputs are consumed in slices of this many rows, never materialized in full.
107
+ # - `flush_threshold`: Optional threshold in bytes before a row group is flushed to the
108
+ # destination; bounds both the raw bytes staged since the last flush
109
+ # and the writer's encoded in-progress buffer (defaults to 100MB)
110
+ # - `compression`: Optional compression type to use (defaults to "zstd")
111
+ # Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
112
+ # - `sample_size`: Optional positive number of rows to sample for size estimation
113
+ # (defaults to 100, at most 10_000)
114
+ # - `string_cache`: Deduplicate repeated string values while writing. `false` (default)
115
+ # disables it, `true` enables it with a default capacity, and an Integer
116
+ # enables it with that many retained distinct strings (at most 65_536).
117
+ # Retention also skips values larger than 4KB and stops after 16MB of
118
+ # cached string content.
119
+ sig do
120
+ params(
121
+ read_from: T::Enumerator[T::Array[T.untyped]],
122
+ schema: T::Array[T::Hash[String, String]],
123
+ write_to: T.any(String, IO),
124
+ batch_size: T.nilable(Integer),
125
+ flush_threshold: T.nilable(Integer),
126
+ compression: T.nilable(String),
127
+ sample_size: T.nilable(Integer),
128
+ string_cache: T.nilable(T.any(T::Boolean, Integer))
129
+ ).void
130
+ end
131
+ def self.write_rows(
132
+ read_from,
133
+ schema:,
134
+ write_to:,
135
+ batch_size: nil,
136
+ flush_threshold: nil,
137
+ compression: nil,
138
+ sample_size: nil,
139
+ string_cache: nil
140
+ )
141
+ end
142
+
143
+ # Options:
144
+ # - `read_from`: An Enumerator yielding arrays of column batches
145
+ # - `schema`: Array of hashes specifying column names and types. Supported types:
146
+ # - `int8`, `int16`, `int32`, `int64`
147
+ # - `uint8`, `uint16`, `uint32`, `uint64`
148
+ # - `float`, `double`
149
+ # - `string`
150
+ # - `binary`
151
+ # - `boolean`
152
+ # - `date32`
153
+ # - `timestamp_millis`, `timestamp_micros`
154
+ # - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
155
+ # - `write_to`: String path or IO object to write the parquet file to
156
+ # - `flush_threshold`: Optional threshold in bytes before a row group is flushed to the
157
+ # destination; bounds both the raw bytes staged since the last flush
158
+ # and the writer's encoded in-progress buffer (defaults to 100MB)
159
+ # - `compression`: Optional compression type to use (defaults to "zstd")
160
+ # Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
161
+ # - `logger`: Optional Ruby logger for column-write progress messages
162
+ sig do
163
+ params(
164
+ read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
165
+ schema: T::Array[T::Hash[String, String]],
166
+ write_to: T.any(String, IO),
167
+ flush_threshold: T.nilable(Integer),
168
+ compression: T.nilable(String),
169
+ logger: T.nilable(T.untyped)
170
+ ).void
171
+ end
172
+ def self.write_columns(
173
+ read_from,
174
+ schema:,
175
+ write_to:,
176
+ flush_threshold: nil,
177
+ compression: nil,
178
+ logger: nil
179
+ )
180
+ end
181
+ end
metadata ADDED
@@ -0,0 +1,165 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: parquet-tyfoom
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.8.0
5
+ platform: ruby
6
+ authors:
7
+ - Nathan Jaremko
8
+ - Cameron McCord
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2026-07-02 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rb_sys
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - "~>"
19
+ - !ruby/object:Gem::Version
20
+ version: 0.9.39
21
+ type: :runtime
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - "~>"
26
+ - !ruby/object:Gem::Version
27
+ version: 0.9.39
28
+ - !ruby/object:Gem::Dependency
29
+ name: bigdecimal
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ type: :runtime
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ - !ruby/object:Gem::Dependency
43
+ name: rake-compiler
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - "~>"
47
+ - !ruby/object:Gem::Version
48
+ version: 1.2.0
49
+ type: :development
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - "~>"
54
+ - !ruby/object:Gem::Version
55
+ version: 1.2.0
56
+ description: |2
57
+ Tyfoom's fork of the `parquet` gem (github.com/njaremko/parquet-ruby), published while the
58
+ incremental streaming-write fix is pending upstream. It wraps the official Apache Rust
59
+ implementation and bounds write memory by streaming row groups to disk instead of buffering the
60
+ whole file. Drop-in compatible with the upstream gem: the library is still required as
61
+ `require "parquet"` and exposes the same `Parquet` API.
62
+ email:
63
+ - nathan@jaremko.ca
64
+ - cameron.mccord@tyfoom.com
65
+ executables: []
66
+ extensions:
67
+ - ext/parquet/extconf.rb
68
+ extra_rdoc_files: []
69
+ files:
70
+ - Cargo.lock
71
+ - Cargo.toml
72
+ - Gemfile
73
+ - LICENSE
74
+ - README.md
75
+ - Rakefile
76
+ - ext/parquet-core/Cargo.toml
77
+ - ext/parquet-core/src/arrow_conversion.rs
78
+ - ext/parquet-core/src/error.rs
79
+ - ext/parquet-core/src/lib.rs
80
+ - ext/parquet-core/src/reader.rs
81
+ - ext/parquet-core/src/schema.rs
82
+ - ext/parquet-core/src/test_utils.rs
83
+ - ext/parquet-core/src/traits/mod.rs
84
+ - ext/parquet-core/src/traits/schema.rs
85
+ - ext/parquet-core/src/value.rs
86
+ - ext/parquet-core/src/writer.rs
87
+ - ext/parquet-core/tests/arrow_conversion_tests.rs
88
+ - ext/parquet-core/tests/binary_data.rs
89
+ - ext/parquet-core/tests/column_projection.rs
90
+ - ext/parquet-core/tests/complex_types.rs
91
+ - ext/parquet-core/tests/compression_tests.rs
92
+ - ext/parquet-core/tests/concurrent_access.rs
93
+ - ext/parquet-core/tests/decimal_tests.rs
94
+ - ext/parquet-core/tests/edge_cases_corner_cases.rs
95
+ - ext/parquet-core/tests/error_handling_comprehensive_tests.rs
96
+ - ext/parquet-core/tests/null_handling_tests.rs
97
+ - ext/parquet-core/tests/performance_memory.rs
98
+ - ext/parquet-core/tests/primitive_types.rs
99
+ - ext/parquet-core/tests/real_world_patterns.rs
100
+ - ext/parquet-core/tests/review_regressions.rs
101
+ - ext/parquet-core/tests/roundtrip_correctness.rs
102
+ - ext/parquet-core/tests/schema_comprehensive_tests.rs
103
+ - ext/parquet-core/tests/temporal_tests.rs
104
+ - ext/parquet-core/tests/test_helpers.rs
105
+ - ext/parquet-core/tests/writer_tests.rs
106
+ - ext/parquet-ruby-adapter/Cargo.toml
107
+ - ext/parquet-ruby-adapter/build.rs
108
+ - ext/parquet-ruby-adapter/examples/try_into_value_demo.rs
109
+ - ext/parquet-ruby-adapter/src/chunk_reader.rs
110
+ - ext/parquet-ruby-adapter/src/converter.rs
111
+ - ext/parquet-ruby-adapter/src/error.rs
112
+ - ext/parquet-ruby-adapter/src/io.rs
113
+ - ext/parquet-ruby-adapter/src/lib.rs
114
+ - ext/parquet-ruby-adapter/src/logger.rs
115
+ - ext/parquet-ruby-adapter/src/metadata.rs
116
+ - ext/parquet-ruby-adapter/src/reader.rs
117
+ - ext/parquet-ruby-adapter/src/schema.rs
118
+ - ext/parquet-ruby-adapter/src/string_cache.rs
119
+ - ext/parquet-ruby-adapter/src/string_cache_test.rs
120
+ - ext/parquet-ruby-adapter/src/string_storage.rs
121
+ - ext/parquet-ruby-adapter/src/try_into_value.rs
122
+ - ext/parquet-ruby-adapter/src/types.rs
123
+ - ext/parquet-ruby-adapter/src/utils.rs
124
+ - ext/parquet-ruby-adapter/src/writer.rs
125
+ - ext/parquet/Cargo.toml
126
+ - ext/parquet/build.rs
127
+ - ext/parquet/extconf.rb
128
+ - ext/parquet/src/adapter_ffi.rs
129
+ - ext/parquet/src/allocator.rs
130
+ - ext/parquet/src/lib.rs
131
+ - lib/parquet.rb
132
+ - lib/parquet.rbi
133
+ - lib/parquet/schema.rb
134
+ - lib/parquet/version.rb
135
+ homepage: https://github.com/cameronmccord2/parquet-ruby
136
+ licenses:
137
+ - MIT
138
+ metadata:
139
+ homepage_uri: https://github.com/cameronmccord2/parquet-ruby
140
+ source_code_uri: https://github.com/cameronmccord2/parquet-ruby
141
+ readme_uri: https://github.com/cameronmccord2/parquet-ruby/blob/stream-writes-incrementally/README.md
142
+ changelog_uri: https://github.com/cameronmccord2/parquet-ruby/blob/stream-writes-incrementally/CHANGELOG.md
143
+ documentation_uri: https://www.rubydoc.info/gems/parquet-tyfoom
144
+ funding_uri: https://github.com/sponsors/njaremko
145
+ allowed_push_host: https://rubygems.org
146
+ post_install_message:
147
+ rdoc_options: []
148
+ require_paths:
149
+ - lib
150
+ required_ruby_version: !ruby/object:Gem::Requirement
151
+ requirements:
152
+ - - ">="
153
+ - !ruby/object:Gem::Version
154
+ version: 3.1.0
155
+ required_rubygems_version: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ">="
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ requirements: []
161
+ rubygems_version: 3.4.19
162
+ signing_key:
163
+ specification_version: 4
164
+ summary: Tyfoom fork of the parquet gem (Rust), with incremental streaming writes
165
+ test_files: []