parquet 0.7.3-aarch64-linux → 0.8.0-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6f9f906c15b3bb0dd6ffebdf8805583e3d70bc4744126a6e5b5b8a9141f7aac3
4
- data.tar.gz: 9e5dccc0659a2a0bebc5585cd99bf6ccf9a688d685ad122006a6c6bc426e2450
3
+ metadata.gz: 89db55543839853aef62e3f11511ba4ff54ee1896c7085b83d9e6e44d2b10335
4
+ data.tar.gz: 07db9c35d4c4777d9339f92b22112760c9ff0e96091b7dc26940c54ed4df03b8
5
5
  SHA512:
6
- metadata.gz: 5b35a9d549a84fdfe670ede85172336ef3fecffb0be714962984830183cec4ace99abd2189647d3bc6f2fbf8488a401d2cb0470aa4e4f0edd633de551c1366db
7
- data.tar.gz: 351a0d0964446a2a2016d7e088ae435e3f11e6cd1974ce171376fb3789649da58b8d7c281be7478ffb50084b2b07df1f11176c49d447a009391af7267c2840c0
6
+ metadata.gz: 3f78682dc2b7b4d5aa3fa186b7cf743b1808c93644ee2097d76051c21e900da23578493a3d0e637b4bc61dd9ce06a85865fdf46e33a24acf46ff059519b5199f
7
+ data.tar.gz: ca2f9c79cfd0faf50567c7df2d73cbbea508e5ade7234ac8d2791288d0f174d1b34f707e63ef29765cdd6573ca34ac8ee0694f21d3ba46d7a8fd0ee49eb2f3cc
data/Gemfile CHANGED
@@ -16,5 +16,6 @@ end
16
16
 
17
17
  group :test do
18
18
  gem "csv"
19
+ gem "logger"
19
20
  gem "minitest", "~> 5.0"
20
21
  end
data/README.md CHANGED
@@ -166,7 +166,7 @@ end
166
166
  Parquet.write_rows(rows,
167
167
  schema: schema,
168
168
  write_to: "output.parquet",
169
- batch_size: 5000 # Rows per batch (default: 1000)
169
+ batch_size: 5000 # Positive rows per batch (default: 1000)
170
170
  )
171
171
  ```
172
172
 
@@ -200,6 +200,9 @@ Parquet.write_columns(batches.each,
200
200
  )
201
201
  ```
202
202
 
203
+ `write_columns` also accepts `logger:` with the same Ruby logger interface as
204
+ row writes.
205
+
203
206
  ## Data Types
204
207
 
205
208
  ### Basic Types
@@ -340,7 +343,7 @@ Parquet.write_rows(data.each, schema: schema, write_to: "complex.parquet")
340
343
 
341
344
  ### Timezone Handling in Parquet
342
345
 
343
- **Critical**: The Parquet specification has a fundamental limitation with timezone storage:
346
+ The Parquet specification has a fundamental limitation with timezone storage:
344
347
 
345
348
  1. **UTC-normalized**: Any timestamp with timezone info (including "+09:00" or "America/New_York") is converted to UTC
346
349
  2. **Local/unzoned**: Timestamps without timezone info are stored as-is
@@ -382,11 +385,15 @@ Control memory usage with flush thresholds:
382
385
  Parquet.write_rows(huge_dataset.each,
383
386
  schema: schema,
384
387
  write_to: "output.parquet",
385
- batch_size: 1000, # Rows before considering flush
388
+ batch_size: 1000, # Positive rows before considering flush
386
389
  flush_threshold: 32 * 1024**2 # Flush if batch exceeds 32MB
387
390
  )
388
391
  ```
389
392
 
393
+ Write batch and sample sizes are bounded before buffer allocation. Very large
394
+ batch sizes are rejected, and wide schemas have a lower effective batch cap so
395
+ the writer cannot reserve unbounded per-column value slots.
396
+
390
397
  ## Architecture
391
398
 
392
399
  This gem uses a modular architecture:
Binary file
Binary file
Binary file
Binary file
@@ -116,8 +116,12 @@ module Parquet
116
116
  key_type = kwargs[:key]
117
117
  value_type = kwargs[:value]
118
118
  raise ArgumentError, "map field `#{name}` requires `key:` and `value:`" if key_type.nil? || value_type.nil?
119
- # Pass key_nullable and value_nullable if provided, otherwise use true as default
120
- key_nullable = kwargs[:key_nullable].nil? ? true : !!kwargs[:key_nullable]
119
+ # Map keys are required by the Parquet spec. Reject an explicit nullable
120
+ # key at this boundary rather than letting it fail deep in the writer.
121
+ if kwargs[:key_nullable]
122
+ raise ArgumentError, "map field `#{name}` keys are always required; remove `key_nullable: true`"
123
+ end
124
+ key_nullable = false
121
125
  value_nullable = kwargs[:value_nullable].nil? ? true : !!kwargs[:value_nullable]
122
126
 
123
127
  field_hash[:key] = wrap_subtype(key_type, nullable: key_nullable)
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.7.3"
2
+ VERSION = "0.8.0"
3
3
  end
data/lib/parquet.rbi CHANGED
@@ -18,12 +18,29 @@ module Parquet
18
18
  # ("hash" or "array" or :hash or :array)
19
19
  # - `columns`: When present, only the specified columns will be included in the output.
20
20
  # This is useful for reducing how much data is read and improving performance.
21
+ # - `string_storage`: How string *values* become Ruby strings (default `:copy`). Hash keys
22
+ # (struct field names and top-level column names) are always interned and
23
+ # reused regardless of this setting.
24
+ # - `:copy` allocates a fresh mutable String per value.
25
+ # - `:intern` deduplicates low-cardinality equal values into frozen interned
26
+ # Strings up to a bounded per-read cache, then falls back to frozen copies.
27
+ # A transient copy still happens per value, so it is not a per-value speedup.
28
+ # - `:shared` returns frozen, zero-copy strings backed by Rust memory for
29
+ # short, repeated, low-cardinality values. Each read returns at most the
30
+ # configured number of shared values and only values up to the configured
31
+ # byte size; values past those bounds become frozen copies. New process-wide
32
+ # leaks are also capped by the requested budget and hard process ceilings.
33
+ # All `:shared` results are frozen. Not recommended for high-cardinality or
34
+ # large-blob string columns.
35
+ # Pass a hash to set the `:shared` leak budget, e.g.
36
+ # `{ mode: :shared, max_entries: 16_384, max_value_bytes: 1024 }`.
21
37
  sig do
22
38
  params(
23
39
  input: T.any(String, File, StringIO, IO),
24
40
  result_type: T.nilable(T.any(String, Symbol)),
25
41
  columns: T.nilable(T::Array[String]),
26
- strict: T.nilable(T::Boolean)
42
+ strict: T.nilable(T::Boolean),
43
+ string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped]))
27
44
  ).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
28
45
  end
29
46
  sig do
@@ -32,10 +49,11 @@ module Parquet
32
49
  result_type: T.nilable(T.any(String, Symbol)),
33
50
  columns: T.nilable(T::Array[String]),
34
51
  strict: T.nilable(T::Boolean),
52
+ string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped])),
35
53
  blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
36
54
  ).returns(NilClass)
37
55
  end
38
- def self.each_row(input, result_type: nil, columns: nil, strict: nil, &blk)
56
+ def self.each_row(input, result_type: nil, columns: nil, strict: nil, string_storage: nil, &blk)
39
57
  end
40
58
 
41
59
  # Options:
@@ -44,13 +62,16 @@ module Parquet
44
62
  # ("hash" or "array" or :hash or :array)
45
63
  # - `columns`: When present, only the specified columns will be included in the output.
46
64
  # - `batch_size`: When present, specifies the number of rows per batch
65
+ # - `string_storage`: How string values become Ruby strings (`:copy` (default), `:intern`,
66
+ # or `:shared`). See `each_row` for the semantics of each mode.
47
67
  sig do
48
68
  params(
49
69
  input: T.any(String, File, StringIO, IO),
50
70
  result_type: T.nilable(T.any(String, Symbol)),
51
71
  columns: T.nilable(T::Array[String]),
52
72
  batch_size: T.nilable(Integer),
53
- strict: T.nilable(T::Boolean)
73
+ strict: T.nilable(T::Boolean),
74
+ string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped]))
54
75
  ).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
55
76
  end
56
77
  sig do
@@ -60,11 +81,12 @@ module Parquet
60
81
  columns: T.nilable(T::Array[String]),
61
82
  batch_size: T.nilable(Integer),
62
83
  strict: T.nilable(T::Boolean),
84
+ string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped])),
63
85
  blk:
64
86
  T.nilable(T.proc.params(batch: T.any(T::Hash[String, T::Array[T.untyped]], T::Array[T::Array[T.untyped]])).void)
65
87
  ).returns(NilClass)
66
88
  end
67
- def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, strict: nil, &blk)
89
+ def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, strict: nil, string_storage: nil, &blk)
68
90
  end
69
91
 
70
92
  # Options:
@@ -79,11 +101,19 @@ module Parquet
79
101
  # - `date32`
80
102
  # - `timestamp_millis`, `timestamp_micros`
81
103
  # - `write_to`: String path or IO object to write the parquet file to
82
- # - `batch_size`: Optional batch size for writing (defaults to 1000)
83
- # - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
104
+ # - `batch_size`: Optional positive batch size for writing (defaults to 1000, at most 1_000_000
105
+ # for one-column schemas; wide schemas may have a lower safety cap)
106
+ # - `flush_threshold`: Optional threshold in bytes for the writer's in-progress (encoded)
107
+ # buffer before a row group is flushed (defaults to 100MB)
84
108
  # - `compression`: Optional compression type to use (defaults to "zstd")
85
109
  # Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
86
- # - `sample_size`: Optional number of rows to sample for size estimation (defaults to 100)
110
+ # - `sample_size`: Optional positive number of rows to sample for size estimation
111
+ # (defaults to 100, at most 10_000)
112
+ # - `string_cache`: Deduplicate repeated string values while writing. `false` (default)
113
+ # disables it, `true` enables it with a default capacity, and an Integer
114
+ # enables it with that many retained distinct strings (at most 65_536).
115
+ # Retention also skips values larger than 4KB and stops after 16MB of
116
+ # cached string content.
87
117
  sig do
88
118
  params(
89
119
  read_from: T::Enumerator[T::Array[T.untyped]],
@@ -92,7 +122,8 @@ module Parquet
92
122
  batch_size: T.nilable(Integer),
93
123
  flush_threshold: T.nilable(Integer),
94
124
  compression: T.nilable(String),
95
- sample_size: T.nilable(Integer)
125
+ sample_size: T.nilable(Integer),
126
+ string_cache: T.nilable(T.any(T::Boolean, Integer))
96
127
  ).void
97
128
  end
98
129
  def self.write_rows(
@@ -102,7 +133,8 @@ module Parquet
102
133
  batch_size: nil,
103
134
  flush_threshold: nil,
104
135
  compression: nil,
105
- sample_size: nil
136
+ sample_size: nil,
137
+ string_cache: nil
106
138
  )
107
139
  end
108
140
 
@@ -119,18 +151,28 @@ module Parquet
119
151
  # - `timestamp_millis`, `timestamp_micros`
120
152
  # - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
121
153
  # - `write_to`: String path or IO object to write the parquet file to
122
- # - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
154
+ # - `flush_threshold`: Optional threshold in bytes for the writer's in-progress (encoded)
155
+ # buffer before a row group is flushed (defaults to 100MB)
123
156
  # - `compression`: Optional compression type to use (defaults to "zstd")
124
157
  # Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
158
+ # - `logger`: Optional Ruby logger for column-write progress messages
125
159
  sig do
126
160
  params(
127
161
  read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
128
162
  schema: T::Array[T::Hash[String, String]],
129
163
  write_to: T.any(String, IO),
130
164
  flush_threshold: T.nilable(Integer),
131
- compression: T.nilable(String)
165
+ compression: T.nilable(String),
166
+ logger: T.nilable(T.untyped)
132
167
  ).void
133
168
  end
134
- def self.write_columns(read_from, schema:, write_to:, flush_threshold: nil, compression: nil)
169
+ def self.write_columns(
170
+ read_from,
171
+ schema:,
172
+ write_to:,
173
+ flush_threshold: nil,
174
+ compression: nil,
175
+ logger: nil
176
+ )
135
177
  end
136
178
  end
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.3
4
+ version: 0.8.0
5
5
  platform: aarch64-linux
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-07-05 00:00:00.000000000 Z
11
+ date: 2026-06-25 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bigdecimal
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: rake-compiler
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -42,6 +56,7 @@ files:
42
56
  - lib/parquet/3.2/parquet.so
43
57
  - lib/parquet/3.3/parquet.so
44
58
  - lib/parquet/3.4/parquet.so
59
+ - lib/parquet/4.0/parquet.so
45
60
  - lib/parquet/schema.rb
46
61
  - lib/parquet/version.rb
47
62
  homepage: https://github.com/njaremko/parquet-ruby
@@ -65,7 +80,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
65
80
  version: '3.2'
66
81
  - - "<"
67
82
  - !ruby/object:Gem::Version
68
- version: 3.5.dev
83
+ version: 4.1.dev
69
84
  required_rubygems_version: !ruby/object:Gem::Requirement
70
85
  requirements:
71
86
  - - ">="