RubyGems - parquet - Versions diffs - 0.7.3-aarch64-linux → 0.8.0-aarch64-linux - Mend

parquet 0.7.3-aarch64-linux → 0.8.0-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 6f9f906c15b3bb0dd6ffebdf8805583e3d70bc4744126a6e5b5b8a9141f7aac3
-  data.tar.gz: 9e5dccc0659a2a0bebc5585cd99bf6ccf9a688d685ad122006a6c6bc426e2450
+  metadata.gz: 89db55543839853aef62e3f11511ba4ff54ee1896c7085b83d9e6e44d2b10335
+  data.tar.gz: 07db9c35d4c4777d9339f92b22112760c9ff0e96091b7dc26940c54ed4df03b8
 SHA512:
-  metadata.gz: 5b35a9d549a84fdfe670ede85172336ef3fecffb0be714962984830183cec4ace99abd2189647d3bc6f2fbf8488a401d2cb0470aa4e4f0edd633de551c1366db
-  data.tar.gz: 351a0d0964446a2a2016d7e088ae435e3f11e6cd1974ce171376fb3789649da58b8d7c281be7478ffb50084b2b07df1f11176c49d447a009391af7267c2840c0
+  metadata.gz: 3f78682dc2b7b4d5aa3fa186b7cf743b1808c93644ee2097d76051c21e900da23578493a3d0e637b4bc61dd9ce06a85865fdf46e33a24acf46ff059519b5199f
+  data.tar.gz: ca2f9c79cfd0faf50567c7df2d73cbbea508e5ade7234ac8d2791288d0f174d1b34f707e63ef29765cdd6573ca34ac8ee0694f21d3ba46d7a8fd0ee49eb2f3cc

data/Gemfile CHANGED Viewed

@@ -16,5 +16,6 @@ end
 group :test do
   gem "csv"
+  gem "logger"
   gem "minitest", "~> 5.0"
 end

data/README.md CHANGED Viewed

@@ -166,7 +166,7 @@ end
 Parquet.write_rows(rows,
   schema: schema,
   write_to: "output.parquet",
-  batch_size: 5000  # Rows per batch (default: 1000)
+  batch_size: 5000  # Positive rows per batch (default: 1000)
 )
 ```
@@ -200,6 +200,9 @@ Parquet.write_columns(batches.each,
 )
 ```
+`write_columns` also accepts `logger:` with the same Ruby logger interface as
+row writes.
 ## Data Types
 ### Basic Types
@@ -340,7 +343,7 @@ Parquet.write_rows(data.each, schema: schema, write_to: "complex.parquet")
 ### Timezone Handling in Parquet
-**Critical**: The Parquet specification has a fundamental limitation with timezone storage:
+The Parquet specification has a fundamental limitation with timezone storage:
 1. **UTC-normalized**: Any timestamp with timezone info (including "+09:00" or "America/New_York") is converted to UTC
 2. **Local/unzoned**: Timestamps without timezone info are stored as-is
@@ -382,11 +385,15 @@ Control memory usage with flush thresholds:
 Parquet.write_rows(huge_dataset.each,
   schema: schema,
   write_to: "output.parquet",
-  batch_size: 1000,              # Rows before considering flush
+  batch_size: 1000,              # Positive rows before considering flush
   flush_threshold: 32 * 1024**2  # Flush if batch exceeds 32MB
 )
 ```
+Write batch and sample sizes are bounded before buffer allocation. Very large
+batch sizes are rejected, and wide schemas have a lower effective batch cap so
+the writer cannot reserve unbounded per-column value slots.
 ## Architecture
 This gem uses a modular architecture:

data/lib/parquet/3.2/parquet.so CHANGED Viewed

Binary file

data/lib/parquet/3.3/parquet.so CHANGED Viewed

Binary file

data/lib/parquet/3.4/parquet.so CHANGED Viewed

Binary file

data/lib/parquet/4.0/parquet.so ADDED Viewed

Binary file

data/lib/parquet/schema.rb CHANGED Viewed

@@ -116,8 +116,12 @@ module Parquet
           key_type = kwargs[:key]
           value_type = kwargs[:value]
           raise ArgumentError, "map field `#{name}` requires `key:` and `value:`" if key_type.nil? || value_type.nil?
-          # Pass key_nullable and value_nullable if provided, otherwise use true as default
-          key_nullable = kwargs[:key_nullable].nil? ? true : !!kwargs[:key_nullable]
+          # Map keys are required by the Parquet spec. Reject an explicit nullable
+          # key at this boundary rather than letting it fail deep in the writer.
+          if kwargs[:key_nullable]
+            raise ArgumentError, "map field `#{name}` keys are always required; remove `key_nullable: true`"
+          end
+          key_nullable = false
           value_nullable = kwargs[:value_nullable].nil? ? true : !!kwargs[:value_nullable]
           field_hash[:key] = wrap_subtype(key_type, nullable: key_nullable)

data/lib/parquet/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Parquet
-  VERSION = "0.7.3"
+  VERSION = "0.8.0"
 end

data/lib/parquet.rbi CHANGED Viewed

@@ -18,12 +18,29 @@ module Parquet
   #                    ("hash" or "array" or :hash or :array)
   #   - `columns`: When present, only the specified columns will be included in the output.
   #                This is useful for reducing how much data is read and improving performance.
+  #   - `string_storage`: How string *values* become Ruby strings (default `:copy`). Hash keys
+  #                       (struct field names and top-level column names) are always interned and
+  #                       reused regardless of this setting.
+  #                       - `:copy` allocates a fresh mutable String per value.
+  #                       - `:intern` deduplicates low-cardinality equal values into frozen interned
+  #                         Strings up to a bounded per-read cache, then falls back to frozen copies.
+  #                         A transient copy still happens per value, so it is not a per-value speedup.
+  #                       - `:shared` returns frozen, zero-copy strings backed by Rust memory for
+  #                         short, repeated, low-cardinality values. Each read returns at most the
+  #                         configured number of shared values and only values up to the configured
+  #                         byte size; values past those bounds become frozen copies. New process-wide
+  #                         leaks are also capped by the requested budget and hard process ceilings.
+  #                         All `:shared` results are frozen. Not recommended for high-cardinality or
+  #                         large-blob string columns.
+  #                       Pass a hash to set the `:shared` leak budget, e.g.
+  #                       `{ mode: :shared, max_entries: 16_384, max_value_bytes: 1024 }`.
   sig do
     params(
       input: T.any(String, File, StringIO, IO),
       result_type: T.nilable(T.any(String, Symbol)),
       columns: T.nilable(T::Array[String]),
-      strict: T.nilable(T::Boolean)
+      strict: T.nilable(T::Boolean),
+      string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped]))
     ).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
   end
   sig do
@@ -32,10 +49,11 @@ module Parquet
       result_type: T.nilable(T.any(String, Symbol)),
       columns: T.nilable(T::Array[String]),
       strict: T.nilable(T::Boolean),
+      string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped])),
       blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
     ).returns(NilClass)
   end
-  def self.each_row(input, result_type: nil, columns: nil, strict: nil, &blk)
+  def self.each_row(input, result_type: nil, columns: nil, strict: nil, string_storage: nil, &blk)
   end
   # Options:
@@ -44,13 +62,16 @@ module Parquet
   #                    ("hash" or "array" or :hash or :array)
   #   - `columns`: When present, only the specified columns will be included in the output.
   #   - `batch_size`: When present, specifies the number of rows per batch
+  #   - `string_storage`: How string values become Ruby strings (`:copy` (default), `:intern`,
+  #                       or `:shared`). See `each_row` for the semantics of each mode.
   sig do
     params(
       input: T.any(String, File, StringIO, IO),
       result_type: T.nilable(T.any(String, Symbol)),
       columns: T.nilable(T::Array[String]),
       batch_size: T.nilable(Integer),
-      strict: T.nilable(T::Boolean)
+      strict: T.nilable(T::Boolean),
+      string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped]))
     ).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
   end
   sig do
@@ -60,11 +81,12 @@ module Parquet
       columns: T.nilable(T::Array[String]),
       batch_size: T.nilable(Integer),
       strict: T.nilable(T::Boolean),
+      string_storage: T.nilable(T.any(String, Symbol, T::Hash[Symbol, T.untyped])),
       blk:
         T.nilable(T.proc.params(batch: T.any(T::Hash[String, T::Array[T.untyped]], T::Array[T::Array[T.untyped]])).void)
     ).returns(NilClass)
   end
-  def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, strict: nil, &blk)
+  def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, strict: nil, string_storage: nil, &blk)
   end
   # Options:
@@ -79,11 +101,19 @@ module Parquet
   #     - `date32`
   #     - `timestamp_millis`, `timestamp_micros`
   #   - `write_to`: String path or IO object to write the parquet file to
-  #   - `batch_size`: Optional batch size for writing (defaults to 1000)
-  #   - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
+  #   - `batch_size`: Optional positive batch size for writing (defaults to 1000, at most 1_000_000
+  #                   for one-column schemas; wide schemas may have a lower safety cap)
+  #   - `flush_threshold`: Optional threshold in bytes for the writer's in-progress (encoded)
+  #                        buffer before a row group is flushed (defaults to 100MB)
   #   - `compression`: Optional compression type to use (defaults to "zstd")
   #                   Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
-  #   - `sample_size`: Optional number of rows to sample for size estimation (defaults to 100)
+  #   - `sample_size`: Optional positive number of rows to sample for size estimation
+  #                    (defaults to 100, at most 10_000)
+  #   - `string_cache`: Deduplicate repeated string values while writing. `false` (default)
+  #                     disables it, `true` enables it with a default capacity, and an Integer
+  #                     enables it with that many retained distinct strings (at most 65_536).
+  #                     Retention also skips values larger than 4KB and stops after 16MB of
+  #                     cached string content.
   sig do
     params(
       read_from: T::Enumerator[T::Array[T.untyped]],
@@ -92,7 +122,8 @@ module Parquet
       batch_size: T.nilable(Integer),
       flush_threshold: T.nilable(Integer),
       compression: T.nilable(String),
-      sample_size: T.nilable(Integer)
+      sample_size: T.nilable(Integer),
+      string_cache: T.nilable(T.any(T::Boolean, Integer))
     ).void
   end
   def self.write_rows(
@@ -102,7 +133,8 @@ module Parquet
     batch_size: nil,
     flush_threshold: nil,
     compression: nil,
-    sample_size: nil
+    sample_size: nil,
+    string_cache: nil
   )
   end
@@ -119,18 +151,28 @@ module Parquet
   #     - `timestamp_millis`, `timestamp_micros`
   #     - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
   #   - `write_to`: String path or IO object to write the parquet file to
-  #   - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
+  #   - `flush_threshold`: Optional threshold in bytes for the writer's in-progress (encoded)
+  #                        buffer before a row group is flushed (defaults to 100MB)
   #   - `compression`: Optional compression type to use (defaults to "zstd")
   #                   Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
+  #   - `logger`: Optional Ruby logger for column-write progress messages
   sig do
     params(
       read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
       schema: T::Array[T::Hash[String, String]],
       write_to: T.any(String, IO),
       flush_threshold: T.nilable(Integer),
-      compression: T.nilable(String)
+      compression: T.nilable(String),
+      logger: T.nilable(T.untyped)
     ).void
   end
-  def self.write_columns(read_from, schema:, write_to:, flush_threshold: nil, compression: nil)
+  def self.write_columns(
+    read_from,
+    schema:,
+    write_to:,
+    flush_threshold: nil,
+    compression: nil,
+    logger: nil
+  )
   end
 end

metadata CHANGED Viewed

@@ -1,15 +1,29 @@
 --- !ruby/object:Gem::Specification
 name: parquet
 version: !ruby/object:Gem::Version
-  version: 0.7.3
+  version: 0.8.0
 platform: aarch64-linux
 authors:
 - Nathan Jaremko
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2025-07-05 00:00:00.000000000 Z
+date: 2026-06-25 00:00:00.000000000 Z
 dependencies:
+- !ruby/object:Gem::Dependency
+  name: bigdecimal
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: rake-compiler
   requirement: !ruby/object:Gem::Requirement
@@ -42,6 +56,7 @@ files:
 - lib/parquet/3.2/parquet.so
 - lib/parquet/3.3/parquet.so
 - lib/parquet/3.4/parquet.so
+- lib/parquet/4.0/parquet.so
 - lib/parquet/schema.rb
 - lib/parquet/version.rb
 homepage: https://github.com/njaremko/parquet-ruby
@@ -65,7 +80,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '3.2'
   - - "<"
     - !ruby/object:Gem::Version
-      version: 3.5.dev
+      version: 4.1.dev
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="