RubyGems - polars-df - Versions diffs - 0.21.0-x86_64-linux-musl → 0.22.0-x86_64-linux-musl - Mend

polars-df 0.21.0-x86_64-linux-musl → 0.22.0-x86_64-linux-musl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +27 -0
data/Cargo.lock +55 -48
data/Cargo.toml +3 -0
data/LICENSE-THIRD-PARTY.txt +23 -49
data/README.md +12 -0
data/lib/polars/3.2/polars.so +0 -0
data/lib/polars/3.3/polars.so +0 -0
data/lib/polars/3.4/polars.so +0 -0
data/lib/polars/array_expr.rb +382 -3
data/lib/polars/array_name_space.rb +281 -0
data/lib/polars/binary_expr.rb +67 -0
data/lib/polars/binary_name_space.rb +43 -0
data/lib/polars/cat_expr.rb +224 -0
data/lib/polars/cat_name_space.rb +138 -0
data/lib/polars/config.rb +2 -2
data/lib/polars/convert.rb +6 -6
data/lib/polars/data_frame.rb +794 -27
data/lib/polars/data_type_expr.rb +52 -0
data/lib/polars/data_types.rb +26 -5
data/lib/polars/date_time_expr.rb +252 -1
data/lib/polars/date_time_name_space.rb +299 -0
data/lib/polars/expr.rb +1248 -206
data/lib/polars/functions/business.rb +95 -0
data/lib/polars/functions/datatype.rb +21 -0
data/lib/polars/functions/lazy.rb +14 -1
data/lib/polars/io/csv.rb +1 -1
data/lib/polars/io/iceberg.rb +27 -0
data/lib/polars/io/json.rb +4 -4
data/lib/polars/io/ndjson.rb +4 -4
data/lib/polars/io/parquet.rb +32 -7
data/lib/polars/io/scan_options.rb +4 -1
data/lib/polars/lazy_frame.rb +1028 -28
data/lib/polars/list_expr.rb +217 -17
data/lib/polars/list_name_space.rb +231 -22
data/lib/polars/meta_expr.rb +89 -0
data/lib/polars/name_expr.rb +36 -0
data/lib/polars/query_opt_flags.rb +50 -0
data/lib/polars/scan_cast_options.rb +20 -1
data/lib/polars/schema.rb +79 -3
data/lib/polars/selector.rb +72 -0
data/lib/polars/selectors.rb +3 -3
data/lib/polars/series.rb +1053 -54
data/lib/polars/string_expr.rb +436 -32
data/lib/polars/string_name_space.rb +736 -50
data/lib/polars/struct_expr.rb +103 -0
data/lib/polars/struct_name_space.rb +19 -1
data/lib/polars/utils/serde.rb +17 -0
data/lib/polars/utils/various.rb +22 -1
data/lib/polars/utils.rb +5 -1
data/lib/polars/version.rb +1 -1
data/lib/polars.rb +6 -0
metadata +8 -2

data/lib/polars/functions/business.rb ADDED Viewed

@@ -0,0 +1,95 @@
+module Polars
+  module Functions
+    # Count the number of business days between `start` and `end` (not including `end`).
+    #
+    # @note
+    #   This functionality is considered **unstable**. It may be changed
+    #   at any point without it being considered a breaking change.
+    #
+    # @param start [Object]
+    #   Start dates.
+    # @param stop [Object]
+    #   End dates.
+    # @param week_mask [Array]
+    #   Which days of the week to count. The default is Monday to Friday.
+    #   If you wanted to count only Monday to Thursday, you would pass
+    #   `[true, true, true, true, false, false, false]`.
+    # @param holidays [Array]
+    #   Holidays to exclude from the count.
+    #
+    # @return [Expr]
+    #
+    # @example
+    #   df = Polars::DataFrame.new(
+    #     {
+    #       "start" => [Date.new(2020, 1, 1), Date.new(2020, 1, 2)],
+    #       "end" => [Date.new(2020, 1, 2), Date.new(2020, 1, 10)]
+    #     }
+    #   )
+    #   df.with_columns(
+    #     business_day_count: Polars.business_day_count("start", "end")
+    #   )
+    #   # =>
+    #   # shape: (2, 3)
+    #   # ┌────────────┬────────────┬────────────────────┐
+    #   # │ start      ┆ end        ┆ business_day_count │
+    #   # │ ---        ┆ ---        ┆ ---                │
+    #   # │ date       ┆ date       ┆ i32                │
+    #   # ╞════════════╪════════════╪════════════════════╡
+    #   # │ 2020-01-01 ┆ 2020-01-02 ┆ 1                  │
+    #   # │ 2020-01-02 ┆ 2020-01-10 ┆ 6                  │
+    #   # └────────────┴────────────┴────────────────────┘
+    #
+    # @example You can pass a custom weekend - for example, if you only take Sunday off:
+    #   week_mask = [true, true, true, true, true, true, false]
+    #   df.with_columns(
+    #     business_day_count: Polars.business_day_count(
+    #       "start", "end", week_mask: week_mask
+    #     )
+    #   )
+    #   # =>
+    #   # shape: (2, 3)
+    #   # ┌────────────┬────────────┬────────────────────┐
+    #   # │ start      ┆ end        ┆ business_day_count │
+    #   # │ ---        ┆ ---        ┆ ---                │
+    #   # │ date       ┆ date       ┆ i32                │
+    #   # ╞════════════╪════════════╪════════════════════╡
+    #   # │ 2020-01-01 ┆ 2020-01-02 ┆ 1                  │
+    #   # │ 2020-01-02 ┆ 2020-01-10 ┆ 7                  │
+    #   # └────────────┴────────────┴────────────────────┘
+    #
+    # @example You can also pass a list of holidays to exclude from the count:
+    #   holidays = [Date.new(2020, 1, 1), Date.new(2020, 1, 2)]
+    #   df.with_columns(
+    #     business_day_count: Polars.business_day_count("start", "end", holidays: holidays)
+    #   )
+    #   # =>
+    #   # shape: (2, 3)
+    #   # ┌────────────┬────────────┬────────────────────┐
+    #   # │ start      ┆ end        ┆ business_day_count │
+    #   # │ ---        ┆ ---        ┆ ---                │
+    #   # │ date       ┆ date       ┆ i32                │
+    #   # ╞════════════╪════════════╪════════════════════╡
+    #   # │ 2020-01-01 ┆ 2020-01-02 ┆ 0                  │
+    #   # │ 2020-01-02 ┆ 2020-01-10 ┆ 5                  │
+    #   # └────────────┴────────────┴────────────────────┘
+    def business_day_count(
+      start,
+      stop,
+      week_mask: [true, true, true, true, true, false, false],
+      holidays: []
+    )
+      start_rbexpr = Utils.parse_into_expression(start)
+      end_rbexpr = Utils.parse_into_expression(stop)
+      unix_epoch = ::Date.new(1970, 1, 1)
+      Utils.wrap_expr(
+        Plr.business_day_count(
+          start_rbexpr,
+          end_rbexpr,
+          week_mask,
+          holidays.map { |holiday| holiday - unix_epoch }
+        )
+      )
+    end
+  end
+end

data/lib/polars/functions/datatype.rb ADDED Viewed

@@ -0,0 +1,21 @@
+module Polars
+  module Functions
+    # Get a lazily evaluated :class:`DataType` of a column or expression.
+    #
+    # @note
+    #   This functionality is considered **unstable**. It may be changed
+    #   at any point without it being considered a breaking change.
+    #
+    # @return [DataTypeExpr]
+    def dtype_of(col_or_expr)
+      e = nil
+      if col_or_expr.is_a?(::String)
+        e = F.col(col_or_expr)
+      else
+        e = col_or_expr
+      end
+      DataTypeExpr._from_rbdatatype_expr(RbDataTypeExpr.of_expr(e._rbexpr))
+    end
+  end
+end

data/lib/polars/functions/lazy.rb CHANGED Viewed

@@ -1,5 +1,18 @@
 module Polars
   module Functions
+    # Select a field in the current `struct.with_fields` scope.
+    #
+    # @param name [Object]
+    #   Name of the field(s) to select.
+    #
+    # @return [Expr]
+    def field(name)
+      if name.is_a?(::String)
+        name = [name]
+      end
+      Utils.wrap_expr(Plr.field(name))
+    end
     # Alias for an element in evaluated in an `eval` expression.
     #
     # @return [Expr]
@@ -810,7 +823,7 @@ module Polars
     #   # ┌─────┐
     #   # │ sum │
     #   # │ --- │
-    #   # │ i64 │
+    #   # │ i32 │
     #   # ╞═════╡
     #   # │ 10  │
     #   # │ 13  │

data/lib/polars/io/csv.rb CHANGED Viewed

@@ -499,7 +499,7 @@ module Polars
     #   for instance `#`.
     # @param quote_char [String]
     #   Single byte character used for csv quoting.
-    #   Set to None to turn off special handling and escaping of quotes.
+    #   Set to nil to turn off special handling and escaping of quotes.
     # @param skip_rows [Integer]
     #   Start reading after `skip_rows` lines. The header will be parsed at this
     #   offset.

data/lib/polars/io/iceberg.rb ADDED Viewed

@@ -0,0 +1,27 @@
+module Polars
+  module IO
+    # Lazily read from an Apache Iceberg table.
+    #
+    # @param source [Object]
+    #   A Iceberg Ruby table, or a direct path to the metadata.
+    # @param snapshot_id [Integer]
+    #   The snapshot ID to scan from.
+    # @param storage_options [Hash]
+    #   Extra options for the storage backends.
+    #
+    # @return [LazyFrame]
+    def scan_iceberg(
+      source,
+      snapshot_id: nil,
+      storage_options: nil
+    )
+      require "iceberg"
+      unless source.is_a?(Iceberg::Table)
+        raise Todo
+      end
+      source.to_polars(snapshot_id:, storage_options:)
+    end
+  end
+end

data/lib/polars/io/json.rb CHANGED Viewed

@@ -7,11 +7,11 @@ module Polars
     # @param schema [Object]
     #   The DataFrame schema may be declared in several ways:
     #
-    #   * As a dict of {name:type} pairs; if type is None, it will be auto-inferred.
-    #   * As a list of column names; in this case types are automatically inferred.
-    #   * As a list of (name,type) pairs; this is equivalent to the dictionary form.
+    #   * As a hash of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
+    #   * As an array of column names; in this case types are automatically inferred.
+    #   * As an array of [name,type] pairs; this is equivalent to the hash form.
     #
-    #   If you supply a list of column names that does not match the names in the
+    #   If you supply an array of column names that does not match the names in the
     #   underlying data, the names given here will overwrite them. The number
     #   of names given in the schema should match the underlying data dimensions.
     # @param schema_overrides [Hash]

data/lib/polars/io/ndjson.rb CHANGED Viewed

@@ -7,11 +7,11 @@ module Polars
     # @param schema [Object]
     #   The DataFrame schema may be declared in several ways:
     #
-    #   * As a dict of {name:type} pairs; if type is None, it will be auto-inferred.
-    #   * As a list of column names; in this case types are automatically inferred.
-    #   * As a list of (name,type) pairs; this is equivalent to the dictionary form.
+    #   * As a hash of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
+    #   * As an array of column names; in this case types are automatically inferred.
+    #   * As an array of [name,type] pairs; this is equivalent to the hash form.
     #
-    #   If you supply a list of column names that does not match the names in the
+    #   If you supply an array of column names that does not match the names in the
     #   underlying data, the names given here will overwrite them. The number
     #   of names given in the schema should match the underlying data dimensions.
     # @param schema_overrides [Hash]

data/lib/polars/io/parquet.rb CHANGED Viewed

@@ -43,7 +43,7 @@ module Polars
     #   Extra options that make sense for a particular storage connection.
     # @param credential_provider [Object]
     #   Provide a function that can be called to provide cloud storage
-    #   credentials. The function is expected to return a dictionary of
+    #   credentials. The function is expected to return a hash of
     #   credential keys along with an optional credential expiry time.
     # @param retries [Integer]
     #   Number of retries if accessing a cloud instance fails.
@@ -117,13 +117,31 @@ module Polars
     # @param source [Object]
     #   Path to a file or a file-like object.
     #
-    # @return [Hash]
+    # @return [Schema]
     def read_parquet_schema(source)
       if Utils.pathlike?(source)
         source = Utils.normalize_filepath(source)
       end
-      Plr.parquet_schema(source)
+      scan_parquet(source).collect_schema
+    end
+    # Get file-level custom metadata of a Parquet file without reading data.
+    #
+    # @note
+    #   This functionality is considered **experimental**. It may be removed or
+    #   changed at any point without it being considered a breaking change.
+    #
+    # @param source [Object]
+    #   Path to a file or a file-like object.
+    #
+    # @return [Hash]
+    def read_parquet_metadata(source)
+      if Utils.pathlike?(source)
+        source = Utils.normalize_filepath(source, check_not_directory: false)
+      end
+      Plr.read_parquet_metadata(source)
     end
     # Lazily read from a parquet file or multiple files via glob patterns.
@@ -171,7 +189,7 @@ module Polars
     #   Extra options that make sense for a particular storage connection.
     # @param credential_provider [Object]
     #   Provide a function that can be called to provide cloud storage
-    #   credentials. The function is expected to return a dictionary of
+    #   credentials. The function is expected to return a hash of
     #   credential keys along with an optional credential expiry time.
     # @param retries [Integer]
     #   Number of retries if accessing a cloud instance fails.
@@ -188,6 +206,9 @@ module Polars
     #   defined schema are encountered in the data:
     #     * `ignore`: Silently ignores.
     #     * `raise`: Raises an error.
+    # @param cast_options [Object]
+    #   Configuration for column type-casting during scans. Useful for datasets
+    #   containing files that have differing schemas.
     #
     # @return [LazyFrame]
     def scan_parquet(
@@ -210,7 +231,10 @@ module Polars
       retries: 2,
       include_file_paths: nil,
       allow_missing_columns: false,
-      extra_columns: "raise"
+      extra_columns: "raise",
+      cast_options: nil,
+      _column_mapping: nil,
+      _deletion_files: nil
     )
       missing_columns = allow_missing_columns ? "insert" : "raise"
@@ -247,7 +271,7 @@ module Polars
           ScanOptions.new(
             row_index: !row_index_name.nil? ? [row_index_name, row_index_offset] : nil,
             pre_slice: !n_rows.nil? ? [0, n_rows] : nil,
-            # cast_options: cast_options,
+            cast_options: cast_options,
             extra_columns: extra_columns,
             missing_columns: missing_columns,
             include_file_paths: include_file_paths,
@@ -260,7 +284,8 @@ module Polars
             storage_options: storage_options,
             # credential_provider: credential_provider_builder,
             retries: retries,
-            # deletion_files: _deletion_files
+            deletion_files: _deletion_files,
+            column_mapping: _column_mapping
           ),
           parallel,
           low_memory,

data/lib/polars/io/scan_options.rb CHANGED Viewed

@@ -3,7 +3,8 @@ module Polars
     class ScanOptions
       attr_reader :row_index, :pre_slice, :cast_options, :extra_columns, :missing_columns,
         :include_file_paths, :glob, :hive_partitioning, :hive_schema, :try_parse_hive_dates,
-        :rechunk, :cache, :storage_options, :credential_provider, :retries, :column_mapping, :deletion_files
+        :rechunk, :cache, :storage_options, :credential_provider, :retries, :column_mapping,
+        :default_values, :deletion_files
       def initialize(
         row_index: nil,
@@ -22,6 +23,7 @@ module Polars
         credential_provider: nil,
         retries: 2,
         column_mapping: nil,
+        default_values: nil,
         deletion_files: nil
       )
         @row_index = row_index
@@ -40,6 +42,7 @@ module Polars
         @credential_provider = credential_provider
         @retries = retries
         @column_mapping = column_mapping
+        @default_values = default_values
         @deletion_files = deletion_files
       end
     end