RubyGems - polars-df - Versions diffs - 0.19.0-x64-mingw-ucrt → 0.21.0-x64-mingw-ucrt - Mend

polars-df 0.19.0-x64-mingw-ucrt → 0.21.0-x64-mingw-ucrt

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +15 -0
data/Cargo.lock +211 -320
data/LICENSE-THIRD-PARTY.txt +1376 -2634
data/LICENSE.txt +1 -1
data/lib/polars/3.2/polars.so +0 -0
data/lib/polars/3.3/polars.so +0 -0
data/lib/polars/3.4/polars.so +0 -0
data/lib/polars/cat_name_space.rb +3 -43
data/lib/polars/catalog/unity/catalog_info.rb +20 -0
data/lib/polars/catalog/unity/column_info.rb +31 -0
data/lib/polars/catalog/unity/namespace_info.rb +21 -0
data/lib/polars/catalog/unity/table_info.rb +50 -0
data/lib/polars/catalog.rb +448 -0
data/lib/polars/convert.rb +10 -0
data/lib/polars/data_frame.rb +151 -30
data/lib/polars/data_types.rb +47 -3
data/lib/polars/exceptions.rb +7 -2
data/lib/polars/expr.rb +48 -39
data/lib/polars/functions/col.rb +6 -5
data/lib/polars/functions/eager.rb +1 -1
data/lib/polars/functions/lazy.rb +114 -15
data/lib/polars/functions/repeat.rb +4 -0
data/lib/polars/io/csv.rb +18 -0
data/lib/polars/io/json.rb +16 -0
data/lib/polars/io/ndjson.rb +13 -0
data/lib/polars/io/parquet.rb +45 -63
data/lib/polars/io/scan_options.rb +47 -0
data/lib/polars/lazy_frame.rb +163 -75
data/lib/polars/list_expr.rb +213 -17
data/lib/polars/list_name_space.rb +121 -8
data/lib/polars/meta_expr.rb +14 -29
data/lib/polars/scan_cast_options.rb +64 -0
data/lib/polars/schema.rb +6 -1
data/lib/polars/selector.rb +138 -0
data/lib/polars/selectors.rb +931 -202
data/lib/polars/series.rb +46 -19
data/lib/polars/string_expr.rb +24 -3
data/lib/polars/string_name_space.rb +12 -1
data/lib/polars/utils/parse.rb +40 -0
data/lib/polars/utils.rb +5 -1
data/lib/polars/version.rb +1 -1
data/lib/polars.rb +8 -0
metadata +10 -2

data/lib/polars/functions/lazy.rb CHANGED Viewed

@@ -458,7 +458,7 @@ module Polars
     #   # └─────┴─────┘
     def first(*columns)
       if columns.empty?
-        return Utils.wrap_expr(Plr.first)
+        return cs.first.as_expr
       end
       col(*columns).first
@@ -518,7 +518,7 @@ module Polars
     #   # └─────┴─────┘
     def last(*columns)
       if columns.empty?
-        return Utils.wrap_expr(Plr.last)
+        return cs.last.as_expr
       end
       col(*columns).last
@@ -565,12 +565,8 @@ module Polars
     #   # │ bar ┆ 8   │
     #   # │ baz ┆ 3   │
     #   # └─────┴─────┘
-    def nth(*indices)
-      if indices.length == 1 && indices[0].is_a?(Array)
-        indices = indices[0]
-      end
-      Utils.wrap_expr(Plr.index_cols(indices))
+    def nth(*indices, strict: true)
+      cs.by_index(*indices, require_all: strict).as_expr
     end
     # Get the first `n` rows.
@@ -675,12 +671,12 @@ module Polars
     #   Column name or Expression.
     # @param b [Object]
     #   Column name or Expression.
+    # @param method ["pearson", "spearman"]
+    #   Correlation method.
     # @param ddof [Integer]
     #   "Delta Degrees of Freedom": the divisor used in the calculation is N - ddof,
     #   where N represents the number of elements.
     #   By default ddof is 1.
-    # @param method ["pearson", "spearman"]
-    #   Correlation method.
     # @param propagate_nans [Boolean]
     #   If `true` any `NaN` encountered will lead to `NaN` in the output.
     #   Defaults to `False` where `NaN` are regarded as larger than any finite number
@@ -795,14 +791,82 @@ module Polars
     # Accumulate over multiple columns horizontally/row wise with a left fold.
     #
     # @return [Expr]
-    def fold(acc, f, exprs)
+    #
+    # @example Horizontally sum over all columns and add 1.
+    #   df = Polars::DataFrame.new(
+    #    {
+    #      "a" => [1, 2, 3],
+    #      "b" => [3, 4, 5],
+    #      "c" => [5, 6, 7]
+    #    }
+    #   )
+    #   df.select(
+    #     Polars.fold(
+    #       Polars.lit(1), ->(acc, x) { acc + x }, Polars.col("*")
+    #     ).alias("sum")
+    #   )
+    #   # =>
+    #   # shape: (3, 1)
+    #   # ┌─────┐
+    #   # │ sum │
+    #   # │ --- │
+    #   # │ i64 │
+    #   # ╞═════╡
+    #   # │ 10  │
+    #   # │ 13  │
+    #   # │ 16  │
+    #   # └─────┘
+    #
+    # @example You can also apply a condition/predicate on all columns:
+    #   df = Polars::DataFrame.new(
+    #     {
+    #       "a" => [1, 2, 3],
+    #       "b" => [0, 1, 2]
+    #     }
+    #   )
+    #   df.filter(
+    #     Polars.fold(
+    #       Polars.lit(true),
+    #       ->(acc, x) { acc & x },
+    #       Polars.col("*") > 1
+    #     )
+    #   )
+    #   # =>
+    #   # shape: (1, 2)
+    #   # ┌─────┬─────┐
+    #   # │ a   ┆ b   │
+    #   # │ --- ┆ --- │
+    #   # │ i64 ┆ i64 │
+    #   # ╞═════╪═════╡
+    #   # │ 3   ┆ 2   │
+    #   # └─────┴─────┘
+    def fold(
+      acc,
+      function,
+      exprs,
+      returns_scalar: false,
+      return_dtype: nil
+    )
       acc = Utils.parse_into_expression(acc, str_as_lit: true)
       if exprs.is_a?(Expr)
         exprs = [exprs]
       end
+      rt = nil
+      if !return_dtype.nil?
+        rt = Utils.parse_into_datatype_expr(return_dtype)._rbdatatype_expr
+      end
       exprs = Utils.parse_into_list_of_expressions(exprs)
-      Utils.wrap_expr(Plr.fold(acc, f, exprs))
+      Utils.wrap_expr(
+        Plr.fold(
+          acc,
+          function,
+          exprs,
+          returns_scalar,
+          rt
+        )
+      )
     end
     # def reduce
@@ -815,11 +879,17 @@ module Polars
     # @param acc [Object]
     #   Accumulator Expression. This is the value that will be initialized when the fold
     #   starts. For a sum this could for instance be lit(0).
-    # @param f [Object]
+    # @param function [Object]
     #   Function to apply over the accumulator and the value.
     #   Fn(acc, value) -> new_value
     # @param exprs [Object]
     #   Expressions to aggregate over. May also be a wildcard expression.
+    # @param returns_scalar [Boolean]
+    #   Whether or not `function` applied returns a scalar. This must be set correctly
+    #   by the user.
+    # @param return_dtype [Object]
+    #   Output datatype.
+    #   If not set, the dtype will be inferred based on the dtype of the accumulator.
     # @param include_init [Boolean]
     #   Include the initial accumulator state as struct field.
     #
@@ -851,14 +921,35 @@ module Polars
     #   # │ 2   ┆ 4   ┆ 6   ┆ {3,7,13}  │
     #   # │ 3   ┆ 5   ┆ 7   ┆ {4,9,16}  │
     #   # └─────┴─────┴─────┴───────────┘
-    def cum_fold(acc, f, exprs, include_init: false)
+    def cum_fold(
+      acc,
+      function,
+      exprs,
+      returns_scalar: false,
+      return_dtype: nil,
+      include_init: false
+    )
       acc = Utils.parse_into_expression(acc, str_as_lit: true)
       if exprs.is_a?(Expr)
         exprs = [exprs]
       end
+      rt = nil
+      if !return_dtype.nil?
+        rt = Utils.parse_into_datatype_expr(return_dtype)._rbdatatype_expr
+      end
       exprs = Utils.parse_into_list_of_expressions(exprs)
-      Utils.wrap_expr(Plr.cum_fold(acc, f, exprs, include_init)._alias("cum_fold"))
+      Utils.wrap_expr(
+        Plr.cum_fold(
+          acc,
+          function,
+          exprs,
+          returns_scalar,
+          rt,
+          include_init
+        )._alias("cum_fold")
+      )
     end
     alias_method :cumfold, :cum_fold
@@ -1047,8 +1138,16 @@ module Polars
     #
     # @param exprs [Object]
     #   Columns use to determine the ordering.
+    # @param more_exprs [Array]
+    #   Additional columns to arg sort by, specified as positional arguments.
     # @param reverse [Boolean]
     #   Default is ascending.
+    # @param nulls_last [Boolean]
+    #   Place null values last.
+    # @param multithreaded [Boolean]
+    #   Sort using multiple threads.
+    # @param maintain_order [Boolean]
+    #   Whether the order should be maintained if elements are equal.
     #
     # @return [Expr]
     #

data/lib/polars/functions/repeat.rb CHANGED Viewed

@@ -6,6 +6,10 @@ module Polars
     #   Value to repeat.
     # @param n [Integer]
     #   Repeat `n` times.
+    # @param dtype [Object]
+    #   Data type of the resulting column. If set to `nil` (default), data type is
+    #   inferred from the given value. Defaults to Int32 for integer values, unless
+    #   Int64 is required to fit the given value. Defaults to Float64 for float values.
     # @param eager [Boolean]
     #   Run eagerly and collect into a `Series`.
     # @param name [String]

data/lib/polars/io/csv.rb CHANGED Viewed

@@ -347,6 +347,9 @@ module Polars
     #   - `String`: All values equal to this string will be null.
     #   - `Array`: All values equal to any string in this array will be null.
     #   - `Hash`: A hash that maps column name to a null value string.
+    # @param missing_utf8_is_empty_string [Boolean]
+    #   By default a missing value is considered to be null; if you would prefer missing
+    #   utf8 values to be treated as the empty string you can set this param true.
     # @param ignore_errors [Boolean]
     #   Try to keep reading lines if some lines yield errors.
     #   First try `infer_schema_length: 0` to read all columns as
@@ -387,8 +390,13 @@ module Polars
     #   Offset to start the row_count column (only used if the name is set).
     # @param eol_char [String]
     #   Single byte end of line character.
+    # @param raise_if_empty [Boolean]
+    #   When there is no data in the source,`NoDataError` is raised. If this parameter
+    #   is set to false, `nil` will be returned from `next_batches(n)` instead.
     # @param truncate_ragged_lines [Boolean]
     #   Truncate lines that are longer than the schema.
+    # @param decimal_comma [Boolean]
+    #   Parse floats using a comma as the decimal separator instead of a period.
     #
     # @return [BatchedCsvReader]
     #
@@ -503,6 +511,9 @@ module Polars
     #   - `String`: All values equal to this string will be null.
     #   - `Array`: All values equal to any string in this array will be null.
     #   - `Hash`: A hash that maps column name to a null value string.
+    # @param missing_utf8_is_empty_string [Boolean]
+    #   By default a missing value is considered to be null; if you would prefer missing
+    #   utf8 values to be treated as the empty string you can set this param true.
     # @param ignore_errors [Boolean]
     #   Try to keep reading lines if some lines yield errors.
     #   First try `infer_schema_length: 0` to read all columns as
@@ -538,8 +549,15 @@ module Polars
     #   the column remains of data type `:str`.
     # @param eol_char [String]
     #   Single byte end of line character.
+    # @param raise_if_empty [Boolean]
+    #   When there is no data in the source, `NoDataError` is raised. If this parameter
+    #   is set to false, an empty LazyFrame (with no columns) is returned instead.
     # @param truncate_ragged_lines [Boolean]
     #   Truncate lines that are longer than the schema.
+    # @param decimal_comma [Boolean]
+    #   Parse floats using a comma as the decimal separator instead of a period.
+    # @param glob [Boolean]
+    #   Expand path given via globbing rules.
     #
     # @return [LazyFrame]
     def scan_csv(

data/lib/polars/io/json.rb CHANGED Viewed

@@ -4,6 +4,22 @@ module Polars
     #
     # @param source [Object]
     #   Path to a file or a file-like object.
+    # @param schema [Object]
+    #   The DataFrame schema may be declared in several ways:
+    #
+    #   * As a dict of {name:type} pairs; if type is None, it will be auto-inferred.
+    #   * As a list of column names; in this case types are automatically inferred.
+    #   * As a list of (name,type) pairs; this is equivalent to the dictionary form.
+    #
+    #   If you supply a list of column names that does not match the names in the
+    #   underlying data, the names given here will overwrite them. The number
+    #   of names given in the schema should match the underlying data dimensions.
+    # @param schema_overrides [Hash]
+    #   Support type specification or override of one or more columns; note that
+    #   any dtypes inferred from the schema param will be overridden.
+    # @param infer_schema_length [Integer]
+    #   The maximum number of rows to scan for schema inference.
+    #   If set to `nil`, the full data may be scanned *(this is slow)*.
     #
     # @return [DataFrame]
     def read_json(

data/lib/polars/io/ndjson.rb CHANGED Viewed

@@ -4,6 +4,19 @@ module Polars
     #
     # @param source [Object]
     #   Path to a file or a file-like object.
+    # @param schema [Object]
+    #   The DataFrame schema may be declared in several ways:
+    #
+    #   * As a dict of {name:type} pairs; if type is None, it will be auto-inferred.
+    #   * As a list of column names; in this case types are automatically inferred.
+    #   * As a list of (name,type) pairs; this is equivalent to the dictionary form.
+    #
+    #   If you supply a list of column names that does not match the names in the
+    #   underlying data, the names given here will overwrite them. The number
+    #   of names given in the schema should match the underlying data dimensions.
+    # @param schema_overrides [Hash]
+    #   Support type specification or override of one or more columns; note that
+    #   any dtypes inferred from the schema param will be overridden.
     #
     # @return [DataFrame]
     def read_ndjson(

data/lib/polars/io/parquet.rb CHANGED Viewed

@@ -49,6 +49,12 @@ module Polars
     #   Number of retries if accessing a cloud instance fails.
     # @param include_file_paths [String]
     #   Include the path of the source file(s) as a column with this name.
+    # @param allow_missing_columns [Boolean]
+    #   When reading a list of parquet files, if a column existing in the first
+    #   file cannot be found in subsequent files, the default behavior is to
+    #   raise an error. However, if `allow_missing_columns` is set to
+    #   `true`, a full-NULL column is returned instead of erroring for the files
+    #   that do not contain the column.
     #
     # @return [DataFrame]
     def read_parquet(
@@ -171,6 +177,17 @@ module Polars
     #   Number of retries if accessing a cloud instance fails.
     # @param include_file_paths [String]
     #   Include the path of the source file(s) as a column with this name.
+    # @param allow_missing_columns [Boolean]
+    #   When reading a list of parquet files, if a column existing in the first
+    #   file cannot be found in subsequent files, the default behavior is to
+    #   raise an error. However, if `allow_missing_columns` is set to
+    #   `true`, a full-NULL column is returned instead of erroring for the files
+    #   that do not contain the column.
+    # @param extra_columns ['ignore', 'raise']
+    #   Configuration for behavior when extra columns outside of the
+    #   defined schema are encountered in the data:
+    #     * `ignore`: Silently ignores.
+    #     * `raise`: Raises an error.
     #
     # @return [LazyFrame]
     def scan_parquet(
@@ -192,8 +209,11 @@ module Polars
       credential_provider: nil,
       retries: 2,
       include_file_paths: nil,
-      allow_missing_columns: false
+      allow_missing_columns: false,
+      extra_columns: "raise"
     )
+      missing_columns = allow_missing_columns ? "insert" : "raise"
       if Utils.pathlike?(source)
         source = Utils.normalize_filepath(source, check_not_directory: false)
       elsif Utils.is_path_or_str_sequence(source)
@@ -204,56 +224,11 @@ module Polars
         raise Todo
       end
-      _scan_parquet_impl(
-        source,
-        n_rows: n_rows,
-        cache: cache,
-        parallel: parallel,
-        rechunk: rechunk,
-        row_index_name: row_count_name,
-        row_index_offset: row_count_offset,
-        storage_options: storage_options,
-        credential_provider: credential_provider,
-        low_memory: low_memory,
-        use_statistics: use_statistics,
-        hive_partitioning: hive_partitioning,
-        schema: schema,
-        hive_schema: hive_schema,
-        try_parse_hive_dates: try_parse_hive_dates,
-        retries: retries,
-        glob: glob,
-        include_file_paths: include_file_paths,
-        allow_missing_columns: allow_missing_columns
-      )
-    end
-    # @private
-    def _scan_parquet_impl(
-      source,
-      n_rows: nil,
-      cache: true,
-      parallel: "auto",
-      rechunk: true,
-      row_index_name: nil,
-      row_index_offset: 0,
-      storage_options: nil,
-      credential_provider: nil,
-      low_memory: false,
-      use_statistics: true,
-      hive_partitioning: nil,
-      glob: true,
-      schema: nil,
-      hive_schema: nil,
-      try_parse_hive_dates: true,
-      retries: 2,
-      include_file_paths: nil,
-      allow_missing_columns: false
-    )
       if source.is_a?(::Array)
         sources = source
         source = nil
       else
-        sources = []
+        sources = [source]
       end
       if storage_options
@@ -262,27 +237,34 @@ module Polars
         storage_options = nil
       end
+      row_index_name = row_count_name
+      row_index_offset = row_count_offset
       rblf =
         RbLazyFrame.new_from_parquet(
-          source,
           sources,
-          n_rows,
-          cache,
+          schema,
+          ScanOptions.new(
+            row_index: !row_index_name.nil? ? [row_index_name, row_index_offset] : nil,
+            pre_slice: !n_rows.nil? ? [0, n_rows] : nil,
+            # cast_options: cast_options,
+            extra_columns: extra_columns,
+            missing_columns: missing_columns,
+            include_file_paths: include_file_paths,
+            glob: glob,
+            hive_partitioning: hive_partitioning,
+            hive_schema: hive_schema,
+            try_parse_hive_dates: try_parse_hive_dates,
+            rechunk: rechunk,
+            cache: cache,
+            storage_options: storage_options,
+            # credential_provider: credential_provider_builder,
+            retries: retries,
+            # deletion_files: _deletion_files
+          ),
           parallel,
-          rechunk,
-          Utils.parse_row_index_args(row_index_name, row_index_offset),
           low_memory,
-          storage_options,
-          credential_provider,
-          use_statistics,
-          hive_partitioning,
-          schema,
-          hive_schema,
-          try_parse_hive_dates,
-          retries,
-          glob,
-          include_file_paths,
-          allow_missing_columns
+          use_statistics
         )
       Utils.wrap_ldf(rblf)
     end

data/lib/polars/io/scan_options.rb ADDED Viewed

@@ -0,0 +1,47 @@
+module Polars
+  module IO
+    class ScanOptions
+      attr_reader :row_index, :pre_slice, :cast_options, :extra_columns, :missing_columns,
+        :include_file_paths, :glob, :hive_partitioning, :hive_schema, :try_parse_hive_dates,
+        :rechunk, :cache, :storage_options, :credential_provider, :retries, :column_mapping, :deletion_files
+      def initialize(
+        row_index: nil,
+        pre_slice: nil,
+        cast_options: nil,
+        extra_columns: "raise",
+        missing_columns: "raise",
+        include_file_paths: nil,
+        glob: true,
+        hive_partitioning: nil,
+        hive_schema: nil,
+        try_parse_hive_dates: true,
+        rechunk: false,
+        cache: true,
+        storage_options: nil,
+        credential_provider: nil,
+        retries: 2,
+        column_mapping: nil,
+        deletion_files: nil
+      )
+        @row_index = row_index
+        @pre_slice = pre_slice
+        @cast_options = cast_options
+        @extra_columns = extra_columns
+        @missing_columns = missing_columns
+        @include_file_paths = include_file_paths
+        @glob = glob
+        @hive_partitioning = hive_partitioning
+        @hive_schema = hive_schema
+        @try_parse_hive_dates = try_parse_hive_dates
+        @rechunk = rechunk
+        @cache = cache
+        @storage_options = storage_options
+        @credential_provider = credential_provider
+        @retries = retries
+        @column_mapping = column_mapping
+        @deletion_files = deletion_files
+      end
+    end
+  end
+end