RubyGems - polars-df - Versions diffs - 0.25.0 → 0.26.0 - Mend

polars-df 0.25.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +33 -0
data/Cargo.lock +270 -97
data/LICENSE.txt +1 -1
data/README.md +1 -3
data/ext/polars/Cargo.toml +19 -18
data/ext/polars/src/catalog/unity.rs +15 -20
data/ext/polars/src/conversion/any_value.rs +53 -29
data/ext/polars/src/conversion/chunked_array.rs +58 -56
data/ext/polars/src/conversion/datetime.rs +58 -7
data/ext/polars/src/conversion/mod.rs +200 -150
data/ext/polars/src/dataframe/export.rs +15 -12
data/ext/polars/src/dataframe/general.rs +25 -7
data/ext/polars/src/dataframe/map.rs +6 -4
data/ext/polars/src/error.rs +1 -1
data/ext/polars/src/expr/array.rs +0 -24
data/ext/polars/src/expr/datatype.rs +13 -3
data/ext/polars/src/expr/datetime.rs +4 -4
data/ext/polars/src/expr/general.rs +35 -15
data/ext/polars/src/expr/list.rs +0 -26
data/ext/polars/src/expr/rolling.rs +24 -0
data/ext/polars/src/functions/business.rs +2 -2
data/ext/polars/src/functions/io.rs +4 -3
data/ext/polars/src/functions/lazy.rs +65 -46
data/ext/polars/src/functions/meta.rs +6 -5
data/ext/polars/src/functions/mod.rs +0 -1
data/ext/polars/src/functions/range.rs +13 -0
data/ext/polars/src/functions/utils.rs +4 -2
data/ext/polars/src/interop/arrow/mod.rs +4 -2
data/ext/polars/src/interop/arrow/to_rb.rs +1 -1
data/ext/polars/src/interop/numo/to_numo_series.rs +26 -25
data/ext/polars/src/io/scan_options.rs +6 -3
data/ext/polars/src/io/sink_options.rs +2 -0
data/ext/polars/src/lazyframe/general.rs +243 -17
data/ext/polars/src/lazyframe/optflags.rs +2 -1
data/ext/polars/src/lib.rs +39 -35
data/ext/polars/src/map/lazy.rs +5 -2
data/ext/polars/src/map/series.rs +19 -18
data/ext/polars/src/on_startup.rs +25 -6
data/ext/polars/src/ruby/numo.rs +3 -4
data/ext/polars/src/ruby/plan_callback.rs +1 -4
data/ext/polars/src/ruby/rb_modules.rs +2 -4
data/ext/polars/src/ruby/ruby_udf.rs +7 -9
data/ext/polars/src/ruby/utils.rs +12 -1
data/ext/polars/src/series/aggregation.rs +13 -1
data/ext/polars/src/series/construction.rs +31 -50
data/ext/polars/src/series/export.rs +33 -38
data/ext/polars/src/series/general.rs +6 -6
data/ext/polars/src/series/map.rs +3 -2
data/ext/polars/src/series/scatter.rs +4 -4
data/ext/polars/src/utils.rs +31 -7
data/lib/polars/array_expr.rb +23 -7
data/lib/polars/array_name_space.rb +16 -2
data/lib/polars/binary_name_space.rb +32 -0
data/lib/polars/collect_batches.rb +4 -0
data/lib/polars/data_frame.rb +144 -11
data/lib/polars/data_type_group.rb +5 -0
data/lib/polars/date_time_expr.rb +91 -3
data/lib/polars/date_time_name_space.rb +7 -1
data/lib/polars/expr.rb +247 -44
data/lib/polars/functions/business.rb +2 -2
data/lib/polars/functions/datatype.rb +30 -0
data/lib/polars/functions/eager.rb +80 -7
data/lib/polars/functions/lazy.rb +97 -2
data/lib/polars/functions/range/linear_space.rb +118 -0
data/lib/polars/io/csv.rb +27 -5
data/lib/polars/io/database.rb +2 -3
data/lib/polars/io/ipc.rb +2 -2
data/lib/polars/io/lines.rb +172 -0
data/lib/polars/io/parquet.rb +1 -1
data/lib/polars/io/sink_options.rb +5 -2
data/lib/polars/lazy_frame.rb +517 -14
data/lib/polars/list_expr.rb +21 -7
data/lib/polars/list_name_space.rb +16 -2
data/lib/polars/query_opt_flags.rb +23 -5
data/lib/polars/selectors.rb +2 -2
data/lib/polars/series.rb +176 -19
data/lib/polars/sql_context.rb +2 -2
data/lib/polars/string_cache.rb +19 -72
data/lib/polars/string_expr.rb +1 -7
data/lib/polars/string_name_space.rb +1 -7
data/lib/polars/utils/construction/series.rb +24 -39
data/lib/polars/utils/convert.rb +16 -6
data/lib/polars/utils/parse.rb +7 -0
data/lib/polars/utils/reduce_balanced.rb +43 -0
data/lib/polars/utils/various.rb +5 -0
data/lib/polars/version.rb +1 -1
data/lib/polars.rb +2 -1
metadata +4 -17
data/ext/polars/src/functions/string_cache.rs +0 -24

data/lib/polars/data_frame.rb CHANGED Viewed

@@ -3808,6 +3808,57 @@ module Polars
       .collect(optimizations: QueryOptFlags._eager)
     end
+    # Selects rows from this DataFrame at the given indices.
+    #
+    # @note
+    #   This functionality is experimental. It may be
+    #   changed at any point without it being considered a breaking change.
+    #
+    # @param indices [Object]
+    #   The indices of the rows to select.
+    # @param null_on_oob [Boolean]
+    #   If true when an index is out-of-bounds a null row will be generated
+    #   instead of raising an error.
+    #
+    # @return [DataFrame]
+    #
+    # @example
+    #   df = Polars::DataFrame.new({"x" => [2, 1, 0], "s" => ["foo", "bar", "baz"]})
+    #   df.gather([2, 0, 0])
+    #   # =>
+    #   # shape: (3, 2)
+    #   # ┌─────┬─────┐
+    #   # │ x   ┆ s   │
+    #   # │ --- ┆ --- │
+    #   # │ i64 ┆ str │
+    #   # ╞═════╪═════╡
+    #   # │ 0   ┆ baz │
+    #   # │ 2   ┆ foo │
+    #   # │ 2   ┆ foo │
+    #   # └─────┴─────┘
+    #
+    # @example
+    #   df.gather([0, 10, 1], null_on_oob: true)
+    #   # =>
+    #   # shape: (3, 2)
+    #   # ┌──────┬──────┐
+    #   # │ x    ┆ s    │
+    #   # │ ---  ┆ ---  │
+    #   # │ i64  ┆ str  │
+    #   # ╞══════╪══════╡
+    #   # │ 2    ┆ foo  │
+    #   # │ null ┆ null │
+    #   # │ 1    ┆ bar  │
+    #   # └──────┴──────┘
+    def gather(
+      indices,
+      null_on_oob: false
+    )
+      lazy
+      .gather(indices, null_on_oob: null_on_oob)
+      .collect(optimizations: QueryOptFlags._eager)
+    end
     # Apply a custom/user-defined function (UDF) over the rows of the DataFrame.
     #
     # The UDF will receive each row as a tuple of values: `udf(row)`.
@@ -4527,6 +4578,8 @@ module Polars
     # @param separator [String]
     #   Used as separator/delimiter in generated column names in case of multiple
     #   `values` columns.
+    # @param column_naming ['auto', 'combine']
+    #   How resulting column names will be constructed.
     #
     # @return [DataFrame]
     #
@@ -4557,7 +4610,8 @@ module Polars
       aggregate_function: nil,
       maintain_order: true,
       sort_columns: false,
-      separator: "_"
+      separator: "_",
+      column_naming: "auto"
     )
       if on_columns.nil?
         cols = select(on).unique(maintain_order: true)
@@ -4577,7 +4631,8 @@ module Polars
         values: values,
         aggregate_function: aggregate_function,
         maintain_order: maintain_order,
-        separator: separator
+        separator: separator,
+        column_naming: column_naming
       )
       .collect(optimizations: QueryOptFlags._eager)
     end
@@ -4593,7 +4648,8 @@ module Polars
     #
     # @param on [Object]
     #   Column(s) or selector(s) to use as values variables; if `on`
-    #   is empty all columns that are not in `index` will be used.
+    #   is empty no columns will be used. If set to `nil` (default)
+    #   all columns that are not in `index` will be used.
     # @param index [Object]
     #   Column(s) or selector(s) to use as identifier variables.
     # @param variable_name [Object]
@@ -4627,7 +4683,7 @@ module Polars
     #   # │ z   ┆ c        ┆ 6     │
     #   # └─────┴──────────┴───────┘
     def unpivot(on = nil, index: nil, variable_name: nil, value_name: nil)
-      on = on.nil? ? [] : Utils._expand_selectors(self, on)
+      on = on.nil? ? nil : Utils._expand_selectors(self, on)
       index = index.nil? ? [] : Utils._expand_selectors(self, index)
       _from_rbdf(_df.unpivot(on, index, value_name, variable_name))
@@ -6448,7 +6504,7 @@ module Polars
     #   #         {5,"five"}
     #   # ]
     def to_struct(name = "")
-      Utils.wrap_s(_df.to_struct(name))
+      Utils.wrap_s(_df.to_struct(name, []))
     end
     # Decompose a struct into its fields.
@@ -6488,7 +6544,7 @@ module Polars
     #   # │ foo    ┆ 1   ┆ a   ┆ true ┆ [1, 2]    ┆ baz   │
     #   # │ bar    ┆ 2   ┆ b   ┆ null ┆ [3]       ┆ womp  │
     #   # └────────┴─────┴─────┴──────┴───────────┴───────┘
-    def unnest(columns, *more_columns, separator: nil)
+    def unnest(columns = nil, *more_columns, separator: nil)
       lazy.unnest(columns, *more_columns, separator: separator).collect(optimizations: QueryOptFlags._eager)
     end
@@ -6504,6 +6560,10 @@ module Polars
     #   Other DataFrame that must be merged
     # @param key [String]
     #   Key that is sorted.
+    # @param maintain_order [Boolean]
+    #   If `true`, the output is guaranteed to have left-biased ordering
+    #   for equal keys: rows from the left frame appear before rows from
+    #   the right frame when their keys are equal.
     #
     # @return [DataFrame]
     #
@@ -6530,8 +6590,8 @@ module Polars
     #   # │ steve  ┆ 42  │
     #   # │ elise  ┆ 44  │
     #   # └────────┴─────┘
-    def merge_sorted(other, key)
-      lazy.merge_sorted(other.lazy, key).collect(optimizations: QueryOptFlags._eager)
+    def merge_sorted(other, key, maintain_order: false)
+      lazy.merge_sorted(other.lazy, key, maintain_order: maintain_order).collect(optimizations: QueryOptFlags._eager)
     end
     # Flag a column as sorted.
@@ -6545,14 +6605,17 @@ module Polars
     #   Column that is sorted.
     # @param descending [Boolean]
     #   Whether the column is sorted in descending order.
+    # @param nulls_last [Boolean]
+    #   Whether the nulls are at the end.
     #
     # @return [DataFrame]
     def set_sorted(
       column,
-      descending: false
+      descending: false,
+      nulls_last: false
     )
       lazy
-        .set_sorted(column, descending: descending)
+        .set_sorted(column, descending: descending, nulls_last: nulls_last)
         .collect(optimizations: QueryOptFlags._eager)
     end
@@ -6687,6 +6750,76 @@ module Polars
       .collect(optimizations: QueryOptFlags._eager)
     end
+    # Match or evolve the schema of a LazyFrame into a specific schema.
+    #
+    # By default, match_to_schema returns an error if the input schema does not
+    # exactly match the target schema. It also allows columns to be freely reordered,
+    # with additional coercion rules available through optional parameters.
+    #
+    # @note
+    #   This functionality is considered **unstable**. It may be changed
+    #   at any point without it being considered a breaking change.
+    #
+    # @param schema [Object]
+    #   Target schema to match or evolve to.
+    # @param missing_columns [Object]
+    #   Raise of insert missing columns from the input with respect to the `schema`.
+    #
+    #   This can also be an expression per column with what to insert if it is
+    #   missing.
+    # @param missing_struct_fields [Object]
+    #   Raise of insert missing struct fields from the input with respect to the
+    #   `schema`.
+    # @param extra_columns [Object]
+    #   Raise of ignore extra columns from the input with respect to the `schema`.
+    # @param extra_struct_fields [Object]
+    #   Raise of ignore extra struct fields from the input with respect to the
+    #   `schema`.
+    # @param integer_cast [Object]
+    #   Forbid of upcast for integer columns from the input to the respective column
+    #   in `schema`.
+    # @param float_cast [Object]
+    #   Forbid of upcast for float columns from the input to the respective column
+    #   in `schema`.
+    #
+    # @return [DataFrame]
+    #
+    # @example Ensuring the schema matches
+    #   df = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => ["A", "B", "C"]})
+    #   df.match_to_schema({"a" => Polars::Int64, "b" => Polars::String})
+    #   # =>
+    #   # shape: (3, 2)
+    #   # ┌─────┬─────┐
+    #   # │ a   ┆ b   │
+    #   # │ --- ┆ --- │
+    #   # │ i64 ┆ str │
+    #   # ╞═════╪═════╡
+    #   # │ 1   ┆ A   │
+    #   # │ 2   ┆ B   │
+    #   # │ 3   ┆ C   │
+    #   # └─────┴─────┘
+    def match_to_schema(
+      schema,
+      missing_columns: "raise",
+      missing_struct_fields: "raise",
+      extra_columns: "raise",
+      extra_struct_fields: "raise",
+      integer_cast: "forbid",
+      float_cast: "forbid"
+    )
+      lazy
+      .match_to_schema(
+        schema,
+        missing_columns: missing_columns,
+        missing_struct_fields: missing_struct_fields,
+        extra_columns: extra_columns,
+        extra_struct_fields: extra_struct_fields,
+        integer_cast: integer_cast,
+        float_cast: float_cast
+      )
+      .collect(optimizations: QueryOptFlags._eager)
+    end
     private
     def initialize_copy(other)
@@ -7041,7 +7174,7 @@ module Polars
     end
     def _select_rows_by_slice(df, key)
-      return Slice.new(df).apply(key)
+      Slice.new(df).apply(key)
     end
     def _select_rows_by_index(df, key)

data/lib/polars/data_type_group.rb CHANGED Viewed

@@ -3,6 +3,7 @@ module Polars
   class DataTypeGroup < Set
   end
+  # @private
   SIGNED_INTEGER_DTYPES = DataTypeGroup.new(
     [
       Int8,
@@ -11,6 +12,7 @@ module Polars
       Int64
     ]
   )
+  # @private
   UNSIGNED_INTEGER_DTYPES = DataTypeGroup.new(
     [
       UInt8,
@@ -19,10 +21,13 @@ module Polars
       UInt64
     ]
   )
+  # @private
   INTEGER_DTYPES = (
     SIGNED_INTEGER_DTYPES | UNSIGNED_INTEGER_DTYPES
   )
+  # @private
   FLOAT_DTYPES = DataTypeGroup.new([Float32, Float64])
+  # @private
   NUMERIC_DTYPES = DataTypeGroup.new(
     FLOAT_DTYPES + INTEGER_DTYPES | [Decimal]
   )

data/lib/polars/date_time_expr.rb CHANGED Viewed

@@ -22,6 +22,8 @@ module Polars
     #   Which days of the week to count. The default is Monday to Friday.
     #   If you wanted to count only Monday to Thursday, you would pass
     #   `[true, true, true, true, false, false, false]`.
+    # @param holidays [Object]
+    #   Holidays to exclude from the count.
     # @param roll
     #   What to do when the start date lands on a non-business day. Options are:
     #
@@ -44,17 +46,67 @@ module Polars
     #   # │ 2020-01-01 ┆ 2020-01-08 │
     #   # │ 2020-01-02 ┆ 2020-01-09 │
     #   # └────────────┴────────────┘
+    #
+    # @example You can pass a custom weekend - for example, if you only take Sunday off:
+    #   week_mask = [true, true, true, true, true, true, false]
+    #   df.with_columns(
+    #     result: Polars.col("start").dt.add_business_days(5, week_mask: week_mask)
+    #   )
+    #   # =>
+    #   # shape: (2, 2)
+    #   # ┌────────────┬────────────┐
+    #   # │ start      ┆ result     │
+    #   # │ ---        ┆ ---        │
+    #   # │ date       ┆ date       │
+    #   # ╞════════════╪════════════╡
+    #   # │ 2020-01-01 ┆ 2020-01-07 │
+    #   # │ 2020-01-02 ┆ 2020-01-08 │
+    #   # └────────────┴────────────┘
+    #
+    # @example You can also pass a list of holidays:
+    #   holidays = [Date.new(2020, 1, 3), Date.new(2020, 1, 6)]
+    #   df.with_columns(
+    #     result: Polars.col("start").dt.add_business_days(5, holidays: holidays)
+    #   )
+    #   # =>
+    #   # shape: (2, 2)
+    #   # ┌────────────┬────────────┐
+    #   # │ start      ┆ result     │
+    #   # │ ---        ┆ ---        │
+    #   # │ date       ┆ date       │
+    #   # ╞════════════╪════════════╡
+    #   # │ 2020-01-01 ┆ 2020-01-10 │
+    #   # │ 2020-01-02 ┆ 2020-01-13 │
+    #   # └────────────┴────────────┘
+    #
+    # @example Roll all dates forwards to the next business day:
+    #   df = Polars::DataFrame.new({"start" => [Date.new(2020, 1, 5), Date.new(2020, 1, 6)]})
+    #   df.with_columns(
+    #     rolled_forwards: Polars.col("start").dt.add_business_days(0, roll: "forward")
+    #   )
+    #   # =>
+    #   # shape: (2, 2)
+    #   # ┌────────────┬─────────────────┐
+    #   # │ start      ┆ rolled_forwards │
+    #   # │ ---        ┆ ---             │
+    #   # │ date       ┆ date            │
+    #   # ╞════════════╪═════════════════╡
+    #   # │ 2020-01-05 ┆ 2020-01-06      │
+    #   # │ 2020-01-06 ┆ 2020-01-06      │
+    #   # └────────────┴─────────────────┘
     def add_business_days(
       n,
       week_mask: [true, true, true, true, true, false, false],
+      holidays: [],
       roll: "raise"
     )
       n_rbexpr = Utils.parse_into_expression(n)
+      holidays_rbexpr = Utils._holidays_to_expr(holidays)
       Utils.wrap_expr(
         _rbexpr.dt_add_business_days(
           n_rbexpr,
           week_mask,
-          [],
+          holidays_rbexpr,
           roll
         )
       )
@@ -580,6 +632,8 @@ module Polars
     #   Which days of the week to count. The default is Monday to Friday.
     #   If you wanted to count only Monday to Thursday, you would pass
     #   `[true, true, true, true, false, false, false]`.
+    # @param holidays [Object]
+    #   Holidays to exclude from the count.
     #
     # @return [Expr]
     #
@@ -596,13 +650,47 @@ module Polars
     #   # │ 2020-01-03 ┆ true            │
     #   # │ 2020-01-05 ┆ false           │
     #   # └────────────┴─────────────────┘
+    #
+    # @example You can pass a custom weekend - for example, if you only take Sunday off:
+    #   week_mask = [true, true, true, true, true, true, false]
+    #   df.with_columns(
+    #     is_business_day: Polars.col("start").dt.is_business_day(week_mask: week_mask)
+    #   )
+    #   # =>
+    #   # shape: (2, 2)
+    #   # ┌────────────┬─────────────────┐
+    #   # │ start      ┆ is_business_day │
+    #   # │ ---        ┆ ---             │
+    #   # │ date       ┆ bool            │
+    #   # ╞════════════╪═════════════════╡
+    #   # │ 2020-01-03 ┆ true            │
+    #   # │ 2020-01-05 ┆ false           │
+    #   # └────────────┴─────────────────┘
+    #
+    # @example You can also pass a list of holidays:
+    #   holidays = [Date.new(2020, 1, 3), Date.new(2020, 1, 6)]
+    #   df.with_columns(
+    #     is_business_day: Polars.col("start").dt.is_business_day(holidays: holidays)
+    #   )
+    #   # =>
+    #   # shape: (2, 2)
+    #   # ┌────────────┬─────────────────┐
+    #   # │ start      ┆ is_business_day │
+    #   # │ ---        ┆ ---             │
+    #   # │ date       ┆ bool            │
+    #   # ╞════════════╪═════════════════╡
+    #   # │ 2020-01-03 ┆ false           │
+    #   # │ 2020-01-05 ┆ false           │
+    #   # └────────────┴─────────────────┘
     def is_business_day(
-      week_mask: [true, true, true, true, true, false, false]
+      week_mask: [true, true, true, true, true, false, false],
+      holidays: []
     )
+      holidays_rbexpr = Utils._holidays_to_expr(holidays)
       Utils.wrap_expr(
         _rbexpr.dt_is_business_day(
           week_mask,
-          []
+          holidays_rbexpr
         )
       )
     end

data/lib/polars/date_time_name_space.rb CHANGED Viewed

@@ -31,6 +31,8 @@ module Polars
     #   Which days of the week to count. The default is Monday to Friday.
     #   If you wanted to count only Monday to Thursday, you would pass
     #   `[true, true, true, true, false, false, false]`.
+    # @param holidays [Object]
+    #   Holidays to exclude from the count.
     # roll
     #   What to do when the start date lands on a non-business day. Options are:
     #
@@ -75,6 +77,7 @@ module Polars
     def add_business_days(
       n,
       week_mask: [true, true, true, true, true, false, false],
+      holidays: [],
       roll: "raise"
     )
       super
@@ -263,6 +266,8 @@ module Polars
     #   Which days of the week to count. The default is Monday to Friday.
     #   If you wanted to count only Monday to Thursday, you would pass
     #   `[true, true, true, true, false, false, false]`.
+    # @param holidays [Object]
+    #   Holidays to exclude from the count.
     #
     # @return [Series]
     #
@@ -288,7 +293,8 @@ module Polars
     #   #         false
     #   # ]
     def is_business_day(
-      week_mask: [true, true, true, true, true, false, false]
+      week_mask: [true, true, true, true, true, false, false],
+      holidays: []
     )
       super
     end