RubyGems - polars-df - Versions diffs - 0.11.0 → 0.12.0 - Mend

polars-df 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +16 -0
data/Cargo.lock +360 -361
data/ext/polars/Cargo.toml +10 -7
data/ext/polars/src/batched_csv.rs +1 -1
data/ext/polars/src/conversion/any_value.rs +261 -0
data/ext/polars/src/conversion/chunked_array.rs +4 -4
data/ext/polars/src/conversion/mod.rs +51 -10
data/ext/polars/src/dataframe/construction.rs +6 -8
data/ext/polars/src/dataframe/general.rs +19 -29
data/ext/polars/src/dataframe/io.rs +43 -33
data/ext/polars/src/error.rs +26 -4
data/ext/polars/src/expr/categorical.rs +0 -10
data/ext/polars/src/expr/datetime.rs +4 -12
data/ext/polars/src/expr/general.rs +123 -110
data/ext/polars/src/expr/mod.rs +2 -2
data/ext/polars/src/expr/rolling.rs +17 -9
data/ext/polars/src/expr/string.rs +2 -6
data/ext/polars/src/functions/eager.rs +10 -10
data/ext/polars/src/functions/lazy.rs +21 -21
data/ext/polars/src/functions/range.rs +6 -12
data/ext/polars/src/interop/numo/to_numo_series.rs +2 -1
data/ext/polars/src/lazyframe/mod.rs +81 -98
data/ext/polars/src/lib.rs +55 -45
data/ext/polars/src/map/dataframe.rs +2 -2
data/ext/polars/src/rb_modules.rs +25 -1
data/ext/polars/src/series/aggregation.rs +4 -2
data/ext/polars/src/series/arithmetic.rs +21 -11
data/ext/polars/src/series/construction.rs +56 -38
data/ext/polars/src/series/export.rs +1 -1
data/ext/polars/src/series/mod.rs +31 -10
data/ext/polars/src/sql.rs +3 -1
data/lib/polars/array_expr.rb +4 -4
data/lib/polars/batched_csv_reader.rb +2 -2
data/lib/polars/cat_expr.rb +0 -36
data/lib/polars/cat_name_space.rb +0 -37
data/lib/polars/data_frame.rb +93 -101
data/lib/polars/data_types.rb +1 -1
data/lib/polars/date_time_expr.rb +525 -573
data/lib/polars/date_time_name_space.rb +263 -464
data/lib/polars/dynamic_group_by.rb +3 -3
data/lib/polars/exceptions.rb +3 -0
data/lib/polars/expr.rb +367 -330
data/lib/polars/expr_dispatch.rb +1 -1
data/lib/polars/functions/aggregation/horizontal.rb +8 -8
data/lib/polars/functions/as_datatype.rb +63 -40
data/lib/polars/functions/lazy.rb +63 -14
data/lib/polars/functions/lit.rb +1 -1
data/lib/polars/functions/range/date_range.rb +18 -77
data/lib/polars/functions/range/datetime_range.rb +4 -4
data/lib/polars/functions/range/int_range.rb +2 -2
data/lib/polars/functions/range/time_range.rb +4 -4
data/lib/polars/functions/repeat.rb +1 -1
data/lib/polars/functions/whenthen.rb +1 -1
data/lib/polars/io/csv.rb +8 -8
data/lib/polars/io/ipc.rb +3 -3
data/lib/polars/io/json.rb +13 -2
data/lib/polars/io/ndjson.rb +15 -4
data/lib/polars/io/parquet.rb +5 -4
data/lib/polars/lazy_frame.rb +120 -106
data/lib/polars/lazy_group_by.rb +1 -1
data/lib/polars/list_expr.rb +11 -11
data/lib/polars/list_name_space.rb +5 -1
data/lib/polars/rolling_group_by.rb +5 -7
data/lib/polars/series.rb +105 -189
data/lib/polars/string_expr.rb +42 -67
data/lib/polars/string_name_space.rb +5 -4
data/lib/polars/testing.rb +2 -2
data/lib/polars/utils/constants.rb +9 -0
data/lib/polars/utils/convert.rb +97 -0
data/lib/polars/utils/parse.rb +89 -0
data/lib/polars/utils/various.rb +76 -0
data/lib/polars/utils/wrap.rb +19 -0
data/lib/polars/utils.rb +4 -330
data/lib/polars/version.rb +1 -1
data/lib/polars/whenthen.rb +6 -6
data/lib/polars.rb +11 -0
metadata +9 -4
data/ext/polars/src/conversion/anyvalue.rs +0 -186

data/lib/polars/batched_csv_reader.rb CHANGED Viewed

@@ -42,7 +42,7 @@ module Polars
       if !dtypes.nil?
         if dtypes.is_a?(Hash)
           dtype_list = []
-          dtypes.each do|k, v|
+          dtypes.each do |k, v|
             dtype_list << [k, Utils.rb_type_to_dtype(v)]
           end
         elsif dtypes.is_a?(::Array)
@@ -78,7 +78,7 @@ module Polars
         missing_utf8_is_empty_string,
         parse_dates,
         skip_rows_after_header,
-        Utils._prepare_row_count_args(row_count_name, row_count_offset),
+        Utils.parse_row_index_args(row_count_name, row_count_offset),
         sample_size,
         eol_char,
         raise_if_empty,

data/lib/polars/cat_expr.rb CHANGED Viewed

@@ -9,42 +9,6 @@ module Polars
       self._rbexpr = expr._rbexpr
     end
-    # Determine how this categorical series should be sorted.
-    #
-    # @param ordering ["physical", "lexical"]
-    #   Ordering type:
-    #
-    #   - 'physical' -> Use the physical representation of the categories to determine the order (default).
-    #   - 'lexical' -> Use the string values to determine the ordering.
-    #
-    # @return [Expr]
-    #
-    # @example
-    #   df = Polars::DataFrame.new(
-    #     {"cats" => ["z", "z", "k", "a", "b"], "vals" => [3, 1, 2, 2, 3]}
-    #   ).with_columns(
-    #     [
-    #       Polars.col("cats").cast(:cat).cat.set_ordering("lexical")
-    #     ]
-    #   )
-    #   df.sort(["cats", "vals"])
-    #   # =>
-    #   # shape: (5, 2)
-    #   # ┌──────┬──────┐
-    #   # │ cats ┆ vals │
-    #   # │ ---  ┆ ---  │
-    #   # │ cat  ┆ i64  │
-    #   # ╞══════╪══════╡
-    #   # │ a    ┆ 2    │
-    #   # │ b    ┆ 3    │
-    #   # │ k    ┆ 2    │
-    #   # │ z    ┆ 1    │
-    #   # │ z    ┆ 3    │
-    #   # └──────┴──────┘
-    def set_ordering(ordering)
-      Utils.wrap_expr(_rbexpr.cat_set_ordering(ordering))
-    end
     # Get the categories stored in this data type.
     #
     # @return [Expr]

data/lib/polars/cat_name_space.rb CHANGED Viewed

@@ -10,43 +10,6 @@ module Polars
       self._s = series._s
     end
-    # Determine how this categorical series should be sorted.
-    #
-    # @param ordering ["physical", "lexical"]
-    #   Ordering type:
-    #
-    #   - 'physical' -> Use the physical representation of the categories to
-    #       determine the order (default).
-    #   - 'lexical' -> Use the string values to determine the ordering.
-    #
-    # @return [Series]
-    #
-    # @example
-    #   df = Polars::DataFrame.new(
-    #     {"cats" => ["z", "z", "k", "a", "b"], "vals" => [3, 1, 2, 2, 3]}
-    #   ).with_columns(
-    #     [
-    #       Polars.col("cats").cast(:cat).cat.set_ordering("lexical")
-    #     ]
-    #   )
-    #   df.sort(["cats", "vals"])
-    #   # =>
-    #   # shape: (5, 2)
-    #   # ┌──────┬──────┐
-    #   # │ cats ┆ vals │
-    #   # │ ---  ┆ ---  │
-    #   # │ cat  ┆ i64  │
-    #   # ╞══════╪══════╡
-    #   # │ a    ┆ 2    │
-    #   # │ b    ┆ 3    │
-    #   # │ k    ┆ 2    │
-    #   # │ z    ┆ 1    │
-    #   # │ z    ┆ 3    │
-    #   # └──────┴──────┘
-    def set_ordering(ordering)
-      super
-    end
     # Get the categories stored in this data type.
     #
     # @return [Series]

data/lib/polars/data_frame.rb CHANGED Viewed

@@ -622,7 +622,7 @@ module Polars
     #       "bar" => [6, 7, 8]
     #     }
     #   )
-    #   df.write_ndjson()
+    #   df.write_ndjson
     #   # => "{\"foo\":1,\"bar\":6}\n{\"foo\":2,\"bar\":7}\n{\"foo\":3,\"bar\":8}\n"
     def write_ndjson(file = nil)
       if Utils.pathlike?(file)
@@ -883,6 +883,24 @@ module Polars
         file = Utils.normalize_filepath(file)
       end
+      if statistics == true
+        statistics = {
+          min: true,
+          max: true,
+          distinct_count: false,
+          null_count: true
+        }
+      elsif statistics == false
+        statistics = {}
+      elsif statistics == "full"
+        statistics = {
+          min: true,
+          max: true,
+          distinct_count: true,
+          null_count: true
+        }
+      end
       _df.write_parquet(
         file, compression, compression_level, statistics, row_group_size, data_page_size
       )
@@ -1724,12 +1742,6 @@ module Polars
     #   Define whether the temporal window interval is closed or not.
     # @param by [Object]
     #   Also group by this column/these columns.
-    # @param check_sorted [Boolean]
-    #   When the `by` argument is given, polars can not check sortedness
-    #   by the metadata and has to do a full scan on the index column to
-    #   verify data is sorted. This is expensive. If you are sure the
-    #   data within the by groups is sorted, you can set this to `false`.
-    #   Doing so incorrectly will lead to incorrect output
     #
     # @return [RollingGroupBy]
     #
@@ -1745,7 +1757,7 @@ module Polars
     #   df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
     #     Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
     #   )
-    #   df.group_by_rolling(index_column: "dt", period: "2d").agg(
+    #   df.rolling(index_column: "dt", period: "2d").agg(
     #     [
     #       Polars.sum("a").alias("sum_a"),
     #       Polars.min("a").alias("min_a"),
@@ -1766,17 +1778,17 @@ module Polars
     #   # │ 2020-01-03 19:45:32 ┆ 11    ┆ 2     ┆ 9     │
     #   # │ 2020-01-08 23:16:43 ┆ 1     ┆ 1     ┆ 1     │
     #   # └─────────────────────┴───────┴───────┴───────┘
-    def group_by_rolling(
+    def rolling(
       index_column:,
       period:,
       offset: nil,
       closed: "right",
-      by: nil,
-      check_sorted: true
+      by: nil
     )
-      RollingGroupBy.new(self, index_column, period, offset, closed, by, check_sorted)
+      RollingGroupBy.new(self, index_column, period, offset, closed, by)
     end
-    alias_method :groupby_rolling, :group_by_rolling
+    alias_method :groupby_rolling, :rolling
+    alias_method :group_by_rolling, :rolling
     # Group based on a time value (or index value of type `:i32`, `:i64`).
     #
@@ -1846,10 +1858,12 @@ module Polars
     # @example
     #   df = Polars::DataFrame.new(
     #     {
-    #       "time" => Polars.date_range(
+    #       "time" => Polars.datetime_range(
     #         DateTime.new(2021, 12, 16),
     #         DateTime.new(2021, 12, 16, 3),
-    #         "30m"
+    #         "30m",
+    #         time_unit: "us",
+    #         eager: true
     #       ),
     #       "n" => 0..6
     #     }
@@ -1948,10 +1962,12 @@ module Polars
     # @example Dynamic group bys can also be combined with grouping on normal keys.
     #   df = Polars::DataFrame.new(
     #     {
-    #       "time" => Polars.date_range(
+    #       "time" => Polars.datetime_range(
     #         DateTime.new(2021, 12, 16),
     #         DateTime.new(2021, 12, 16, 3),
-    #         "30m"
+    #         "30m",
+    #         time_unit: "us",
+    #         eager: true
     #       ),
     #       "groups" => ["a", "a", "a", "b", "b", "a", "a"]
     #     }
@@ -2038,8 +2054,6 @@ module Polars
     #   Note that this column has to be sorted for the output to make sense.
     # @param every [String]
     #   interval will start 'every' duration
-    # @param offset [String]
-    #   change the start of the date_range by this offset.
     # @param by [Object]
     #   First group by these columns and then upsample for every group
     # @param maintain_order [Boolean]
@@ -2099,7 +2113,6 @@ module Polars
     def upsample(
       time_column:,
       every:,
-      offset: nil,
       by: nil,
       maintain_order: false
     )
@@ -2109,15 +2122,11 @@ module Polars
       if by.is_a?(::String)
         by = [by]
       end
-      if offset.nil?
-        offset = "0ns"
-      end
-      every = Utils._timedelta_to_pl_duration(every)
-      offset = Utils._timedelta_to_pl_duration(offset)
+      every = Utils.parse_as_duration_string(every)
       _from_rbdf(
-        _df.upsample(by, time_column, every, offset, maintain_order)
+        _df.upsample(by, time_column, every, maintain_order)
       )
     end
@@ -2264,7 +2273,7 @@ module Polars
     #   Name(s) of the right join column(s).
     # @param on [Object]
     #   Name(s) of the join columns in both DataFrames.
-    # @param how ["inner", "left", "outer", "semi", "anti", "cross"]
+    # @param how ["inner", "left", "full", "semi", "anti", "cross"]
     #   Join strategy.
     # @param suffix [String]
     #   Suffix to append to columns with a duplicate name.
@@ -2300,7 +2309,7 @@ module Polars
     #   # └─────┴─────┴─────┴───────┘
     #
     # @example
-    #   df.join(other_df, on: "ham", how: "outer")
+    #   df.join(other_df, on: "ham", how: "full")
     #   # =>
     #   # shape: (4, 5)
     #   # ┌──────┬──────┬──────┬───────┬───────────┐
@@ -2957,9 +2966,9 @@ module Polars
     #   arguments contains multiple columns as well
     # @param index [Object]
     #   One or multiple keys to group by
-    # @param columns [Object]
+    # @param on [Object]
     #   Columns whose values will be used as the header of the output DataFrame
-    # @param aggregate_fn ["first", "sum", "max", "min", "mean", "median", "last", "count"]
+    # @param aggregate_function ["first", "sum", "max", "min", "mean", "median", "last", "count"]
     #   A predefined aggregate function str or an expression.
     # @param maintain_order [Object]
     #   Sort the grouped keys so that the output order is predictable.
@@ -2971,66 +2980,62 @@ module Polars
     # @example
     #   df = Polars::DataFrame.new(
     #     {
-    #       "foo" => ["one", "one", "one", "two", "two", "two"],
-    #       "bar" => ["A", "B", "C", "A", "B", "C"],
+    #       "foo" => ["one", "one", "two", "two", "one", "two"],
+    #       "bar" => ["y", "y", "y", "x", "x", "x"],
     #       "baz" => [1, 2, 3, 4, 5, 6]
     #     }
     #   )
-    #   df.pivot(values: "baz", index: "foo", columns: "bar")
+    #   df.pivot("bar", index: "foo", values: "baz", aggregate_function: "sum")
     #   # =>
-    #   # shape: (2, 4)
-    #   # ┌─────┬─────┬─────┬─────┐
-    #   # │ foo ┆ A   ┆ B   ┆ C   │
-    #   # │ --- ┆ --- ┆ --- ┆ --- │
-    #   # │ str ┆ i64 ┆ i64 ┆ i64 │
-    #   # ╞═════╪═════╪═════╪═════╡
-    #   # │ one ┆ 1   ┆ 2   ┆ 3   │
-    #   # │ two ┆ 4   ┆ 5   ┆ 6   │
-    #   # └─────┴─────┴─────┴─────┘
+    #   # shape: (2, 3)
+    #   # ┌─────┬─────┬─────┐
+    #   # │ foo ┆ y   ┆ x   │
+    #   # │ --- ┆ --- ┆ --- │
+    #   # │ str ┆ i64 ┆ i64 │
+    #   # ╞═════╪═════╪═════╡
+    #   # │ one ┆ 3   ┆ 5   │
+    #   # │ two ┆ 3   ┆ 10  │
+    #   # └─────┴─────┴─────┘
     def pivot(
-      values:,
-      index:,
-      columns:,
-      aggregate_fn: "first",
+      on,
+      index: nil,
+      values: nil,
+      aggregate_function: nil,
       maintain_order: true,
       sort_columns: false,
       separator: "_"
     )
-      if values.is_a?(::String)
-        values = [values]
-      end
-      if index.is_a?(::String)
-        index = [index]
-      end
-      if columns.is_a?(::String)
-        columns = [columns]
+      index = Utils._expand_selectors(self, index)
+      on = Utils._expand_selectors(self, on)
+      if !values.nil?
+        values = Utils._expand_selectors(self, values)
       end
-      if aggregate_fn.is_a?(::String)
-        case aggregate_fn
+      if aggregate_function.is_a?(::String)
+        case aggregate_function
         when "first"
-          aggregate_expr = Polars.element.first._rbexpr
+          aggregate_expr = F.element.first._rbexpr
         when "sum"
-          aggregate_expr = Polars.element.sum._rbexpr
+          aggregate_expr = F.element.sum._rbexpr
         when "max"
-          aggregate_expr = Polars.element.max._rbexpr
+          aggregate_expr = F.element.max._rbexpr
         when "min"
-          aggregate_expr = Polars.element.min._rbexpr
+          aggregate_expr = F.element.min._rbexpr
         when "mean"
-          aggregate_expr = Polars.element.mean._rbexpr
+          aggregate_expr = F.element.mean._rbexpr
         when "median"
-          aggregate_expr = Polars.element.median._rbexpr
+          aggregate_expr = F.element.median._rbexpr
         when "last"
-          aggregate_expr = Polars.element.last._rbexpr
+          aggregate_expr = F.element.last._rbexpr
         when "len"
-          aggregate_expr = Polars.len._rbexpr
+          aggregate_expr = F.len._rbexpr
         when "count"
           warn "`aggregate_function: \"count\"` input for `pivot` is deprecated. Use `aggregate_function: \"len\"` instead."
-          aggregate_expr = Polars.len._rbexpr
+          aggregate_expr = F.len._rbexpr
         else
           raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
         end
-      elsif aggregate_fn.nil?
+      elsif aggregate_function.nil?
         aggregate_expr = nil
       else
         aggregate_expr = aggregate_function._rbexpr
@@ -3038,8 +3043,8 @@ module Polars
       _from_rbdf(
         _df.pivot_expr(
+          on,
           index,
-          columns,
           values,
           maintain_order,
           sort_columns,
@@ -3054,18 +3059,18 @@ module Polars
     # Optionally leaves identifiers set.
     #
     # This function is useful to massage a DataFrame into a format where one or more
-    # columns are identifier variables (id_vars), while all other columns, considered
-    # measured variables (value_vars), are "unpivoted" to the row axis, leaving just
+    # columns are identifier variables (index) while all other columns, considered
+    # measured variables (on), are "unpivoted" to the row axis leaving just
     # two non-identifier columns, 'variable' and 'value'.
     #
-    # @param id_vars [Object]
-    #   Columns to use as identifier variables.
-    # @param value_vars [Object]
-    #   Values to use as identifier variables.
-    #   If `value_vars` is empty all columns that are not in `id_vars` will be used.
-    # @param variable_name [String]
-    #   Name to give to the `value` column. Defaults to "variable"
-    # @param value_name [String]
+    # @param on [Object]
+    #   Column(s) or selector(s) to use as values variables; if `on`
+    #   is empty all columns that are not in `index` will be used.
+    # @param index [Object]
+    #   Column(s) or selector(s) to use as identifier variables.
+    # @param variable_name [Object]
+    #   Name to give to the `variable` column. Defaults to "variable"
+    # @param value_name [Object]
     #   Name to give to the `value` column. Defaults to "value"
     #
     # @return [DataFrame]
@@ -3078,7 +3083,7 @@ module Polars
     #       "c" => [2, 4, 6]
     #     }
     #   )
-    #   df.melt(id_vars: "a", value_vars: ["b", "c"])
+    #   df.unpivot(Polars::Selectors.numeric, index: "a")
     #   # =>
     #   # shape: (6, 3)
     #   # ┌─────┬──────────┬───────┐
@@ -3093,23 +3098,13 @@ module Polars
     #   # │ y   ┆ c        ┆ 4     │
     #   # │ z   ┆ c        ┆ 6     │
     #   # └─────┴──────────┴───────┘
-    def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
-      if value_vars.is_a?(::String)
-        value_vars = [value_vars]
-      end
-      if id_vars.is_a?(::String)
-        id_vars = [id_vars]
-      end
-      if value_vars.nil?
-        value_vars = []
-      end
-      if id_vars.nil?
-        id_vars = []
-      end
-      _from_rbdf(
-        _df.melt(id_vars, value_vars, value_name, variable_name)
-      )
+    def unpivot(on, index: nil, variable_name: nil, value_name: nil)
+      on = on.nil? ? [] : Utils._expand_selectors(self, on)
+      index = index.nil? ? [] : Utils._expand_selectors(self, index)
+      _from_rbdf(_df.unpivot(on, index, value_name, variable_name))
     end
+    alias_method :melt, :unpivot
     # Unstack a long table to a wide form without doing an aggregation.
     #
@@ -4143,7 +4138,7 @@ module Polars
       end
       if subset.is_a?(::Array) && subset.length == 1
-        expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false)
+        expr = Utils.wrap_expr(Utils.parse_into_expression(subset[0], str_as_lit: false))
       else
         struct_fields = subset.nil? ? Polars.all : subset
         expr = Polars.struct(struct_fields)
@@ -4561,7 +4556,7 @@ module Polars
     #   # │ 3   ┆ 7   │
     #   # └─────┴─────┘
     def gather_every(n, offset = 0)
-      select(Utils.col("*").gather_every(n, offset))
+      select(F.col("*").gather_every(n, offset))
     end
     alias_method :take_every, :gather_every
@@ -4631,7 +4626,7 @@ module Polars
     #   # │ 10.0 ┆ null ┆ 9.0      │
     #   # └──────┴──────┴──────────┘
     def interpolate
-      select(Utils.col("*").interpolate)
+      select(F.col("*").interpolate)
     end
     # Check if the dataframe is empty.
@@ -4767,19 +4762,16 @@ module Polars
     #
     # @param column [Object]
     #   Columns that are sorted
-    # @param more_columns [Object]
-    #   Additional columns that are sorted, specified as positional arguments.
     # @param descending [Boolean]
     #   Whether the columns are sorted in descending order.
     #
     # @return [DataFrame]
     def set_sorted(
       column,
-      *more_columns,
       descending: false
     )
       lazy
-        .set_sorted(column, *more_columns, descending: descending)
+        .set_sorted(column, descending: descending)
         .collect(no_optimization: true)
     end

data/lib/polars/data_types.rb CHANGED Viewed

@@ -456,7 +456,7 @@ module Polars
     end
     def to_s
-      "#{self.class.name}([#{fields.map(&:to_s).join("\n")}])"
+      "#{self.class.name}(#{fields.to_h { |f| [f.name, f.dtype] }})"
     end
     def to_schema