RubyGems - polars-df - Versions diffs - 0.8.0-x86_64-linux → 0.9.0-x86_64-linux - Mend

polars-df 0.8.0-x86_64-linux → 0.9.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +30 -1
data/Cargo.lock +107 -59
data/Cargo.toml +0 -3
data/LICENSE-THIRD-PARTY.txt +1726 -754
data/LICENSE.txt +1 -1
data/README.md +2 -2
data/lib/polars/3.1/polars.so +0 -0
data/lib/polars/3.2/polars.so +0 -0
data/lib/polars/3.3/polars.so +0 -0
data/lib/polars/array_expr.rb +449 -0
data/lib/polars/array_name_space.rb +346 -0
data/lib/polars/cat_expr.rb +24 -0
data/lib/polars/cat_name_space.rb +75 -0
data/lib/polars/config.rb +2 -2
data/lib/polars/data_frame.rb +179 -43
data/lib/polars/data_types.rb +191 -28
data/lib/polars/date_time_expr.rb +31 -14
data/lib/polars/exceptions.rb +12 -1
data/lib/polars/expr.rb +866 -186
data/lib/polars/functions/aggregation/horizontal.rb +246 -0
data/lib/polars/functions/aggregation/vertical.rb +282 -0
data/lib/polars/functions/as_datatype.rb +248 -0
data/lib/polars/functions/col.rb +47 -0
data/lib/polars/functions/eager.rb +182 -0
data/lib/polars/functions/lazy.rb +1280 -0
data/lib/polars/functions/len.rb +49 -0
data/lib/polars/functions/lit.rb +35 -0
data/lib/polars/functions/random.rb +16 -0
data/lib/polars/functions/range/date_range.rb +103 -0
data/lib/polars/functions/range/int_range.rb +51 -0
data/lib/polars/functions/repeat.rb +144 -0
data/lib/polars/functions/whenthen.rb +27 -0
data/lib/polars/functions.rb +29 -416
data/lib/polars/group_by.rb +2 -2
data/lib/polars/io.rb +18 -25
data/lib/polars/lazy_frame.rb +367 -53
data/lib/polars/list_expr.rb +152 -6
data/lib/polars/list_name_space.rb +102 -0
data/lib/polars/meta_expr.rb +175 -7
data/lib/polars/series.rb +273 -34
data/lib/polars/string_cache.rb +75 -0
data/lib/polars/string_expr.rb +412 -96
data/lib/polars/string_name_space.rb +4 -4
data/lib/polars/testing.rb +507 -0
data/lib/polars/utils.rb +52 -8
data/lib/polars/version.rb +1 -1
data/lib/polars.rb +15 -2
metadata +33 -4
data/lib/polars/lazy_functions.rb +0 -1181

data/lib/polars/data_frame.rb CHANGED Viewed

@@ -47,8 +47,8 @@ module Polars
     end
     # @private
-    def self._from_hashes(data, infer_schema_length: 100, schema: nil)
-      rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema)
+    def self._from_hashes(data, infer_schema_length: 100, schema: nil, schema_overrides: nil)
+      rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
       _from_rbdf(rbdf)
     end
@@ -622,7 +622,7 @@ module Polars
         # select single column
         # df["foo"]
         if item.is_a?(::String) || item.is_a?(Symbol)
-          return Utils.wrap_s(_df.column(item.to_s))
+          return Utils.wrap_s(_df.get_column(item.to_s))
         end
         # df[idx]
@@ -1084,7 +1084,7 @@ module Polars
     #   df.estimated_size
     #   # => 25888898
     #   df.estimated_size("mb")
-    #   # => 24.689577102661133
+    #   # => 26.702880859375
     def estimated_size(unit = "b")
       sz = _df.estimated_size
       Utils.scale_bytes(sz, to: unit)
@@ -1782,7 +1782,7 @@ module Polars
     #       "b" => [2, 4, 6]
     #     }
     #   )
-    #   df.with_row_count
+    #   df.with_row_index
     #   # =>
     #   # shape: (3, 3)
     #   # ┌────────┬─────┬─────┐
@@ -1794,9 +1794,10 @@ module Polars
     #   # │ 1      ┆ 3   ┆ 4   │
     #   # │ 2      ┆ 5   ┆ 6   │
     #   # └────────┴─────┴─────┘
-    def with_row_count(name: "row_nr", offset: 0)
-      _from_rbdf(_df.with_row_count(name, offset))
+    def with_row_index(name: "row_nr", offset: 0)
+      _from_rbdf(_df.with_row_index(name, offset))
     end
+    alias_method :with_row_count, :with_row_index
     # Start a group by operation.
     #
@@ -2433,6 +2434,8 @@ module Polars
     #   Join strategy.
     # @param suffix [String]
     #   Suffix to append to columns with a duplicate name.
+    # @param join_nulls [Boolean]
+    #   Join on null values. By default null values will never produce matches.
     #
     # @return [DataFrame]
     #
@@ -2515,7 +2518,7 @@ module Polars
     #   # ╞═════╪═════╪═════╡
     #   # │ 3   ┆ 8.0 ┆ c   │
     #   # └─────┴─────┴─────┘
-    def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
+    def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right", join_nulls: false)
       lazy
         .join(
           other.lazy,
@@ -2524,6 +2527,7 @@ module Polars
           on: on,
           how: how,
           suffix: suffix,
+          join_nulls: join_nulls
         )
         .collect(no_optimization: true)
     end
@@ -2867,7 +2871,7 @@ module Polars
     #       "c" => [true, true, false, nil]
     #     }
     #   )
-    #   df.cleared
+    #   df.clear
     #   # =>
     #   # shape: (0, 3)
     #   # ┌─────┬─────┬──────┐
@@ -2876,9 +2880,31 @@ module Polars
     #   # │ i64 ┆ f64 ┆ bool │
     #   # ╞═════╪═════╪══════╡
     #   # └─────┴─────┴──────┘
-    def cleared
-      height > 0 ? head(0) : clone
+    #
+    # @example
+    #   df.clear(2)
+    #   # =>
+    #   # shape: (2, 3)
+    #   # ┌──────┬──────┬──────┐
+    #   # │ a    ┆ b    ┆ c    │
+    #   # │ ---  ┆ ---  ┆ ---  │
+    #   # │ i64  ┆ f64  ┆ bool │
+    #   # ╞══════╪══════╪══════╡
+    #   # │ null ┆ null ┆ null │
+    #   # │ null ┆ null ┆ null │
+    #   # └──────┴──────┴──────┘
+    def clear(n = 0)
+      if n == 0
+        _from_rbdf(_df.clear)
+      elsif n > 0 || len > 0
+        self.class.new(
+          schema.to_h { |nm, tp| [nm, Series.new(nm, [], dtype: tp).extend_constant(nil, n)] }
+        )
+      else
+        clone
+      end
     end
+    alias_method :cleared, :clear
     # clone handled by initialize_copy
@@ -3141,8 +3167,11 @@ module Polars
           aggregate_expr = Polars.element.median._rbexpr
         when "last"
           aggregate_expr = Polars.element.last._rbexpr
+        when "len"
+          aggregate_expr = Polars.len._rbexpr
         when "count"
-          aggregate_expr = Polars.count._rbexpr
+          warn "`aggregate_function: \"count\"` input for `pivot` is deprecated. Use `aggregate_function: \"len\"` instead."
+          aggregate_expr = Polars.len._rbexpr
         else
           raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
         end
@@ -3154,9 +3183,9 @@ module Polars
       _from_rbdf(
         _df.pivot_expr(
-          values,
           index,
           columns,
+          values,
           maintain_order,
           sort_columns,
           aggregate_expr,
@@ -3591,8 +3620,13 @@ module Polars
     # Select columns from this DataFrame.
     #
-    # @param exprs [Object]
-    #   Column or columns to select.
+    # @param exprs [Array]
+    #   Column(s) to select, specified as positional arguments.
+    #   Accepts expression input. Strings are parsed as column names,
+    #   other non-expression inputs are parsed as literals.
+    # @param named_exprs [Hash]
+    #   Additional columns to select, specified as keyword arguments.
+    #   The columns will be renamed to the keyword used.
     #
     # @return [DataFrame]
     #
@@ -3672,23 +3706,25 @@ module Polars
     #   # │ 0       │
     #   # │ 10      │
     #   # └─────────┘
-    def select(exprs)
-      _from_rbdf(
-        lazy
-          .select(exprs)
-          .collect(no_optimization: true, string_cache: false)
-          ._df
-      )
+    def select(*exprs, **named_exprs)
+      lazy.select(*exprs, **named_exprs).collect(_eager: true)
     end
-    # Add or overwrite multiple columns in a DataFrame.
+    # Add columns to this DataFrame.
+    #
+    # Added columns will replace existing columns with the same name.
     #
     # @param exprs [Array]
-    #   Array of Expressions that evaluate to columns.
+    #   Column(s) to add, specified as positional arguments.
+    #   Accepts expression input. Strings are parsed as column names, other
+    #   non-expression inputs are parsed as literals.
+    # @param named_exprs [Hash]
+    #   Additional columns to add, specified as keyword arguments.
+    #   The columns will be renamed to the keyword used.
     #
     # @return [DataFrame]
     #
-    # @example
+    # @example Pass an expression to add it as a new column.
     #   df = Polars::DataFrame.new(
     #     {
     #       "a" => [1, 2, 3, 4],
@@ -3696,11 +3732,41 @@ module Polars
     #       "c" => [true, true, false, true]
     #     }
     #   )
+    #   df.with_columns((Polars.col("a") ** 2).alias("a^2"))
+    #   # =>
+    #   # shape: (4, 4)
+    #   # ┌─────┬──────┬───────┬──────┐
+    #   # │ a   ┆ b    ┆ c     ┆ a^2  │
+    #   # │ --- ┆ ---  ┆ ---   ┆ ---  │
+    #   # │ i64 ┆ f64  ┆ bool  ┆ f64  │
+    #   # ╞═════╪══════╪═══════╪══════╡
+    #   # │ 1   ┆ 0.5  ┆ true  ┆ 1.0  │
+    #   # │ 2   ┆ 4.0  ┆ true  ┆ 4.0  │
+    #   # │ 3   ┆ 10.0 ┆ false ┆ 9.0  │
+    #   # │ 4   ┆ 13.0 ┆ true  ┆ 16.0 │
+    #   # └─────┴──────┴───────┴──────┘
+    #
+    # @example Added columns will replace existing columns with the same name.
+    #   df.with_columns(Polars.col("a").cast(Polars::Float64))
+    #   # =>
+    #   # shape: (4, 3)
+    #   # ┌─────┬──────┬───────┐
+    #   # │ a   ┆ b    ┆ c     │
+    #   # │ --- ┆ ---  ┆ ---   │
+    #   # │ f64 ┆ f64  ┆ bool  │
+    #   # ╞═════╪══════╪═══════╡
+    #   # │ 1.0 ┆ 0.5  ┆ true  │
+    #   # │ 2.0 ┆ 4.0  ┆ true  │
+    #   # │ 3.0 ┆ 10.0 ┆ false │
+    #   # │ 4.0 ┆ 13.0 ┆ true  │
+    #   # └─────┴──────┴───────┘
+    #
+    # @example Multiple columns can be added by passing a list of expressions.
     #   df.with_columns(
     #     [
     #       (Polars.col("a") ** 2).alias("a^2"),
     #       (Polars.col("b") / 2).alias("b/2"),
-    #       (Polars.col("c").is_not).alias("not c")
+    #       (Polars.col("c").not_).alias("not c"),
     #     ]
     #   )
     #   # =>
@@ -3715,13 +3781,45 @@ module Polars
     #   # │ 3   ┆ 10.0 ┆ false ┆ 9.0  ┆ 5.0  ┆ true  │
     #   # │ 4   ┆ 13.0 ┆ true  ┆ 16.0 ┆ 6.5  ┆ false │
     #   # └─────┴──────┴───────┴──────┴──────┴───────┘
-    def with_columns(exprs)
-      if !exprs.nil? && !exprs.is_a?(::Array)
-        exprs = [exprs]
-      end
-      lazy
-        .with_columns(exprs)
-        .collect(no_optimization: true, string_cache: false)
+    #
+    # @example Multiple columns also can be added using positional arguments instead of a list.
+    #   df.with_columns(
+    #     (Polars.col("a") ** 2).alias("a^2"),
+    #     (Polars.col("b") / 2).alias("b/2"),
+    #     (Polars.col("c").not_).alias("not c"),
+    #   )
+    #   # =>
+    #   # shape: (4, 6)
+    #   # ┌─────┬──────┬───────┬──────┬──────┬───────┐
+    #   # │ a   ┆ b    ┆ c     ┆ a^2  ┆ b/2  ┆ not c │
+    #   # │ --- ┆ ---  ┆ ---   ┆ ---  ┆ ---  ┆ ---   │
+    #   # │ i64 ┆ f64  ┆ bool  ┆ f64  ┆ f64  ┆ bool  │
+    #   # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
+    #   # │ 1   ┆ 0.5  ┆ true  ┆ 1.0  ┆ 0.25 ┆ false │
+    #   # │ 2   ┆ 4.0  ┆ true  ┆ 4.0  ┆ 2.0  ┆ false │
+    #   # │ 3   ┆ 10.0 ┆ false ┆ 9.0  ┆ 5.0  ┆ true  │
+    #   # │ 4   ┆ 13.0 ┆ true  ┆ 16.0 ┆ 6.5  ┆ false │
+    #   # └─────┴──────┴───────┴──────┴──────┴───────┘
+    #
+    # @example Use keyword arguments to easily name your expression inputs.
+    #   df.with_columns(
+    #     ab: Polars.col("a") * Polars.col("b"),
+    #     not_c: Polars.col("c").not_
+    #   )
+    #   # =>
+    #   # shape: (4, 5)
+    #   # ┌─────┬──────┬───────┬──────┬───────┐
+    #   # │ a   ┆ b    ┆ c     ┆ ab   ┆ not_c │
+    #   # │ --- ┆ ---  ┆ ---   ┆ ---  ┆ ---   │
+    #   # │ i64 ┆ f64  ┆ bool  ┆ f64  ┆ bool  │
+    #   # ╞═════╪══════╪═══════╪══════╪═══════╡
+    #   # │ 1   ┆ 0.5  ┆ true  ┆ 0.5  ┆ false │
+    #   # │ 2   ┆ 4.0  ┆ true  ┆ 8.0  ┆ false │
+    #   # │ 3   ┆ 10.0 ┆ false ┆ 30.0 ┆ true  │
+    #   # │ 4   ┆ 13.0 ┆ true  ┆ 52.0 ┆ false │
+    #   # └─────┴──────┴───────┴──────┴───────┘
+    def with_columns(*exprs, **named_exprs)
+      lazy.with_columns(*exprs, **named_exprs).collect(_eager: true)
     end
     # Get number of chunks used by the ChunkedArrays of this DataFrame.
@@ -4363,7 +4461,7 @@ module Polars
     #   #         null
     #   # ]
     #
-    # @example A horizontal boolean or, similar to a row-wise .any():
+    # @example A horizontal boolean or, similar to a row-wise .any:
     #   df = Polars::DataFrame.new(
     #     {
     #       "a" => [false, false, true],
@@ -4486,7 +4584,7 @@ module Polars
     #   # => [{"a"=>1, "b"=>2}, {"a"=>3, "b"=>4}, {"a"=>5, "b"=>6}]
     def rows(named: false)
       if named
-        columns = columns()
+        columns = self.columns
         _df.row_tuples.map do |v|
           columns.zip(v).to_h
         end
@@ -4527,7 +4625,7 @@ module Polars
       return to_enum(:iter_rows, named: named, buffer_size: buffer_size) unless block_given?
       # load into the local namespace for a modest performance boost in the hot loops
-      columns = columns()
+      columns = self.columns
       # note: buffering rows results in a 2-4x speedup over individual calls
       # to ".row(i)", so it should only be disabled in extremely specific cases.
@@ -4764,13 +4862,51 @@ module Polars
       _from_rbdf(_df.unnest(names))
     end
-    # TODO
+    # Requires NumPy
     # def corr
     # end
-    # TODO
-    # def merge_sorted
-    # end
+    # Take two sorted DataFrames and merge them by the sorted key.
+    #
+    # The output of this operation will also be sorted.
+    # It is the callers responsibility that the frames are sorted
+    # by that key otherwise the output will not make sense.
+    #
+    # The schemas of both DataFrames must be equal.
+    #
+    # @param other [DataFrame]
+    #   Other DataFrame that must be merged
+    # @param key [String]
+    #   Key that is sorted.
+    #
+    # @return [DataFrame]
+    #
+    # @example
+    #   df0 = Polars::DataFrame.new(
+    #     {"name" => ["steve", "elise", "bob"], "age" => [42, 44, 18]}
+    #   ).sort("age")
+    #   df1 = Polars::DataFrame.new(
+    #     {"name" => ["anna", "megan", "steve", "thomas"], "age" => [21, 33, 42, 20]}
+    #   ).sort("age")
+    #   df0.merge_sorted(df1, "age")
+    #   # =>
+    #   # shape: (7, 2)
+    #   # ┌────────┬─────┐
+    #   # │ name   ┆ age │
+    #   # │ ---    ┆ --- │
+    #   # │ str    ┆ i64 │
+    #   # ╞════════╪═════╡
+    #   # │ bob    ┆ 18  │
+    #   # │ thomas ┆ 20  │
+    #   # │ anna   ┆ 21  │
+    #   # │ megan  ┆ 33  │
+    #   # │ steve  ┆ 42  │
+    #   # │ steve  ┆ 42  │
+    #   # │ elise  ┆ 44  │
+    #   # └────────┴─────┘
+    def merge_sorted(other, key)
+      lazy.merge_sorted(other.lazy, key).collect(_eager: true)
+    end
     # Indicate that one or multiple columns are sorted.
     #
@@ -4812,7 +4948,7 @@ module Polars
     end
     def _pos_idxs(idxs, dim)
-      idx_type = Polars._get_idx_type
+      idx_type = Plr.get_index_type
       if idxs.is_a?(Series)
         if idxs.dtype == idx_type
@@ -5045,14 +5181,14 @@ module Polars
       elsif data[0].is_a?(Hash)
         column_names, dtypes = _unpack_schema(columns)
         schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
-        rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema_overrides)
+        rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
         if column_names
           rbdf = _post_apply_columns(rbdf, column_names)
         end
         return rbdf
       elsif data[0].is_a?(::Array)
+        first_element = data[0]
         if orient.nil? && !columns.nil?
-          first_element = data[0]
           row_types = first_element.filter_map { |value| value.class }.uniq
           if row_types.include?(Integer) && row_types.include?(Float)
             row_types.delete(Integer)