RubyGems - polars-df - Versions diffs - 0.5.0-aarch64-linux → 0.7.0-aarch64-linux - Mend

polars-df 0.5.0-aarch64-linux → 0.7.0-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +26 -0
data/Cargo.lock +595 -709
data/Cargo.toml +1 -0
data/LICENSE-THIRD-PARTY.txt +3854 -4496
data/README.md +11 -9
data/lib/polars/3.0/polars.so +0 -0
data/lib/polars/3.1/polars.so +0 -0
data/lib/polars/3.2/polars.so +0 -0
data/lib/polars/array_expr.rb +84 -0
data/lib/polars/array_name_space.rb +77 -0
data/lib/polars/batched_csv_reader.rb +1 -1
data/lib/polars/config.rb +530 -0
data/lib/polars/data_frame.rb +206 -131
data/lib/polars/data_types.rb +163 -29
data/lib/polars/date_time_expr.rb +13 -18
data/lib/polars/date_time_name_space.rb +22 -28
data/lib/polars/dynamic_group_by.rb +2 -2
data/lib/polars/expr.rb +241 -151
data/lib/polars/functions.rb +29 -38
data/lib/polars/group_by.rb +38 -76
data/lib/polars/io.rb +37 -2
data/lib/polars/lazy_frame.rb +174 -95
data/lib/polars/lazy_functions.rb +87 -63
data/lib/polars/lazy_group_by.rb +7 -8
data/lib/polars/list_expr.rb +40 -36
data/lib/polars/list_name_space.rb +15 -15
data/lib/polars/name_expr.rb +198 -0
data/lib/polars/rolling_group_by.rb +6 -4
data/lib/polars/series.rb +95 -28
data/lib/polars/sql_context.rb +194 -0
data/lib/polars/string_expr.rb +249 -69
data/lib/polars/string_name_space.rb +155 -25
data/lib/polars/utils.rb +119 -57
data/lib/polars/version.rb +1 -1
data/lib/polars.rb +6 -0
metadata +7 -2

data/lib/polars/data_frame.rb CHANGED Viewed

@@ -20,15 +20,9 @@ module Polars
     #   this does not yield conclusive results, column orientation is used.
     def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
       schema ||= columns
-      raise Todo if schema_overrides
-      # TODO deprecate in favor of read_sql
       if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
-        result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
-        data = {}
-        result.columns.each_with_index do |k, i|
-          data[k] = result.rows.map { |r| r[i] }
-        end
+        raise ArgumentError, "Use read_database instead"
       end
       if data.nil?
@@ -36,7 +30,7 @@ module Polars
       elsif data.is_a?(Hash)
         data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
         self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
-      elsif data.is_a?(Array)
+      elsif data.is_a?(::Array)
         self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
       elsif data.is_a?(Series)
         self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
@@ -116,7 +110,7 @@ module Polars
           dtypes.each do|k, v|
             dtype_list << [k, Utils.rb_type_to_dtype(v)]
           end
-        elsif dtypes.is_a?(Array)
+        elsif dtypes.is_a?(::Array)
           dtype_slice = dtypes
         else
           raise ArgumentError, "dtype arg should be list or dict"
@@ -590,7 +584,7 @@ module Polars
         # df[2, ..] (select row as df)
         if row_selection.is_a?(Integer)
-          if col_selection.is_a?(Array)
+          if col_selection.is_a?(::Array)
             df = self[0.., col_selection]
             return df.slice(row_selection, 1)
           end
@@ -611,7 +605,7 @@ module Polars
           return series[row_selection]
         end
-        if col_selection.is_a?(Array)
+        if col_selection.is_a?(::Array)
           # df[.., [1, 2]]
           if Utils.is_int_sequence(col_selection)
             series_list = col_selection.map { |i| to_series(i) }
@@ -641,7 +635,7 @@ module Polars
           return Slice.new(self).apply(item)
         end
-        if item.is_a?(Array) && item.all? { |v| Utils.strlike?(v) }
+        if item.is_a?(::Array) && item.all? { |v| Utils.strlike?(v) }
           # select multiple columns
           # df[["foo", "bar"]]
           return _from_rbdf(_df.select(item.map(&:to_s)))
@@ -684,13 +678,13 @@ module Polars
       end
       if Utils.strlike?(key)
-        if value.is_a?(Array) || (defined?(Numo::NArray) && value.is_a?(Numo::NArray))
+        if value.is_a?(::Array) || (defined?(Numo::NArray) && value.is_a?(Numo::NArray))
           value = Series.new(value)
         elsif !value.is_a?(Series)
           value = Polars.lit(value)
         end
         self._df = with_column(value.alias(key.to_s))._df
-      elsif key.is_a?(Array)
+      elsif key.is_a?(::Array)
         row_selection, col_selection = key
         if Utils.strlike?(col_selection)
@@ -905,6 +899,7 @@ module Polars
     def write_csv(
       file = nil,
       has_header: true,
+      include_header: nil,
       sep: ",",
       quote: '"',
       batch_size: 1024,
@@ -914,6 +909,8 @@ module Polars
       float_precision: nil,
       null_value: nil
     )
+      include_header = has_header if include_header.nil?
       if sep.length > 1
         raise ArgumentError, "only single byte separator is allowed"
       elsif quote.length > 1
@@ -927,7 +924,7 @@ module Polars
         buffer.set_encoding(Encoding::BINARY)
         _df.write_csv(
           buffer,
-          has_header,
+          include_header,
           sep.ord,
           quote.ord,
           batch_size,
@@ -946,7 +943,7 @@ module Polars
       _df.write_csv(
         file,
-        has_header,
+        include_header,
         sep.ord,
         quote.ord,
         batch_size,
@@ -994,14 +991,21 @@ module Polars
     #
     # @return [nil]
     def write_ipc(file, compression: "uncompressed")
-      if compression.nil?
-        compression = "uncompressed"
+      return_bytes = file.nil?
+      if return_bytes
+        file = StringIO.new
+        file.set_encoding(Encoding::BINARY)
       end
       if Utils.pathlike?(file)
         file = Utils.normalise_filepath(file)
       end
+      if compression.nil?
+        compression = "uncompressed"
+      end
       _df.write_ipc(file, compression)
+      return_bytes ? file.string : nil
     end
     # Write to Apache Parquet file.
@@ -1144,22 +1148,8 @@ module Polars
     #   # │ b   ┆ 1   ┆ 2   ┆ 3   │
     #   # └─────┴─────┴─────┴─────┘
     def transpose(include_header: false, header_name: "column", column_names: nil)
-      df = _from_rbdf(_df.transpose(include_header, header_name))
-      if !column_names.nil?
-        names = []
-        n = df.width
-        if include_header
-          names << header_name
-          n -= 1
-        end
-        column_names = column_names.each
-        n.times do
-          names << column_names.next
-        end
-        df.columns = names
-      end
-      df
+      keep_names_as = include_header ? header_name : nil
+      _from_rbdf(_df.transpose(keep_names_as, column_names))
     end
     # Reverse the DataFrame.
@@ -1491,13 +1481,9 @@ module Polars
     #   # │ 1   ┆ 6.0 ┆ a   │
     #   # └─────┴─────┴─────┘
     def sort(by, reverse: false, nulls_last: false)
-      if by.is_a?(Array) || by.is_a?(Expr)
-        lazy
-          .sort(by, reverse: reverse, nulls_last: nulls_last)
-          .collect(no_optimization: true, string_cache: false)
-      else
-        _from_rbdf(_df.sort(by, reverse, nulls_last))
-      end
+      lazy
+        .sort(by, reverse: reverse, nulls_last: nulls_last)
+        .collect(no_optimization: true)
     end
     # Sort the DataFrame by column in-place.
@@ -1808,13 +1794,13 @@ module Polars
       _from_rbdf(_df.with_row_count(name, offset))
     end
-    # Start a groupby operation.
+    # Start a group by operation.
     #
     # @param by [Object]
     #   Column(s) to group by.
     # @param maintain_order [Boolean]
     #   Make sure that the order of the groups remain consistent. This is more
-    #   expensive than a default groupby. Note that this only works in expression
+    #   expensive than a default group by. Note that this only works in expression
     #   aggregations.
     #
     # @return [GroupBy]
@@ -1827,7 +1813,7 @@ module Polars
     #       "c" => [6, 5, 4, 3, 2, 1]
     #     }
     #   )
-    #   df.groupby("a").agg(Polars.col("b").sum).sort("a")
+    #   df.group_by("a").agg(Polars.col("b").sum).sort("a")
     #   # =>
     #   # shape: (3, 2)
     #   # ┌─────┬─────┐
@@ -1839,25 +1825,26 @@ module Polars
     #   # │ b   ┆ 11  │
     #   # │ c   ┆ 6   │
     #   # └─────┴─────┘
-    def groupby(by, maintain_order: false)
+    def group_by(by, maintain_order: false)
       if !Utils.bool?(maintain_order)
-        raise TypeError, "invalid input for groupby arg `maintain_order`: #{maintain_order}."
+        raise TypeError, "invalid input for group_by arg `maintain_order`: #{maintain_order}."
       end
       GroupBy.new(
-        _df,
+        self,
         by,
-        self.class,
         maintain_order: maintain_order
       )
     end
+    alias_method :groupby, :group_by
+    alias_method :group, :group_by
     # Create rolling groups based on a time column.
     #
     # Also works for index values of type `:i32` or `:i64`.
     #
-    # Different from a `dynamic_groupby` the windows are now determined by the
+    # Different from a `dynamic_group_by` the windows are now determined by the
     # individual values and are not of constant intervals. For constant intervals use
-    # *groupby_dynamic*
+    # *group_by_dynamic*
     #
     # The `period` and `offset` arguments are created either from a timedelta, or
     # by using the following string language:
@@ -1877,7 +1864,7 @@ module Polars
     # Or combine them:
     # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
     #
-    # In case of a groupby_rolling on an integer column, the windows are defined by:
+    # In case of a group_by_rolling on an integer column, the windows are defined by:
     #
     # - **"1i"      # length 1**
     # - **"10i"     # length 10**
@@ -1888,7 +1875,7 @@ module Polars
     #   This column must be sorted in ascending order. If not the output will not
     #   make sense.
     #
-    #   In case of a rolling groupby on indices, dtype needs to be one of
+    #   In case of a rolling group by on indices, dtype needs to be one of
     #   `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
     #   performance matters use an `:i64` column.
     # @param period [Object]
@@ -1899,6 +1886,12 @@ module Polars
     #   Define whether the temporal window interval is closed or not.
     # @param by [Object]
     #   Also group by this column/these columns.
+    # @param check_sorted [Boolean]
+    #   When the `by` argument is given, polars can not check sortedness
+    #   by the metadata and has to do a full scan on the index column to
+    #   verify data is sorted. This is expensive. If you are sure the
+    #   data within the by groups is sorted, you can set this to `false`.
+    #   Doing so incorrectly will lead to incorrect output
     #
     # @return [RollingGroupBy]
     #
@@ -1912,9 +1905,9 @@ module Polars
     #     "2020-01-08 23:16:43"
     #   ]
     #   df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
-    #     Polars.col("dt").str.strptime(Polars::Datetime)
+    #     Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
     #   )
-    #   df.groupby_rolling(index_column: "dt", period: "2d").agg(
+    #   df.group_by_rolling(index_column: "dt", period: "2d").agg(
     #     [
     #       Polars.sum("a").alias("sum_a"),
     #       Polars.min("a").alias("min_a"),
@@ -1935,20 +1928,22 @@ module Polars
     #   # │ 2020-01-03 19:45:32 ┆ 11    ┆ 2     ┆ 9     │
     #   # │ 2020-01-08 23:16:43 ┆ 1     ┆ 1     ┆ 1     │
     #   # └─────────────────────┴───────┴───────┴───────┘
-    def groupby_rolling(
+    def group_by_rolling(
       index_column:,
       period:,
       offset: nil,
       closed: "right",
-      by: nil
+      by: nil,
+      check_sorted: true
     )
-      RollingGroupBy.new(self, index_column, period, offset, closed, by)
+      RollingGroupBy.new(self, index_column, period, offset, closed, by, check_sorted)
     end
+    alias_method :groupby_rolling, :group_by_rolling
     # Group based on a time value (or index value of type `:i32`, `:i64`).
     #
     # Time windows are calculated and rows are assigned to windows. Different from a
-    # normal groupby is that a row can be member of multiple groups. The time/index
+    # normal group by is that a row can be member of multiple groups. The time/index
     # window could be seen as a rolling window, with a window size determined by
     # dates/times/values instead of slots in the DataFrame.
     #
@@ -1976,7 +1971,7 @@ module Polars
     # Or combine them:
     # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
     #
-    # In case of a groupby_dynamic on an integer column, the windows are defined by:
+    # In case of a group_by_dynamic on an integer column, the windows are defined by:
     #
     # - "1i"      # length 1
     # - "10i"     # length 10
@@ -1987,7 +1982,7 @@ module Polars
     #   This column must be sorted in ascending order. If not the output will not
     #   make sense.
     #
-    #   In case of a dynamic groupby on indices, dtype needs to be one of
+    #   In case of a dynamic group by on indices, dtype needs to be one of
     #   `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
     #   performance matters use an `:i64` column.
     # @param every
@@ -2038,7 +2033,7 @@ module Polars
     #   # └─────────────────────┴─────┘
     #
     # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
-    #   df.groupby_dynamic("time", every: "1h", closed: "right").agg(
+    #   df.group_by_dynamic("time", every: "1h", closed: "right").agg(
     #     [
     #       Polars.col("time").min.alias("time_min"),
     #       Polars.col("time").max.alias("time_max")
@@ -2058,7 +2053,7 @@ module Polars
     #   # └─────────────────────┴─────────────────────┴─────────────────────┘
     #
     # @example The window boundaries can also be added to the aggregation result.
-    #   df.groupby_dynamic(
+    #   df.group_by_dynamic(
     #     "time", every: "1h", include_boundaries: true, closed: "right"
     #   ).agg([Polars.col("time").count.alias("time_count")])
     #   # =>
@@ -2075,27 +2070,27 @@ module Polars
     #   # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
     #
     # @example When closed="left", should not include right end of interval.
-    #   df.groupby_dynamic("time", every: "1h", closed: "left").agg(
+    #   df.group_by_dynamic("time", every: "1h", closed: "left").agg(
     #     [
     #       Polars.col("time").count.alias("time_count"),
-    #       Polars.col("time").list.alias("time_agg_list")
+    #       Polars.col("time").alias("time_agg_list")
     #     ]
     #   )
     #   # =>
     #   # shape: (4, 3)
-    #   # ┌─────────────────────┬────────────┬─────────────────────────────────────┐
-    #   # │ time                ┆ time_count ┆ time_agg_list                       │
-    #   # │ ---                 ┆ ---        ┆ ---                                 │
-    #   # │ datetime[μs]        ┆ u32        ┆ list[datetime[μs]]                  │
-    #   # ╞═════════════════════╪════════════╪═════════════════════════════════════╡
-    #   # │ 2021-12-16 00:00:00 ┆ 2          ┆ [2021-12-16 00:00:00, 2021-12-16... │
-    #   # │ 2021-12-16 01:00:00 ┆ 2          ┆ [2021-12-16 01:00:00, 2021-12-16... │
-    #   # │ 2021-12-16 02:00:00 ┆ 2          ┆ [2021-12-16 02:00:00, 2021-12-16... │
-    #   # │ 2021-12-16 03:00:00 ┆ 1          ┆ [2021-12-16 03:00:00]               │
-    #   # └─────────────────────┴────────────┴─────────────────────────────────────┘
+    #   # ┌─────────────────────┬────────────┬───────────────────────────────────┐
+    #   # │ time                ┆ time_count ┆ time_agg_list                     │
+    #   # │ ---                 ┆ ---        ┆ ---                               │
+    #   # │ datetime[μs]        ┆ u32        ┆ list[datetime[μs]]                │
+    #   # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
+    #   # │ 2021-12-16 00:00:00 ┆ 2          ┆ [2021-12-16 00:00:00, 2021-12-16… │
+    #   # │ 2021-12-16 01:00:00 ┆ 2          ┆ [2021-12-16 01:00:00, 2021-12-16… │
+    #   # │ 2021-12-16 02:00:00 ┆ 2          ┆ [2021-12-16 02:00:00, 2021-12-16… │
+    #   # │ 2021-12-16 03:00:00 ┆ 1          ┆ [2021-12-16 03:00:00]             │
+    #   # └─────────────────────┴────────────┴───────────────────────────────────┘
     #
     # @example When closed="both" the time values at the window boundaries belong to 2 groups.
-    #   df.groupby_dynamic("time", every: "1h", closed: "both").agg(
+    #   df.group_by_dynamic("time", every: "1h", closed: "both").agg(
     #     [Polars.col("time").count.alias("time_count")]
     #   )
     #   # =>
@@ -2112,7 +2107,7 @@ module Polars
     #   # │ 2021-12-16 03:00:00 ┆ 1          │
     #   # └─────────────────────┴────────────┘
     #
-    # @example Dynamic groupbys can also be combined with grouping on normal keys.
+    # @example Dynamic group bys can also be combined with grouping on normal keys.
     #   df = Polars::DataFrame.new(
     #     {
     #       "time" => Polars.date_range(
@@ -2123,7 +2118,7 @@ module Polars
     #       "groups" => ["a", "a", "a", "b", "b", "a", "a"]
     #     }
     #   )
-    #   df.groupby_dynamic(
+    #   df.group_by_dynamic(
     #     "time",
     #     every: "1h",
     #     closed: "both",
@@ -2146,20 +2141,20 @@ module Polars
     #   # │ b      ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1          │
     #   # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
     #
-    # @example Dynamic groupby on an index column.
+    # @example Dynamic group by on an index column.
     #   df = Polars::DataFrame.new(
     #     {
     #       "idx" => Polars.arange(0, 6, eager: true),
     #       "A" => ["A", "A", "B", "B", "B", "C"]
     #     }
     #   )
-    #   df.groupby_dynamic(
+    #   df.group_by_dynamic(
     #     "idx",
     #     every: "2i",
     #     period: "3i",
     #     include_boundaries: true,
     #     closed: "right"
-    #   ).agg(Polars.col("A").list.alias("A_agg_list"))
+    #   ).agg(Polars.col("A").alias("A_agg_list"))
     #   # =>
     #   # shape: (3, 4)
     #   # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
@@ -2171,7 +2166,7 @@ module Polars
     #   # │ 2               ┆ 5               ┆ 2   ┆ ["B", "B", "C"] │
     #   # │ 4               ┆ 7               ┆ 4   ┆ ["C"]           │
     #   # └─────────────────┴─────────────────┴─────┴─────────────────┘
-    def groupby_dynamic(
+    def group_by_dynamic(
       index_column,
       every:,
       period: nil,
@@ -2195,6 +2190,7 @@ module Polars
         start_by
       )
     end
+    alias_method :groupby_dynamic, :group_by_dynamic
     # Upsample a DataFrame at a regular frequency.
     #
@@ -2242,7 +2238,7 @@ module Polars
     #       "groups" => ["A", "B", "A", "B"],
     #       "values" => [0, 1, 2, 3]
     #     }
-    #   )
+    #   ).set_sorted("time")
     #   df.upsample(
     #     time_column: "time", every: "1mo", by: "groups", maintain_order: true
     #   ).select(Polars.all.forward_fill)
@@ -2360,7 +2356,7 @@ module Polars
     #       ],  # note record date: Jan 1st (sorted!)
     #       "gdp" => [4164, 4411, 4566, 4696]
     #     }
-    #   )
+    #   ).set_sorted("date")
     #   population = Polars::DataFrame.new(
     #     {
     #       "date" => [
@@ -2371,7 +2367,7 @@ module Polars
     #       ],  # note record date: May 12th (sorted!)
     #       "population" => [82.19, 82.66, 83.12, 83.52]
     #     }
-    #   )
+    #   ).set_sorted("date")
     #   population.join_asof(
     #     gdp, left_on: "date", right_on: "date", strategy: "backward"
     #   )
@@ -2674,7 +2670,7 @@ module Polars
     #   # │ 3   ┆ 8   ┆ c   ┆ 30    │
     #   # └─────┴─────┴─────┴───────┘
     def hstack(columns, in_place: false)
-      if !columns.is_a?(Array)
+      if !columns.is_a?(::Array)
         columns = columns.get_columns
       end
       if in_place
@@ -2804,7 +2800,7 @@ module Polars
     #   # │ 3   ┆ 8.0 │
     #   # └─────┴─────┘
     def drop(columns)
-      if columns.is_a?(Array)
+      if columns.is_a?(::Array)
         df = clone
         columns.each do |n|
           df._df.drop_in_place(n)
@@ -3317,7 +3313,7 @@ module Polars
       n_fill = n_cols * n_rows - height
       if n_fill > 0
-        if !fill_values.is_a?(Array)
+        if !fill_values.is_a?(::Array)
           fill_values = [fill_values] * df.width
         end
@@ -3426,36 +3422,38 @@ module Polars
     #   # ╞═════╪═════╪═════╡
     #   # │ C   ┆ 2   ┆ l   │
     #   # └─────┴─────┴─────┘}
-    def partition_by(groups, maintain_order: true, as_dict: false)
+    def partition_by(groups, maintain_order: true, include_key: true, as_dict: false)
       if groups.is_a?(String)
         groups = [groups]
-      elsif !groups.is_a?(Array)
+      elsif !groups.is_a?(::Array)
         groups = Array(groups)
       end
       if as_dict
         out = {}
         if groups.length == 1
-          _df.partition_by(groups, maintain_order).each do |df|
+          _df.partition_by(groups, maintain_order, include_key).each do |df|
             df = _from_rbdf(df)
             out[df[groups][0, 0]] = df
           end
         else
-          _df.partition_by(groups, maintain_order).each do |df|
+          _df.partition_by(groups, maintain_order, include_key).each do |df|
             df = _from_rbdf(df)
             out[df[groups].row(0)] = df
           end
         end
         out
       else
-        _df.partition_by(groups, maintain_order).map { |df| _from_rbdf(df) }
+        _df.partition_by(groups, maintain_order, include_key).map { |df| _from_rbdf(df) }
       end
     end
     # Shift values by the given period.
     #
-    # @param periods [Integer]
+    # @param n [Integer]
     #   Number of places to shift (may be negative).
+    # @param fill_value [Object]
+    #  Fill the resulting null values with this value.
     #
     # @return [DataFrame]
     #
@@ -3493,8 +3491,8 @@ module Polars
     #   # │ 3    ┆ 8    ┆ c    │
     #   # │ null ┆ null ┆ null │
     #   # └──────┴──────┴──────┘
-    def shift(periods)
-      _from_rbdf(_df.shift(periods))
+    def shift(n, fill_value: nil)
+      lazy.shift(n, fill_value: fill_value).collect(_eager: true)
     end
     # Shift the values by a given period and fill the resulting null values.
@@ -3527,9 +3525,7 @@ module Polars
     #   # │ 2   ┆ 7   ┆ b   │
     #   # └─────┴─────┴─────┘
     def shift_and_fill(periods, fill_value)
-      lazy
-        .shift_and_fill(periods, fill_value)
-        .collect(no_optimization: true, string_cache: false)
+      shift(periods, fill_value: fill_value)
     end
     # Get a mask of all duplicated rows in this DataFrame.
@@ -3716,7 +3712,7 @@ module Polars
     #   # │ 4   ┆ 13.0 ┆ true  ┆ 16.0 ┆ 6.5  ┆ false │
     #   # └─────┴──────┴───────┴──────┴──────┴───────┘
     def with_columns(exprs)
-      if !exprs.nil? && !exprs.is_a?(Array)
+      if !exprs.nil? && !exprs.is_a?(::Array)
         exprs = [exprs]
       end
       lazy
@@ -3780,7 +3776,7 @@ module Polars
       if axis == 0
         _from_rbdf(_df.max)
       elsif axis == 1
-        Utils.wrap_s(_df.hmax)
+        Utils.wrap_s(_df.max_horizontal)
       else
         raise ArgumentError, "Axis should be 0 or 1."
       end
@@ -3812,7 +3808,7 @@ module Polars
       if axis == 0
         _from_rbdf(_df.min)
       elsif axis == 1
-        Utils.wrap_s(_df.hmin)
+        Utils.wrap_s(_df.min_horizontal)
       else
         raise ArgumentError, "Axis should be 0 or 1."
       end
@@ -3861,7 +3857,7 @@ module Polars
       when 0
         _from_rbdf(_df.sum)
       when 1
-        Utils.wrap_s(_df.hsum(null_strategy))
+        Utils.wrap_s(_df.sum_horizontal(null_strategy))
       else
         raise ArgumentError, "Axis should be 0 or 1."
       end
@@ -3899,7 +3895,7 @@ module Polars
       when 0
         _from_rbdf(_df.mean)
       when 1
-        Utils.wrap_s(_df.hmean(null_strategy))
+        Utils.wrap_s(_df.mean_horizontal(null_strategy))
       else
         raise ArgumentError, "Axis should be 0 or 1."
       end
@@ -4097,11 +4093,11 @@ module Polars
     #   # │ 1     ┆ 0     ┆ 1     ┆ 0     ┆ 1     ┆ 0     │
     #   # │ 0     ┆ 1     ┆ 0     ┆ 1     ┆ 0     ┆ 1     │
     #   # └───────┴───────┴───────┴───────┴───────┴───────┘
-    def to_dummies(columns: nil, separator: "_")
+    def to_dummies(columns: nil, separator: "_", drop_first: false)
       if columns.is_a?(String)
         columns = [columns]
       end
-      _from_rbdf(_df.to_dummies(columns, separator))
+      _from_rbdf(_df.to_dummies(columns, separator, drop_first))
     end
     # Drop duplicate rows from this DataFrame.
@@ -4189,7 +4185,7 @@ module Polars
         subset = [subset]
       end
-      if subset.is_a?(Array) && subset.length == 1
+      if subset.is_a?(::Array) && subset.length == 1
         expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false)
       else
         struct_fields = subset.nil? ? Polars.all : subset
@@ -4284,15 +4280,20 @@ module Polars
       end
       if n.nil? && !frac.nil?
+        frac = Series.new("frac", [frac]) unless frac.is_a?(Series)
         _from_rbdf(
-          _df.sample_frac(frac, with_replacement, shuffle, seed)
+          _df.sample_frac(frac._s, with_replacement, shuffle, seed)
         )
       end
       if n.nil?
         n = 1
       end
-      _from_rbdf(_df.sample_n(n, with_replacement, shuffle, seed))
+      n = Series.new("", [n]) unless n.is_a?(Series)
+      _from_rbdf(_df.sample_n(n._s, with_replacement, shuffle, seed))
     end
     # Apply a horizontal reduction on a DataFrame.
@@ -4591,7 +4592,7 @@ module Polars
     #
     # @example
     #   s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]})
-    #   s.take_every(2)
+    #   s.gather_every(2)
     #   # =>
     #   # shape: (2, 2)
     #   # ┌─────┬─────┐
@@ -4602,9 +4603,10 @@ module Polars
     #   # │ 1   ┆ 5   │
     #   # │ 3   ┆ 7   │
     #   # └─────┴─────┘
-    def take_every(n)
-      select(Utils.col("*").take_every(n))
+    def gather_every(n)
+      select(Utils.col("*").gather_every(n))
     end
+    alias_method :take_every, :gather_every
     # Hash and combine the rows in this DataFrame.
     #
@@ -4661,16 +4663,16 @@ module Polars
     #   df.interpolate
     #   # =>
     #   # shape: (4, 3)
-    #   # ┌─────┬──────┬─────┐
-    #   # │ foo ┆ bar  ┆ baz │
-    #   # │ --- ┆ ---  ┆ --- │
-    #   # │ i64 ┆ i64  ┆ i64 │
-    #   # ╞═════╪══════╪═════╡
-    #   # │ 1   ┆ 6    ┆ 1   │
-    #   # │ 5   ┆ 7    ┆ 3   │
-    #   # │ 9   ┆ 9    ┆ 6   │
-    #   # │ 10  ┆ null ┆ 9   │
-    #   # └─────┴──────┴─────┘
+    #   # ┌──────┬──────┬──────────┐
+    #   # │ foo  ┆ bar  ┆ baz      │
+    #   # │ ---  ┆ ---  ┆ ---      │
+    #   # │ f64  ┆ f64  ┆ f64      │
+    #   # ╞══════╪══════╪══════════╡
+    #   # │ 1.0  ┆ 6.0  ┆ 1.0      │
+    #   # │ 5.0  ┆ 7.0  ┆ 3.666667 │
+    #   # │ 9.0  ┆ 9.0  ┆ 6.333333 │
+    #   # │ 10.0 ┆ null ┆ 9.0      │
+    #   # └──────┴──────┴──────────┘
     def interpolate
       select(Utils.col("*").interpolate)
     end
@@ -4758,6 +4760,38 @@ module Polars
       _from_rbdf(_df.unnest(names))
     end
+    # TODO
+    # def corr
+    # end
+    # TODO
+    # def merge_sorted
+    # end
+    # Indicate that one or multiple columns are sorted.
+    #
+    # @param column [Object]
+    #   Columns that are sorted
+    # @param more_columns [Object]
+    #   Additional columns that are sorted, specified as positional arguments.
+    # @param descending [Boolean]
+    #   Whether the columns are sorted in descending order.
+    #
+    # @return [DataFrame]
+    def set_sorted(
+      column,
+      *more_columns,
+      descending: false
+    )
+      lazy
+        .set_sorted(column, *more_columns, descending: descending)
+        .collect(no_optimization: true)
+    end
+    # TODO
+    # def update
+    # end
     private
     def initialize_copy(other)
@@ -4910,8 +4944,8 @@ module Polars
           [lookup[col[0]] || col[0], col[1]]
         end
-      if schema_overrides
-        raise Todo
+      if schema_overrides && schema_overrides.any?
+        column_dtypes.merge!(schema_overrides)
       end
       column_dtypes.each do |col, dtype|
@@ -4967,7 +5001,7 @@ module Polars
       columns.each do |col, i|
         if dtypes[col] == Categorical # != rbdf_dtypes[i]
           column_casts << Polars.col(col).cast(Categorical)._rbexpr
-        elsif structs.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
+        elsif structs&.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
           column_casts << Polars.col(col).cast(structs[col])._rbexpr
         elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
           column_casts << Polars.col(col).cast(dtypes[col])._rbexpr
@@ -5012,15 +5046,56 @@ module Polars
           rbdf = _post_apply_columns(rbdf, column_names)
         end
         return rbdf
-      elsif data[0].is_a?(Array)
+      elsif data[0].is_a?(::Array)
         if orient.nil? && !columns.nil?
-          orient = columns.length == data.length ? "col" : "row"
+          first_element = data[0]
+          row_types = first_element.filter_map { |value| value.class }.uniq
+          if row_types.include?(Integer) && row_types.include?(Float)
+            row_types.delete(Integer)
+          end
+          orient = row_types.length == 1 ? "col" : "row"
         end
         if orient == "row"
-          raise Todo
+          column_names, schema_overrides = _unpack_schema(
+            schema, schema_overrides: schema_overrides, n_expected: first_element.length
+          )
+          local_schema_override = (
+            schema_overrides.any? ? (raise Todo) : {}
+          )
+          if column_names.any? && first_element.length > 0 && first_element.length != column_names.length
+            raise ArgumentError, "the row data does not match the number of columns"
+          end
+          unpack_nested = false
+          local_schema_override.each do |col, tp|
+            raise Todo
+          end
+          if unpack_nested
+            raise Todo
+          else
+            rbdf = RbDataFrame.read_rows(
+              data,
+              infer_schema_length,
+              local_schema_override.any? ? local_schema_override : nil
+            )
+          end
+          if column_names.any? || schema_overrides.any?
+            rbdf = _post_apply_columns(
+              rbdf, column_names, schema_overrides: schema_overrides
+            )
+          end
+          return rbdf
         elsif orient == "col" || orient.nil?
-          raise Todo
+          column_names, schema_overrides = _unpack_schema(
+            schema, schema_overrides: schema_overrides, n_expected: data.length
+          )
+          data_series =
+            data.map.with_index do |element, i|
+              Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]])._s
+            end
+          return RbDataFrame.new(data_series)
         else
           raise ArgumentError, "orient must be one of {{'col', 'row', nil}}, got #{orient} instead."
         end
@@ -5066,10 +5141,10 @@ module Polars
     def _compare_to_other_df(other, op)
       if columns != other.columns
-        raise ArgmentError, "DataFrame columns do not match"
+        raise ArgumentError, "DataFrame columns do not match"
       end
       if shape != other.shape
-        raise ArgmentError, "DataFrame dimensions do not match"
+        raise ArgumentError, "DataFrame dimensions do not match"
       end
       suffix = "__POLARS_CMP_OTHER"
@@ -5117,7 +5192,7 @@ module Polars
     def _prepare_other_arg(other)
       if !other.is_a?(Series)
-        if other.is_a?(Array)
+        if other.is_a?(::Array)
           raise ArgumentError, "Operation not supported."
         end