RubyGems - polars-df - Versions diffs - 0.4.0-x86_64-darwin → 0.6.0-x86_64-darwin - Mend

polars-df 0.4.0-x86_64-darwin → 0.6.0-x86_64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +26 -0
data/Cargo.lock +447 -410
data/Cargo.toml +0 -1
data/LICENSE-THIRD-PARTY.txt +2142 -972
data/README.md +6 -5
data/lib/polars/3.0/polars.bundle +0 -0
data/lib/polars/3.1/polars.bundle +0 -0
data/lib/polars/3.2/polars.bundle +0 -0
data/lib/polars/array_expr.rb +84 -0
data/lib/polars/array_name_space.rb +77 -0
data/lib/polars/batched_csv_reader.rb +1 -1
data/lib/polars/convert.rb +2 -2
data/lib/polars/data_frame.rb +289 -96
data/lib/polars/data_types.rb +169 -33
data/lib/polars/date_time_expr.rb +142 -2
data/lib/polars/date_time_name_space.rb +17 -3
data/lib/polars/expr.rb +145 -78
data/lib/polars/functions.rb +0 -1
data/lib/polars/group_by.rb +1 -22
data/lib/polars/lazy_frame.rb +84 -31
data/lib/polars/lazy_functions.rb +71 -32
data/lib/polars/list_expr.rb +94 -45
data/lib/polars/list_name_space.rb +13 -13
data/lib/polars/rolling_group_by.rb +4 -2
data/lib/polars/series.rb +249 -87
data/lib/polars/string_expr.rb +277 -45
data/lib/polars/string_name_space.rb +137 -22
data/lib/polars/struct_name_space.rb +32 -0
data/lib/polars/utils.rb +138 -54
data/lib/polars/version.rb +1 -1
data/lib/polars.rb +5 -2
metadata +4 -2

data/lib/polars/data_frame.rb CHANGED Viewed

@@ -18,7 +18,10 @@ module Polars
     #   Whether to interpret two-dimensional data as columns or as rows. If `nil`,
     #   the orientation is inferred by matching the columns and data dimensions. If
     #   this does not yield conclusive results, column orientation is used.
-    def initialize(data = nil, columns: nil, orient: nil)
+    def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
+      schema ||= columns
+      raise Todo if schema_overrides
       # TODO deprecate in favor of read_sql
       if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
         result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
@@ -29,14 +32,14 @@ module Polars
       end
       if data.nil?
-        self._df = self.class.hash_to_rbdf({}, columns: columns)
+        self._df = self.class.hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
       elsif data.is_a?(Hash)
         data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
-        self._df = self.class.hash_to_rbdf(data, columns: columns)
-      elsif data.is_a?(Array)
-        self._df = self.class.sequence_to_rbdf(data, columns: columns, orient: orient)
+        self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
+      elsif data.is_a?(::Array)
+        self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
       elsif data.is_a?(Series)
-        self._df = self.class.series_to_rbdf(data, columns: columns)
+        self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
       else
         raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
       end
@@ -56,8 +59,8 @@ module Polars
     end
     # @private
-    def self._from_hash(data, columns: nil)
-      _from_rbdf(hash_to_rbdf(data, columns: columns))
+    def self._from_hash(data, schema: nil, schema_overrides: nil)
+      _from_rbdf(hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides))
     end
     # def self._from_records
@@ -113,7 +116,7 @@ module Polars
           dtypes.each do|k, v|
             dtype_list << [k, Utils.rb_type_to_dtype(v)]
           end
-        elsif dtypes.is_a?(Array)
+        elsif dtypes.is_a?(::Array)
           dtype_slice = dtypes
         else
           raise ArgumentError, "dtype arg should be list or dict"
@@ -336,6 +339,7 @@ module Polars
     end
     alias_method :count, :height
     alias_method :length, :height
+    alias_method :size, :height
     # Get the width of the DataFrame.
     #
@@ -546,6 +550,13 @@ module Polars
     end
     alias_method :inspect, :to_s
+    # Returns an array representing the DataFrame
+    #
+    # @return [Array]
+    def to_a
+      rows(named: true)
+    end
     # Check if DataFrame includes column.
     #
     # @return [Boolean]
@@ -579,7 +590,7 @@ module Polars
         # df[2, ..] (select row as df)
         if row_selection.is_a?(Integer)
-          if col_selection.is_a?(Array)
+          if col_selection.is_a?(::Array)
             df = self[0.., col_selection]
             return df.slice(row_selection, 1)
           end
@@ -600,7 +611,7 @@ module Polars
           return series[row_selection]
         end
-        if col_selection.is_a?(Array)
+        if col_selection.is_a?(::Array)
           # df[.., [1, 2]]
           if Utils.is_int_sequence(col_selection)
             series_list = col_selection.map { |i| to_series(i) }
@@ -630,7 +641,7 @@ module Polars
           return Slice.new(self).apply(item)
         end
-        if item.is_a?(Array) && item.all? { |v| Utils.strlike?(v) }
+        if item.is_a?(::Array) && item.all? { |v| Utils.strlike?(v) }
           # select multiple columns
           # df[["foo", "bar"]]
           return _from_rbdf(_df.select(item.map(&:to_s)))
@@ -655,7 +666,7 @@ module Polars
       end
       # Ruby-specific
-      if item.is_a?(Expr)
+      if item.is_a?(Expr) || item.is_a?(Series)
         return filter(item)
       end
@@ -665,15 +676,42 @@ module Polars
     # Set item.
     #
     # @return [Object]
-    #
-    # def []=(key, value)
-    #   if key.is_a?(String)
-    #     raise TypeError, "'DataFrame' object does not support 'Series' assignment by index. Use 'DataFrame.with_columns'"
-    #   end
+    def []=(*key, value)
+      if key.length == 1
+        key = key.first
+      elsif key.length != 2
+        raise ArgumentError, "wrong number of arguments (given #{key.length + 1}, expected 2..3)"
+      end
-    #   raise Todo
-    # end
+      if Utils.strlike?(key)
+        if value.is_a?(::Array) || (defined?(Numo::NArray) && value.is_a?(Numo::NArray))
+          value = Series.new(value)
+        elsif !value.is_a?(Series)
+          value = Polars.lit(value)
+        end
+        self._df = with_column(value.alias(key.to_s))._df
+      elsif key.is_a?(::Array)
+        row_selection, col_selection = key
+        if Utils.strlike?(col_selection)
+          s = self[col_selection]
+        elsif col_selection.is_a?(Integer)
+          raise Todo
+        else
+          raise ArgumentError, "column selection not understood: #{col_selection}"
+        end
+        s[row_selection] = value
+        if col_selection.is_a?(Integer)
+          replace_at_idx(col_selection, s)
+        elsif Utils.strlike?(col_selection)
+          replace(col_selection, s)
+        end
+      else
+        raise Todo
+      end
+    end
     # Return the dataframe as a scalar.
     #
@@ -956,14 +994,21 @@ module Polars
     #
     # @return [nil]
     def write_ipc(file, compression: "uncompressed")
-      if compression.nil?
-        compression = "uncompressed"
+      return_bytes = file.nil?
+      if return_bytes
+        file = StringIO.new
+        file.set_encoding(Encoding::BINARY)
       end
       if Utils.pathlike?(file)
         file = Utils.normalise_filepath(file)
       end
+      if compression.nil?
+        compression = "uncompressed"
+      end
       _df.write_ipc(file, compression)
+      return_bytes ? file.string : nil
     end
     # Write to Apache Parquet file.
@@ -1453,13 +1498,23 @@ module Polars
     #   # │ 1   ┆ 6.0 ┆ a   │
     #   # └─────┴─────┴─────┘
     def sort(by, reverse: false, nulls_last: false)
-      if by.is_a?(Array) || by.is_a?(Expr)
-        lazy
-          .sort(by, reverse: reverse, nulls_last: nulls_last)
-          .collect(no_optimization: true, string_cache: false)
-      else
-        _from_rbdf(_df.sort(by, reverse, nulls_last))
-      end
+      lazy
+        .sort(by, reverse: reverse, nulls_last: nulls_last)
+        .collect(no_optimization: true)
+    end
+    # Sort the DataFrame by column in-place.
+    #
+    # @param by [String]
+    #   By which column to sort.
+    # @param reverse [Boolean]
+    #   Reverse/descending sort.
+    # @param nulls_last [Boolean]
+    #   Place null values last. Can only be used if sorted by a single column.
+    #
+    # @return [DataFrame]
+    def sort!(by, reverse: false, nulls_last: false)
+      self._df = sort(by, reverse: reverse, nulls_last: nulls_last)._df
     end
     # Check if DataFrame is equal to other.
@@ -1519,7 +1574,7 @@ module Polars
     #   # │ 30  ┆ 6   │
     #   # └─────┴─────┘
     def replace(column, new_col)
-      _df.replace(column, new_col._s)
+      _df.replace(column.to_s, new_col._s)
       self
     end
@@ -1847,6 +1902,12 @@ module Polars
     #   Define whether the temporal window interval is closed or not.
     # @param by [Object]
     #   Also group by this column/these columns.
+    # @param check_sorted [Boolean]
+    #   When the `by` argument is given, polars can not check sortedness
+    #   by the metadata and has to do a full scan on the index column to
+    #   verify data is sorted. This is expensive. If you are sure the
+    #   data within the by groups is sorted, you can set this to `false`.
+    #   Doing so incorrectly will lead to incorrect output
     #
     # @return [RollingGroupBy]
     #
@@ -1860,7 +1921,7 @@ module Polars
     #     "2020-01-08 23:16:43"
     #   ]
     #   df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
-    #     Polars.col("dt").str.strptime(:datetime)
+    #     Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
     #   )
     #   df.groupby_rolling(index_column: "dt", period: "2d").agg(
     #     [
@@ -1888,9 +1949,10 @@ module Polars
       period:,
       offset: nil,
       closed: "right",
-      by: nil
+      by: nil,
+      check_sorted: true
     )
-      RollingGroupBy.new(self, index_column, period, offset, closed, by)
+      RollingGroupBy.new(self, index_column, period, offset, closed, by, check_sorted)
     end
     # Group based on a time value (or index value of type `:i32`, `:i64`).
@@ -2026,21 +2088,21 @@ module Polars
     #   df.groupby_dynamic("time", every: "1h", closed: "left").agg(
     #     [
     #       Polars.col("time").count.alias("time_count"),
-    #       Polars.col("time").list.alias("time_agg_list")
+    #       Polars.col("time").alias("time_agg_list")
     #     ]
     #   )
     #   # =>
     #   # shape: (4, 3)
-    #   # ┌─────────────────────┬────────────┬─────────────────────────────────────┐
-    #   # │ time                ┆ time_count ┆ time_agg_list                       │
-    #   # │ ---                 ┆ ---        ┆ ---                                 │
-    #   # │ datetime[μs]        ┆ u32        ┆ list[datetime[μs]]                  │
-    #   # ╞═════════════════════╪════════════╪═════════════════════════════════════╡
-    #   # │ 2021-12-16 00:00:00 ┆ 2          ┆ [2021-12-16 00:00:00, 2021-12-16... │
-    #   # │ 2021-12-16 01:00:00 ┆ 2          ┆ [2021-12-16 01:00:00, 2021-12-16... │
-    #   # │ 2021-12-16 02:00:00 ┆ 2          ┆ [2021-12-16 02:00:00, 2021-12-16... │
-    #   # │ 2021-12-16 03:00:00 ┆ 1          ┆ [2021-12-16 03:00:00]               │
-    #   # └─────────────────────┴────────────┴─────────────────────────────────────┘
+    #   # ┌─────────────────────┬────────────┬───────────────────────────────────┐
+    #   # │ time                ┆ time_count ┆ time_agg_list                     │
+    #   # │ ---                 ┆ ---        ┆ ---                               │
+    #   # │ datetime[μs]        ┆ u32        ┆ list[datetime[μs]]                │
+    #   # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
+    #   # │ 2021-12-16 00:00:00 ┆ 2          ┆ [2021-12-16 00:00:00, 2021-12-16… │
+    #   # │ 2021-12-16 01:00:00 ┆ 2          ┆ [2021-12-16 01:00:00, 2021-12-16… │
+    #   # │ 2021-12-16 02:00:00 ┆ 2          ┆ [2021-12-16 02:00:00, 2021-12-16… │
+    #   # │ 2021-12-16 03:00:00 ┆ 1          ┆ [2021-12-16 03:00:00]             │
+    #   # └─────────────────────┴────────────┴───────────────────────────────────┘
     #
     # @example When closed="both" the time values at the window boundaries belong to 2 groups.
     #   df.groupby_dynamic("time", every: "1h", closed: "both").agg(
@@ -2107,7 +2169,7 @@ module Polars
     #     period: "3i",
     #     include_boundaries: true,
     #     closed: "right"
-    #   ).agg(Polars.col("A").list.alias("A_agg_list"))
+    #   ).agg(Polars.col("A").alias("A_agg_list"))
     #   # =>
     #   # shape: (3, 4)
     #   # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
@@ -2190,7 +2252,7 @@ module Polars
     #       "groups" => ["A", "B", "A", "B"],
     #       "values" => [0, 1, 2, 3]
     #     }
-    #   )
+    #   ).set_sorted("time")
     #   df.upsample(
     #     time_column: "time", every: "1mo", by: "groups", maintain_order: true
     #   ).select(Polars.all.forward_fill)
@@ -2308,7 +2370,7 @@ module Polars
     #       ],  # note record date: Jan 1st (sorted!)
     #       "gdp" => [4164, 4411, 4566, 4696]
     #     }
-    #   )
+    #   ).set_sorted("date")
     #   population = Polars::DataFrame.new(
     #     {
     #       "date" => [
@@ -2319,7 +2381,7 @@ module Polars
     #       ],  # note record date: May 12th (sorted!)
     #       "population" => [82.19, 82.66, 83.12, 83.52]
     #     }
-    #   )
+    #   ).set_sorted("date")
     #   population.join_asof(
     #     gdp, left_on: "date", right_on: "date", strategy: "backward"
     #   )
@@ -2622,7 +2684,7 @@ module Polars
     #   # │ 3   ┆ 8   ┆ c   ┆ 30    │
     #   # └─────┴─────┴─────┴───────┘
     def hstack(columns, in_place: false)
-      if !columns.is_a?(Array)
+      if !columns.is_a?(::Array)
         columns = columns.get_columns
       end
       if in_place
@@ -2752,7 +2814,7 @@ module Polars
     #   # │ 3   ┆ 8.0 │
     #   # └─────┴─────┘
     def drop(columns)
-      if columns.is_a?(Array)
+      if columns.is_a?(::Array)
         df = clone
         columns.each do |n|
           df._df.drop_in_place(n)
@@ -2791,6 +2853,16 @@ module Polars
       Utils.wrap_s(_df.drop_in_place(name))
     end
+    # Drop in place if exists.
+    #
+    # @param name [Object]
+    #   Column to drop.
+    #
+    # @return [Series]
+    def delete(name)
+      drop_in_place(name) if include?(name)
+    end
     # Create an empty copy of the current DataFrame.
     #
     # Returns a DataFrame with identical schema but no data.
@@ -3202,7 +3274,7 @@ module Polars
     #   # │ B    ┆ 1    │
     #   # │ C    ┆ 2    │
     #   # │ D    ┆ 3    │
-    #   # │ …    ┆ …    │
+    #   # │ E    ┆ 4    │
     #   # │ F    ┆ 5    │
     #   # │ G    ┆ 6    │
     #   # │ H    ┆ 7    │
@@ -3255,7 +3327,7 @@ module Polars
       n_fill = n_cols * n_rows - height
       if n_fill > 0
-        if !fill_values.is_a?(Array)
+        if !fill_values.is_a?(::Array)
           fill_values = [fill_values] * df.width
         end
@@ -3364,29 +3436,29 @@ module Polars
     #   # ╞═════╪═════╪═════╡
     #   # │ C   ┆ 2   ┆ l   │
     #   # └─────┴─────┴─────┘}
-    def partition_by(groups, maintain_order: true, as_dict: false)
+    def partition_by(groups, maintain_order: true, include_key: true, as_dict: false)
       if groups.is_a?(String)
         groups = [groups]
-      elsif !groups.is_a?(Array)
+      elsif !groups.is_a?(::Array)
         groups = Array(groups)
       end
       if as_dict
         out = {}
         if groups.length == 1
-          _df.partition_by(groups, maintain_order).each do |df|
+          _df.partition_by(groups, maintain_order, include_key).each do |df|
             df = _from_rbdf(df)
             out[df[groups][0, 0]] = df
           end
         else
-          _df.partition_by(groups, maintain_order).each do |df|
+          _df.partition_by(groups, maintain_order, include_key).each do |df|
             df = _from_rbdf(df)
             out[df[groups].row(0)] = df
           end
         end
         out
       else
-        _df.partition_by(groups, maintain_order).map { |df| _from_rbdf(df) }
+        _df.partition_by(groups, maintain_order, include_key).map { |df| _from_rbdf(df) }
       end
     end
@@ -3654,7 +3726,7 @@ module Polars
     #   # │ 4   ┆ 13.0 ┆ true  ┆ 16.0 ┆ 6.5  ┆ false │
     #   # └─────┴──────┴───────┴──────┴──────┴───────┘
     def with_columns(exprs)
-      if !exprs.nil? && !exprs.is_a?(Array)
+      if !exprs.nil? && !exprs.is_a?(::Array)
         exprs = [exprs]
       end
       lazy
@@ -4035,11 +4107,11 @@ module Polars
     #   # │ 1     ┆ 0     ┆ 1     ┆ 0     ┆ 1     ┆ 0     │
     #   # │ 0     ┆ 1     ┆ 0     ┆ 1     ┆ 0     ┆ 1     │
     #   # └───────┴───────┴───────┴───────┴───────┴───────┘
-    def to_dummies(columns: nil, separator: "_")
+    def to_dummies(columns: nil, separator: "_", drop_first: false)
       if columns.is_a?(String)
         columns = [columns]
       end
-      _from_rbdf(_df.to_dummies(columns, separator))
+      _from_rbdf(_df.to_dummies(columns, separator, drop_first))
     end
     # Drop duplicate rows from this DataFrame.
@@ -4127,7 +4199,7 @@ module Polars
         subset = [subset]
       end
-      if subset.is_a?(Array) && subset.length == 1
+      if subset.is_a?(::Array) && subset.length == 1
         expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false)
       else
         struct_fields = subset.nil? ? Polars.all : subset
@@ -4428,7 +4500,7 @@ module Polars
       end
     end
-    # Returns an iterator over the DataFrame of rows of python-native values.
+    # Returns an iterator over the DataFrame of rows of Ruby-native values.
     #
     # @param named [Boolean]
     #   Return hashes instead of arrays. The hashes are a mapping of
@@ -4489,6 +4561,24 @@ module Polars
       end
     end
+    # Returns an iterator over the DataFrame of rows of Ruby-native values.
+    #
+    # @param named [Boolean]
+    #   Return hashes instead of arrays. The hashes are a mapping of
+    #   column name to row value. This is more expensive than returning an
+    #   array, but allows for accessing values by column name.
+    # @param buffer_size [Integer]
+    #   Determines the number of rows that are buffered internally while iterating
+    #   over the data; you should only modify this in very specific cases where the
+    #   default value is determined not to be a good fit to your access pattern, as
+    #   the speedup from using the buffer is significant (~2-4x). Setting this
+    #   value to zero disables row buffering.
+    #
+    # @return [Object]
+    def each_row(named: true, buffer_size: 500, &block)
+      iter_rows(named: named, buffer_size: buffer_size, &block)
+    end
     # Shrink DataFrame memory usage.
     #
     # Shrinks to fit the exact capacity needed to hold the data.
@@ -4678,6 +4768,38 @@ module Polars
       _from_rbdf(_df.unnest(names))
     end
+    # TODO
+    # def corr
+    # end
+    # TODO
+    # def merge_sorted
+    # end
+    # Indicate that one or multiple columns are sorted.
+    #
+    # @param column [Object]
+    #   Columns that are sorted
+    # @param more_columns [Object]
+    #   Additional columns that are sorted, specified as positional arguments.
+    # @param descending [Boolean]
+    #   Whether the columns are sorted in descending order.
+    #
+    # @return [DataFrame]
+    def set_sorted(
+      column,
+      *more_columns,
+      descending: false
+    )
+      lazy
+        .set_sorted(column, *more_columns, descending: descending)
+        .collect(no_optimization: true)
+    end
+    # TODO
+    # def update
+    # end
     private
     def initialize_copy(other)
@@ -4742,20 +4864,63 @@ module Polars
     end
     # @private
-    def self.hash_to_rbdf(data, columns: nil)
-      if !columns.nil?
-        columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
+    def self.expand_hash_scalars(data, schema_overrides: nil, order: nil, nan_to_null: false)
+      updated_data = {}
+      unless data.empty?
+        dtypes = schema_overrides || {}
+        array_len = data.values.map { |val| Utils.arrlen(val) || 0 }.max
+        if array_len > 0
+          data.each do |name, val|
+            dtype = dtypes[name]
+            if val.is_a?(Hash) && dtype != Struct
+              updated_data[name] = DataFrame.new(val).to_struct(name)
+            elsif !Utils.arrlen(val).nil?
+              updated_data[name] = Series.new(String.new(name), val, dtype: dtype)
+            elsif val.nil? || [Integer, Float, TrueClass, FalseClass, String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
+              dtype = Polars::Float64 if val.nil? && dtype.nil?
+              updated_data[name] = Series.new(String.new(name), [val], dtype: dtype).extend_constant(val, array_len - 1)
+            else
+              raise Todo
+            end
+          end
+        elsif data.values.all? { |val| Utils.arrlen(val) == 0 }
+          data.each do |name, val|
+            updated_data[name] = Series.new(name, val, dtype: dtypes[name])
+          end
+        elsif data.values.all? { |val| Utils.arrlen(val).nil? }
+          data.each do |name, val|
+            updated_data[name] = Series.new(name, [val], dtype: dtypes[name])
+          end
+        end
+      end
+      updated_data
+    end
-        if data.empty? && dtypes
-          data_series = columns.map { |name| Series.new(name, [], dtype: dtypes[name])._s }
-        else
-          data_series = data.map { |name, values| Series.new(name, values, dtype: dtypes[name])._s }
+    # @private
+    def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, nan_to_null: nil)
+      if schema.is_a?(Hash) && !data.empty?
+        if !data.all? { |col, _| schema[col] }
+          raise ArgumentError, "The given column-schema names do not match the data dictionary"
         end
-        data_series = _handle_columns_arg(data_series, columns: columns)
-        return RbDataFrame.new(data_series)
+        data = schema.to_h { |col| [col, data[col]] }
       end
-      RbDataFrame.read_hash(data)
+      column_names, schema_overrides = _unpack_schema(
+        schema, lookup_names: data.keys, schema_overrides: schema_overrides
+      )
+      if column_names.empty?
+        column_names = data.keys
+      end
+      if data.empty? && !schema_overrides.empty?
+        data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], nan_to_null: nan_to_null)._s }
+      else
+        data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, nan_to_null: nan_to_null).values.map(&:_s)
+      end
+      data_series = _handle_columns_arg(data_series, columns: column_names, from_hash: true)
+      RbDataFrame.new(data_series)
     end
     # @private
@@ -4764,14 +4929,12 @@ module Polars
     end
     # @private
-    def self._unpack_columns(columns, schema_overrides: nil, lookup_names: nil, n_expected: nil)
-      raise Todo if schema_overrides
-      if columns.is_a?(Hash)
-        columns = columns.to_a
+    def self._unpack_schema(schema, schema_overrides: nil, n_expected: nil, lookup_names: nil, include_overrides_in_columns: false)
+      if schema.is_a?(Hash)
+        schema = schema.to_a
       end
       column_names =
-        (columns || []).map.with_index do |col, i|
+        (schema || []).map.with_index do |col, i|
           if col.is_a?(String)
             col || "column_#{i}"
           else
@@ -4784,21 +4947,38 @@ module Polars
       # TODO zip_longest
       lookup = column_names.zip(lookup_names || []).to_h
-      [
-        column_names,
-        (columns || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
+      column_dtypes =
+        (schema || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
           [lookup[col[0]] || col[0], col[1]]
         end
-      ]
+      if schema_overrides
+        raise Todo
+      end
+      column_dtypes.each do |col, dtype|
+        if !Utils.is_polars_dtype(dtype, include_unknown: true) && !dtype.nil?
+          column_dtypes[col] = Utils.rb_type_to_dtype(dtype)
+        end
+      end
+      [column_names, column_dtypes]
     end
-    def self._handle_columns_arg(data, columns: nil)
-      if columns.nil?
+    def self._handle_columns_arg(data, columns: nil, from_hash: false)
+      if columns.nil? || columns.empty?
         data
       else
         if data.empty?
           columns.map { |c| Series.new(c, nil)._s }
         elsif data.length == columns.length
+          if from_hash
+            series_map = data.to_h { |s| [s.name, s] }
+            if columns.all? { |col| series_map.key?(col) }
+              return columns.map { |col| series_map[col] }
+            end
+          end
           columns.each_with_index do |c, i|
             # not in-place?
             data[i].rename(c)
@@ -4813,7 +4993,7 @@ module Polars
     def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil)
       rbdf_columns = rbdf.columns
       rbdf_dtypes = rbdf.dtypes
-      columns, dtypes = _unpack_columns(
+      columns, dtypes = _unpack_schema(
         (columns || rbdf_columns), schema_overrides: schema_overrides
       )
       column_subset = []
@@ -4829,7 +5009,7 @@ module Polars
       columns.each do |col, i|
         if dtypes[col] == Categorical # != rbdf_dtypes[i]
           column_casts << Polars.col(col).cast(Categorical)._rbexpr
-        elsif structs.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
+        elsif structs&.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
           column_casts << Polars.col(col).cast(structs[col])._rbexpr
         elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
           column_casts << Polars.col(col).cast(dtypes[col])._rbexpr
@@ -4851,27 +5031,30 @@ module Polars
     end
     # @private
-    def self.sequence_to_rbdf(data, columns: nil, orient: nil, infer_schema_length: 50)
+    def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 50)
+      raise Todo if schema_overrides
+      columns = schema
       if data.length == 0
-        return hash_to_rbdf({}, columns: columns)
+        return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
       end
       if data[0].is_a?(Series)
         # series_names = data.map(&:name)
-        # columns, dtypes = _unpack_columns(columns || series_names, n_expected: data.length)
+        # columns, dtypes = _unpack_schema(columns || series_names, n_expected: data.length)
         data_series = []
         data.each do |s|
           data_series << s._s
         end
       elsif data[0].is_a?(Hash)
-        column_names, dtypes = _unpack_columns(columns)
+        column_names, dtypes = _unpack_schema(columns)
         schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
         rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema_overrides)
         if column_names
           rbdf = _post_apply_columns(rbdf, column_names)
         end
         return rbdf
-      elsif data[0].is_a?(Array)
+      elsif data[0].is_a?(::Array)
         if orient.nil? && !columns.nil?
           orient = columns.length == data.length ? "col" : "row"
         end
@@ -4890,11 +5073,21 @@ module Polars
     end
     # @private
-    def self.series_to_rbdf(data, columns: nil)
-      if columns
-        raise Todo
+    def self.series_to_rbdf(data, schema: nil, schema_overrides: nil)
+      data_series = [data._s]
+      series_name = data_series.map(&:name)
+      column_names, schema_overrides = _unpack_schema(
+        schema || series_name, schema_overrides: schema_overrides, n_expected: 1
+      )
+      if schema_overrides.any?
+        new_dtype = schema_overrides.values[0]
+        if new_dtype != data.dtype
+          data_series[0] = data_series[0].cast(new_dtype, true)
+        end
       end
-      RbDataFrame.new([data._s])
+      data_series = _handle_columns_arg(data_series, columns: column_names)
+      RbDataFrame.new(data_series)
     end
     def wrap_ldf(ldf)
@@ -4966,7 +5159,7 @@ module Polars
     def _prepare_other_arg(other)
       if !other.is_a?(Series)
-        if other.is_a?(Array)
+        if other.is_a?(::Array)
           raise ArgumentError, "Operation not supported."
         end