RubyGems - polars-df - Versions diffs - 0.13.0-x86_64-linux → 0.15.0-x86_64-linux - Mend

polars-df 0.13.0-x86_64-linux → 0.15.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +30 -0
data/Cargo.lock +1368 -319
data/LICENSE-THIRD-PARTY.txt +24801 -13447
data/LICENSE.txt +1 -0
data/README.md +1 -2
data/lib/polars/3.1/polars.so +0 -0
data/lib/polars/3.2/polars.so +0 -0
data/lib/polars/3.3/polars.so +0 -0
data/lib/polars/batched_csv_reader.rb +0 -2
data/lib/polars/binary_expr.rb +133 -9
data/lib/polars/binary_name_space.rb +101 -6
data/lib/polars/config.rb +4 -0
data/lib/polars/data_frame.rb +285 -62
data/lib/polars/data_type_group.rb +28 -0
data/lib/polars/data_types.rb +2 -0
data/lib/polars/date_time_expr.rb +244 -0
data/lib/polars/date_time_name_space.rb +87 -0
data/lib/polars/expr.rb +109 -8
data/lib/polars/functions/as_datatype.rb +51 -2
data/lib/polars/functions/col.rb +1 -1
data/lib/polars/functions/eager.rb +1 -3
data/lib/polars/functions/lazy.rb +88 -10
data/lib/polars/functions/range/time_range.rb +21 -21
data/lib/polars/io/csv.rb +14 -16
data/lib/polars/io/database.rb +2 -2
data/lib/polars/io/ipc.rb +14 -12
data/lib/polars/io/ndjson.rb +10 -0
data/lib/polars/io/parquet.rb +168 -111
data/lib/polars/lazy_frame.rb +649 -15
data/lib/polars/list_name_space.rb +169 -0
data/lib/polars/selectors.rb +1144 -0
data/lib/polars/series.rb +470 -40
data/lib/polars/string_cache.rb +27 -1
data/lib/polars/string_expr.rb +0 -1
data/lib/polars/string_name_space.rb +73 -3
data/lib/polars/struct_name_space.rb +31 -7
data/lib/polars/utils/various.rb +5 -1
data/lib/polars/utils.rb +45 -10
data/lib/polars/version.rb +1 -1
data/lib/polars.rb +2 -1
metadata +4 -3
data/lib/polars/functions.rb +0 -57

data/lib/polars/data_frame.rb CHANGED Viewed

@@ -8,17 +8,49 @@ module Polars
     # Create a new DataFrame.
     #
-    # @param data [Hash, Array, Series, nil]
-    #   Two-dimensional data in various forms. Hash must contain Arrays.
-    #   Array may contain Series.
-    # @param columns [Array, Hash, nil]
-    #   Column labels to use for resulting DataFrame. If specified, overrides any
-    #   labels already present in the data. Must match data dimensions.
-    # @param orient ["col", "row", nil]
-    #   Whether to interpret two-dimensional data as columns or as rows. If `nil`,
+    # @param data [Object]
+    #   Two-dimensional data in various forms; hash input must contain arrays
+    #   or a range. Arrays may contain Series or other arrays.
+    # @param schema [Object]
+    #   The schema of the resulting DataFrame. The schema may be declared in several
+    #   ways:
+    #
+    #   * As a hash of name:type pairs; if type is nil, it will be auto-inferred.
+    #   * As an array of column names; in this case types are automatically inferred.
+    #   * As an array of (name,type) pairs; this is equivalent to the dictionary form.
+    #
+    #   If you supply a list of column names that does not match the names in the
+    #   underlying data, the names given here will overwrite them. The number
+    #   of names given in the schema should match the underlying data dimensions.
+    #
+    #   If set to `nil` (default), the schema is inferred from the data.
+    # @param schema_overrides [Hash]
+    #   Support type specification or override of one or more columns; note that
+    #   any dtypes inferred from the schema param will be overridden.
+    #
+    #   The number of entries in the schema should match the underlying data
+    #   dimensions, unless an array of hashes is being passed, in which case
+    #   a *partial* schema can be declared to prevent specific fields from being loaded.
+    # @param strict [Boolean]
+    #   Throw an error if any `data` value does not exactly match the given or inferred
+    #   data type for that column. If set to `false`, values that do not match the data
+    #   type are cast to that data type or, if casting is not possible, set to null
+    #   instead.
+    # @param orient ["col", "row"]
+    #   Whether to interpret two-dimensional data as columns or as rows. If nil,
     #   the orientation is inferred by matching the columns and data dimensions. If
     #   this does not yield conclusive results, column orientation is used.
-    def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
+    # @param infer_schema_length [Integer]
+    #   The maximum number of rows to scan for schema inference. If set to `nil`, the
+    #   full data may be scanned *(this can be slow)*. This parameter only applies if
+    #   the input data is a sequence or generator of rows; other input is read as-is.
+    # @param nan_to_null [Boolean]
+    #   If the data comes from one or more Numo arrays, can optionally convert input
+    #   data NaN values to null instead. This is a no-op for all other input data.
+    def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: 100, nan_to_null: false)
+      if schema && columns
+        warn "columns is ignored when schema is passed"
+      end
       schema ||= columns
       if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
@@ -29,11 +61,17 @@ module Polars
         self._df = self.class.hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
       elsif data.is_a?(Hash)
         data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
-        self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
+        self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null)
       elsif data.is_a?(::Array)
-        self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
+        self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, orient: orient, infer_schema_length: infer_schema_length)
       elsif data.is_a?(Series)
-        self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
+        self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict)
+      elsif data.respond_to?(:arrow_c_stream)
+        # This uses the fact that RbSeries.from_arrow_c_stream will create a
+        # struct-typed Series. Then we unpack that to a DataFrame.
+        tmp_col_name = ""
+        s = Utils.wrap_s(RbSeries.from_arrow_c_stream(data))
+        self._df = s.to_frame(tmp_col_name).unnest(tmp_col_name)._df
       else
         raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
       end
@@ -452,6 +490,11 @@ module Polars
       end
     end
+    # @private
+    def arrow_c_stream
+      _df.arrow_c_stream
+    end
     # Return the dataframe as a scalar.
     #
     # Equivalent to `df[0,0]`, with a check that the shape is (1,1).
@@ -766,15 +809,18 @@ module Polars
     #   Compression method. Defaults to "uncompressed".
     #
     # @return [nil]
-    def write_avro(file, compression = "uncompressed")
+    def write_avro(file, compression = "uncompressed", name: "")
       if compression.nil?
         compression = "uncompressed"
       end
       if Utils.pathlike?(file)
         file = Utils.normalize_filepath(file)
       end
+      if name.nil?
+        name = ""
+      end
-      _df.write_avro(file, compression)
+      _df.write_avro(file, compression, name)
     end
     # Write to Arrow IPC binary stream or Feather file.
@@ -785,7 +831,7 @@ module Polars
     #   Compression method. Defaults to "uncompressed".
     #
     # @return [nil]
-    def write_ipc(file, compression: "uncompressed")
+    def write_ipc(file, compression: "uncompressed", compat_level: nil)
       return_bytes = file.nil?
       if return_bytes
         file = StringIO.new
@@ -795,11 +841,15 @@ module Polars
         file = Utils.normalize_filepath(file)
       end
+      if compat_level.nil?
+        compat_level = true
+      end
       if compression.nil?
         compression = "uncompressed"
       end
-      _df.write_ipc(file, compression)
+      _df.write_ipc(file, compression, compat_level)
       return_bytes ? file.string : nil
     end
@@ -826,7 +876,8 @@ module Polars
     #   df.write_ipc_stream("new_file.arrow")
     def write_ipc_stream(
       file,
-      compression: "uncompressed"
+      compression: "uncompressed",
+      compat_level: nil
     )
       return_bytes = file.nil?
       if return_bytes
@@ -836,11 +887,15 @@ module Polars
         file = Utils.normalize_filepath(file)
       end
+      if compat_level.nil?
+        compat_level = true
+      end
       if compression.nil?
         compression = "uncompressed"
       end
-      _df.write_ipc_stream(file, compression)
+      _df.write_ipc_stream(file, compression, compat_level)
       return_bytes ? file.string : nil
     end
@@ -1037,6 +1092,10 @@ module Polars
     #
     # @param mapping [Hash]
     #   Key value pairs that map from old name to new name.
+    # @param strict [Boolean]
+    #   Validate that all column names exist in the current schema,
+    #   and throw an exception if any do not. (Note that this parameter
+    #   is a no-op when passing a function to `mapping`).
     #
     # @return [DataFrame]
     #
@@ -1060,8 +1119,8 @@ module Polars
     #   # │ 2     ┆ 7   ┆ b   │
     #   # │ 3     ┆ 8   ┆ c   │
     #   # └───────┴─────┴─────┘
-    def rename(mapping)
-      lazy.rename(mapping).collect(no_optimization: true)
+    def rename(mapping, strict: true)
+      lazy.rename(mapping, strict: strict).collect(no_optimization: true)
     end
     # Insert a Series at a certain column index. This operation is in place.
@@ -2190,6 +2249,11 @@ module Polars
     # @param force_parallel [Boolean]
     #   Force the physical plan to evaluate the computation of both DataFrames up to
     #   the join in parallel.
+    # @param coalesce [Boolean]
+    #   Coalescing behavior (merging of join columns).
+    #     - true: -> Always coalesce join columns.
+    #     - false: -> Never coalesce join columns.
+    #   Note that joining on any other expressions than `col` will turn off coalescing.
     #
     # @return [DataFrame]
     #
@@ -2243,7 +2307,8 @@ module Polars
       suffix: "_right",
       tolerance: nil,
       allow_parallel: true,
-      force_parallel: false
+      force_parallel: false,
+      coalesce: true
     )
       lazy
         .join_asof(
@@ -2258,7 +2323,8 @@ module Polars
           suffix: suffix,
           tolerance: tolerance,
           allow_parallel: allow_parallel,
-          force_parallel: force_parallel
+          force_parallel: force_parallel,
+          coalesce: coalesce
         )
         .collect(no_optimization: true)
     end
@@ -2277,8 +2343,20 @@ module Polars
     #   Join strategy.
     # @param suffix [String]
     #   Suffix to append to columns with a duplicate name.
+    # @param validate ['m:m', 'm:1', '1:m', '1:1']
+    #   Checks if join is of specified type.
+    #     * *many_to_many* - “m:m”: default, does not result in checks
+    #     * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets
+    #     * *one_to_many* - “1:m”: check if join keys are unique in left dataset
+    #     * *many_to_one* - “m:1”: check if join keys are unique in right dataset
     # @param join_nulls [Boolean]
     #   Join on null values. By default null values will never produce matches.
+    # @param coalesce [Boolean]
+    #   Coalescing behavior (merging of join columns).
+    #     - nil: -> join specific.
+    #     - true: -> Always coalesce join columns.
+    #     - false: -> Never coalesce join columns.
+    #   Note that joining on any other expressions than `col` will turn off coalescing.
     #
     # @return [DataFrame]
     #
@@ -2361,7 +2439,16 @@ module Polars
     #   # ╞═════╪═════╪═════╡
     #   # │ 3   ┆ 8.0 ┆ c   │
     #   # └─────┴─────┴─────┘
-    def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right", join_nulls: false)
+    def join(other,
+      left_on: nil,
+      right_on: nil,
+      on: nil,
+      how: "inner",
+      suffix: "_right",
+      validate: "m:m",
+      join_nulls: false,
+      coalesce: nil
+    )
       lazy
         .join(
           other.lazy,
@@ -2370,7 +2457,9 @@ module Polars
           on: on,
           how: how,
           suffix: suffix,
-          join_nulls: join_nulls
+          validate: validate,
+          join_nulls: join_nulls,
+          coalesce: coalesce
         )
         .collect(no_optimization: true)
     end
@@ -2426,15 +2515,15 @@ module Polars
     #   df.map_rows { |t| t[0] * 2 + t[1] }
     #   # =>
     #   # shape: (3, 1)
-    #   # ┌───────┐
-    #   # │ apply │
-    #   # │ ---   │
-    #   # │ i64   │
-    #   # ╞═══════╡
-    #   # │ 1     │
-    #   # │ 9     │
-    #   # │ 14    │
-    #   # └───────┘
+    #   # ┌─────┐
+    #   # │ map │
+    #   # │ --- │
+    #   # │ i64 │
+    #   # ╞═════╡
+    #   # │ 1   │
+    #   # │ 9   │
+    #   # │ 14  │
+    #   # └─────┘
     def map_rows(return_dtype: nil, inference_size: 256, &f)
       out, is_df = _df.map_rows(f, return_dtype, inference_size)
       if is_df
@@ -2717,10 +2806,85 @@ module Polars
     #   Column to drop.
     #
     # @return [Series]
+    #
+    # @example
+    #   df = Polars::DataFrame.new(
+    #     {
+    #       "foo" => [1, 2, 3],
+    #       "bar" => [6, 7, 8],
+    #       "ham" => ["a", "b", "c"]
+    #     }
+    #   )
+    #   df.delete("ham")
+    #   # =>
+    #   # shape: (3,)
+    #   # Series: 'ham' [str]
+    #   # [
+    #   #         "a"
+    #   #         "b"
+    #   #         "c"
+    #   # ]
+    #
+    # @example
+    #   df.delete("missing")
+    #   # => nil
     def delete(name)
       drop_in_place(name) if include?(name)
     end
+    # Cast DataFrame column(s) to the specified dtype(s).
+    #
+    # @param dtypes [Object]
+    #   Mapping of column names (or selector) to dtypes, or a single dtype
+    #   to which all columns will be cast.
+    # @param strict [Boolean]
+    #   Throw an error if a cast could not be done (for instance, due to an
+    #   overflow).
+    #
+    # @return [DataFrame]
+    #
+    # @example Cast specific frame columns to the specified dtypes:
+    #   df = Polars::DataFrame.new(
+    #     {
+    #       "foo" => [1, 2, 3],
+    #       "bar" => [6.0, 7.0, 8.0],
+    #       "ham" => [Date.new(2020, 1, 2), Date.new(2021, 3, 4), Date.new(2022, 5, 6)]
+    #     }
+    #   )
+    #   df.cast({"foo" => Polars::Float32, "bar" => Polars::UInt8})
+    #   # =>
+    #   # shape: (3, 3)
+    #   # ┌─────┬─────┬────────────┐
+    #   # │ foo ┆ bar ┆ ham        │
+    #   # │ --- ┆ --- ┆ ---        │
+    #   # │ f32 ┆ u8  ┆ date       │
+    #   # ╞═════╪═════╪════════════╡
+    #   # │ 1.0 ┆ 6   ┆ 2020-01-02 │
+    #   # │ 2.0 ┆ 7   ┆ 2021-03-04 │
+    #   # │ 3.0 ┆ 8   ┆ 2022-05-06 │
+    #   # └─────┴─────┴────────────┘
+    #
+    # @example Cast all frame columns matching one dtype (or dtype group) to another dtype:
+    #   df.cast({Polars::Date => Polars::Datetime})
+    #   # =>
+    #   # shape: (3, 3)
+    #   # ┌─────┬─────┬─────────────────────┐
+    #   # │ foo ┆ bar ┆ ham                 │
+    #   # │ --- ┆ --- ┆ ---                 │
+    #   # │ i64 ┆ f64 ┆ datetime[μs]        │
+    #   # ╞═════╪═════╪═════════════════════╡
+    #   # │ 1   ┆ 6.0 ┆ 2020-01-02 00:00:00 │
+    #   # │ 2   ┆ 7.0 ┆ 2021-03-04 00:00:00 │
+    #   # │ 3   ┆ 8.0 ┆ 2022-05-06 00:00:00 │
+    #   # └─────┴─────┴─────────────────────┘
+    #
+    # @example Cast all frame columns to the specified dtype:
+    #   df.cast(Polars::String).to_h(as_series: false)
+    #   # => {"foo"=>["1", "2", "3"], "bar"=>["6.0", "7.0", "8.0"], "ham"=>["2020-01-02", "2021-03-04", "2022-05-06"]}
+    def cast(dtypes, strict: true)
+      lazy.cast(dtypes, strict: strict).collect(_eager: true)
+    end
     # Create an empty copy of the current DataFrame.
     #
     # Returns a DataFrame with identical schema but no data.
@@ -2775,6 +2939,57 @@ module Polars
     # Get the DataFrame as a Array of Series.
     #
     # @return [Array]
+    #
+    # @example
+    #   df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
+    #   df.get_columns
+    #   # =>
+    #   # [shape: (3,)
+    #   # Series: 'foo' [i64]
+    #   # [
+    #   #         1
+    #   #         2
+    #   #         3
+    #   # ], shape: (3,)
+    #   # Series: 'bar' [i64]
+    #   # [
+    #   #         4
+    #   #         5
+    #   #         6
+    #   # ]]
+    #
+    # @example
+    #   df = Polars::DataFrame.new(
+    #     {
+    #       "a" => [1, 2, 3, 4],
+    #       "b" => [0.5, 4, 10, 13],
+    #       "c" => [true, true, false, true]
+    #     }
+    #   )
+    #   df.get_columns
+    #   # =>
+    #   # [shape: (4,)
+    #   # Series: 'a' [i64]
+    #   # [
+    #   #         1
+    #   #         2
+    #   #         3
+    #   #         4
+    #   # ], shape: (4,)
+    #   # Series: 'b' [f64]
+    #   # [
+    #   #         0.5
+    #   #         4.0
+    #   #         10.0
+    #   #         13.0
+    #   # ], shape: (4,)
+    #   # Series: 'c' [bool]
+    #   # [
+    #   #         true
+    #   #         true
+    #   #         false
+    #   #         true
+    #   # ]]
     def get_columns
       _df.get_columns.map { |s| Utils.wrap_s(s) }
     end
@@ -3083,7 +3298,7 @@ module Polars
     #       "c" => [2, 4, 6]
     #     }
     #   )
-    #   df.unpivot(Polars::Selectors.numeric, index: "a")
+    #   df.unpivot(Polars.cs.numeric, index: "a")
     #   # =>
     #   # shape: (6, 3)
     #   # ┌─────┬──────────┬───────┐
@@ -4234,7 +4449,7 @@ module Polars
       if n.nil? && !frac.nil?
         frac = Series.new("frac", [frac]) unless frac.is_a?(Series)
-        _from_rbdf(
+        return _from_rbdf(
           _df.sample_frac(frac._s, with_replacement, shuffle, seed)
         )
       end
@@ -4296,7 +4511,7 @@ module Polars
     # @example A horizontal string concatenation:
     #   df = Polars::DataFrame.new(
     #     {
-    #       "a" => ["foo", "bar", 2],
+    #       "a" => ["foo", "bar", nil],
     #       "b" => [1, 2, 3],
     #       "c" => [1.0, 2.0, 3.0]
     #     }
@@ -4327,11 +4542,11 @@ module Polars
     #   #         true
     #   #         true
     #   # ]
-    def fold(&operation)
+    def fold
       acc = to_series(0)
       1.upto(width - 1) do |i|
-        acc = operation.call(acc, to_series(i))
+        acc = yield(acc, to_series(i))
       end
       acc
     end
@@ -4843,7 +5058,7 @@ module Polars
     end
     # @private
-    def self.expand_hash_scalars(data, schema_overrides: nil, order: nil, nan_to_null: false)
+    def self.expand_hash_scalars(data, schema_overrides: nil, strict: true, order: nil, nan_to_null: false)
       updated_data = {}
       unless data.empty?
         dtypes = schema_overrides || {}
@@ -4852,23 +5067,23 @@ module Polars
           data.each do |name, val|
             dtype = dtypes[name]
             if val.is_a?(Hash) && dtype != Struct
-              updated_data[name] = DataFrame.new(val).to_struct(name)
+              updated_data[name] = DataFrame.new(val, strict: strict).to_struct(name)
             elsif !Utils.arrlen(val).nil?
-              updated_data[name] = Series.new(::String.new(name), val, dtype: dtype)
+              updated_data[name] = Series.new(::String.new(name), val, dtype: dtype, strict: strict)
             elsif val.nil? || [Integer, Float, TrueClass, FalseClass, ::String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
               dtype = Polars::Float64 if val.nil? && dtype.nil?
-              updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype).extend_constant(val, array_len - 1)
+              updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype, strict: strict).extend_constant(val, array_len - 1)
             else
               raise Todo
             end
           end
         elsif data.values.all? { |val| Utils.arrlen(val) == 0 }
           data.each do |name, val|
-            updated_data[name] = Series.new(name, val, dtype: dtypes[name])
+            updated_data[name] = Series.new(name, val, dtype: dtypes[name], strict: strict)
           end
         elsif data.values.all? { |val| Utils.arrlen(val).nil? }
           data.each do |name, val|
-            updated_data[name] = Series.new(name, [val], dtype: dtypes[name])
+            updated_data[name] = Series.new(name, [val], dtype: dtypes[name], strict: strict)
           end
         end
       end
@@ -4876,7 +5091,7 @@ module Polars
     end
     # @private
-    def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, nan_to_null: nil)
+    def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true, nan_to_null: nil)
       if schema.is_a?(Hash) && !data.empty?
         if !data.all? { |col, _| schema[col] }
           raise ArgumentError, "The given column-schema names do not match the data dictionary"
@@ -4893,9 +5108,9 @@ module Polars
       end
       if data.empty? && !schema_overrides.empty?
-        data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], nan_to_null: nan_to_null)._s }
+        data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], strict: strict, nan_to_null: nan_to_null)._s }
       else
-        data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, nan_to_null: nan_to_null).values.map(&:_s)
+        data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null).values.map(&:_s)
       end
       data_series = _handle_columns_arg(data_series, columns: column_names, from_hash: true)
@@ -4969,7 +5184,7 @@ module Polars
       end
     end
-    def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil)
+    def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil, strict: true)
       rbdf_columns = rbdf.columns
       rbdf_dtypes = rbdf.dtypes
       columns, dtypes = _unpack_schema(
@@ -4985,13 +5200,13 @@ module Polars
       end
       column_casts = []
-      columns.each do |col, i|
+      columns.each_with_index do |col, i|
         if dtypes[col] == Categorical # != rbdf_dtypes[i]
-          column_casts << Polars.col(col).cast(Categorical)._rbexpr
+          column_casts << Polars.col(col).cast(Categorical, strict: strict)._rbexpr
         elsif structs&.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
-          column_casts << Polars.col(col).cast(structs[col])._rbexpr
+          column_casts << Polars.col(col).cast(structs[col], strict: strict)._rbexpr
         elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
-          column_casts << Polars.col(col).cast(dtypes[col])._rbexpr
+          column_casts << Polars.col(col).cast(dtypes[col], strict: strict)._rbexpr
         end
       end
@@ -5010,12 +5225,11 @@ module Polars
     end
     # @private
-    def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 50)
-      raise Todo if schema_overrides
+    def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: 50)
       columns = schema
       if data.length == 0
-        return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
+        return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides, strict: strict)
       end
       if data[0].is_a?(Series)
@@ -5028,7 +5242,7 @@ module Polars
       elsif data[0].is_a?(Hash)
         column_names, dtypes = _unpack_schema(columns)
         schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
-        rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, false, infer_schema_length)
+        rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, strict, infer_schema_length)
         if column_names
           rbdf = _post_apply_columns(rbdf, column_names)
         end
@@ -5048,7 +5262,7 @@ module Polars
             schema, schema_overrides: schema_overrides, n_expected: first_element.length
           )
           local_schema_override = (
-            schema_overrides.any? ? (raise Todo) : {}
+            schema_overrides.any? ? _include_unknowns(schema_overrides, column_names) : {}
           )
           if column_names.any? && first_element.length > 0 && first_element.length != column_names.length
             raise ArgumentError, "the row data does not match the number of columns"
@@ -5056,7 +5270,11 @@ module Polars
           unpack_nested = false
           local_schema_override.each do |col, tp|
-            raise Todo
+            if [Categorical, Enum].include?(tp)
+              local_schema_override[col] = String
+            elsif !unpack_nested && [Unknown, Struct].include?(tp.base_type)
+              raise Todo
+            end
           end
           if unpack_nested
@@ -5070,7 +5288,7 @@ module Polars
           end
           if column_names.any? || schema_overrides.any?
             rbdf = _post_apply_columns(
-              rbdf, column_names, schema_overrides: schema_overrides
+              rbdf, column_names, schema_overrides: schema_overrides, strict: strict
             )
           end
           return rbdf
@@ -5080,7 +5298,7 @@ module Polars
           )
           data_series =
             data.map.with_index do |element, i|
-              Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]])._s
+              Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]], strict: strict)._s
             end
           return RbDataFrame.new(data_series)
         else
@@ -5093,7 +5311,12 @@ module Polars
     end
     # @private
-    def self.series_to_rbdf(data, schema: nil, schema_overrides: nil)
+    def self._include_unknowns(schema, cols)
+      cols.to_h { |col| [col, schema[col] || Unknown] }
+    end
+    # @private
+    def self.series_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true)
       data_series = [data._s]
       series_name = data_series.map(&:name)
       column_names, schema_overrides = _unpack_schema(
@@ -5102,7 +5325,7 @@ module Polars
       if schema_overrides.any?
         new_dtype = schema_overrides.values[0]
         if new_dtype != data.dtype
-          data_series[0] = data_series[0].cast(new_dtype, true)
+          data_series[0] = data_series[0].cast(new_dtype, strict)
         end
       end

data/lib/polars/data_type_group.rb ADDED Viewed

@@ -0,0 +1,28 @@
+module Polars
+  class DataTypeGroup < Set
+  end
+  SIGNED_INTEGER_DTYPES = DataTypeGroup.new(
+    [
+      Int8,
+      Int16,
+      Int32,
+      Int64
+    ]
+  )
+  UNSIGNED_INTEGER_DTYPES = DataTypeGroup.new(
+    [
+      UInt8,
+      UInt16,
+      UInt32,
+      UInt64
+    ]
+  )
+  INTEGER_DTYPES = (
+    SIGNED_INTEGER_DTYPES | UNSIGNED_INTEGER_DTYPES
+  )
+  FLOAT_DTYPES = DataTypeGroup.new([Float32, Float64])
+  NUMERIC_DTYPES = DataTypeGroup.new(
+    FLOAT_DTYPES + INTEGER_DTYPES | [Decimal]
+  )
+end

data/lib/polars/data_types.rb CHANGED Viewed

@@ -292,6 +292,8 @@ module Polars
   # A categorical encoding of a set of strings.
   class Categorical < DataType
+    attr_reader :ordering
     def initialize(ordering = "physical")
       @ordering = ordering
     end