RubyGems - polars-df - Versions diffs - 0.7.0-x86_64-linux → 0.9.0-x86_64-linux - Mend

polars-df 0.7.0-x86_64-linux → 0.9.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +41 -0
data/Cargo.lock +353 -237
data/Cargo.toml +0 -3
data/LICENSE-THIRD-PARTY.txt +1978 -1459
data/LICENSE.txt +1 -1
data/README.md +2 -2
data/lib/polars/3.1/polars.so +0 -0
data/lib/polars/3.2/polars.so +0 -0
data/lib/polars/{3.0 → 3.3}/polars.so +0 -0
data/lib/polars/array_expr.rb +449 -0
data/lib/polars/array_name_space.rb +346 -0
data/lib/polars/cat_expr.rb +24 -0
data/lib/polars/cat_name_space.rb +75 -0
data/lib/polars/config.rb +2 -2
data/lib/polars/data_frame.rb +248 -108
data/lib/polars/data_types.rb +195 -29
data/lib/polars/date_time_expr.rb +41 -24
data/lib/polars/date_time_name_space.rb +12 -12
data/lib/polars/exceptions.rb +12 -1
data/lib/polars/expr.rb +1080 -195
data/lib/polars/functions/aggregation/horizontal.rb +246 -0
data/lib/polars/functions/aggregation/vertical.rb +282 -0
data/lib/polars/functions/as_datatype.rb +248 -0
data/lib/polars/functions/col.rb +47 -0
data/lib/polars/functions/eager.rb +182 -0
data/lib/polars/functions/lazy.rb +1280 -0
data/lib/polars/functions/len.rb +49 -0
data/lib/polars/functions/lit.rb +35 -0
data/lib/polars/functions/random.rb +16 -0
data/lib/polars/functions/range/date_range.rb +103 -0
data/lib/polars/functions/range/int_range.rb +51 -0
data/lib/polars/functions/repeat.rb +144 -0
data/lib/polars/functions/whenthen.rb +27 -0
data/lib/polars/functions.rb +29 -416
data/lib/polars/group_by.rb +3 -3
data/lib/polars/io.rb +21 -28
data/lib/polars/lazy_frame.rb +390 -76
data/lib/polars/list_expr.rb +152 -6
data/lib/polars/list_name_space.rb +102 -0
data/lib/polars/meta_expr.rb +175 -7
data/lib/polars/series.rb +557 -59
data/lib/polars/sql_context.rb +1 -1
data/lib/polars/string_cache.rb +75 -0
data/lib/polars/string_expr.rb +412 -96
data/lib/polars/string_name_space.rb +4 -4
data/lib/polars/struct_expr.rb +1 -1
data/lib/polars/struct_name_space.rb +1 -1
data/lib/polars/testing.rb +507 -0
data/lib/polars/utils.rb +64 -20
data/lib/polars/version.rb +1 -1
data/lib/polars.rb +15 -2
metadata +36 -7
data/lib/polars/lazy_functions.rb +0 -1197

data/lib/polars/struct_expr.rb CHANGED Viewed

@@ -13,7 +13,7 @@ module Polars
     #
     # @return [Expr]
     def [](item)
-      if item.is_a?(String)
+      if item.is_a?(::String)
         field(item)
       elsif item.is_a?(Integer)
         Utils.wrap_expr(_rbexpr.struct_field_by_index(item))

data/lib/polars/struct_name_space.rb CHANGED Viewed

@@ -16,7 +16,7 @@ module Polars
     def [](item)
       if item.is_a?(Integer)
         field(fields[item])
-      elsif item.is_a?(String)
+      elsif item.is_a?(::String)
         field(item)
       else
         raise ArgumentError, "expected type Integer or String, got #{item.class.name}"

data/lib/polars/testing.rb ADDED Viewed

@@ -0,0 +1,507 @@
+module Polars
+  module Testing
+    # Assert that the left and right frame are equal.
+    #
+    # Raises a detailed `AssertionError` if the frames differ.
+    # This function is intended for use in unit tests.
+    #
+    # @param left [Object]
+    #   The first DataFrame or LazyFrame to compare.
+    # @param right [Object]
+    #   The second DataFrame or LazyFrame to compare.
+    # @param check_row_order [Boolean]
+    #   Require row order to match.
+    # @param check_column_order [Boolean]
+    #   Require column order to match.
+    # @param check_dtype [Boolean]
+    #   Require data types to match.
+    # @param check_exact [Boolean]
+    #   Require float values to match exactly. If set to `false`, values are considered
+    #   equal when within tolerance of each other (see `rtol` and `atol`).
+    #   Only affects columns with a Float data type.
+    # @param rtol [Float]
+    #   Relative tolerance for inexact checking. Fraction of values in `right`.
+    # @param atol [Float]
+    #   Absolute tolerance for inexact checking.
+    # @param categorical_as_str [Boolean]
+    #   Cast categorical columns to string before comparing. Enabling this helps
+    #   compare columns that do not share the same string cache.
+    #
+    # @return [nil]
+    def assert_frame_equal(
+      left,
+      right,
+      check_row_order: true,
+      check_column_order: true,
+      check_dtype: true,
+      check_exact: false,
+      rtol: 1e-5,
+      atol: 1e-8,
+      categorical_as_str: false
+    )
+      lazy = _assert_correct_input_type(left, right)
+      objects = lazy ? "LazyFrames" : "DataFrames"
+      _assert_frame_schema_equal(
+        left,
+        right,
+        check_column_order: check_column_order,
+        check_dtype: check_dtype,
+        objects: objects,
+      )
+      if lazy
+        left, right = left.collect, right.collect
+      end
+      if left.height != right.height
+        raise_assertion_error(
+          objects, "number of rows does not match", left.height, right.height
+        )
+      end
+      if !check_row_order
+        left, right = _sort_dataframes(left, right)
+      end
+      left.columns.each do |c|
+        s_left, s_right = left.get_column(c), right.get_column(c)
+        begin
+          _assert_series_values_equal(
+            s_left,
+            s_right,
+            check_exact: check_exact,
+            rtol: rtol,
+            atol: atol,
+            categorical_as_str: categorical_as_str
+          )
+        rescue AssertionError
+          raise_assertion_error(
+            objects,
+            "value mismatch for column #{c.inspect}",
+            s_left.to_a,
+            s_right.to_a
+          )
+        end
+      end
+    end
+    # Assert that the left and right frame are **not** equal.
+    #
+    # This function is intended for use in unit tests.
+    #
+    # @param left [Object]
+    #   The first DataFrame or LazyFrame to compare.
+    # @param right [Object]
+    #   The second DataFrame or LazyFrame to compare.
+    # @param check_row_order [Boolean]
+    #   Require row order to match.
+    # @param check_column_order [Boolean]
+    #   Require column order to match.
+    # @param check_dtype [Boolean]
+    #   Require data types to match.
+    # @param check_exact [Boolean]
+    #   Require float values to match exactly. If set to `false`, values are considered
+    #   equal when within tolerance of each other (see `rtol` and `atol`).
+    #   Only affects columns with a Float data type.
+    # @param rtol [Float]
+    #   Relative tolerance for inexact checking. Fraction of values in `right`.
+    # @param atol [Float]
+    #   Absolute tolerance for inexact checking.
+    # @param categorical_as_str [Boolean]
+    #   Cast categorical columns to string before comparing. Enabling this helps
+    #   compare columns that do not share the same string cache.
+    #
+    # @return [nil]
+    def assert_frame_not_equal(
+      left,
+      right,
+      check_row_order: true,
+      check_column_order: true,
+      check_dtype: true,
+      check_exact: false,
+      rtol: 1e-5,
+      atol: 1e-8,
+      categorical_as_str: false
+    )
+      begin
+        assert_frame_equal(
+          left,
+          right,
+          check_column_order: check_column_order,
+          check_row_order: check_row_order,
+          check_dtype: check_dtype,
+          check_exact: check_exact,
+          rtol: rtol,
+          atol: atol,
+          categorical_as_str: categorical_as_str
+        )
+      rescue AssertionError
+        return
+      end
+      msg = "frames are equal"
+      raise AssertionError, msg
+    end
+    # Assert that the left and right Series are equal.
+    #
+    # Raises a detailed `AssertionError` if the Series differ.
+    # This function is intended for use in unit tests.
+    #
+    # @param left [Object]
+    #   The first Series to compare.
+    # @param right [Object]
+    #   The second Series to compare.
+    # @param check_dtype [Boolean]
+    #   Require data types to match.
+    # @param check_names [Boolean]
+    #   Require names to match.
+    # @param check_exact [Boolean]
+    #   Require float values to match exactly. If set to `false`, values are considered
+    #   equal when within tolerance of each other (see `rtol` and `atol`).
+    #   Only affects columns with a Float data type.
+    # @param rtol [Float]
+    #   Relative tolerance for inexact checking, given as a fraction of the values in
+    #   `right`.
+    # @param atol [Float]
+    #   Absolute tolerance for inexact checking.
+    # @param categorical_as_str [Boolean]
+    #   Cast categorical columns to string before comparing. Enabling this helps
+    #   compare columns that do not share the same string cache.
+    #
+    # @return [nil]
+    def assert_series_equal(
+      left,
+      right,
+      check_dtype: true,
+      check_names: true,
+      check_exact: false,
+      rtol: 1e-5,
+      atol: 1e-8,
+      categorical_as_str: false
+    )
+      if !(left.is_a?(Series) && right.is_a?(Series))
+        raise_assertion_error(
+          "inputs",
+          "unexpected input types",
+          left.class.name,
+          right.class.name
+        )
+      end
+      if left.len != right.len
+        raise_assertion_error("Series", "length mismatch", left.len, right.len)
+      end
+      if check_names && left.name != right.name
+        raise_assertion_error("Series", "name mismatch", left.name, right.name)
+      end
+      if check_dtype && left.dtype != right.dtype
+        raise_assertion_error("Series", "dtype mismatch", left.dtype, right.dtype)
+      end
+      _assert_series_values_equal(
+        left,
+        right,
+        check_exact: check_exact,
+        rtol: rtol,
+        atol: atol,
+        categorical_as_str: categorical_as_str
+      )
+    end
+    # Assert that the left and right Series are **not** equal.
+    #
+    # This function is intended for use in unit tests.
+    #
+    # @param left [Object]
+    #   The first Series to compare.
+    # @param right [Object]
+    #   The second Series to compare.
+    # @param check_dtype [Boolean]
+    #   Require data types to match.
+    # @param check_names [Boolean]
+    #   Require names to match.
+    # @param check_exact [Boolean]
+    #   Require float values to match exactly. If set to `false`, values are considered
+    #   equal when within tolerance of each other (see `rtol` and `atol`).
+    #   Only affects columns with a Float data type.
+    # @param rtol [Float]
+    #   Relative tolerance for inexact checking, given as a fraction of the values in
+    #   `right`.
+    # @param atol [Float]
+    #   Absolute tolerance for inexact checking.
+    # @param categorical_as_str [Boolean]
+    #   Cast categorical columns to string before comparing. Enabling this helps
+    #   compare columns that do not share the same string cache.
+    #
+    # @return [nil]
+    def assert_series_not_equal(
+      left,
+      right,
+      check_dtype: true,
+      check_names: true,
+      check_exact: false,
+      rtol: 1e-5,
+      atol: 1e-8,
+      categorical_as_str: false
+    )
+      begin
+        assert_series_equal(
+          left,
+          right,
+          check_dtype: check_dtype,
+          check_names: check_names,
+          check_exact: check_exact,
+          rtol: rtol,
+          atol: atol,
+          categorical_as_str: categorical_as_str
+        )
+      rescue AssertionError
+        return
+      end
+      msg = "Series are equal"
+      raise AssertionError, msg
+    end
+    private
+    def _assert_correct_input_type(left, right)
+      if left.is_a?(DataFrame) && right.is_a?(DataFrame)
+        return false
+      elsif left.is_a?(LazyFrame) && right.is_a?(DataFrame)
+        return true
+      else
+        raise_assertion_error(
+          "inputs",
+          "unexpected input types",
+          left.class.name,
+          right.class.name
+        )
+      end
+    end
+    def _assert_frame_schema_equal(
+      left,
+      right,
+      check_dtype:,
+      check_column_order:,
+      objects:
+    )
+      left_schema, right_schema = left.schema, right.schema
+      # Fast path for equal frames
+      if left_schema == right_schema
+        return
+      end
+      # Special error message for when column names do not match
+      if left_schema.keys != right_schema.keys
+        if (left_not_right = right_schema.keys - left_schema.keys).any?
+          msg = "columns #{left_not_right.inspect} in left #{objects[..-1]}, but not in right"
+          raise AssertionError, msg
+        else
+          right_not_left = right_schema.keys - left_schema.keys
+          msg = "columns #{right_not_left.inspect} in right #{objects[..-1]}, but not in left"
+          raise AssertionError, msg
+        end
+      end
+      if check_column_order
+        left_columns, right_columns = left_schema.keys, right_schema.keys
+        if left_columns != right_columns
+          detail = "columns are not in the same order"
+          raise_assertion_error(objects, detail, left_columns, right_columns)
+        end
+      end
+      if check_dtype
+        left_schema_dict, right_schema_dict = left_schema.to_h, right_schema.to_h
+        if check_column_order || left_schema_dict != right_schema_dict
+          detail = "dtypes do not match"
+          raise_assertion_error(objects, detail, left_schema_dict, right_schema_dict)
+        end
+      end
+    end
+    def _sort_dataframes(left, right)
+      by = left.columns
+      begin
+        left = left.sort(by)
+        right = right.sort(by)
+      rescue
+        msg = "cannot set `check_row_order: false` on frame with unsortable columns"
+        raise InvalidAssert, msg
+      end
+      [left, right]
+    end
+    def _assert_series_values_equal(
+      left,
+      right,
+      check_exact:,
+      rtol:,
+      atol:,
+      categorical_as_str:
+    )
+      if categorical_as_str
+        if left.dtype == Categorical
+          left = left.cast(String)
+        end
+        if right.dtype == Categorical
+          right = right.cast(String)
+        end
+      end
+      # Determine unequal elements
+      begin
+        unequal = left.ne_missing(right)
+      rescue
+        raise_assertion_error(
+          "Series",
+          "incompatible data types",
+          left.dtype,
+          right.dtype
+        )
+      end
+      # Check nested dtypes in separate function
+      if _comparing_nested_floats(left.dtype, right.dtype)
+        begin
+          _assert_series_nested_values_equal(
+            left: left.filter(unequal),
+            right: right.filter(unequal),
+            check_exact: check_exact,
+            rtol: rtol,
+            atol: atol,
+            categorical_as_str: categorical_as_str
+          )
+        rescue AssertionError
+          raise_assertion_error(
+            "Series",
+            "nested value mismatch",
+            left.to_a,
+            right.to_a
+          )
+        else
+          return
+        end
+      end
+      # If no differences found during exact checking, we're done
+      if !unequal.any
+        return
+      end
+      # Only do inexact checking for float types
+      if check_exact || !left.dtype.float? || !right.dtype.float?
+        raise_assertion_error(
+          "Series", "exact value mismatch", left.to_a, right.to_a
+        )
+      end
+      _assert_series_null_values_match(left, right)
+      _assert_series_nan_values_match(left, right)
+      _assert_series_values_within_tolerance(
+        left,
+        right,
+        unequal,
+        rtol: rtol,
+        atol: atol
+      )
+    end
+    def _assert_series_nested_values_equal(
+      left,
+      right,
+      check_exact:,
+      rtol:,
+      atol:,
+      categorical_as_str:
+    )
+      # compare nested lists element-wise
+      if _comparing_lists(left.dtype, right.dtype)
+        left.zip(right) do |s1, s2|
+          if s1.nil? || s2.nil?
+            raise_assertion_error("Series", "nested value mismatch", s1, s2)
+          end
+          _assert_series_values_equal(
+            s1,
+            s2,
+            check_exact: check_exact,
+            rtol: rtol,
+            atol: atol,
+            categorical_as_str: categorical_as_str
+          )
+        end
+      # unnest structs as series and compare
+      else
+        ls, rs = left.struct.unnest, right.struct.unnest
+        ls.zip(rs) do |s1, s2|
+          _assert_series_values_equal(
+            s1,
+            s2,
+            check_exact: check_exact,
+            rtol: rtol,
+            atol: atol,
+            categorical_as_str: categorical_as_str
+          )
+        end
+      end
+    end
+    def _assert_series_null_values_match(left, right)
+      null_value_mismatch = left.is_null != right.is_null
+      if null_value_mismatch.any
+        raise_assertion_error(
+          "Series", "null value mismatch", left.to_a, right.to_a
+        )
+      end
+    end
+    def _assert_series_nan_values_match(left, right)
+      if !_comparing_floats(left.dtype, right.dtype)
+        return
+      end
+      nan_value_mismatch = left.is_nan != right.is_nan
+      if nan_value_mismatch.any
+        raise_assertion_error(
+          "Series",
+          "nan value mismatch",
+          left.to_a,
+          right.to_a
+        )
+      end
+    end
+    def _comparing_floats(left, right)
+      left.is_float && right.is_float
+    end
+    def _comparing_lists(left, right)
+      [List, Array].include?(left) && [List, Array].include?(right)
+    end
+    def _comparing_structs(left, right)
+      left == Struct && right == Struct
+    end
+    def _comparing_nested_floats(left, right)
+      if !(_comparing_lists(left, right) || _comparing_structs(left, right))
+        return false
+      end
+      left.float? && right.float?
+    end
+    def raise_assertion_error(objects, detail, left, right)
+      msg = "#{objects} are different (#{detail})\n[left]:  #{left}\n[right]: #{right}"
+      raise AssertionError, msg
+    end
+  end
+end

data/lib/polars/utils.rb CHANGED Viewed

@@ -27,7 +27,7 @@ module Polars
       if obj.is_a?(Range)
         # size only works for numeric ranges
         obj.to_a.length
-      elsif obj.is_a?(String)
+      elsif obj.is_a?(::String)
         nil
       else
         obj.length
@@ -116,7 +116,7 @@ module Polars
     end
     def self.selection_to_rbexpr_list(exprs)
-      if exprs.is_a?(String) || exprs.is_a?(Symbol) || exprs.is_a?(Expr) || exprs.is_a?(Series)
+      if exprs.is_a?(::String) || exprs.is_a?(Symbol) || exprs.is_a?(Expr) || exprs.is_a?(Series)
         exprs = [exprs]
       end
@@ -124,9 +124,9 @@ module Polars
     end
     def self.expr_to_lit_or_expr(expr, str_to_lit: true)
-      if (expr.is_a?(String) || expr.is_a?(Symbol)) && !str_to_lit
+      if (expr.is_a?(::String) || expr.is_a?(Symbol)) && !str_to_lit
         col(expr)
-      elsif expr.is_a?(Integer) || expr.is_a?(Float) || expr.is_a?(String) || expr.is_a?(Symbol) || expr.is_a?(Series) || expr.nil?
+      elsif expr.is_a?(Integer) || expr.is_a?(Float) || expr.is_a?(::String) || expr.is_a?(Symbol) || expr.is_a?(Series) || expr.nil?
         lit(expr)
       elsif expr.is_a?(Expr)
         expr
@@ -152,7 +152,7 @@ module Polars
       if data_type == Unknown
         return include_unknown
       end
-      data_type.is_a?(Symbol) || data_type.is_a?(String) || data_type.is_a?(DataType) || (data_type.is_a?(Class) && data_type < DataType)
+      data_type.is_a?(Symbol) || data_type.is_a?(::String) || data_type.is_a?(DataType) || (data_type.is_a?(Class) && data_type < DataType)
     end
     def self.map_rb_type_to_dtype(ruby_dtype)
@@ -160,7 +160,7 @@ module Polars
         Float64
       elsif ruby_dtype == Integer
         Int64
-      elsif ruby_dtype == String
+      elsif ruby_dtype == ::String
         Utf8
       elsif ruby_dtype == TrueClass || ruby_dtype == FalseClass
         Boolean
@@ -187,7 +187,7 @@ module Polars
       begin
         map_rb_type_to_dtype(data_type)
       rescue TypeError
-        raise ArgumentError, "Conversion of Ruby data type #{data_type} to Polars data type not implemented."
+        raise ArgumentError, "Conversion of Ruby data type #{data_type.inspect} to Polars data type not implemented."
       end
     end
@@ -211,7 +211,7 @@ module Polars
       projection = nil
       if columns
         raise Todo
-        # if columns.is_a?(String) || columns.is_a?(Symbol)
+        # if columns.is_a?(::String) || columns.is_a?(Symbol)
         #   columns = [columns]
         # elsif is_int_sequence(columns)
         #   projection = columns.to_a
@@ -243,11 +243,11 @@ module Polars
     end
     def self.strlike?(value)
-      value.is_a?(String) || value.is_a?(Symbol)
+      value.is_a?(::String) || value.is_a?(Symbol)
     end
     def self.pathlike?(value)
-      value.is_a?(String) || (defined?(Pathname) && value.is_a?(Pathname))
+      value.is_a?(::String) || (defined?(Pathname) && value.is_a?(Pathname))
     end
     def self._is_iterable_of(val, eltype)
@@ -275,10 +275,10 @@ module Polars
     end
     def self.is_str_sequence(val, allow_str: false)
-      if allow_str == false && val.is_a?(String)
+      if allow_str == false && val.is_a?(::String)
         false
       else
-        val.is_a?(::Array) && _is_iterable_of(val, String)
+        val.is_a?(::Array) && _is_iterable_of(val, ::String)
       end
     end
@@ -286,20 +286,51 @@ module Polars
       Dir.glob(file).any?
     end
-    def self.parse_as_expression(input, str_as_lit: false, structify: false)
+    def self.parse_as_list_of_expressions(*inputs, __structify: false, **named_inputs)
+      exprs = _parse_positional_inputs(inputs, structify: __structify)
+      if named_inputs.any?
+        named_exprs = _parse_named_inputs(named_inputs, structify: __structify)
+        exprs.concat(named_exprs)
+      end
+      exprs
+    end
+    def self._parse_positional_inputs(inputs, structify: false)
+      inputs_iter = _parse_inputs_as_iterable(inputs)
+      inputs_iter.map { |e| parse_as_expression(e, structify: structify) }
+    end
+    def self._parse_inputs_as_iterable(inputs)
+      if inputs.empty?
+        return []
+      end
+      if inputs.length == 1 && inputs[0].is_a?(::Array)
+        return inputs[0]
+      end
+      inputs
+    end
+    def self._parse_named_inputs(named_inputs, structify: false)
+      named_inputs.map do |name, input|
+        parse_as_expression(input, structify: structify)._alias(name.to_s)
+      end
+    end
+    def self.parse_as_expression(input, str_as_lit: false, list_as_lit: true, structify: false, dtype: nil)
       if input.is_a?(Expr)
         expr = input
-      elsif input.is_a?(String) && !str_as_lit
+      elsif input.is_a?(::String) && !str_as_lit
         expr = Polars.col(input)
         structify = false
-      elsif [Integer, Float, String, Series, ::Date, ::Time, ::DateTime].any? { |cls| input.is_a?(cls) } || input.nil?
-        expr = Polars.lit(input)
-        structify = false
-      elsif input.is_a?(Array)
-        expr = Polars.lit(Polars::Series.new("", [input]))
+      elsif input.is_a?(::Array) && !list_as_lit
+        expr = Polars.lit(Series.new(input), dtype: dtype)
         structify = false
       else
-        raise TypeError, "did not expect value #{input} of type #{input.class.name}, maybe disambiguate with pl.lit or pl.col"
+        expr = Polars.lit(input, dtype: dtype)
+        structify = false
       end
       if structify
@@ -320,5 +351,18 @@ module Polars
       end
       ambiguous
     end
+    def self._check_arg_is_1byte(arg_name, arg, can_be_empty = false)
+      if arg.is_a?(::String)
+        arg_byte_length = arg.bytesize
+        if can_be_empty
+          if arg_byte_length > 1
+            raise ArgumentError, "#{arg_name} should be a single byte character or empty, but is #{arg_byte_length} bytes long."
+          end
+        elsif arg_byte_length != 1
+          raise ArgumentError, "#{arg_name} should be a single byte character, but is #{arg_byte_length} bytes long."
+        end
+      end
+    end
   end
 end

data/lib/polars/version.rb CHANGED Viewed

@@ -1,4 +1,4 @@
 module Polars
   # @private
-  VERSION = "0.7.0"
+  VERSION = "0.9.0"
 end