RubyGems - polars-df - Versions diffs - 0.8.0-x86_64-linux → 0.9.0-x86_64-linux - Mend

polars-df 0.8.0-x86_64-linux → 0.9.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +30 -1
data/Cargo.lock +107 -59
data/Cargo.toml +0 -3
data/LICENSE-THIRD-PARTY.txt +1726 -754
data/LICENSE.txt +1 -1
data/README.md +2 -2
data/lib/polars/3.1/polars.so +0 -0
data/lib/polars/3.2/polars.so +0 -0
data/lib/polars/3.3/polars.so +0 -0
data/lib/polars/array_expr.rb +449 -0
data/lib/polars/array_name_space.rb +346 -0
data/lib/polars/cat_expr.rb +24 -0
data/lib/polars/cat_name_space.rb +75 -0
data/lib/polars/config.rb +2 -2
data/lib/polars/data_frame.rb +179 -43
data/lib/polars/data_types.rb +191 -28
data/lib/polars/date_time_expr.rb +31 -14
data/lib/polars/exceptions.rb +12 -1
data/lib/polars/expr.rb +866 -186
data/lib/polars/functions/aggregation/horizontal.rb +246 -0
data/lib/polars/functions/aggregation/vertical.rb +282 -0
data/lib/polars/functions/as_datatype.rb +248 -0
data/lib/polars/functions/col.rb +47 -0
data/lib/polars/functions/eager.rb +182 -0
data/lib/polars/functions/lazy.rb +1280 -0
data/lib/polars/functions/len.rb +49 -0
data/lib/polars/functions/lit.rb +35 -0
data/lib/polars/functions/random.rb +16 -0
data/lib/polars/functions/range/date_range.rb +103 -0
data/lib/polars/functions/range/int_range.rb +51 -0
data/lib/polars/functions/repeat.rb +144 -0
data/lib/polars/functions/whenthen.rb +27 -0
data/lib/polars/functions.rb +29 -416
data/lib/polars/group_by.rb +2 -2
data/lib/polars/io.rb +18 -25
data/lib/polars/lazy_frame.rb +367 -53
data/lib/polars/list_expr.rb +152 -6
data/lib/polars/list_name_space.rb +102 -0
data/lib/polars/meta_expr.rb +175 -7
data/lib/polars/series.rb +273 -34
data/lib/polars/string_cache.rb +75 -0
data/lib/polars/string_expr.rb +412 -96
data/lib/polars/string_name_space.rb +4 -4
data/lib/polars/testing.rb +507 -0
data/lib/polars/utils.rb +52 -8
data/lib/polars/version.rb +1 -1
data/lib/polars.rb +15 -2
metadata +33 -4
data/lib/polars/lazy_functions.rb +0 -1181

data/lib/polars/functions.rb CHANGED Viewed

@@ -13,432 +13,45 @@ module Polars
       df.to_dummies(columns: columns)
     end
-    # Aggregate multiple Dataframes/Series to a single DataFrame/Series.
+    # Aggregate to list.
     #
-    # @param items [Object]
-    #   DataFrames/Series/LazyFrames to concatenate.
-    # @param rechunk [Boolean]
-    #   Make sure that all data is in contiguous memory.
-    # @param how ["vertical", "vertical_relaxed", "diagonal", "horizontal"]
-    #   LazyFrames do not support the `horizontal` strategy.
-    #
-    #   - Vertical: applies multiple `vstack` operations.
-    #   - Diagonal: finds a union between the column schemas and fills missing column values with null.
-    #   - Horizontal: stacks Series horizontally and fills with nulls if the lengths don't match.
-    # @param parallel [Boolean]
-    #   Only relevant for LazyFrames. This determines if the concatenated
-    #   lazy computations may be executed in parallel.
-    #
-    # @return [Object]
-    #
-    # @example
-    #   df1 = Polars::DataFrame.new({"a" => [1], "b" => [3]})
-    #   df2 = Polars::DataFrame.new({"a" => [2], "b" => [4]})
-    #   Polars.concat([df1, df2])
-    #   # =>
-    #   # shape: (2, 2)
-    #   # ┌─────┬─────┐
-    #   # │ a   ┆ b   │
-    #   # │ --- ┆ --- │
-    #   # │ i64 ┆ i64 │
-    #   # ╞═════╪═════╡
-    #   # │ 1   ┆ 3   │
-    #   # │ 2   ┆ 4   │
-    #   # └─────┴─────┘
-    def concat(items, rechunk: true, how: "vertical", parallel: true)
-      if items.empty?
-        raise ArgumentError, "cannot concat empty list"
-      end
-      first = items[0]
-      if first.is_a?(DataFrame)
-        if how == "vertical"
-          out = Utils.wrap_df(_concat_df(items))
-        elsif how == "diagonal"
-          out = Utils.wrap_df(_concat_df_diagonal(items))
-        elsif how == "horizontal"
-          out = Utils.wrap_df(_concat_df_horizontal(items))
-        else
-          raise ArgumentError, "how must be one of {{'vertical', 'diagonal', 'horizontal'}}, got #{how}"
-        end
-      elsif first.is_a?(LazyFrame)
-        if how == "vertical"
-          return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, false))
-        elsif how == "vertical_relaxed"
-          return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, true))
-        elsif how == "diagonal"
-          return Utils.wrap_ldf(_concat_lf_diagonal(items, rechunk, parallel, false))
-        else
-          raise ArgumentError, "Lazy only allows 'vertical', 'vertical_relaxed', and 'diagonal' concat strategy."
-        end
-      elsif first.is_a?(Series)
-        # TODO
-        out = Utils.wrap_s(_concat_series(items))
-      elsif first.is_a?(Expr)
-        out = first
-        items[1..-1].each do |e|
-          out = out.append(e)
-        end
-      else
-        raise ArgumentError, "did not expect type: #{first.class.name} in 'Polars.concat'."
-      end
-      if rechunk
-        out.rechunk
-      else
-        out
-      end
-    end
-    # Create a range of type `Datetime` (or `Date`).
-    #
-    # @param start [Object]
-    #   Lower bound of the date range.
-    # @param stop [Object]
-    #   Upper bound of the date range.
-    # @param interval [Object]
-    #   Interval periods. It can be a polars duration string, such as `3d12h4m25s`
-    #   representing 3 days, 12 hours, 4 minutes, and 25 seconds.
-    # @param lazy [Boolean]
-    #   Return an expression.
-    # @param closed ["both", "left", "right", "none"]
-    #   Define whether the temporal window interval is closed or not.
-    # @param name [String]
-    #   Name of the output Series.
-    # @param time_unit [nil, "ns", "us", "ms"]
-    #   Set the time unit.
-    # @param time_zone [String]
-    #   Optional timezone
-    #
-    # @return [Object]
-    #
-    # @note
-    #   If both `low` and `high` are passed as date types (not datetime), and the
-    #   interval granularity is no finer than 1d, the returned range is also of
-    #   type date. All other permutations return a datetime Series.
-    #
-    # @example Using polars duration string to specify the interval
-    #   Polars.date_range(Date.new(2022, 1, 1), Date.new(2022, 3, 1), "1mo", name: "drange")
-    #   # =>
-    #   # shape: (3,)
-    #   # Series: 'drange' [date]
-    #   # [
-    #   #         2022-01-01
-    #   #         2022-02-01
-    #   #         2022-03-01
-    #   # ]
-    #
-    # @example Using `timedelta` object to specify the interval:
-    #   Polars.date_range(
-    #       DateTime.new(1985, 1, 1),
-    #       DateTime.new(1985, 1, 10),
-    #       "1d12h",
-    #       time_unit: "ms"
-    #   )
-    #   # =>
-    #   # shape: (7,)
-    #   # Series: '' [datetime[ms]]
-    #   # [
-    #   #         1985-01-01 00:00:00
-    #   #         1985-01-02 12:00:00
-    #   #         1985-01-04 00:00:00
-    #   #         1985-01-05 12:00:00
-    #   #         1985-01-07 00:00:00
-    #   #         1985-01-08 12:00:00
-    #   #         1985-01-10 00:00:00
-    #   # ]
-    def date_range(
-      start,
-      stop,
-      interval,
-      lazy: false,
-      closed: "both",
-      name: nil,
-      time_unit: nil,
-      time_zone: nil
-    )
-      if defined?(ActiveSupport::Duration) && interval.is_a?(ActiveSupport::Duration)
-        raise Todo
-      else
-        interval = interval.to_s
-        if interval.include?(" ")
-          interval = interval.gsub(" ", "")
-        end
-      end
-      if time_unit.nil?
-        if interval.include?("ns")
-          time_unit = "ns"
-        else
-          time_unit = "us"
-        end
-      end
-      start_rbexpr = Utils.parse_as_expression(start)
-      stop_rbexpr = Utils.parse_as_expression(stop)
-      result = Utils.wrap_expr(
-        _rb_date_range(start_rbexpr, stop_rbexpr, interval, closed, time_unit, time_zone)
-      )
-      result = result.alias(name.to_s)
-      if !lazy
-        return select(result).to_series
-      end
-      result
-    end
-    # Bin values into discrete values.
-    #
-    # @param s [Series]
-    #   Series to bin.
-    # @param bins [Array]
-    #   Bins to create.
-    # @param labels [Array]
-    #   Labels to assign to the bins. If given the length of labels must be
-    #   len(bins) + 1.
-    # @param break_point_label [String]
-    #   Name given to the breakpoint column.
-    # @param category_label [String]
-    #   Name given to the category column.
-    #
-    # @return [DataFrame]
-    #
-    # @note
-    #   This functionality is experimental and may change without it being considered a
-    #   breaking change.
-    #
-    # @example
-    #   a = Polars::Series.new("a", 13.times.map { |i| (-30 + i * 5) / 10.0 })
-    #   Polars.cut(a, [-1, 1])
-    #   # =>
-    #   # shape: (12, 3)
-    #   # ┌──────┬─────────────┬──────────────┐
-    #   # │ a    ┆ break_point ┆ category     │
-    #   # │ ---  ┆ ---         ┆ ---          │
-    #   # │ f64  ┆ f64         ┆ cat          │
-    #   # ╞══════╪═════════════╪══════════════╡
-    #   # │ -3.0 ┆ -1.0        ┆ (-inf, -1.0] │
-    #   # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
-    #   # │ -2.5 ┆ -1.0        ┆ (-inf, -1.0] │
-    #   # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
-    #   # │ -2.0 ┆ -1.0        ┆ (-inf, -1.0] │
-    #   # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
-    #   # │ -1.5 ┆ -1.0        ┆ (-inf, -1.0] │
-    #   # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
-    #   # │ ...  ┆ ...         ┆ ...          │
-    #   # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
-    #   # │ 1.0  ┆ 1.0         ┆ (-1.0, 1.0]  │
-    #   # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
-    #   # │ 1.5  ┆ inf         ┆ (1.0, inf]   │
-    #   # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
-    #   # │ 2.0  ┆ inf         ┆ (1.0, inf]   │
-    #   # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
-    #   # │ 2.5  ┆ inf         ┆ (1.0, inf]   │
-    #   # └──────┴─────────────┴──────────────┘
-    # def cut(
-    #   s,
-    #   bins,
-    #   labels: nil,
-    #   break_point_label: "break_point",
-    #   category_label: "category"
-    # )
-    #   var_nm = s.name
-    #   cuts_df = DataFrame.new(
-    #     [
-    #       Series.new(
-    #         break_point_label, bins, dtype: :f64
-    #       ).extend_constant(Float::INFINITY, 1)
-    #     ]
-    #   )
-    #   if labels
-    #     if labels.length != bins.length + 1
-    #       raise ArgumentError, "expected more labels"
-    #     end
-    #     cuts_df = cuts_df.with_column(Series.new(category_label, labels))
-    #   else
-    #     cuts_df = cuts_df.with_column(
-    #       Polars.format(
-    #         "({}, {}]",
-    #         Polars.col(break_point_label).shift_and_fill(1, -Float::INFINITY),
-    #         Polars.col(break_point_label)
-    #       ).alias(category_label)
-    #     )
-    #   end
-    #   cuts_df = cuts_df.with_column(Polars.col(category_label).cast(:cat))
-    #   s.cast(:f64)
-    #     .sort
-    #     .to_frame
-    #     .join_asof(
-    #       cuts_df,
-    #       left_on: var_nm,
-    #       right_on: break_point_label,
-    #       strategy: "forward"
-    #     )
-    # end
-    # Align a sequence of frames using the uique values from one or more columns as a key.
-    #
-    # Frames that do not contain the given key values have rows injected (with nulls
-    # filling the non-key columns), and each resulting frame is sorted by the key.
-    #
-    # The original column order of input frames is not changed unless ``select`` is
-    # specified (in which case the final column order is determined from that).
-    #
-    # Note that this does not result in a joined frame - you receive the same number
-    # of frames back that you passed in, but each is now aligned by key and has
-    # the same number of rows.
-    #
-    # @param frames [Array]
-    #   Sequence of DataFrames or LazyFrames.
-    # @param on [Object]
-    #   One or more columns whose unique values will be used to align the frames.
-    # @param select [Object]
-    #   Optional post-alignment column select to constrain and/or order
-    #   the columns returned from the newly aligned frames.
-    # @param reverse [Object]
-    #   Sort the alignment column values in descending order; can be a single
-    #   boolean or a list of booleans associated with each column in `on`.
-    #
-    # @return [Object]
-    #
-    # @example
-    #   df1 = Polars::DataFrame.new(
-    #     {
-    #       "dt" => [Date.new(2022, 9, 1), Date.new(2022, 9, 2), Date.new(2022, 9, 3)],
-    #       "x" => [3.5, 4.0, 1.0],
-    #       "y" => [10.0, 2.5, 1.5]
-    #     }
-    #   )
-    #   df2 = Polars::DataFrame.new(
-    #     {
-    #       "dt" => [Date.new(2022, 9, 2), Date.new(2022, 9, 3), Date.new(2022, 9, 1)],
-    #       "x" => [8.0, 1.0, 3.5],
-    #       "y" => [1.5, 12.0, 5.0]
-    #     }
-    #   )
-    #   df3 = Polars::DataFrame.new(
-    #     {
-    #       "dt" => [Date.new(2022, 9, 3), Date.new(2022, 9, 2)],
-    #       "x" => [2.0, 5.0],
-    #       "y" => [2.5, 2.0]
-    #     }
-    #   )
-    #   af1, af2, af3 = Polars.align_frames(
-    #     df1, df2, df3, on: "dt", select: ["x", "y"]
-    #   )
-    #   (af1 * af2 * af3).fill_null(0).select(Polars.sum(Polars.col("*")).alias("dot"))
-    #   # =>
-    #   # shape: (3, 1)
-    #   # ┌───────┐
-    #   # │ dot   │
-    #   # │ ---   │
-    #   # │ f64   │
-    #   # ╞═══════╡
-    #   # │ 0.0   │
-    #   # ├╌╌╌╌╌╌╌┤
-    #   # │ 167.5 │
-    #   # ├╌╌╌╌╌╌╌┤
-    #   # │ 47.0  │
-    #   # └───────┘
-    def align_frames(
-      *frames,
-      on:,
-      select: nil,
-      reverse: false
-    )
-      if frames.empty?
-        return []
-      elsif frames.map(&:class).uniq.length != 1
-        raise TypeError, "Input frames must be of a consistent type (all LazyFrame or all DataFrame)"
-      end
-      # establish the superset of all "on" column values, sort, and cache
-      eager = frames[0].is_a?(DataFrame)
-      alignment_frame = (
-        concat(frames.map { |df| df.lazy.select(on) })
-          .unique(maintain_order: false)
-          .sort(on, reverse: reverse)
-      )
-      alignment_frame = (
-        eager ? alignment_frame.collect.lazy : alignment_frame.cache
-      )
-      # finally, align all frames
-      aligned_frames =
-        frames.map do |df|
-          alignment_frame.join(
-            df.lazy,
-            on: alignment_frame.columns,
-            how: "left"
-          ).select(df.columns)
-        end
-      if !select.nil?
-        aligned_frames = aligned_frames.map { |df| df.select(select) }
-      end
-      eager ? aligned_frames.map(&:collect) : aligned_frames
+    # @return [Expr]
+    def to_list(name)
+      col(name).list
     end
-    # Return a new Series of given length and type, filled with ones.
+    # Compute the spearman rank correlation between two columns.
     #
-    # @param n [Integer]
-    #   Number of elements in the `Series`
-    # @param dtype [Symbol]
-    #   DataType of the elements, defaults to `:f64`
+    # Missing data will be excluded from the computation.
     #
-    # @return [Series]
+    # @param a [Object]
+    #   Column name or Expression.
+    # @param b [Object]
+    #   Column name or Expression.
+    # @param ddof [Integer]
+    #   Delta degrees of freedom
+    # @param propagate_nans [Boolean]
+    #   If `True` any `NaN` encountered will lead to `NaN` in the output.
+    #   Defaults to `False` where `NaN` are regarded as larger than any finite number
+    #   and thus lead to the highest rank.
     #
-    # @note
-    #   In the lazy API you should probably not use this, but use `lit(1)`
-    #   instead.
-    def ones(n, dtype: nil)
-      s = Series.new([1.0])
-      if dtype
-        s = s.cast(dtype)
-      end
-      s.new_from_index(0, n)
+    # @return [Expr]
+    def spearman_rank_corr(a, b, ddof: 1, propagate_nans: false)
+      corr(a, b, method: "spearman", ddof: ddof, propagate_nans: propagate_nans)
     end
-    # Return a new Series of given length and type, filled with zeros.
-    #
-    # @param n [Integer]
-    #   Number of elements in the `Series`
-    # @param dtype [Symbol]
-    #   DataType of the elements, defaults to `:f64`
+    # Compute the pearson's correlation between two columns.
     #
-    # @return [Series]
+    # @param a [Object]
+    #   Column name or Expression.
+    # @param b [Object]
+    #   Column name or Expression.
+    # @param ddof [Integer]
+    #   Delta degrees of freedom
     #
-    # @note
-    #   In the lazy API you should probably not use this, but use `lit(0)`
-    #   instead.
-    def zeros(n, dtype: nil)
-      s = Series.new([0.0])
-      if dtype
-        s = s.cast(dtype)
-      end
-      s.new_from_index(0, n)
-    end
-    private
-    def _ensure_datetime(value)
-      is_date_type = false
-      if !value.is_a?(::DateTime)
-        value = ::DateTime.new(value.year, value.month, value.day)
-        is_date_type = true
-      end
-      [value, is_date_type]
-    end
-    # TODO
-    def _interval_granularity(interval)
-      interval
+    # @return [Expr]
+    def pearson_corr(a, b, ddof: 1)
+      corr(a, b, method: "pearson", ddof: ddof)
     end
   end
 end

data/lib/polars/group_by.rb CHANGED Viewed

@@ -38,7 +38,7 @@ module Polars
       temp_col = "__POLARS_GB_GROUP_INDICES"
       groups_df =
         @df.lazy
-          .with_row_count(name: temp_col)
+          .with_row_index(name: temp_col)
           .group_by(@by, maintain_order: @maintain_order)
           .agg(Polars.col(temp_col))
           .collect(no_optimization: true)
@@ -415,7 +415,7 @@ module Polars
     #   # │ Banana ┆ 2     │
     #   # └────────┴───────┘
     def count
-      agg(Polars.count)
+      agg(Polars.len.alias("count"))
     end
     # Reduce the groups to the mean values.

data/lib/polars/io.rb CHANGED Viewed

@@ -115,10 +115,10 @@ module Polars
       sample_size: 1024,
       eol_char: "\n"
     )
-      _check_arg_is_1byte("sep", sep, false)
-      _check_arg_is_1byte("comment_char", comment_char, false)
-      _check_arg_is_1byte("quote_char", quote_char, true)
-      _check_arg_is_1byte("eol_char", eol_char, false)
+      Utils._check_arg_is_1byte("sep", sep, false)
+      Utils._check_arg_is_1byte("comment_char", comment_char, false)
+      Utils._check_arg_is_1byte("quote_char", quote_char, true)
+      Utils._check_arg_is_1byte("eol_char", eol_char, false)
       projection, columns = Utils.handle_projection_columns(columns)
@@ -264,9 +264,9 @@ module Polars
       parse_dates: false,
       eol_char: "\n"
     )
-      _check_arg_is_1byte("sep", sep, false)
-      _check_arg_is_1byte("comment_char", comment_char, false)
-      _check_arg_is_1byte("quote_char", quote_char, true)
+      Utils._check_arg_is_1byte("sep", sep, false)
+      Utils._check_arg_is_1byte("comment_char", comment_char, false)
+      Utils._check_arg_is_1byte("quote_char", quote_char, true)
       if Utils.pathlike?(source)
         source = Utils.normalise_filepath(source)
@@ -604,9 +604,12 @@ module Polars
     #
     # @param query [Object]
     #   ActiveRecord::Relation or ActiveRecord::Result.
+    # @param schema_overrides [Hash]
+    #   A hash mapping column names to dtypes, used to override the schema
+    #   inferred from the query.
     #
     # @return [DataFrame]
-    def read_database(query)
+    def read_database(query, schema_overrides: nil)
       if !defined?(ActiveRecord)
         raise Error, "Active Record not available"
       end
@@ -623,7 +626,7 @@ module Polars
         end
       data = {}
-      schema_overrides = {}
+      schema_overrides = (schema_overrides || {}).transform_keys(&:to_s)
       result.columns.each_with_index do |k, i|
         column_type = result.column_types[i]
@@ -655,9 +658,12 @@ module Polars
             String
           when :time
             Time
+          # TODO fix issue with null
+          # when :json, :jsonb
+          #   Struct
           end
-        schema_overrides[k] = polars_type if polars_type
+        schema_overrides[k] ||= polars_type if polars_type
       end
       DataFrame.new(data, schema_overrides: schema_overrides)
@@ -836,7 +842,7 @@ module Polars
         source = Utils.normalise_filepath(source)
       end
-      _ipc_schema(source)
+      Plr.ipc_schema(source)
     end
     # Get a schema of the Parquet file without reading data.
@@ -850,7 +856,7 @@ module Polars
         source = Utils.normalise_filepath(source)
       end
-      _parquet_schema(source)
+      Plr.parquet_schema(source)
     end
     private
@@ -868,18 +874,5 @@ module Polars
       yield file
     end
-    def _check_arg_is_1byte(arg_name, arg, can_be_empty = false)
-      if arg.is_a?(::String)
-        arg_byte_length = arg.bytesize
-        if can_be_empty
-          if arg_byte_length > 1
-            raise ArgumentError, "#{arg_name} should be a single byte character or empty, but is #{arg_byte_length} bytes long."
-          end
-        elsif arg_byte_length != 1
-          raise ArgumentError, "#{arg_name} should be a single byte character, but is #{arg_byte_length} bytes long."
-        end
-      end
-    end
   end
 end