RubyGems - polars-df - Versions diffs - 0.8.0-aarch64-linux → 0.9.0-aarch64-linux - Mend

polars-df 0.8.0-aarch64-linux → 0.9.0-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +30 -1
data/Cargo.lock +107 -59
data/Cargo.toml +0 -3
data/LICENSE-THIRD-PARTY.txt +1726 -754
data/LICENSE.txt +1 -1
data/README.md +2 -2
data/lib/polars/3.1/polars.so +0 -0
data/lib/polars/3.2/polars.so +0 -0
data/lib/polars/3.3/polars.so +0 -0
data/lib/polars/array_expr.rb +449 -0
data/lib/polars/array_name_space.rb +346 -0
data/lib/polars/cat_expr.rb +24 -0
data/lib/polars/cat_name_space.rb +75 -0
data/lib/polars/config.rb +2 -2
data/lib/polars/data_frame.rb +179 -43
data/lib/polars/data_types.rb +191 -28
data/lib/polars/date_time_expr.rb +31 -14
data/lib/polars/exceptions.rb +12 -1
data/lib/polars/expr.rb +866 -186
data/lib/polars/functions/aggregation/horizontal.rb +246 -0
data/lib/polars/functions/aggregation/vertical.rb +282 -0
data/lib/polars/functions/as_datatype.rb +248 -0
data/lib/polars/functions/col.rb +47 -0
data/lib/polars/functions/eager.rb +182 -0
data/lib/polars/functions/lazy.rb +1280 -0
data/lib/polars/functions/len.rb +49 -0
data/lib/polars/functions/lit.rb +35 -0
data/lib/polars/functions/random.rb +16 -0
data/lib/polars/functions/range/date_range.rb +103 -0
data/lib/polars/functions/range/int_range.rb +51 -0
data/lib/polars/functions/repeat.rb +144 -0
data/lib/polars/functions/whenthen.rb +27 -0
data/lib/polars/functions.rb +29 -416
data/lib/polars/group_by.rb +2 -2
data/lib/polars/io.rb +18 -25
data/lib/polars/lazy_frame.rb +367 -53
data/lib/polars/list_expr.rb +152 -6
data/lib/polars/list_name_space.rb +102 -0
data/lib/polars/meta_expr.rb +175 -7
data/lib/polars/series.rb +273 -34
data/lib/polars/string_cache.rb +75 -0
data/lib/polars/string_expr.rb +412 -96
data/lib/polars/string_name_space.rb +4 -4
data/lib/polars/testing.rb +507 -0
data/lib/polars/utils.rb +52 -8
data/lib/polars/version.rb +1 -1
data/lib/polars.rb +15 -2
metadata +33 -4
data/lib/polars/lazy_functions.rb +0 -1181

data/lib/polars/lazy_frame.rb CHANGED Viewed

@@ -308,7 +308,7 @@ module Polars
     #   end
     #
     #   df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]}).lazy
-    #   df.pipe(cast_str_to_int, col_name: "b").collect()
+    #   df.pipe(cast_str_to_int, col_name: "b").collect
     #   # =>
     #   # shape: (4, 2)
     #   # ┌─────┬─────┐
@@ -342,6 +342,7 @@ module Polars
       simplify_expression: true,
       slice_pushdown: true,
       common_subplan_elimination: true,
+      comm_subexpr_elim: true,
       allow_streaming: false
     )
       ldf = _ldf.optimization_toggle(
@@ -351,6 +352,7 @@ module Polars
         simplify_expression,
         slice_pushdown,
         common_subplan_elimination,
+        comm_subexpr_elim,
         allow_streaming,
         false
       )
@@ -469,6 +471,7 @@ module Polars
       no_optimization: false,
       slice_pushdown: true,
       common_subplan_elimination: true,
+      comm_subexpr_elim: true,
       allow_streaming: false,
       _eager: false
     )
@@ -477,6 +480,7 @@ module Polars
         projection_pushdown = false
         slice_pushdown = false
         common_subplan_elimination = false
+        comm_subexpr_elim = false
       end
       if allow_streaming
@@ -490,6 +494,7 @@ module Polars
         simplify_expression,
         slice_pushdown,
         common_subplan_elimination,
+        comm_subexpr_elim,
         allow_streaming,
         _eager
       )
@@ -559,6 +564,268 @@ module Polars
       simplify_expression: true,
       no_optimization: false,
       slice_pushdown: true
+    )
+      lf = _set_sink_optimizations(
+        type_coercion: type_coercion,
+        predicate_pushdown: predicate_pushdown,
+        projection_pushdown: projection_pushdown,
+        simplify_expression: simplify_expression,
+        slice_pushdown: slice_pushdown,
+        no_optimization: no_optimization
+      )
+      lf.sink_parquet(
+        path,
+        compression,
+        compression_level,
+        statistics,
+        row_group_size,
+        data_pagesize_limit,
+        maintain_order
+      )
+    end
+    # Evaluate the query in streaming mode and write to an IPC file.
+    #
+    # This allows streaming results that are larger than RAM to be written to disk.
+    #
+    # @param path [String]
+    #   File path to which the file should be written.
+    # @param compression ["lz4", "zstd"]
+    #   Choose "zstd" for good compression performance.
+    #   Choose "lz4" for fast compression/decompression.
+    # @param maintain_order [Boolean]
+    #   Maintain the order in which data is processed.
+    #   Setting this to `false` will  be slightly faster.
+    # @param type_coercion [Boolean]
+    #   Do type coercion optimization.
+    # @param predicate_pushdown [Boolean]
+    #   Do predicate pushdown optimization.
+    # @param projection_pushdown [Boolean]
+    #   Do projection pushdown optimization.
+    # @param simplify_expression [Boolean]
+    #   Run simplify expressions optimization.
+    # @param slice_pushdown [Boolean]
+    #   Slice pushdown optimization.
+    # @param no_optimization [Boolean]
+    #   Turn off (certain) optimizations.
+    #
+    # @return [DataFrame]
+    #
+    # @example
+    #   lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
+    #   lf.sink_ipc("out.arrow")
+    def sink_ipc(
+      path,
+      compression: "zstd",
+      maintain_order: true,
+      type_coercion: true,
+      predicate_pushdown: true,
+      projection_pushdown: true,
+      simplify_expression: true,
+      slice_pushdown: true,
+      no_optimization: false
+    )
+      lf = _set_sink_optimizations(
+        type_coercion: type_coercion,
+        predicate_pushdown: predicate_pushdown,
+        projection_pushdown: projection_pushdown,
+        simplify_expression: simplify_expression,
+        slice_pushdown: slice_pushdown,
+        no_optimization: no_optimization
+      )
+      lf.sink_ipc(
+        path,
+        compression,
+        maintain_order
+      )
+    end
+    # Evaluate the query in streaming mode and write to a CSV file.
+    #
+    # This allows streaming results that are larger than RAM to be written to disk.
+    #
+    # @param path [String]
+    #   File path to which the file should be written.
+    # @param include_bom [Boolean]
+    #   Whether to include UTF-8 BOM in the CSV output.
+    # @param include_header [Boolean]
+    #   Whether to include header in the CSV output.
+    # @param separator [String]
+    #   Separate CSV fields with this symbol.
+    # @param line_terminator [String]
+    #   String used to end each row.
+    # @param quote_char [String]
+    #   Byte to use as quoting character.
+    # @param batch_size [Integer]
+    #   Number of rows that will be processed per thread.
+    # @param datetime_format [String]
+    #   A format string, with the specifiers defined by the
+    #   `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
+    #   Rust crate. If no format specified, the default fractional-second
+    #   precision is inferred from the maximum timeunit found in the frame's
+    #   Datetime cols (if any).
+    # @param date_format [String]
+    #   A format string, with the specifiers defined by the
+    #   `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
+    #   Rust crate.
+    # @param time_format [String]
+    #   A format string, with the specifiers defined by the
+    #   `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
+    #   Rust crate.
+    # @param float_precision [Integer]
+    #   Number of decimal places to write, applied to both `Float32` and
+    #   `Float64` datatypes.
+    # @param null_value [String]
+    #   A string representing null values (defaulting to the empty string).
+    # @param quote_style ["necessary", "always", "non_numeric", "never"]
+    #   Determines the quoting strategy used.
+    #
+    #   - necessary (default): This puts quotes around fields only when necessary.
+    #     They are necessary when fields contain a quote,
+    #     delimiter or record terminator.
+    #     Quotes are also necessary when writing an empty record
+    #     (which is indistinguishable from a record with one empty field).
+    #     This is the default.
+    #   - always: This puts quotes around every field. Always.
+    #   - never: This never puts quotes around fields, even if that results in
+    #     invalid CSV data (e.g.: by not quoting strings containing the
+    #     separator).
+    #   - non_numeric: This puts quotes around all fields that are non-numeric.
+    #     Namely, when writing a field that does not parse as a valid float
+    #     or integer, then quotes will be used even if they aren`t strictly
+    #     necessary.
+    # @param maintain_order [Boolean]
+    #   Maintain the order in which data is processed.
+    #   Setting this to `false` will  be slightly faster.
+    # @param type_coercion [Boolean]
+    #   Do type coercion optimization.
+    # @param predicate_pushdown [Boolean]
+    #   Do predicate pushdown optimization.
+    # @param projection_pushdown [Boolean]
+    #   Do projection pushdown optimization.
+    # @param simplify_expression [Boolean]
+    #   Run simplify expressions optimization.
+    # @param slice_pushdown [Boolean]
+    #   Slice pushdown optimization.
+    # @param no_optimization [Boolean]
+    #   Turn off (certain) optimizations.
+    #
+    # @return [DataFrame]
+    #
+    # @example
+    #   lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
+    #   lf.sink_csv("out.csv")
+    def sink_csv(
+      path,
+      include_bom: false,
+      include_header: true,
+      separator: ",",
+      line_terminator: "\n",
+      quote_char: '"',
+      batch_size: 1024,
+      datetime_format: nil,
+      date_format: nil,
+      time_format: nil,
+      float_precision: nil,
+      null_value: nil,
+      quote_style: nil,
+      maintain_order: true,
+      type_coercion: true,
+      predicate_pushdown: true,
+      projection_pushdown: true,
+      simplify_expression: true,
+      slice_pushdown: true,
+      no_optimization: false
+    )
+      Utils._check_arg_is_1byte("separator", separator, false)
+      Utils._check_arg_is_1byte("quote_char", quote_char, false)
+      lf = _set_sink_optimizations(
+        type_coercion: type_coercion,
+        predicate_pushdown: predicate_pushdown,
+        projection_pushdown: projection_pushdown,
+        simplify_expression: simplify_expression,
+        slice_pushdown: slice_pushdown,
+        no_optimization: no_optimization
+      )
+      lf.sink_csv(
+        path,
+        include_bom,
+        include_header,
+        separator.ord,
+        line_terminator,
+        quote_char.ord,
+        batch_size,
+        datetime_format,
+        date_format,
+        time_format,
+        float_precision,
+        null_value,
+        quote_style,
+        maintain_order
+      )
+    end
+    # Evaluate the query in streaming mode and write to an NDJSON file.
+    #
+    # This allows streaming results that are larger than RAM to be written to disk.
+    #
+    # @param path [String]
+    #   File path to which the file should be written.
+    # @param maintain_order [Boolean]
+    #   Maintain the order in which data is processed.
+    #   Setting this to `false` will be slightly faster.
+    # @param type_coercion [Boolean]
+    #   Do type coercion optimization.
+    # @param predicate_pushdown [Boolean]
+    #   Do predicate pushdown optimization.
+    # @param projection_pushdown [Boolean]
+    #   Do projection pushdown optimization.
+    # @param simplify_expression [Boolean]
+    #   Run simplify expressions optimization.
+    # @param slice_pushdown [Boolean]
+    #   Slice pushdown optimization.
+    # @param no_optimization [Boolean]
+    #   Turn off (certain) optimizations.
+    #
+    # @return [DataFrame]
+    #
+    # @example
+    #   lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
+    #   lf.sink_ndjson("out.ndjson")
+    def sink_ndjson(
+      path,
+      maintain_order: true,
+      type_coercion: true,
+      predicate_pushdown: true,
+      projection_pushdown: true,
+      simplify_expression: true,
+      slice_pushdown: true,
+      no_optimization: false
+    )
+      lf = _set_sink_optimizations(
+        type_coercion: type_coercion,
+        predicate_pushdown: predicate_pushdown,
+        projection_pushdown: projection_pushdown,
+        simplify_expression: simplify_expression,
+        slice_pushdown: slice_pushdown,
+        no_optimization: no_optimization
+      )
+      lf.sink_json(path, maintain_order)
+    end
+    # @private
+    def _set_sink_optimizations(
+      type_coercion: true,
+      predicate_pushdown: true,
+      projection_pushdown: true,
+      simplify_expression: true,
+      slice_pushdown: true,
+      no_optimization: false
     )
       if no_optimization
         predicate_pushdown = false
@@ -566,25 +833,17 @@ module Polars
         slice_pushdown = false
       end
-      lf = _ldf.optimization_toggle(
+      _ldf.optimization_toggle(
         type_coercion,
         predicate_pushdown,
         projection_pushdown,
         simplify_expression,
         slice_pushdown,
         false,
+        false,
         true,
         false
       )
-      lf.sink_parquet(
-        path,
-        compression,
-        compression_level,
-        statistics,
-        row_group_size,
-        data_pagesize_limit,
-        maintain_order
-      )
     end
     # Collect a small number of rows for debugging purposes.
@@ -650,6 +909,7 @@ module Polars
       no_optimization: false,
       slice_pushdown: true,
       common_subplan_elimination: true,
+      comm_subexpr_elim: true,
       allow_streaming: false
     )
       if no_optimization
@@ -666,6 +926,7 @@ module Polars
         simplify_expression,
         slice_pushdown,
         common_subplan_elimination,
+        comm_subexpr_elim,
         allow_streaming,
         false
       )
@@ -699,6 +960,10 @@ module Polars
       _from_rbldf(_ldf.cache)
     end
+    # TODO
+    # def cast
+    # end
     # Create an empty copy of the current LazyFrame.
     #
     # The copy has an identical schema but no data.
@@ -706,14 +971,14 @@ module Polars
     # @return [LazyFrame]
     #
     # @example
-    #   df = Polars::DataFrame.new(
+    #   lf = Polars::LazyFrame.new(
     #     {
     #       "a" => [nil, 2, 3, 4],
     #       "b" => [0.5, nil, 2.5, 13],
     #       "c" => [true, true, false, nil],
     #     }
     #   ).lazy
-    #   df.cleared.fetch
+    #   lf.clear.fetch
     #   # =>
     #   # shape: (0, 3)
     #   # ┌─────┬─────┬──────┐
@@ -722,9 +987,23 @@ module Polars
     #   # │ i64 ┆ f64 ┆ bool │
     #   # ╞═════╪═════╪══════╡
     #   # └─────┴─────┴──────┘
-    def cleared
-      DataFrame.new(columns: schema).lazy
-    end
+    #
+    # @example
+    #   lf.clear(2).fetch
+    #   # =>
+    #   # shape: (2, 3)
+    #   # ┌──────┬──────┬──────┐
+    #   # │ a    ┆ b    ┆ c    │
+    #   # │ ---  ┆ ---  ┆ ---  │
+    #   # │ i64  ┆ f64  ┆ bool │
+    #   # ╞══════╪══════╪══════╡
+    #   # │ null ┆ null ┆ null │
+    #   # │ null ┆ null ┆ null │
+    #   # └──────┴──────┴──────┘
+    def clear(n = 0)
+      DataFrame.new(columns: schema).clear(n).lazy
+    end
+    alias_method :cleared, :clear
     # Filter the rows in the DataFrame based on a predicate expression.
     #
@@ -774,8 +1053,13 @@ module Polars
     # Select columns from this DataFrame.
     #
-    # @param exprs [Object]
-    #   Column or columns to select.
+    # @param exprs [Array]
+    #   Column(s) to select, specified as positional arguments.
+    #   Accepts expression input. Strings are parsed as column names,
+    #   other non-expression inputs are parsed as literals.
+    # @param named_exprs [Hash]
+    #   Additional columns to select, specified as keyword arguments.
+    #   The columns will be renamed to the keyword used.
     #
     # @return [LazyFrame]
     #
@@ -855,9 +1139,13 @@ module Polars
     #   # │ 0       │
     #   # │ 10      │
     #   # └─────────┘
-    def select(exprs)
-      exprs = Utils.selection_to_rbexpr_list(exprs)
-      _from_rbldf(_ldf.select(exprs))
+    def select(*exprs, **named_exprs)
+      structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
+      rbexprs = Utils.parse_as_list_of_expressions(
+        *exprs, **named_exprs, __structify: structify
+      )
+      _from_rbldf(_ldf.select(rbexprs))
     end
     # Start a group by operation.
@@ -967,7 +1255,7 @@ module Polars
     #   df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
     #     Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
     #   )
-    #   df.group_by_rolling(index_column: "dt", period: "2d").agg(
+    #   df.rolling(index_column: "dt", period: "2d").agg(
     #     [
     #       Polars.sum("a").alias("sum_a"),
     #       Polars.min("a").alias("min_a"),
@@ -988,7 +1276,7 @@ module Polars
     #   # │ 2020-01-03 19:45:32 ┆ 11    ┆ 2     ┆ 9     │
     #   # │ 2020-01-08 23:16:43 ┆ 1     ┆ 1     ┆ 1     │
     #   # └─────────────────────┴───────┴───────┴───────┘
-    def group_by_rolling(
+    def rolling(
       index_column:,
       period:,
       offset: nil,
@@ -1005,12 +1293,13 @@ module Polars
       period = Utils._timedelta_to_pl_duration(period)
       offset = Utils._timedelta_to_pl_duration(offset)
-      lgb = _ldf.group_by_rolling(
+      lgb = _ldf.rolling(
         index_column, period, offset, closed, rbexprs_by, check_sorted
       )
       LazyGroupBy.new(lgb)
     end
-    alias_method :groupby_rolling, :group_by_rolling
+    alias_method :group_by_rolling, :rolling
+    alias_method :groupby_rolling, :rolling
     # Group based on a time value (or index value of type `:i32`, `:i64`).
     #
@@ -1440,6 +1729,8 @@ module Polars
     #   Join strategy.
     # @param suffix [String]
     #   Suffix to append to columns with a duplicate name.
+    # @param join_nulls [Boolean]
+    #   Join on null values. By default null values will never produce matches.
     # @param allow_parallel [Boolean]
     #   Allow the physical plan to optionally evaluate the computation of both
     #   DataFrames up to the join in parallel.
@@ -1535,6 +1826,7 @@ module Polars
       on: nil,
       how: "inner",
       suffix: "_right",
+      join_nulls: false,
       allow_parallel: true,
       force_parallel: false
     )
@@ -1568,6 +1860,7 @@ module Polars
           rbexprs_right,
           allow_parallel,
           force_parallel,
+          join_nulls,
           how,
           suffix,
         )
@@ -1608,27 +1901,9 @@ module Polars
     #   # │ 3   ┆ 10.0 ┆ false ┆ 9.0  ┆ 5.0  ┆ true  │
     #   # │ 4   ┆ 13.0 ┆ true  ┆ 16.0 ┆ 6.5  ┆ false │
     #   # └─────┴──────┴───────┴──────┴──────┴───────┘
-    def with_columns(exprs)
-      exprs =
-        if exprs.nil?
-          []
-        elsif exprs.is_a?(Expr)
-          [exprs]
-        else
-          exprs.to_a
-        end
-      rbexprs = []
-      exprs.each do |e|
-        case e
-        when Expr
-          rbexprs << e._rbexpr
-        when Series
-          rbexprs << Utils.lit(e)._rbexpr
-        else
-          raise ArgumentError, "Expected an expression, got #{e}"
-        end
-      end
+    def with_columns(*exprs, **named_exprs)
+      structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
+      rbexprs = Utils.parse_as_list_of_expressions(*exprs, **named_exprs, __structify: structify)
       _from_rbldf(_ldf.with_columns(rbexprs))
     end
@@ -1725,7 +2000,7 @@ module Polars
       if columns.is_a?(::String)
         columns = [columns]
       end
-      _from_rbldf(_ldf.drop_columns(columns))
+      _from_rbldf(_ldf.drop(columns))
     end
     # Rename column names.
@@ -1955,7 +2230,7 @@ module Polars
     #       "b" => [2, 4, 6]
     #     }
     #   ).lazy
-    #   df.with_row_count.collect
+    #   df.with_row_index.collect
     #   # =>
     #   # shape: (3, 3)
     #   # ┌────────┬─────┬─────┐
@@ -1967,9 +2242,10 @@ module Polars
     #   # │ 1      ┆ 3   ┆ 4   │
     #   # │ 2      ┆ 5   ┆ 6   │
     #   # └────────┴─────┴─────┘
-    def with_row_count(name: "row_nr", offset: 0)
-      _from_rbldf(_ldf.with_row_count(name, offset))
+    def with_row_index(name: "row_nr", offset: 0)
+      _from_rbldf(_ldf.with_row_index(name, offset))
     end
+    alias_method :with_row_count, :with_row_index
     # Take every nth row in the LazyFrame and return as a new LazyFrame.
     #
@@ -2470,9 +2746,47 @@ module Polars
       _from_rbldf(_ldf.unnest(names))
     end
-    # TODO
-    # def merge_sorted
-    # end
+    # Take two sorted DataFrames and merge them by the sorted key.
+    #
+    # The output of this operation will also be sorted.
+    # It is the callers responsibility that the frames are sorted
+    # by that key otherwise the output will not make sense.
+    #
+    # The schemas of both LazyFrames must be equal.
+    #
+    # @param other [DataFrame]
+    #   Other DataFrame that must be merged
+    # @param key [String]
+    #   Key that is sorted.
+    #
+    # @return [LazyFrame]
+    #
+    # @example
+    #   df0 = Polars::LazyFrame.new(
+    #     {"name" => ["steve", "elise", "bob"], "age" => [42, 44, 18]}
+    #   ).sort("age")
+    #   df1 = Polars::LazyFrame.new(
+    #     {"name" => ["anna", "megan", "steve", "thomas"], "age" => [21, 33, 42, 20]}
+    #   ).sort("age")
+    #   df0.merge_sorted(df1, "age").collect
+    #   # =>
+    #   # shape: (7, 2)
+    #   # ┌────────┬─────┐
+    #   # │ name   ┆ age │
+    #   # │ ---    ┆ --- │
+    #   # │ str    ┆ i64 │
+    #   # ╞════════╪═════╡
+    #   # │ bob    ┆ 18  │
+    #   # │ thomas ┆ 20  │
+    #   # │ anna   ┆ 21  │
+    #   # │ megan  ┆ 33  │
+    #   # │ steve  ┆ 42  │
+    #   # │ steve  ┆ 42  │
+    #   # │ elise  ┆ 44  │
+    #   # └────────┴─────┘
+    def merge_sorted(other, key)
+      _from_rbldf(_ldf.merge_sorted(other._ldf, key))
+    end
     # Indicate that one or multiple columns are sorted.
     #