RubyGems - polars-df - Versions diffs - 0.13.0-x86_64-linux → 0.15.0-x86_64-linux - Mend

polars-df 0.13.0-x86_64-linux → 0.15.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +30 -0
data/Cargo.lock +1368 -319
data/LICENSE-THIRD-PARTY.txt +24801 -13447
data/LICENSE.txt +1 -0
data/README.md +1 -2
data/lib/polars/3.1/polars.so +0 -0
data/lib/polars/3.2/polars.so +0 -0
data/lib/polars/3.3/polars.so +0 -0
data/lib/polars/batched_csv_reader.rb +0 -2
data/lib/polars/binary_expr.rb +133 -9
data/lib/polars/binary_name_space.rb +101 -6
data/lib/polars/config.rb +4 -0
data/lib/polars/data_frame.rb +285 -62
data/lib/polars/data_type_group.rb +28 -0
data/lib/polars/data_types.rb +2 -0
data/lib/polars/date_time_expr.rb +244 -0
data/lib/polars/date_time_name_space.rb +87 -0
data/lib/polars/expr.rb +109 -8
data/lib/polars/functions/as_datatype.rb +51 -2
data/lib/polars/functions/col.rb +1 -1
data/lib/polars/functions/eager.rb +1 -3
data/lib/polars/functions/lazy.rb +88 -10
data/lib/polars/functions/range/time_range.rb +21 -21
data/lib/polars/io/csv.rb +14 -16
data/lib/polars/io/database.rb +2 -2
data/lib/polars/io/ipc.rb +14 -12
data/lib/polars/io/ndjson.rb +10 -0
data/lib/polars/io/parquet.rb +168 -111
data/lib/polars/lazy_frame.rb +649 -15
data/lib/polars/list_name_space.rb +169 -0
data/lib/polars/selectors.rb +1144 -0
data/lib/polars/series.rb +470 -40
data/lib/polars/string_cache.rb +27 -1
data/lib/polars/string_expr.rb +0 -1
data/lib/polars/string_name_space.rb +73 -3
data/lib/polars/struct_name_space.rb +31 -7
data/lib/polars/utils/various.rb +5 -1
data/lib/polars/utils.rb +45 -10
data/lib/polars/version.rb +1 -1
data/lib/polars.rb +2 -1
metadata +4 -3
data/lib/polars/functions.rb +0 -57

data/lib/polars/functions/lazy.rb CHANGED Viewed

@@ -824,6 +824,29 @@ module Polars
     # @note
     #   If you simply want the first encountered expression as accumulator,
     #   consider using `cumreduce`.
+    #
+    # @example
+    #   df = Polars::DataFrame.new(
+    #     {
+    #       "a" => [1, 2, 3],
+    #       "b" => [3, 4, 5],
+    #       "c" => [5, 6, 7]
+    #     }
+    #   )
+    #   df.with_columns(
+    #     Polars.cum_fold(Polars.lit(1), ->(acc, x) { acc + x }, Polars.all)
+    #   )
+    #   # =>
+    #   # shape: (3, 4)
+    #   # ┌─────┬─────┬─────┬───────────┐
+    #   # │ a   ┆ b   ┆ c   ┆ cum_fold  │
+    #   # │ --- ┆ --- ┆ --- ┆ ---       │
+    #   # │ i64 ┆ i64 ┆ i64 ┆ struct[3] │
+    #   # ╞═════╪═════╪═════╪═══════════╡
+    #   # │ 1   ┆ 3   ┆ 5   ┆ {2,5,10}  │
+    #   # │ 2   ┆ 4   ┆ 6   ┆ {3,7,13}  │
+    #   # │ 3   ┆ 5   ┆ 7   ┆ {4,9,16}  │
+    #   # └─────┴─────┴─────┴───────────┘
     def cum_fold(acc, f, exprs, include_init: false)
       acc = Utils.parse_into_expression(acc, str_as_lit: true)
       if exprs.is_a?(Expr)
@@ -831,7 +854,7 @@ module Polars
       end
       exprs = Utils.parse_into_list_of_expressions(exprs)
-      Utils.wrap_expr(Plr.cum_fold(acc, f, exprs, include_init))
+      Utils.wrap_expr(Plr.cum_fold(acc, f, exprs, include_init)._alias("cum_fold"))
     end
     alias_method :cumfold, :cum_fold
@@ -1024,15 +1047,70 @@ module Polars
     #   Default is ascending.
     #
     # @return [Expr]
-    def arg_sort_by(exprs, reverse: false)
-      if !exprs.is_a?(::Array)
-        exprs = [exprs]
-      end
-      if reverse == true || reverse == false
-        reverse = [reverse] * exprs.length
-      end
-      exprs = Utils.parse_into_list_of_expressions(exprs)
-      Utils.wrap_expr(Plr.arg_sort_by(exprs, reverse))
+    #
+    # @example Pass a single column name to compute the arg sort by that column.
+    #   df = Polars::DataFrame.new(
+    #     {
+    #       "a" => [0, 1, 1, 0],
+    #       "b" => [3, 2, 3, 2],
+    #       "c" => [1, 2, 3, 4]
+    #     }
+    #   )
+    #   df.select(Polars.arg_sort_by("a"))
+    #   # =>
+    #   # shape: (4, 1)
+    #   # ┌─────┐
+    #   # │ a   │
+    #   # │ --- │
+    #   # │ u32 │
+    #   # ╞═════╡
+    #   # │ 0   │
+    #   # │ 3   │
+    #   # │ 1   │
+    #   # │ 2   │
+    #   # └─────┘
+    #
+    # @example Compute the arg sort by multiple columns by either passing a list of columns, or by specifying each column as a positional argument.
+    #   df.select(Polars.arg_sort_by(["a", "b"], reverse: true))
+    #   # =>
+    #   # shape: (4, 1)
+    #   # ┌─────┐
+    #   # │ a   │
+    #   # │ --- │
+    #   # │ u32 │
+    #   # ╞═════╡
+    #   # │ 2   │
+    #   # │ 1   │
+    #   # │ 0   │
+    #   # │ 3   │
+    #   # └─────┘
+    #
+    # @example Use gather to apply the arg sort to other columns.
+    #   df.select(Polars.col("c").gather(Polars.arg_sort_by("a")))
+    #   # =>
+    #   # shape: (4, 1)
+    #   # ┌─────┐
+    #   # │ c   │
+    #   # │ --- │
+    #   # │ i64 │
+    #   # ╞═════╡
+    #   # │ 1   │
+    #   # │ 4   │
+    #   # │ 2   │
+    #   # │ 3   │
+    #   # └─────┘
+    def arg_sort_by(
+      exprs,
+      *more_exprs,
+      reverse: false,
+      nulls_last: false,
+      multithreaded: true,
+      maintain_order: false
+    )
+      exprs = Utils.parse_into_list_of_expressions(exprs, *more_exprs)
+      reverse = Utils.extend_bool(reverse, exprs.length, "reverse", "exprs")
+      nulls_last = Utils.extend_bool(nulls_last, exprs.length, "nulls_last", "exprs")
+      Utils.wrap_expr(Plr.arg_sort_by(exprs, reverse, nulls_last, multithreaded, maintain_order))
     end
     alias_method :argsort_by, :arg_sort_by

data/lib/polars/functions/range/time_range.rb CHANGED Viewed

@@ -18,7 +18,7 @@ module Polars
     #
     # @example
     #   Polars.time_range(
-    #     time(14, 0),
+    #     Time.utc(2000, 1, 1, 14, 0),
     #     nil,
     #     "3h15m",
     #     eager: true
@@ -48,12 +48,12 @@ module Polars
       end
       if start.nil?
-        # start = time(0, 0, 0)
-        raise Todo
+        # date part is ignored
+        start = ::Time.utc(2000, 1, 1, 0, 0, 0)
       end
       if stop.nil?
-        # stop = time(23, 59, 59, 999999)
-        raise Todo
+        # date part is ignored
+        stop = ::Time.utc(2000, 1, 1, 23, 59, 59, 999999)
       end
       start_rbexpr = Utils.parse_into_expression(start)
@@ -87,21 +87,21 @@ module Polars
     # @example
     #   df = Polars::DataFrame.new(
     #     {
-    #       "start" => [time(9, 0), time(10, 0)],
-    #       "end" => time(11, 0)
+    #       "start" => [Time.utc(2000, 1, 1, 9, 0), Time.utc(2000, 1, 1, 10, 0)],
+    #       "end" => Time.utc(2000, 1, 1, 11, 0)
     #     }
     #   )
-    #   df.with_columns(time_range: Polars.time_ranges("start", "end"))
+    #   df.select(time_range: Polars.time_ranges("start", "end"))
     #   # =>
-    #   # shape: (2, 3)
-    #   # ┌──────────┬──────────┬────────────────────────────────┐
-    #   # │ start    ┆ end      ┆ time_range                     │
-    #   # │ ---      ┆ ---      ┆ ---                            │
-    #   # │ time     ┆ time     ┆ list[time]                     │
-    #   # ╞══════════╪══════════╪════════════════════════════════╡
-    #   # │ 09:00:00 ┆ 11:00:00 ┆ [09:00:00, 10:00:00, 11:00:00] │
-    #   # │ 10:00:00 ┆ 11:00:00 ┆ [10:00:00, 11:00:00]           │
-    #   # └──────────┴──────────┴────────────────────────────────┘
+    #   # shape: (2, 1)
+    #   # ┌────────────────────────────────┐
+    #   # │ time_range                     │
+    #   # │ ---                            │
+    #   # │ list[time]                     │
+    #   # ╞════════════════════════════════╡
+    #   # │ [09:00:00, 10:00:00, 11:00:00] │
+    #   # │ [10:00:00, 11:00:00]           │
+    #   # └────────────────────────────────┘
     def time_ranges(
       start = nil,
       stop = nil,
@@ -118,12 +118,12 @@ module Polars
       end
       if start.nil?
-        # start = time(0, 0, 0)
-        raise Todo
+        # date part is ignored
+        start = ::Time.utc(2000, 1, 1, 0, 0, 0)
       end
       if stop.nil?
-        # stop = time(23, 59, 59, 999999)
-        raise Todo
+        # date part is ignored
+        stop = ::Time.utc(2000, 1, 1, 23, 59, 59, 999999)
       end
       start_rbexpr = Utils.parse_into_expression(start)

data/lib/polars/io/csv.rb CHANGED Viewed

@@ -75,9 +75,6 @@ module Polars
     #   the DataFrame.
     # @param row_count_offset [Integer]
     #   Offset to start the row_count column (only used if the name is set).
-    # @param sample_size [Integer]
-    #   Set the sample size. This is used to sample statistics to estimate the
-    #   allocation needed.
     # @param eol_char [String]
     #   Single byte end of line character.
     # @param truncate_ragged_lines [Boolean]
@@ -114,7 +111,6 @@ module Polars
       skip_rows_after_header: 0,
       row_count_name: nil,
       row_count_offset: 0,
-      sample_size: 1024,
       eol_char: "\n",
       truncate_ragged_lines: false
     )
@@ -163,7 +159,6 @@ module Polars
           skip_rows_after_header: skip_rows_after_header,
           row_count_name: row_count_name,
           row_count_offset: row_count_offset,
-          sample_size: sample_size,
           eol_char: eol_char,
           truncate_ragged_lines: truncate_ragged_lines
         )
@@ -201,7 +196,6 @@ module Polars
       skip_rows_after_header: 0,
       row_count_name: nil,
       row_count_offset: 0,
-      sample_size: 1024,
       eol_char: "\n",
       raise_if_empty: true,
       truncate_ragged_lines: false,
@@ -305,7 +299,6 @@ module Polars
           parse_dates,
           skip_rows_after_header,
           Utils.parse_row_index_args(row_count_name, row_count_offset),
-          sample_size,
           eol_char,
           raise_if_empty,
           truncate_ragged_lines,
@@ -392,9 +385,6 @@ module Polars
     #   the DataFrame.
     # @param row_count_offset [Integer]
     #   Offset to start the row_count column (only used if the name is set).
-    # @param sample_size [Integer]
-    #   Set the sample size. This is used to sample statistics to estimate the
-    #   allocation needed.
     # @param eol_char [String]
     #   Single byte end of line character.
     # @param truncate_ragged_lines [Boolean]
@@ -431,7 +421,6 @@ module Polars
       skip_rows_after_header: 0,
       row_count_name: nil,
       row_count_offset: 0,
-      sample_size: 1024,
       eol_char: "\n",
       raise_if_empty: true,
       truncate_ragged_lines: false,
@@ -474,7 +463,6 @@ module Polars
         skip_rows_after_header: skip_rows_after_header,
         row_count_name: row_count_name,
         row_count_offset: row_count_offset,
-        sample_size: sample_size,
         eol_char: eol_char,
         new_columns: new_columns,
         raise_if_empty: raise_if_empty,
@@ -618,7 +606,7 @@ module Polars
     # @private
     def _scan_csv_impl(
-      file,
+      source,
       has_header: true,
       sep: ",",
       comment_char: nil,
@@ -650,9 +638,16 @@ module Polars
       end
       processed_null_values = Utils._process_null_values(null_values)
+      if source.is_a?(::Array)
+        sources = source
+        source = nil
+      else
+        sources = []
+      end
       rblf =
         RbLazyFrame.new_from_csv(
-          file,
+          source,
           sep,
           has_header,
           ignore_errors,
@@ -672,7 +667,8 @@ module Polars
           Utils.parse_row_index_args(row_count_name, row_count_offset),
           parse_dates,
           eol_char,
-          truncate_ragged_lines
+          truncate_ragged_lines,
+          sources
         )
       Utils.wrap_ldf(rblf)
     end
@@ -681,7 +677,9 @@ module Polars
     def _prepare_file_arg(file)
       if file.is_a?(::String) && file =~ /\Ahttps?:\/\//
-        raise ArgumentError, "use URI(...) for remote files"
+        require "uri"
+        file = URI(file)
       end
       if defined?(URI) && file.is_a?(URI)

data/lib/polars/io/database.rb CHANGED Viewed

@@ -18,9 +18,9 @@ module Polars
         if query.is_a?(ActiveRecord::Result)
           query
         elsif query.is_a?(ActiveRecord::Relation)
-          query.connection.select_all(query.to_sql)
+          query.connection_pool.with_connection { |c| c.select_all(query.to_sql) }
         elsif query.is_a?(::String)
-          ActiveRecord::Base.connection.select_all(query)
+          ActiveRecord::Base.connection_pool.with_connection { |c| c.select_all(query) }
         else
           raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
         end

data/lib/polars/io/ipc.rb CHANGED Viewed

@@ -189,10 +189,6 @@ module Polars
     #   Offset to start the row_count column (only use if the name is set).
     # @param storage_options [Hash]
     #   Extra options that make sense for a particular storage connection.
-    # @param memory_map [Boolean]
-    #   Try to memory map the file. This can greatly improve performance on repeated
-    #   queries as the OS may cache pages.
-    #   Only uncompressed IPC files can be memory mapped.
     # @param hive_partitioning [Boolean]
     #   Infer statistics and schema from Hive partitioned URL and use them
     #   to prune reads. This is unset by default (i.e. `nil`), meaning it is
@@ -215,7 +211,6 @@ module Polars
       row_count_name: nil,
       row_count_offset: 0,
       storage_options: nil,
-      memory_map: true,
       hive_partitioning: nil,
       hive_schema: nil,
       try_parse_hive_dates: true,
@@ -229,7 +224,6 @@ module Polars
         row_count_name: row_count_name,
         row_count_offset: row_count_offset,
         storage_options: storage_options,
-        memory_map: memory_map,
         hive_partitioning: hive_partitioning,
         hive_schema: hive_schema,
         try_parse_hive_dates: try_parse_hive_dates,
@@ -239,31 +233,39 @@ module Polars
     # @private
     def _scan_ipc_impl(
-      file,
+      source,
       n_rows: nil,
       cache: true,
       rechunk: true,
       row_count_name: nil,
       row_count_offset: 0,
       storage_options: nil,
-      memory_map: true,
       hive_partitioning: nil,
       hive_schema: nil,
       try_parse_hive_dates: true,
       include_file_paths: nil
     )
-      if Utils.pathlike?(file)
-        file = Utils.normalize_filepath(file)
+      sources = []
+      if Utils.pathlike?(source)
+        source = Utils.normalize_filepath(source)
+      elsif source.is_a?(::Array)
+        if Utils.is_path_or_str_sequence(source)
+          sources = source.map { |s| Utils.normalize_filepath(s) }
+        else
+          sources = source
+        end
+        source = nil
       end
       rblf =
         RbLazyFrame.new_from_ipc(
-          file,
+          source,
+          sources,
           n_rows,
           cache,
           rechunk,
           Utils.parse_row_index_args(row_count_name, row_count_offset),
-          memory_map,
           hive_partitioning,
           hive_schema,
           try_parse_hive_dates,

data/lib/polars/io/ndjson.rb CHANGED Viewed

@@ -60,13 +60,23 @@ module Polars
       row_count_name: nil,
       row_count_offset: 0
     )
+      sources = []
       if Utils.pathlike?(source)
         source = Utils.normalize_filepath(source)
+      elsif source.is_a?(::Array)
+        if Utils.is_path_or_str_sequence(source)
+          sources = source.map { |s| Utils.normalize_filepath(s) }
+        else
+          sources = source
+        end
+        source = nil
       end
       rblf =
         RbLazyFrame.new_from_ndjson(
           source,
+          sources,
           infer_schema_length,
           batch_size,
           n_rows,