RubyGems - polars-df - Versions diffs - 0.3.1-x86_64-linux → 0.4.0-x86_64-linux - Mend

polars-df 0.3.1-x86_64-linux → 0.4.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +12 -1
data/Cargo.lock +335 -310
data/Cargo.toml +0 -1
data/LICENSE-THIRD-PARTY.txt +9228 -11189
data/README.md +29 -0
data/lib/polars/3.0/polars.so +0 -0
data/lib/polars/3.1/polars.so +0 -0
data/lib/polars/3.2/polars.so +0 -0
data/lib/polars/batched_csv_reader.rb +1 -1
data/lib/polars/binary_expr.rb +77 -0
data/lib/polars/binary_name_space.rb +66 -0
data/lib/polars/data_frame.rb +63 -38
data/lib/polars/date_time_expr.rb +6 -6
data/lib/polars/expr.rb +9 -2
data/lib/polars/io.rb +73 -62
data/lib/polars/lazy_frame.rb +103 -7
data/lib/polars/lazy_functions.rb +3 -2
data/lib/polars/list_expr.rb +2 -2
data/lib/polars/list_name_space.rb +2 -2
data/lib/polars/series.rb +9 -1
data/lib/polars/string_expr.rb +1 -1
data/lib/polars/utils.rb +10 -2
data/lib/polars/version.rb +1 -1
data/lib/polars.rb +2 -0
metadata +4 -2

data/lib/polars/io.rb CHANGED Viewed

@@ -2,7 +2,7 @@ module Polars
   module IO
     # Read a CSV file into a DataFrame.
     #
-    # @param file [Object]
+    # @param source [Object]
     #   Path to a file or a file-like object.
     # @param has_header [Boolean]
     #   Indicate if the first row of dataset is a header or not.
@@ -89,7 +89,7 @@ module Polars
     #   Set `rechunk: false` if you are benchmarking the csv-reader. A `rechunk` is
     #   an expensive operation.
     def read_csv(
-      file,
+      source,
       has_header: true,
       columns: nil,
       new_columns: nil,
@@ -137,7 +137,7 @@ module Polars
       end
       df = nil
-      _prepare_file_arg(file) do |data|
+      _prepare_file_arg(source) do |data|
         df = DataFrame._read_csv(
           data,
           has_header: has_header,
@@ -178,7 +178,7 @@ module Polars
     # projections to the scan level, thereby potentially reducing
     # memory overhead.
     #
-    # @param file [Object]
+    # @param source [Object]
     #   Path to a file.
     # @param has_header [Boolean]
     #   Indicate if the first row of dataset is a header or not.
@@ -242,7 +242,7 @@ module Polars
     #
     # @return [LazyFrame]
     def scan_csv(
-      file,
+      source,
       has_header: true,
       sep: ",",
       comment_char: nil,
@@ -268,12 +268,12 @@ module Polars
       _check_arg_is_1byte("comment_char", comment_char, false)
       _check_arg_is_1byte("quote_char", quote_char, true)
-      if Utils.pathlike?(file)
-        file = Utils.format_path(file)
+      if Utils.pathlike?(source)
+        source = Utils.normalise_filepath(source)
       end
       LazyFrame._scan_csv(
-        file,
+        source,
         has_header: has_header,
         sep: sep,
         comment_char: comment_char,
@@ -302,7 +302,7 @@ module Polars
     # This allows the query optimizer to push down predicates and projections to the scan
     # level, thereby potentially reducing memory overhead.
     #
-    # @param file [String]
+    # @param source [String]
     #   Path to a IPC file.
     # @param n_rows [Integer]
     #   Stop reading from IPC file after reading `n_rows`.
@@ -324,7 +324,7 @@ module Polars
     #
     # @return [LazyFrame]
     def scan_ipc(
-      file,
+      source,
       n_rows: nil,
       cache: true,
       rechunk: true,
@@ -334,7 +334,7 @@ module Polars
       memory_map: true
     )
       LazyFrame._scan_ipc(
-        file,
+        source,
         n_rows: n_rows,
         cache: cache,
         rechunk: rechunk,
@@ -350,7 +350,7 @@ module Polars
     # This allows the query optimizer to push down predicates and projections to the scan
     # level, thereby potentially reducing memory overhead.
     #
-    # @param file [String]
+    # @param source [String]
     #   Path to a file.
     # @param n_rows [Integer]
     #   Stop reading from parquet file after reading `n_rows`.
@@ -374,7 +374,7 @@ module Polars
     #
     # @return [LazyFrame]
     def scan_parquet(
-      file,
+      source,
       n_rows: nil,
       cache: true,
       parallel: "auto",
@@ -384,12 +384,12 @@ module Polars
       storage_options: nil,
       low_memory: false
     )
-      if Utils.pathlike?(file)
-        file = Utils.format_path(file)
+      if Utils.pathlike?(source)
+        source = Utils.normalise_filepath(source)
       end
       LazyFrame._scan_parquet(
-        file,
+        source,
         n_rows:n_rows,
         cache: cache,
         parallel: parallel,
@@ -406,7 +406,7 @@ module Polars
     # This allows the query optimizer to push down predicates and projections to the scan
     # level, thereby potentially reducing memory overhead.
     #
-    # @param file [String]
+    # @param source [String]
     #   Path to a file.
     # @param infer_schema_length [Integer]
     #   Infer the schema length from the first `infer_schema_length` rows.
@@ -426,7 +426,7 @@ module Polars
     #
     # @return [LazyFrame]
     def scan_ndjson(
-      file,
+      source,
       infer_schema_length: 100,
       batch_size: 1024,
       n_rows: nil,
@@ -435,12 +435,12 @@ module Polars
       row_count_name: nil,
       row_count_offset: 0
     )
-      if Utils.pathlike?(file)
-        file = Utils.format_path(file)
+      if Utils.pathlike?(source)
+        source = Utils.normalise_filepath(source)
       end
       LazyFrame._scan_ndjson(
-        file,
+        source,
         infer_schema_length: infer_schema_length,
         batch_size: batch_size,
         n_rows: n_rows,
@@ -453,7 +453,7 @@ module Polars
     # Read into a DataFrame from Apache Avro format.
     #
-    # @param file [Object]
+    # @param source [Object]
     #   Path to a file or a file-like object.
     # @param columns [Object]
     #   Columns to select. Accepts a list of column indices (starting at zero) or a list
@@ -462,17 +462,17 @@ module Polars
     #   Stop reading from Apache Avro file after reading ``n_rows``.
     #
     # @return [DataFrame]
-    def read_avro(file, columns: nil, n_rows: nil)
-      if Utils.pathlike?(file)
-        file = Utils.format_path(file)
+    def read_avro(source, columns: nil, n_rows: nil)
+      if Utils.pathlike?(source)
+        source = Utils.normalise_filepath(source)
       end
-      DataFrame._read_avro(file, n_rows: n_rows, columns: columns)
+      DataFrame._read_avro(source, n_rows: n_rows, columns: columns)
     end
     # Read into a DataFrame from Arrow IPC (Feather v2) file.
     #
-    # @param file [Object]
+    # @param source [Object]
     #   Path to a file or a file-like object.
     # @param columns [Object]
     #   Columns to select. Accepts a list of column indices (starting at zero) or a list
@@ -495,7 +495,7 @@ module Polars
     #
     # @return [DataFrame]
     def read_ipc(
-      file,
+      source,
       columns: nil,
       n_rows: nil,
       memory_map: true,
@@ -505,7 +505,7 @@ module Polars
       rechunk: true
     )
       storage_options ||= {}
-      _prepare_file_arg(file, **storage_options) do |data|
+      _prepare_file_arg(source, **storage_options) do |data|
         DataFrame._read_ipc(
           data,
           columns: columns,
@@ -520,8 +520,8 @@ module Polars
     # Read into a DataFrame from a parquet file.
     #
-    # @param file [Object]
-    #   Path to a file, or a file-like object.
+    # @param source [Object]
+    #   Path to a file or a file-like object.
     # @param columns [Object]
     #   Columns to select. Accepts a list of column indices (starting at zero) or a list
     #   of column names.
@@ -539,6 +539,12 @@ module Polars
     #   Offset to start the row_count column (only use if the name is set).
     # @param low_memory [Boolean]
     #   Reduce memory pressure at the expense of performance.
+    # @param use_statistics [Boolean]
+    #   Use statistics in the parquet to determine if pages
+    #   can be skipped from reading.
+    # @param rechunk [Boolean]
+    #   Make sure that all columns are contiguous in memory by
+    #   aggregating the chunks into a single array.
     #
     # @return [DataFrame]
     #
@@ -548,16 +554,18 @@ module Polars
     #   Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
     #   an expensive operation.
     def read_parquet(
-      file,
+      source,
       columns: nil,
       n_rows: nil,
       storage_options: nil,
       parallel: "auto",
       row_count_name: nil,
       row_count_offset: 0,
-      low_memory: false
+      low_memory: false,
+      use_statistics: true,
+      rechunk: true
     )
-      _prepare_file_arg(file) do |data|
+      _prepare_file_arg(source) do |data|
         DataFrame._read_parquet(
           data,
           columns: columns,
@@ -565,49 +573,51 @@ module Polars
           parallel: parallel,
           row_count_name: row_count_name,
           row_count_offset: row_count_offset,
-          low_memory: low_memory
+          low_memory: low_memory,
+          use_statistics: use_statistics,
+          rechunk: rechunk
         )
       end
     end
     # Read into a DataFrame from a JSON file.
     #
-    # @param file [Object]
+    # @param source [Object]
     #   Path to a file or a file-like object.
     #
     # @return [DataFrame]
-    def read_json(file)
-      DataFrame._read_json(file)
+    def read_json(source)
+      DataFrame._read_json(source)
     end
     # Read into a DataFrame from a newline delimited JSON file.
     #
-    # @param file [Object]
+    # @param source [Object]
     #   Path to a file or a file-like object.
     #
     # @return [DataFrame]
-    def read_ndjson(file)
-      DataFrame._read_ndjson(file)
+    def read_ndjson(source)
+      DataFrame._read_ndjson(source)
     end
     # Read a SQL query into a DataFrame.
     #
-    # @param sql [Object]
+    # @param query [Object]
     #   ActiveRecord::Relation or ActiveRecord::Result.
     #
     # @return [DataFrame]
-    def read_sql(sql)
+    def read_database(query)
       if !defined?(ActiveRecord)
         raise Error, "Active Record not available"
       end
       result =
-        if sql.is_a?(ActiveRecord::Result)
-          sql
-        elsif sql.is_a?(ActiveRecord::Relation)
-          sql.connection.select_all(sql.to_sql)
-        elsif sql.is_a?(String)
-          ActiveRecord::Base.connection.select_all(sql)
+        if query.is_a?(ActiveRecord::Result)
+          query
+        elsif query.is_a?(ActiveRecord::Relation)
+          query.connection.select_all(query.to_sql)
+        elsif query.is_a?(String)
+          ActiveRecord::Base.connection.select_all(query)
         else
           raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
         end
@@ -617,6 +627,7 @@ module Polars
       end
       DataFrame.new(data)
     end
+    alias_method :read_sql, :read_database
     # def read_excel
     # end
@@ -628,7 +639,7 @@ module Polars
     # file chunks. After that work will only be done
     # if `next_batches` is called.
     #
-    # @param file [Object]
+    # @param source [Object]
     #   Path to a file or a file-like object.
     # @param has_header [Boolean]
     #   Indicate if the first row of dataset is a header or not.
@@ -712,7 +723,7 @@ module Polars
     #   )
     #   reader.next_batches(5)
     def read_csv_batched(
-      file,
+      source,
       has_header: true,
       columns: nil,
       new_columns: nil,
@@ -752,7 +763,7 @@ module Polars
       end
       BatchedCsvReader.new(
-        file,
+        source,
         has_header: has_header,
         columns: columns || projection,
         sep: sep,
@@ -781,30 +792,30 @@ module Polars
     # Get a schema of the IPC file without reading data.
     #
-    # @param file [Object]
+    # @param source [Object]
     #   Path to a file or a file-like object.
     #
     # @return [Hash]
-    def read_ipc_schema(file)
-      if Utils.pathlike?(file)
-        file = Utils.format_path(file)
+    def read_ipc_schema(source)
+      if Utils.pathlike?(source)
+        source = Utils.normalise_filepath(source)
       end
-      _ipc_schema(file)
+      _ipc_schema(source)
     end
     # Get a schema of the Parquet file without reading data.
     #
-    # @param file [Object]
+    # @param source [Object]
     #   Path to a file or a file-like object.
     #
     # @return [Hash]
-    def read_parquet_schema(file)
-      if Utils.pathlike?(file)
-        file = Utils.format_path(file)
+    def read_parquet_schema(source)
+      if Utils.pathlike?(source)
+        source = Utils.normalise_filepath(source)
       end
-      _parquet_schema(file)
+      _parquet_schema(source)
     end
     private

data/lib/polars/lazy_frame.rb CHANGED Viewed

@@ -80,7 +80,8 @@ module Polars
       row_count_name: nil,
       row_count_offset: 0,
       storage_options: nil,
-      low_memory: false
+      low_memory: false,
+      use_statistics: true
     )
       _from_rbldf(
         RbLazyFrame.new_from_parquet(
@@ -90,7 +91,8 @@ module Polars
           parallel,
           rechunk,
           Utils._prepare_row_count_args(row_count_name, row_count_offset),
-          low_memory
+          low_memory,
+          use_statistics
         )
       )
     end
@@ -107,7 +109,7 @@ module Polars
       memory_map: true
     )
       if Utils.pathlike?(file)
-        file = Utils.format_path(file)
+        file = Utils.normalise_filepath(file)
       end
       _from_rbldf(
@@ -157,7 +159,7 @@ module Polars
     # @return [LazyFrame]
     def self.read_json(file)
       if Utils.pathlike?(file)
-        file = Utils.format_path(file)
+        file = Utils.normalise_filepath(file)
       end
       Utils.wrap_ldf(RbLazyFrame.read_json(file))
@@ -264,7 +266,7 @@ module Polars
     # @return [nil]
     def write_json(file)
       if Utils.pathlike?(file)
-        file = Utils.format_path(file)
+        file = Utils.normalise_filepath(file)
       end
       _ldf.write_json(file)
       nil
@@ -473,6 +475,96 @@ module Polars
       Utils.wrap_df(ldf.collect)
     end
+    # Persists a LazyFrame at the provided path.
+    #
+    # This allows streaming results that are larger than RAM to be written to disk.
+    #
+    # @param path [String]
+    #   File path to which the file should be written.
+    # @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
+    #   Choose "zstd" for good compression performance.
+    #   Choose "lz4" for fast compression/decompression.
+    #   Choose "snappy" for more backwards compatibility guarantees
+    #   when you deal with older parquet readers.
+    # @param compression_level [Integer]
+    #   The level of compression to use. Higher compression means smaller files on
+    #   disk.
+    #
+    #   - "gzip" : min-level: 0, max-level: 10.
+    #   - "brotli" : min-level: 0, max-level: 11.
+    #   - "zstd" : min-level: 1, max-level: 22.
+    # @param statistics [Boolean]
+    #   Write statistics to the parquet headers. This requires extra compute.
+    # @param row_group_size [Integer]
+    #   Size of the row groups in number of rows.
+    #   If `nil` (default), the chunks of the `DataFrame` are
+    #   used. Writing in smaller chunks may reduce memory pressure and improve
+    #   writing speeds.
+    # @param data_pagesize_limit [Integer]
+    #   Size limit of individual data pages.
+    #   If not set defaults to 1024 * 1024 bytes
+    # @param maintain_order [Boolean]
+    #   Maintain the order in which data is processed.
+    #   Setting this to `false` will  be slightly faster.
+    # @param type_coercion [Boolean]
+    #   Do type coercion optimization.
+    # @param predicate_pushdown [Boolean]
+    #   Do predicate pushdown optimization.
+    # @param projection_pushdown [Boolean]
+    #   Do projection pushdown optimization.
+    # @param simplify_expression [Boolean]
+    #   Run simplify expressions optimization.
+    # @param no_optimization [Boolean]
+    #   Turn off (certain) optimizations.
+    # @param slice_pushdown [Boolean]
+    #   Slice pushdown optimization.
+    #
+    # @return [DataFrame]
+    #
+    # @example
+    #   lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
+    #   lf.sink_parquet("out.parquet")
+    def sink_parquet(
+      path,
+      compression: "zstd",
+      compression_level: nil,
+      statistics: false,
+      row_group_size: nil,
+      data_pagesize_limit: nil,
+      maintain_order: true,
+      type_coercion: true,
+      predicate_pushdown: true,
+      projection_pushdown: true,
+      simplify_expression: true,
+      no_optimization: false,
+      slice_pushdown: true
+    )
+      if no_optimization
+        predicate_pushdown = false
+        projection_pushdown = false
+        slice_pushdown = false
+      end
+      lf = _ldf.optimization_toggle(
+        type_coercion,
+        predicate_pushdown,
+        projection_pushdown,
+        simplify_expression,
+        slice_pushdown,
+        false,
+        true
+      )
+      lf.sink_parquet(
+        path,
+        compression,
+        compression_level,
+        statistics,
+        row_group_size,
+        data_pagesize_limit,
+        maintain_order
+      )
+    end
     # Collect a small number of rows for debugging purposes.
     #
     # Fetch is like a {#collect} operation, but it overwrites the number of rows
@@ -2192,6 +2284,10 @@ module Polars
     #   Name to give to the `value` column. Defaults to "variable"
     # @param value_name [String]
     #   Name to give to the `value` column. Defaults to "value"
+    # @param streamable [Boolean]
+    #   Allow this node to run in the streaming engine.
+    #   If this runs in streaming, the output of the melt operation
+    #   will not have a stable ordering.
     #
     # @return [LazyFrame]
     #
@@ -2218,7 +2314,7 @@ module Polars
     #   # │ y   ┆ c        ┆ 4     │
     #   # │ z   ┆ c        ┆ 6     │
     #   # └─────┴──────────┴───────┘
-    def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
+    def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil, streamable: true)
       if value_vars.is_a?(String)
         value_vars = [value_vars]
       end
@@ -2232,7 +2328,7 @@ module Polars
         id_vars = []
       end
       _from_rbldf(
-        _ldf.melt(id_vars, value_vars, value_name, variable_name)
+        _ldf.melt(id_vars, value_vars, value_name, variable_name, streamable)
       )
     end

data/lib/polars/lazy_functions.rb CHANGED Viewed

@@ -657,7 +657,7 @@ module Polars
     #   Default is ascending.
     #
     # @return [Expr]
-    def argsort_by(exprs, reverse: false)
+    def arg_sort_by(exprs, reverse: false)
       if !exprs.is_a?(Array)
         exprs = [exprs]
       end
@@ -665,8 +665,9 @@ module Polars
         reverse = [reverse] * exprs.length
       end
       exprs = Utils.selection_to_rbexpr_list(exprs)
-      Utils.wrap_expr(RbExpr.argsort_by(exprs, reverse))
+      Utils.wrap_expr(RbExpr.arg_sort_by(exprs, reverse))
     end
+    alias_method :argsort_by, :arg_sort_by
     # Create polars `Duration` from distinct time components.
     #

data/lib/polars/list_expr.rb CHANGED Viewed

@@ -426,7 +426,7 @@ module Polars
     #   # shape: (2,)
     #   # Series: 'a' [list[i64]]
     #   # [
-    #   #         [null, 1, ... 1]
+    #   #         [null, 1, … 1]
     #   #         [null, -8, -1]
     #   # ]
     def diff(n: 1, null_behavior: "ignore")
@@ -447,7 +447,7 @@ module Polars
     #   # shape: (2,)
     #   # Series: 'a' [list[i64]]
     #   # [
-    #   #         [null, 1, ... 3]
+    #   #         [null, 1, … 3]
     #   #         [null, 10, 2]
     #   # ]
     def shift(periods = 1)

data/lib/polars/list_name_space.rb CHANGED Viewed

@@ -185,7 +185,7 @@ module Polars
     #   # shape: (2,)
     #   # Series: 'a' [list[i64]]
     #   # [
-    #   #         [null, 1, ... 1]
+    #   #         [null, 1, … 1]
     #   #         [null, -8, -1]
     #   # ]
     def diff(n: 1, null_behavior: "ignore")
@@ -206,7 +206,7 @@ module Polars
     #   # shape: (2,)
     #   # Series: 'a' [list[i64]]
     #   # [
-    #   #         [null, 1, ... 3]
+    #   #         [null, 1, … 3]
     #   #         [null, 10, 2]
     #   # ]
     def shift(periods = 1)

data/lib/polars/series.rb CHANGED Viewed

@@ -3531,6 +3531,13 @@ module Polars
       ListNameSpace.new(self)
     end
+    # Create an object namespace of all binary related methods.
+    #
+    # @return [BinaryNameSpace]
+    def bin
+      BinaryNameSpace.new(self)
+    end
     # Create an object namespace of all categorical related methods.
     #
     # @return [CatNameSpace]
@@ -3795,7 +3802,8 @@ module Polars
       UInt32 => RbSeries.method(:new_opt_u32),
       UInt64 => RbSeries.method(:new_opt_u64),
       Boolean => RbSeries.method(:new_opt_bool),
-      Utf8 => RbSeries.method(:new_str)
+      Utf8 => RbSeries.method(:new_str),
+      Binary => RbSeries.method(:new_binary)
     }
     SYM_TYPE_TO_CONSTRUCTOR = {

data/lib/polars/string_expr.rb CHANGED Viewed

@@ -332,7 +332,7 @@ module Polars
     #   # │ -0001   │
     #   # │ 00000   │
     #   # │ 00001   │
-    #   # │ ...     │
+    #   # │ …       │
     #   # │ 10000   │
     #   # │ 100000  │
     #   # │ 1000000 │

data/lib/polars/utils.rb CHANGED Viewed

@@ -93,8 +93,12 @@ module Polars
       Polars.lit(value)
     end
-    def self.format_path(path)
-      File.expand_path(path)
+    def self.normalise_filepath(path, check_not_directory: true)
+      path = File.expand_path(path)
+      if check_not_directory && File.exist?(path) && Dir.exist?(path)
+        raise ArgumentError, "Expected a file path; #{path} is a directory"
+      end
+      path
     end
     # TODO fix
@@ -216,5 +220,9 @@ module Polars
         val.is_a?(Array) && _is_iterable_of(val, String)
       end
     end
+    def self.local_file?(file)
+      Dir.glob(file).any?
+    end
   end
 end

data/lib/polars/version.rb CHANGED Viewed

@@ -1,4 +1,4 @@
 module Polars
   # @private
-  VERSION = "0.3.1"
+  VERSION = "0.4.0"
 end

data/lib/polars.rb CHANGED Viewed

@@ -12,6 +12,8 @@ require "stringio"
 # modules
 require_relative "polars/expr_dispatch"
 require_relative "polars/batched_csv_reader"
+require_relative "polars/binary_expr"
+require_relative "polars/binary_name_space"
 require_relative "polars/cat_expr"
 require_relative "polars/cat_name_space"
 require_relative "polars/convert"