RubyGems - polars-df - Versions diffs - 0.10.0-x86_64-linux → 0.11.0-x86_64-linux - Mend

polars-df 0.10.0-x86_64-linux → 0.11.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +11 -0
data/Cargo.lock +90 -48
data/LICENSE-THIRD-PARTY.txt +152 -79
data/README.md +6 -6
data/lib/polars/3.1/polars.so +0 -0
data/lib/polars/3.2/polars.so +0 -0
data/lib/polars/3.3/polars.so +0 -0
data/lib/polars/batched_csv_reader.rb +9 -3
data/lib/polars/convert.rb +6 -1
data/lib/polars/data_frame.rb +83 -302
data/lib/polars/date_time_expr.rb +1 -0
data/lib/polars/date_time_name_space.rb +5 -1
data/lib/polars/dynamic_group_by.rb +2 -2
data/lib/polars/exceptions.rb +4 -0
data/lib/polars/expr.rb +1134 -20
data/lib/polars/functions/range/date_range.rb +92 -0
data/lib/polars/functions/range/datetime_range.rb +149 -0
data/lib/polars/functions/range/time_range.rb +141 -0
data/lib/polars/group_by.rb +88 -23
data/lib/polars/io/avro.rb +24 -0
data/lib/polars/{io.rb → io/csv.rb} +296 -490
data/lib/polars/io/database.rb +73 -0
data/lib/polars/io/ipc.rb +247 -0
data/lib/polars/io/json.rb +18 -0
data/lib/polars/io/ndjson.rb +69 -0
data/lib/polars/io/parquet.rb +226 -0
data/lib/polars/lazy_frame.rb +23 -166
data/lib/polars/lazy_group_by.rb +100 -3
data/lib/polars/rolling_group_by.rb +2 -2
data/lib/polars/series.rb +2 -2
data/lib/polars/string_expr.rb +37 -36
data/lib/polars/utils.rb +35 -1
data/lib/polars/version.rb +1 -1
data/lib/polars.rb +9 -1
metadata +12 -4

data/lib/polars/data_frame.rb CHANGED Viewed

@@ -46,271 +46,6 @@ module Polars
       df
     end
-    # @private
-    def self._from_hashes(data, infer_schema_length: 100, schema: nil, schema_overrides: nil)
-      rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
-      _from_rbdf(rbdf)
-    end
-    # @private
-    def self._from_hash(data, schema: nil, schema_overrides: nil)
-      _from_rbdf(hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides))
-    end
-    # def self._from_records
-    # end
-    # def self._from_numo
-    # end
-    # no self._from_arrow
-    # no self._from_pandas
-    # @private
-    def self._read_csv(
-      file,
-      has_header: true,
-      columns: nil,
-      sep: str = ",",
-      comment_char: nil,
-      quote_char: '"',
-      skip_rows: 0,
-      dtypes: nil,
-      null_values: nil,
-      ignore_errors: false,
-      parse_dates: false,
-      n_threads: nil,
-      infer_schema_length: 100,
-      batch_size: 8192,
-      n_rows: nil,
-      encoding: "utf8",
-      low_memory: false,
-      rechunk: true,
-      skip_rows_after_header: 0,
-      row_count_name: nil,
-      row_count_offset: 0,
-      sample_size: 1024,
-      eol_char: "\n",
-      truncate_ragged_lines: false
-    )
-      if Utils.pathlike?(file)
-        path = Utils.normalise_filepath(file)
-      else
-        path = nil
-        # if defined?(StringIO) && file.is_a?(StringIO)
-        #   file = file.string
-        # end
-      end
-      dtype_list = nil
-      dtype_slice = nil
-      if !dtypes.nil?
-        if dtypes.is_a?(Hash)
-          dtype_list = []
-          dtypes.each do|k, v|
-            dtype_list << [k, Utils.rb_type_to_dtype(v)]
-          end
-        elsif dtypes.is_a?(::Array)
-          dtype_slice = dtypes
-        else
-          raise ArgumentError, "dtype arg should be list or dict"
-        end
-      end
-      processed_null_values = Utils._process_null_values(null_values)
-      if columns.is_a?(::String)
-        columns = [columns]
-      end
-      if file.is_a?(::String) && file.include?("*")
-        dtypes_dict = nil
-        if !dtype_list.nil?
-          dtypes_dict = dtype_list.to_h
-        end
-        if !dtype_slice.nil?
-          raise ArgumentError, "cannot use glob patterns and unnamed dtypes as `dtypes` argument; Use dtypes: Mapping[str, Type[DataType]"
-        end
-        scan = Polars.scan_csv(
-          file,
-          has_header: has_header,
-          sep: sep,
-          comment_char: comment_char,
-          quote_char: quote_char,
-          skip_rows: skip_rows,
-          dtypes: dtypes_dict,
-          null_values: null_values,
-          ignore_errors: ignore_errors,
-          infer_schema_length: infer_schema_length,
-          n_rows: n_rows,
-          low_memory: low_memory,
-          rechunk: rechunk,
-          skip_rows_after_header: skip_rows_after_header,
-          row_count_name: row_count_name,
-          row_count_offset: row_count_offset,
-          eol_char: eol_char,
-          truncate_ragged_lines: truncate_ragged_lines
-        )
-        if columns.nil?
-          return _from_rbdf(scan.collect._df)
-        elsif is_str_sequence(columns, allow_str: false)
-          return _from_rbdf(scan.select(columns).collect._df)
-        else
-          raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: List[str]"
-        end
-      end
-      projection, columns = Utils.handle_projection_columns(columns)
-      _from_rbdf(
-        RbDataFrame.read_csv(
-          file,
-          infer_schema_length,
-          batch_size,
-          has_header,
-          ignore_errors,
-          n_rows,
-          skip_rows,
-          projection,
-          sep,
-          rechunk,
-          columns,
-          encoding,
-          n_threads,
-          path,
-          dtype_list,
-          dtype_slice,
-          low_memory,
-          comment_char,
-          quote_char,
-          processed_null_values,
-          parse_dates,
-          skip_rows_after_header,
-          Utils._prepare_row_count_args(row_count_name, row_count_offset),
-          sample_size,
-          eol_char,
-          truncate_ragged_lines
-        )
-      )
-    end
-    # @private
-    def self._read_parquet(
-      source,
-      columns: nil,
-      n_rows: nil,
-      parallel: "auto",
-      row_count_name: nil,
-      row_count_offset: 0,
-      low_memory: false,
-      use_statistics: true,
-      rechunk: true
-    )
-      if Utils.pathlike?(source)
-        source = Utils.normalise_filepath(source)
-      end
-      if columns.is_a?(::String)
-        columns = [columns]
-      end
-      if source.is_a?(::String) && source.include?("*") && Utils.local_file?(source)
-        scan =
-          Polars.scan_parquet(
-            source,
-            n_rows: n_rows,
-            rechunk: true,
-            parallel: parallel,
-            row_count_name: row_count_name,
-            row_count_offset: row_count_offset,
-            low_memory: low_memory
-          )
-        if columns.nil?
-          return self._from_rbdf(scan.collect._df)
-        elsif Utils.is_str_sequence(columns, allow_str: false)
-          return self._from_rbdf(scan.select(columns).collect._df)
-        else
-          raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: Array[String]"
-        end
-      end
-      projection, columns = Utils.handle_projection_columns(columns)
-      _from_rbdf(
-        RbDataFrame.read_parquet(
-          source,
-          columns,
-          projection,
-          n_rows,
-          parallel,
-          Utils._prepare_row_count_args(row_count_name, row_count_offset),
-          low_memory,
-          use_statistics,
-          rechunk
-        )
-      )
-    end
-    # @private
-    def self._read_avro(file, columns: nil, n_rows: nil)
-      if Utils.pathlike?(file)
-        file = Utils.normalise_filepath(file)
-      end
-      projection, columns = Utils.handle_projection_columns(columns)
-      _from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
-    end
-    # @private
-    def self._read_ipc(
-      file,
-      columns: nil,
-      n_rows: nil,
-      row_count_name: nil,
-      row_count_offset: 0,
-      rechunk: true,
-      memory_map: true
-    )
-      if Utils.pathlike?(file)
-        file = Utils.normalise_filepath(file)
-      end
-      if columns.is_a?(::String)
-        columns = [columns]
-      end
-      if file.is_a?(::String) && file.include?("*")
-        raise Todo
-      end
-      projection, columns = Utils.handle_projection_columns(columns)
-      _from_rbdf(
-        RbDataFrame.read_ipc(
-          file,
-          columns,
-          projection,
-          n_rows,
-          Utils._prepare_row_count_args(row_count_name, row_count_offset),
-          memory_map
-        )
-      )
-    end
-    # @private
-    def self._read_json(file)
-      if Utils.pathlike?(file)
-        file = Utils.normalise_filepath(file)
-      end
-      _from_rbdf(RbDataFrame.read_json(file))
-    end
-    # @private
-    def self._read_ndjson(file)
-      if Utils.pathlike?(file)
-        file = Utils.normalise_filepath(file)
-      end
-      _from_rbdf(RbDataFrame.read_ndjson(file))
-    end
     # Get the shape of the DataFrame.
     #
     # @return [Array]
@@ -419,6 +154,13 @@ module Polars
       _df.dtypes
     end
+    # Get flags that are set on the columns of this DataFrame.
+    #
+    # @return [Hash]
+    def flags
+      columns.to_h { |name| [name, self[name].flags] }
+    end
     # Get the schema.
     #
     # @return [Hash]
@@ -845,7 +587,7 @@ module Polars
       row_oriented: false
     )
       if Utils.pathlike?(file)
-        file = Utils.normalise_filepath(file)
+        file = Utils.normalize_filepath(file)
       end
       to_string_io = !file.nil? && file.is_a?(StringIO)
       if file.nil? || to_string_io
@@ -884,7 +626,7 @@ module Polars
     #   # => "{\"foo\":1,\"bar\":6}\n{\"foo\":2,\"bar\":7}\n{\"foo\":3,\"bar\":8}\n"
     def write_ndjson(file = nil)
       if Utils.pathlike?(file)
-        file = Utils.normalise_filepath(file)
+        file = Utils.normalize_filepath(file)
       end
       to_string_io = !file.nil? && file.is_a?(StringIO)
       if file.nil? || to_string_io
@@ -991,7 +733,7 @@ module Polars
       end
       if Utils.pathlike?(file)
-        file = Utils.normalise_filepath(file)
+        file = Utils.normalize_filepath(file)
       end
       _df.write_csv(
@@ -1029,7 +771,7 @@ module Polars
         compression = "uncompressed"
       end
       if Utils.pathlike?(file)
-        file = Utils.normalise_filepath(file)
+        file = Utils.normalize_filepath(file)
       end
       _df.write_avro(file, compression)
@@ -1050,7 +792,7 @@ module Polars
         file.set_encoding(Encoding::BINARY)
       end
       if Utils.pathlike?(file)
-        file = Utils.normalise_filepath(file)
+        file = Utils.normalize_filepath(file)
       end
       if compression.nil?
@@ -1061,6 +803,47 @@ module Polars
       return_bytes ? file.string : nil
     end
+    # Write to Arrow IPC record batch stream.
+    #
+    # See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html.
+    #
+    # @param file [Object]
+    #   Path or writable file-like object to which the IPC record batch data will
+    #   be written. If set to `None`, the output is returned as a BytesIO object.
+    # @param compression ['uncompressed', 'lz4', 'zstd']
+    #   Compression method. Defaults to "uncompressed".
+    #
+    # @return [Object]
+    #
+    # @example
+    #   df = Polars::DataFrame.new(
+    #     {
+    #       "foo" => [1, 2, 3, 4, 5],
+    #       "bar" => [6, 7, 8, 9, 10],
+    #       "ham" => ["a", "b", "c", "d", "e"]
+    #     }
+    #   )
+    #   df.write_ipc_stream("new_file.arrow")
+    def write_ipc_stream(
+      file,
+      compression: "uncompressed"
+    )
+      return_bytes = file.nil?
+      if return_bytes
+        file = StringIO.new
+        file.set_encoding(Encoding::BINARY)
+      elsif Utils.pathlike?(file)
+        file = Utils.normalize_filepath(file)
+      end
+      if compression.nil?
+        compression = "uncompressed"
+      end
+      _df.write_ipc_stream(file, compression)
+      return_bytes ? file.string : nil
+    end
     # Write to Apache Parquet file.
     #
     # @param file [String, Pathname, StringIO]
@@ -1097,7 +880,7 @@ module Polars
         compression = "uncompressed"
       end
       if Utils.pathlike?(file)
-        file = Utils.normalise_filepath(file)
+        file = Utils.normalize_filepath(file)
       end
       _df.write_parquet(
@@ -1773,10 +1556,7 @@ module Polars
     #   # │ 3   ┆ 8   ┆ c   │
     #   # └─────┴─────┴─────┘
     def drop_nulls(subset: nil)
-      if subset.is_a?(::String)
-        subset = [subset]
-      end
-      _from_rbdf(_df.drop_nulls(subset))
+      lazy.drop_nulls(subset: subset).collect(_eager: true)
     end
     # Offers a structured way to apply a sequence of user-defined functions (UDFs).
@@ -1838,16 +1618,16 @@ module Polars
     #   df.with_row_index
     #   # =>
     #   # shape: (3, 3)
-    #   # ┌────────┬─────┬─────┐
-    #   # │ row_nr ┆ a   ┆ b   │
-    #   # │ ---    ┆ --- ┆ --- │
-    #   # │ u32    ┆ i64 ┆ i64 │
-    #   # ╞════════╪═════╪═════╡
-    #   # │ 0      ┆ 1   ┆ 2   │
-    #   # │ 1      ┆ 3   ┆ 4   │
-    #   # │ 2      ┆ 5   ┆ 6   │
-    #   # └────────┴─────┴─────┘
-    def with_row_index(name: "row_nr", offset: 0)
+    #   # ┌───────┬─────┬─────┐
+    #   # │ index ┆ a   ┆ b   │
+    #   # │ ---   ┆ --- ┆ --- │
+    #   # │ u32   ┆ i64 ┆ i64 │
+    #   # ╞═══════╪═════╪═════╡
+    #   # │ 0     ┆ 1   ┆ 2   │
+    #   # │ 1     ┆ 3   ┆ 4   │
+    #   # │ 2     ┆ 5   ┆ 6   │
+    #   # └───────┴─────┴─────┘
+    def with_row_index(name: "index", offset: 0)
       _from_rbdf(_df.with_row_index(name, offset))
     end
     alias_method :with_row_count, :with_row_index
@@ -2136,16 +1916,16 @@ module Polars
     #   )
     #   # =>
     #   # shape: (4, 3)
-    #   # ┌─────────────────────┬────────────┬───────────────────────────────────┐
-    #   # │ time                ┆ time_count ┆ time_agg_list                     │
-    #   # │ ---                 ┆ ---        ┆ ---                               │
-    #   # │ datetime[μs]        ┆ u32        ┆ list[datetime[μs]]                │
-    #   # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
-    #   # │ 2021-12-16 00:00:00 ┆ 2          ┆ [2021-12-16 00:00:00, 2021-12-16… │
-    #   # │ 2021-12-16 01:00:00 ┆ 2          ┆ [2021-12-16 01:00:00, 2021-12-16… │
-    #   # │ 2021-12-16 02:00:00 ┆ 2          ┆ [2021-12-16 02:00:00, 2021-12-16… │
-    #   # │ 2021-12-16 03:00:00 ┆ 1          ┆ [2021-12-16 03:00:00]             │
-    #   # └─────────────────────┴────────────┴───────────────────────────────────┘
+    #   # ┌─────────────────────┬────────────┬─────────────────────────────────┐
+    #   # │ time                ┆ time_count ┆ time_agg_list                   │
+    #   # │ ---                 ┆ ---        ┆ ---                             │
+    #   # │ datetime[μs]        ┆ u32        ┆ list[datetime[μs]]              │
+    #   # ╞═════════════════════╪════════════╪═════════════════════════════════╡
+    #   # │ 2021-12-16 00:00:00 ┆ 2          ┆ [2021-12-16 00:00:00, 2021-12-… │
+    #   # │ 2021-12-16 01:00:00 ┆ 2          ┆ [2021-12-16 01:00:00, 2021-12-… │
+    #   # │ 2021-12-16 02:00:00 ┆ 2          ┆ [2021-12-16 02:00:00, 2021-12-… │
+    #   # │ 2021-12-16 03:00:00 ┆ 1          ┆ [2021-12-16 03:00:00]           │
+    #   # └─────────────────────┴────────────┴─────────────────────────────────┘
     #
     # @example When closed="both" the time values at the window boundaries belong to 2 groups.
     #   df.group_by_dynamic("time", every: "1h", closed: "both").agg(
@@ -2620,7 +2400,7 @@ module Polars
     #   df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [-1, 5, 8]})
     #
     # @example Return a DataFrame by mapping each row to a tuple:
-    #   df.apply { |t| [t[0] * 2, t[1] * 3] }
+    #   df.map_rows { |t| [t[0] * 2, t[1] * 3] }
     #   # =>
     #   # shape: (3, 2)
     #   # ┌──────────┬──────────┐
@@ -2634,7 +2414,7 @@ module Polars
     #   # └──────────┴──────────┘
     #
     # @example Return a Series by mapping each row to a scalar:
-    #   df.apply { |t| t[0] * 2 + t[1] }
+    #   df.map_rows { |t| t[0] * 2 + t[1] }
     #   # =>
     #   # shape: (3, 1)
     #   # ┌───────┐
@@ -2646,14 +2426,15 @@ module Polars
     #   # │ 9     │
     #   # │ 14    │
     #   # └───────┘
-    def apply(return_dtype: nil, inference_size: 256, &f)
-      out, is_df = _df.apply(f, return_dtype, inference_size)
+    def map_rows(return_dtype: nil, inference_size: 256, &f)
+      out, is_df = _df.map_rows(f, return_dtype, inference_size)
       if is_df
         _from_rbdf(out)
       else
         _from_rbdf(Utils.wrap_s(out).to_frame._df)
       end
     end
+    alias_method :apply, :map_rows
     # Return a new DataFrame with the column added or replaced.
     #
@@ -3774,7 +3555,7 @@ module Polars
     #   # ┌─────────┐
     #   # │ literal │
     #   # │ ---     │
-    #   # │ i64     │
+    #   # │ i32     │
     #   # ╞═════════╡
     #   # │ 0       │
     #   # │ 0       │
@@ -5255,7 +5036,7 @@ module Polars
       elsif data[0].is_a?(Hash)
         column_names, dtypes = _unpack_schema(columns)
         schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
-        rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
+        rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, false, infer_schema_length)
         if column_names
           rbdf = _post_apply_columns(rbdf, column_names)
         end
@@ -5289,7 +5070,7 @@ module Polars
           if unpack_nested
             raise Todo
           else
-            rbdf = RbDataFrame.read_rows(
+            rbdf = RbDataFrame.from_rows(
               data,
               infer_schema_length,
               local_schema_override.any? ? local_schema_override : nil

data/lib/polars/date_time_expr.rb CHANGED Viewed

@@ -215,6 +215,7 @@ module Polars
         offset = "0ns"
       end
+      every = Utils.parse_as_expression(every, str_as_lit: true)
       Utils.wrap_expr(
         _rbexpr.dt_round(
           Utils._timedelta_to_pl_duration(every),

data/lib/polars/date_time_name_space.rb CHANGED Viewed

@@ -66,6 +66,8 @@ module Polars
       if !out.nil?
         if s.dtype == Date
           return Utils._to_ruby_date(out.to_i)
+        elsif [Datetime, Duration, Time].include?(s.dtype)
+          return out
         else
           return Utils._to_ruby_datetime(out.to_i, s.time_unit)
         end
@@ -93,10 +95,12 @@ module Polars
     #   # => 2001-01-02 00:00:00 UTC
     def mean
       s = Utils.wrap_s(_s)
-      out = s.mean.to_i
+      out = s.mean
       if !out.nil?
         if s.dtype == Date
           return Utils._to_ruby_date(out.to_i)
+        elsif [Datetime, Duration, Time].include?(s.dtype)
+          return out
         else
           return Utils._to_ruby_datetime(out.to_i, s.time_unit)
         end

data/lib/polars/dynamic_group_by.rb CHANGED Viewed

@@ -32,7 +32,7 @@ module Polars
       @start_by = start_by
     end
-    def agg(aggs)
+    def agg(*aggs, **named_aggs)
       @df.lazy
         .group_by_dynamic(
           @time_column,
@@ -45,7 +45,7 @@ module Polars
           by: @by,
           start_by: @start_by
         )
-        .agg(aggs)
+        .agg(*aggs, **named_aggs)
         .collect(no_optimization: true, string_cache: false)
     end
   end

data/lib/polars/exceptions.rb CHANGED Viewed

@@ -3,6 +3,10 @@ module Polars
   # Base class for all Polars errors.
   class Error < StandardError; end
+  # @private
+  # Exception raised when an operation is not allowed (or possible) against a given object or data structure.
+  class InvalidOperationError < Error; end
   # @private
   # Exception raised when an unsupported testing assert is made.
   class InvalidAssert < Error; end