RubyGems - polars-df - Versions diffs - 0.1.1 → 0.1.2 - Mend

polars-df 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/Cargo.lock +1 -1
data/ext/polars/Cargo.toml +1 -1
data/ext/polars/src/batched_csv.rs +120 -0
data/ext/polars/src/conversion.rs +105 -5
data/ext/polars/src/dataframe.rs +132 -4
data/ext/polars/src/error.rs +9 -0
data/ext/polars/src/file.rs +8 -7
data/ext/polars/src/lazy/apply.rs +7 -0
data/ext/polars/src/lazy/dataframe.rs +132 -0
data/ext/polars/src/lazy/dsl.rs +38 -0
data/ext/polars/src/lazy/meta.rs +1 -1
data/ext/polars/src/lazy/mod.rs +1 -0
data/ext/polars/src/lib.rs +77 -3
data/ext/polars/src/series.rs +8 -9
data/lib/polars/batched_csv_reader.rb +95 -0
data/lib/polars/data_frame.rb +585 -19
data/lib/polars/expr.rb +17 -2
data/lib/polars/io.rb +342 -2
data/lib/polars/lazy_frame.rb +156 -2
data/lib/polars/lazy_functions.rb +154 -11
data/lib/polars/series.rb +806 -18
data/lib/polars/utils.rb +33 -0
data/lib/polars/version.rb +1 -1
data/lib/polars.rb +9 -0
metadata +5 -2

data/lib/polars/expr.rb CHANGED Viewed

@@ -1,7 +1,10 @@
 module Polars
+  # Expressions that can be used in various contexts.
   class Expr
+    # @private
     attr_accessor :_rbexpr
+    # @private
     def self._from_rbexpr(rbexpr)
       expr = Expr.allocate
       expr._rbexpr = rbexpr
@@ -80,6 +83,7 @@ module Polars
     # def to_physical
     # end
+    #
     def any
       wrap_expr(_rbexpr.any)
     end
@@ -104,7 +108,9 @@ module Polars
       wrap_expr(_rbexpr._alias(name))
     end
-    # TODO support symbols
+    # TODO support symbols for exclude
+    #
     def exclude(columns)
       if columns.is_a?(String)
         columns = [columns]
@@ -140,6 +146,7 @@ module Polars
     # def map_alias
     # end
+    #
     def is_not
       wrap_expr(_rbexpr.is_not)
     end
@@ -293,7 +300,8 @@ module Polars
     # def take
     # end
-    def shift(periods)
+    #
+    def shift(periods = 1)
       wrap_expr(_rbexpr.shift(periods))
     end
@@ -439,6 +447,7 @@ module Polars
     # def apply
     # end
+    #
     def flatten
       wrap_expr(_rbexpr.explode)
     end
@@ -471,6 +480,7 @@ module Polars
     # def is_in
     # end
+    #
     def repeat_by(by)
       by = Utils.expr_to_lit_or_expr(by, false)
       wrap_expr(_rbexpr.repeat_by(by._rbexpr))
@@ -482,6 +492,7 @@ module Polars
     # def _hash
     # end
+    #
     def reinterpret(signed: false)
       wrap_expr(_rbexpr.reinterpret(signed))
     end
@@ -489,6 +500,7 @@ module Polars
     # def _inspect
     # end
+    #
     def interpolate
       wrap_expr(_rbexpr.interpolate)
     end
@@ -520,6 +532,7 @@ module Polars
     # def rolling_apply
     # end
+    #
     def rolling_skew(window_size, bias: true)
       wrap_expr(_rbexpr.rolling_skew(window_size, bias))
     end
@@ -650,6 +663,7 @@ module Polars
     # def extend_constant
     # end
+    #
     def value_counts(multithreaded: false, sort: false)
       wrap_expr(_rbexpr.value_counts(multithreaded, sort))
     end
@@ -672,6 +686,7 @@ module Polars
     # def set_sorted
     # end
+    #
     def list
       wrap_expr(_rbexpr.list)
     end

data/lib/polars/io.rb CHANGED Viewed

@@ -1,8 +1,245 @@
 module Polars
   module IO
-    def read_csv(file, has_header: true)
+    def read_csv(
+      file,
+      has_header: true,
+      columns: nil,
+      new_columns: nil,
+      sep: ",",
+      comment_char: nil,
+      quote_char: '"',
+      skip_rows: 0,
+      dtypes: nil,
+      null_values: nil,
+      ignore_errors: false,
+      parse_dates: false,
+      n_threads: nil,
+      infer_schema_length: 100,
+      batch_size: 8192,
+      n_rows: nil,
+      encoding: "utf8",
+      low_memory: false,
+      rechunk: true,
+      storage_options: nil,
+      skip_rows_after_header: 0,
+      row_count_name: nil,
+      row_count_offset: 0,
+      sample_size: 1024,
+      eol_char: "\n"
+    )
+      _check_arg_is_1byte("sep", sep, false)
+      _check_arg_is_1byte("comment_char", comment_char, false)
+      _check_arg_is_1byte("quote_char", quote_char, true)
+      _check_arg_is_1byte("eol_char", eol_char, false)
+      projection, columns = Utils.handle_projection_columns(columns)
+      storage_options ||= {}
+      if columns && !has_header
+        columns.each do |column|
+          if !column.start_with?("column_")
+            raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
+          end
+        end
+      end
+      if projection || new_columns
+        raise Todo
+      end
+      df = nil
       _prepare_file_arg(file) do |data|
-        DataFrame._read_csv(data, has_header: has_header)
+        df = DataFrame._read_csv(
+          data,
+          has_header: has_header,
+          columns: columns || projection,
+          sep: sep,
+          comment_char: comment_char,
+          quote_char: quote_char,
+          skip_rows: skip_rows,
+          dtypes: dtypes,
+          null_values: null_values,
+          ignore_errors: ignore_errors,
+          parse_dates: parse_dates,
+          n_threads: n_threads,
+          infer_schema_length: infer_schema_length,
+          batch_size: batch_size,
+          n_rows: n_rows,
+          encoding: encoding == "utf8-lossy" ? encoding : "utf8",
+          low_memory: low_memory,
+          rechunk: rechunk,
+          skip_rows_after_header: skip_rows_after_header,
+          row_count_name: row_count_name,
+          row_count_offset: row_count_offset,
+          sample_size: sample_size,
+          eol_char: eol_char
+        )
+      end
+      if new_columns
+        Utils._update_columns(df, new_columns)
+      else
+        df
+      end
+    end
+    def scan_csv(
+      file,
+      has_header: true,
+      sep: ",",
+      comment_char: nil,
+      quote_char: '"',
+      skip_rows: 0,
+      dtypes: nil,
+      null_values: nil,
+      ignore_errors: false,
+      cache: true,
+      with_column_names: nil,
+      infer_schema_length: 100,
+      n_rows: nil,
+      encoding: "utf8",
+      low_memory: false,
+      rechunk: true,
+      skip_rows_after_header: 0,
+      row_count_name: nil,
+      row_count_offset: 0,
+      parse_dates: false,
+      eol_char: "\n"
+    )
+      _check_arg_is_1byte("sep", sep, false)
+      _check_arg_is_1byte("comment_char", comment_char, false)
+      _check_arg_is_1byte("quote_char", quote_char, true)
+      if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
+        file = Utils.format_path(file)
+      end
+      LazyFrame._scan_csv(
+        file,
+        has_header: has_header,
+        sep: sep,
+        comment_char: comment_char,
+        quote_char: quote_char,
+        skip_rows: skip_rows,
+        dtypes: dtypes,
+        null_values: null_values,
+        ignore_errors: ignore_errors,
+        cache: cache,
+        with_column_names: with_column_names,
+        infer_schema_length: infer_schema_length,
+        n_rows: n_rows,
+        low_memory: low_memory,
+        rechunk: rechunk,
+        skip_rows_after_header: skip_rows_after_header,
+        encoding: encoding,
+        row_count_name: row_count_name,
+        row_count_offset: row_count_offset,
+        parse_dates: parse_dates,
+        eol_char: eol_char,
+      )
+    end
+    def scan_ipc(
+      file,
+      n_rows: nil,
+      cache: true,
+      rechunk: true,
+      row_count_name: nil,
+      row_count_offset: 0,
+      storage_options: nil,
+      memory_map: true
+    )
+      LazyFrame._scan_ipc(
+        file,
+        n_rows: n_rows,
+        cache: cache,
+        rechunk: rechunk,
+        row_count_name: row_count_name,
+        row_count_offset: row_count_offset,
+        storage_options: storage_options,
+        memory_map: memory_map
+      )
+    end
+    def scan_parquet(
+      file,
+      n_rows: nil,
+      cache: true,
+      parallel: "auto",
+      rechunk: true,
+      row_count_name: nil,
+      row_count_offset: 0,
+      storage_options: nil,
+      low_memory: false
+    )
+      if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
+        file = Utils.format_path(file)
+      end
+      LazyFrame._scan_parquet(
+        file,
+        n_rows:n_rows,
+        cache: cache,
+        parallel: parallel,
+        rechunk: rechunk,
+        row_count_name: row_count_name,
+        row_count_offset: row_count_offset,
+        storage_options: storage_options,
+        low_memory: low_memory
+      )
+    end
+    def scan_ndjson(
+      file,
+      infer_schema_length: 100,
+      batch_size: 1024,
+      n_rows: nil,
+      low_memory: false,
+      rechunk: true,
+      row_count_name: nil,
+      row_count_offset: 0
+    )
+      if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
+        file = Utils.format_path(file)
+      end
+      LazyFrame._scan_ndjson(
+        file,
+        infer_schema_length: infer_schema_length,
+        batch_size: batch_size,
+        n_rows: n_rows,
+        low_memory: low_memory,
+        rechunk: rechunk,
+        row_count_name: row_count_name,
+        row_count_offset: row_count_offset,
+      )
+    end
+    # def read_avro
+    # end
+    def read_ipc(
+      file,
+      columns: nil,
+      n_rows: nil,
+      memory_map: true,
+      storage_options: nil,
+      row_count_name: nil,
+      row_count_offset: 0,
+      rechunk: true
+    )
+      storage_options ||= {}
+      _prepare_file_arg(file, **storage_options) do |data|
+        DataFrame._read_ipc(
+          data,
+          columns: columns,
+          n_rows: n_rows,
+          row_count_name: row_count_name,
+          row_count_offset: row_count_offset,
+          rechunk: rechunk,
+          memory_map: memory_map
+        )
       end
     end
@@ -20,6 +257,96 @@ module Polars
       DataFrame._read_ndjson(file)
     end
+    # def read_sql
+    # end
+    # def read_excel
+    # end
+    def read_csv_batched(
+      file,
+      has_header: true,
+      columns: nil,
+      new_columns: nil,
+      sep: ",",
+      comment_char: nil,
+      quote_char: '"',
+      skip_rows: 0,
+      dtypes: nil,
+      null_values: nil,
+      ignore_errors: false,
+      parse_dates: false,
+      n_threads: nil,
+      infer_schema_length: 100,
+      batch_size: 50_000,
+      n_rows: nil,
+      encoding: "utf8",
+      low_memory: false,
+      rechunk: true,
+      skip_rows_after_header: 0,
+      row_count_name: nil,
+      row_count_offset: 0,
+      sample_size: 1024,
+      eol_char: "\n"
+    )
+      projection, columns = Utils.handle_projection_columns(columns)
+      if columns && !has_header
+        columns.each do |column|
+          if !column.start_with?("column_")
+            raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
+          end
+        end
+      end
+      if projection || new_columns
+        raise Todo
+      end
+      BatchedCsvReader.new(
+        file,
+        has_header: has_header,
+        columns: columns || projection,
+        sep: sep,
+        comment_char: comment_char,
+        quote_char: quote_char,
+        skip_rows: skip_rows,
+        dtypes: dtypes,
+        null_values: null_values,
+        ignore_errors: ignore_errors,
+        parse_dates: parse_dates,
+        n_threads: n_threads,
+        infer_schema_length: infer_schema_length,
+        batch_size: batch_size,
+        n_rows: n_rows,
+        encoding: encoding == "utf8-lossy" ? encoding : "utf8",
+        low_memory: low_memory,
+        rechunk: rechunk,
+        skip_rows_after_header: skip_rows_after_header,
+        row_count_name: row_count_name,
+        row_count_offset: row_count_offset,
+        sample_size: sample_size,
+        eol_char: eol_char,
+        new_columns: new_columns
+      )
+    end
+    def read_ipc_schema(file)
+      if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
+        file = Utils.format_path(file)
+      end
+      _ipc_schema(file)
+    end
+    def read_parquet_schema(file)
+      if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
+        file = Utils.format_path(file)
+      end
+      _parquet_schema(file)
+    end
     private
     def _prepare_file_arg(file)
@@ -35,5 +362,18 @@ module Polars
       yield file
     end
+    def _check_arg_is_1byte(arg_name, arg, can_be_empty = false)
+      if arg.is_a?(String)
+        arg_byte_length = arg.bytesize
+        if can_be_empty
+          if arg_byte_length > 1
+            raise ArgumentError, "#{arg_name} should be a single byte character or empty, but is #{arg_byte_length} bytes long."
+          end
+        elsif arg_byte_length != 1
+          raise ArgumentError, "#{arg_name} should be a single byte character, but is #{arg_byte_length} bytes long."
+        end
+      end
+    end
   end
 end

data/lib/polars/lazy_frame.rb CHANGED Viewed

@@ -1,13 +1,157 @@
 module Polars
+  # Representation of a Lazy computation graph/query againat a DataFrame.
   class LazyFrame
+    # @private
     attr_accessor :_ldf
+    # @private
     def self._from_rbldf(rb_ldf)
       ldf = LazyFrame.allocate
       ldf._ldf = rb_ldf
       ldf
     end
+    # @private
+    def self._scan_csv(
+      file,
+      has_header: true,
+      sep: ",",
+      comment_char: nil,
+      quote_char: '"',
+      skip_rows: 0,
+      dtypes: nil,
+      null_values: nil,
+      ignore_errors: false,
+      cache: true,
+      with_column_names: nil,
+      infer_schema_length: 100,
+      n_rows: nil,
+      encoding: "utf8",
+      low_memory: false,
+      rechunk: true,
+      skip_rows_after_header: 0,
+      row_count_name: nil,
+      row_count_offset: 0,
+      parse_dates: false,
+      eol_char: "\n"
+    )
+      dtype_list = nil
+      if !dtypes.nil?
+        dtype_list = []
+        dtypes.each do |k, v|
+          dtype_list << [k, Utils.rb_type_to_dtype(v)]
+        end
+      end
+      processed_null_values = Utils._process_null_values(null_values)
+      _from_rbldf(
+        RbLazyFrame.new_from_csv(
+          file,
+          sep,
+          has_header,
+          ignore_errors,
+          skip_rows,
+          n_rows,
+          cache,
+          dtype_list,
+          low_memory,
+          comment_char,
+          quote_char,
+          processed_null_values,
+          infer_schema_length,
+          with_column_names,
+          rechunk,
+          skip_rows_after_header,
+          encoding,
+          Utils._prepare_row_count_args(row_count_name, row_count_offset),
+          parse_dates,
+          eol_char
+        )
+      )
+    end
+    # @private
+    def self._scan_parquet(
+      file,
+      n_rows: nil,
+      cache: true,
+      parallel: "auto",
+      rechunk: true,
+      row_count_name: nil,
+      row_count_offset: 0,
+      storage_options: nil,
+      low_memory: false
+    )
+      _from_rbldf(
+        RbLazyFrame.new_from_parquet(
+          file,
+          n_rows,
+          cache,
+          parallel,
+          rechunk,
+          Utils._prepare_row_count_args(row_count_name, row_count_offset),
+          low_memory
+        )
+      )
+    end
+    # @private
+    def self._scan_ipc(
+      file,
+      n_rows: nil,
+      cache: true,
+      rechunk: true,
+      row_count_name: nil,
+      row_count_offset: 0,
+      storage_options: nil,
+      memory_map: true
+    )
+      if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
+        file = Utils.format_path(file)
+      end
+      _from_rbldf(
+        RbLazyFrame.new_from_ipc(
+          file,
+          n_rows,
+          cache,
+          rechunk,
+          Utils._prepare_row_count_args(row_count_name, row_count_offset),
+          memory_map
+        )
+      )
+    end
+    # @private
+    def self._scan_ndjson(
+      file,
+      infer_schema_length: nil,
+      batch_size: nil,
+      n_rows: nil,
+      low_memory: false,
+      rechunk: true,
+      row_count_name: nil,
+      row_count_offset: 0
+    )
+      _from_rbldf(
+        RbLazyFrame.new_from_ndjson(
+          file,
+          infer_schema_length,
+          batch_size,
+          n_rows,
+          low_memory,
+          rechunk,
+          Utils._prepare_row_count_args(row_count_name, row_count_offset)
+        )
+      )
+    end
+    # def self.from_json
+    # end
+    # def self.read_json
+    # end
     # def columns
     # end
@@ -53,6 +197,7 @@ module Polars
     # def profile
     # end
+    #
     def collect(
       type_coercion: true,
       predicate_pushdown: true,
@@ -90,6 +235,7 @@ module Polars
     # def fetch
     # end
+    #
     def lazy
       self
     end
@@ -100,6 +246,7 @@ module Polars
     # def cleared
     # end
+    #
     def filter(predicate)
       _from_rbldf(
         _ldf.filter(
@@ -128,6 +275,7 @@ module Polars
     # def join_asof
     # end
+    #
     def join(
       other,
       left_on: nil,
@@ -202,6 +350,7 @@ module Polars
     # def with_context
     # end
+    #
     def with_column(column)
       with_columns([column])
     end
@@ -209,6 +358,7 @@ module Polars
     # def drop
     # end
+    #
     def rename(mapping)
       existing = mapping.keys
       _new = mapping.values
@@ -251,6 +401,7 @@ module Polars
     # def fill_null
     # end
+    #
     def fill_nan(fill_value)
       if !fill_value.is_a?(Expr)
         fill_value = Utils.lit(fill_value)
@@ -282,8 +433,11 @@ module Polars
     # def quantile
     # end
-    # def explode
-    # end
+    #
+    def explode(columns)
+      columns = Utils.selection_to_rbexpr_list(columns)
+      _from_rbldf(_ldf.explode(columns))
+    end
     # def unique
     # end