RubyGems - polars-df - Versions diffs - 0.16.0 → 0.17.1 - Mend

polars-df 0.16.0 → 0.17.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +18 -0
data/Cargo.lock +225 -238
data/LICENSE.txt +1 -1
data/README.md +4 -4
data/ext/polars/Cargo.toml +10 -7
data/ext/polars/src/conversion/any_value.rs +2 -1
data/ext/polars/src/conversion/mod.rs +22 -0
data/ext/polars/src/dataframe/general.rs +5 -5
data/ext/polars/src/dataframe/io.rs +8 -14
data/ext/polars/src/expr/general.rs +4 -0
data/ext/polars/src/functions/io.rs +2 -2
data/ext/polars/src/functions/lazy.rs +15 -0
data/ext/polars/src/interop/numo/mod.rs +1 -0
data/ext/polars/src/interop/numo/numo_rs.rs +52 -0
data/ext/polars/src/interop/numo/to_numo_series.rs +69 -48
data/ext/polars/src/lazyframe/general.rs +25 -22
data/ext/polars/src/lib.rs +9 -4
data/ext/polars/src/map/mod.rs +1 -1
data/ext/polars/src/series/export.rs +1 -0
data/ext/polars/src/series/import.rs +2 -2
data/ext/polars/src/series/scatter.rs +1 -1
data/lib/polars/data_frame.rb +321 -23
data/lib/polars/data_types.rb +4 -0
data/lib/polars/expr.rb +31 -0
data/lib/polars/functions/eager.rb +145 -16
data/lib/polars/io/database.rb +17 -0
data/lib/polars/lazy_frame.rb +74 -9
data/lib/polars/schema.rb +29 -0
data/lib/polars/series.rb +31 -24
data/lib/polars/version.rb +1 -1
data/lib/polars.rb +1 -0
metadata +4 -2

data/lib/polars/data_frame.rb CHANGED Viewed

@@ -495,6 +495,45 @@ module Polars
       _df.arrow_c_stream
     end
+    # Get an ordered mapping of column names to their data type.
+    #
+    # @return [Schema]
+    #
+    # @note
+    #   This method is included to facilitate writing code that is generic for both
+    #   DataFrame and LazyFrame.
+    #
+    # @example Determine the schema.
+    #   df = Polars::DataFrame.new(
+    #     {
+    #       "foo" => [1, 2, 3],
+    #       "bar" => [6.0, 7.0, 8.0],
+    #       "ham" => ["a", "b", "c"]
+    #     }
+    #   )
+    #   df.collect_schema
+    #   # => Polars::Schema({"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String})
+    #
+    # @example Access various properties of the schema using the `Schema` object.
+    #   schema = df.collect_schema
+    #   schema["bar"]
+    #   # => Polars::Float64
+    #
+    # @example
+    #   schema.names
+    #   # => ["foo", "bar", "ham"]
+    #
+    # @example
+    #   schema.dtypes
+    #   # => [Polars::Int64, Polars::Float64, Polars::String]
+    #
+    # @example
+    #   schema.length
+    #   # => 3
+    def collect_schema
+      Schema.new(columns.zip(dtypes), check_dtypes: false)
+    end
     # Return the dataframe as a scalar.
     #
     # Equivalent to `df[0,0]`, with a check that the shape is (1,1).
@@ -604,10 +643,6 @@ module Polars
     #
     # @param file [String]
     #   File path to which the result should be written.
-    # @param pretty [Boolean]
-    #   Pretty serialize json.
-    # @param row_oriented [Boolean]
-    #   Write to row oriented json. This is slower, but more common.
     #
     # @return [nil]
     #
@@ -619,16 +654,8 @@ module Polars
     #     }
     #   )
     #   df.write_json
-    #   # => "{\"columns\":[{\"name\":\"foo\",\"datatype\":\"Int64\",\"bit_settings\":\"\",\"values\":[1,2,3]},{\"name\":\"bar\",\"datatype\":\"Int64\",\"bit_settings\":\"\",\"values\":[6,7,8]}]}"
-    #
-    # @example
-    #   df.write_json(row_oriented: true)
     #   # => "[{\"foo\":1,\"bar\":6},{\"foo\":2,\"bar\":7},{\"foo\":3,\"bar\":8}]"
-    def write_json(
-      file = nil,
-      pretty: false,
-      row_oriented: false
-    )
+    def write_json(file = nil)
       if Utils.pathlike?(file)
         file = Utils.normalize_filepath(file)
       end
@@ -636,7 +663,7 @@ module Polars
       if file.nil? || to_string_io
         buf = StringIO.new
         buf.set_encoding(Encoding::BINARY)
-        _df.write_json(buf, pretty, row_oriented)
+        _df.write_json(buf)
         json_bytes = buf.string
         json_str = json_bytes.force_encoding(Encoding::UTF_8)
@@ -646,7 +673,7 @@ module Polars
           return json_str
         end
       else
-        _df.write_json(file, pretty, row_oriented)
+        _df.write_json(file)
       end
       nil
     end
@@ -973,6 +1000,139 @@ module Polars
       )
     end
+    # Write the data in a Polars DataFrame to a database.
+    #
+    # @param table_name [String]
+    #   Schema-qualified name of the table to create or append to in the target
+    #   SQL database.
+    # @param connection [Object]
+    #   An existing Active Record connection against the target database.
+    # @param if_table_exists ['append', 'replace', 'fail']
+    #   The insert mode:
+    #
+    #   * 'replace' will create a new database table, overwriting an existing one.
+    #   * 'append' will append to an existing table.
+    #   * 'fail' will fail if table already exists.
+    #
+    # @return [Integer]
+    #
+    # @note
+    #   This functionality is experimental. It may be changed at any point without it being considered a breaking change.
+    def write_database(table_name, connection = nil, if_table_exists: "fail")
+      if !defined?(ActiveRecord)
+        raise Error, "Active Record not available"
+      elsif ActiveRecord::VERSION::MAJOR < 7
+        raise Error, "Requires Active Record 7+"
+      end
+      valid_write_modes = ["append", "replace", "fail"]
+      if !valid_write_modes.include?(if_table_exists)
+        msg = "write_database `if_table_exists` must be one of #{valid_write_modes.inspect}, got #{if_table_exists.inspect}"
+        raise ArgumentError, msg
+      end
+      with_connection(connection) do |connection|
+        table_exists = connection.table_exists?(table_name)
+        if table_exists && if_table_exists == "fail"
+          raise ArgumentError, "Table already exists"
+        end
+        create_table = !table_exists || if_table_exists == "replace"
+        maybe_transaction(connection, create_table) do
+          if create_table
+            mysql = connection.adapter_name.match?(/mysql|trilogy/i)
+            force = if_table_exists == "replace"
+            connection.create_table(table_name, id: false, force: force) do |t|
+              schema.each do |c, dtype|
+                options = {}
+                column_type =
+                  case dtype
+                  when Binary
+                    :binary
+                  when Boolean
+                    :boolean
+                  when Date
+                    :date
+                  when Datetime
+                    :datetime
+                  when Decimal
+                    if mysql
+                      options[:precision] = dtype.precision || 65
+                      options[:scale] = dtype.scale || 30
+                    end
+                    :decimal
+                  when Float32
+                    options[:limit] = 24
+                    :float
+                  when Float64
+                    options[:limit] = 53
+                    :float
+                  when Int8
+                    options[:limit] = 1
+                    :integer
+                  when Int16
+                    options[:limit] = 2
+                    :integer
+                  when Int32
+                    options[:limit] = 4
+                    :integer
+                  when Int64
+                    options[:limit] = 8
+                    :integer
+                  when UInt8
+                    if mysql
+                      options[:limit] = 1
+                      options[:unsigned] = true
+                    else
+                      options[:limit] = 2
+                    end
+                    :integer
+                  when UInt16
+                    if mysql
+                      options[:limit] = 2
+                      options[:unsigned] = true
+                    else
+                      options[:limit] = 4
+                    end
+                    :integer
+                  when UInt32
+                    if mysql
+                      options[:limit] = 4
+                      options[:unsigned] = true
+                    else
+                      options[:limit] = 8
+                    end
+                    :integer
+                  when UInt64
+                    if mysql
+                      options[:limit] = 8
+                      options[:unsigned] = true
+                      :integer
+                    else
+                      options[:precision] = 20
+                      options[:scale] = 0
+                      :decimal
+                    end
+                  when String
+                    :text
+                  when Time
+                    :time
+                  else
+                    raise ArgumentError, "column type not supported yet: #{dtype}"
+                  end
+                t.column c, column_type, **options
+              end
+            end
+          end
+          quoted_table = connection.quote_table_name(table_name)
+          quoted_columns = columns.map { |c| connection.quote_column_name(c) }
+          rows = cast({Polars::UInt64 => Polars::String}).rows(named: false).map { |row| "(#{row.map { |v| connection.quote(v) }.join(", ")})" }
+          connection.exec_update("INSERT INTO #{quoted_table} (#{quoted_columns.join(", ")}) VALUES #{rows.join(", ")}")
+        end
+      end
+    end
     # Write DataFrame as delta table.
     #
     # @param target [Object]
@@ -2294,6 +2454,14 @@ module Polars
     #   keys are within this distance. If an asof join is done on columns of dtype
     #   "Date", "Datetime", "Duration" or "Time" you use the following string
     #   language:
+    # @param allow_exact_matches [Boolean]
+    #   Whether exact matches are valid join predicates.
+    #     - If true, allow matching with the same `on` value (i.e. less-than-or-equal-to / greater-than-or-equal-to).
+    #     - If false, don't match the same `on` value (i.e., strictly less-than / strictly greater-than).
+    # @param check_sortedness [Boolean]
+    #   Check the sortedness of the asof keys. If the keys are not sorted Polars
+    #   will error, or in case of 'by' argument raise a warning. This might become
+    #   a hard error in the future.
     #
     #    - 1ns   (1 nanosecond)
     #    - 1us   (1 microsecond)
@@ -2375,7 +2543,9 @@ module Polars
       tolerance: nil,
       allow_parallel: true,
       force_parallel: false,
-      coalesce: true
+      coalesce: true,
+      allow_exact_matches: true,
+      check_sortedness: true
     )
       lazy
         .join_asof(
@@ -2391,7 +2561,9 @@ module Polars
           tolerance: tolerance,
           allow_parallel: allow_parallel,
           force_parallel: force_parallel,
-          coalesce: coalesce
+          coalesce: coalesce,
+          allow_exact_matches: allow_exact_matches,
+          check_sortedness: check_sortedness
         )
         .collect(no_optimization: true)
     end
@@ -2424,6 +2596,24 @@ module Polars
     #     - true: -> Always coalesce join columns.
     #     - false: -> Never coalesce join columns.
     #   Note that joining on any other expressions than `col` will turn off coalescing.
+    # @param maintain_order ['none', 'left', 'right', 'left_right', 'right_left']
+    #   Which DataFrame row order to preserve, if any.
+    #   Do not rely on any observed ordering without explicitly
+    #   setting this parameter, as your code may break in a future release.
+    #   Not specifying any ordering can improve performance
+    #   Supported for inner, left, right and full joins
+    #
+    #   * *none*
+    #       No specific ordering is desired. The ordering might differ across
+    #       Polars versions or even between different runs.
+    #   * *left*
+    #       Preserves the order of the left DataFrame.
+    #   * *right*
+    #       Preserves the order of the right DataFrame.
+    #   * *left_right*
+    #       First preserves the order of the left DataFrame, then the right.
+    #   * *right_left*
+    #       First preserves the order of the right DataFrame, then the left.
     #
     # @return [DataFrame]
     #
@@ -2506,7 +2696,8 @@ module Polars
     #   # ╞═════╪═════╪═════╡
     #   # │ 3   ┆ 8.0 ┆ c   │
     #   # └─────┴─────┴─────┘
-    def join(other,
+    def join(
+      other,
       left_on: nil,
       right_on: nil,
       on: nil,
@@ -2514,7 +2705,8 @@ module Polars
       suffix: "_right",
       validate: "m:m",
       join_nulls: false,
-      coalesce: nil
+      coalesce: nil,
+      maintain_order: nil
     )
       lazy
         .join(
@@ -2526,7 +2718,8 @@ module Polars
           suffix: suffix,
           validate: validate,
           join_nulls: join_nulls,
-          coalesce: coalesce
+          coalesce: coalesce,
+          maintain_order: maintain_order
         )
         .collect(no_optimization: true)
     end
@@ -4865,6 +5058,90 @@ module Polars
       iter_rows(named: named, buffer_size: buffer_size, &block)
     end
+    # Returns an iterator over the columns of this DataFrame.
+    #
+    # @return [Object]
+    #
+    # @note
+    #   Consider whether you can use `all` instead.
+    #   If you can, it will be more efficient.
+    #
+    # @example
+    #   df = Polars::DataFrame.new(
+    #     {
+    #       "a" => [1, 3, 5],
+    #       "b" => [2, 4, 6]
+    #     }
+    #   )
+    #   df.iter_columns.map { |s| s.name }
+    #   # => ["a", "b"]
+    #
+    # @example If you're using this to modify a dataframe's columns, e.g.
+    #   # Do NOT do this
+    #   Polars::DataFrame.new(df.iter_columns.map { |column| column * 2 })
+    #   # =>
+    #   # shape: (3, 2)
+    #   # ┌─────┬─────┐
+    #   # │ a   ┆ b   │
+    #   # │ --- ┆ --- │
+    #   # │ i64 ┆ i64 │
+    #   # ╞═════╪═════╡
+    #   # │ 2   ┆ 4   │
+    #   # │ 6   ┆ 8   │
+    #   # │ 10  ┆ 12  │
+    #   # └─────┴─────┘
+    #
+    # @example then consider whether you can use `all` instead:
+    #   df.select(Polars.all * 2)
+    #   # =>
+    #   # shape: (3, 2)
+    #   # ┌─────┬─────┐
+    #   # │ a   ┆ b   │
+    #   # │ --- ┆ --- │
+    #   # │ i64 ┆ i64 │
+    #   # ╞═════╪═════╡
+    #   # │ 2   ┆ 4   │
+    #   # │ 6   ┆ 8   │
+    #   # │ 10  ┆ 12  │
+    #   # └─────┴─────┘
+    def iter_columns
+      return to_enum(:iter_columns) unless block_given?
+      _df.get_columns.each do |s|
+        yield Utils.wrap_s(s)
+      end
+    end
+    # Returns a non-copying iterator of slices over the underlying DataFrame.
+    #
+    # @param n_rows [Integer]
+    #   Determines the number of rows contained in each DataFrame slice.
+    #
+    # @return [Object]
+    #
+    # @example
+    #   df = Polars::DataFrame.new(
+    #     {
+    #       "a" => 0...17_500,
+    #       "b" => Date.new(2023, 1, 1),
+    #       "c" => "klmnoopqrstuvwxyz"
+    #     },
+    #     schema_overrides: {"a" => Polars::Int32}
+    #   )
+    #   df.iter_slices.map.with_index do |frame, idx|
+    #     "#{frame.class.name}:[#{idx}]:#{frame.length}"
+    #   end
+    #   # => ["Polars::DataFrame:[0]:10000", "Polars::DataFrame:[1]:7500"]
+    def iter_slices(n_rows: 10_000)
+      return to_enum(:iter_slices, n_rows: n_rows) unless block_given?
+      offset = 0
+      while offset < height
+        yield slice(offset, n_rows)
+        offset += n_rows
+      end
+    end
     # Shrink DataFrame memory usage.
     #
     # Shrinks to fit the exact capacity needed to hold the data.
@@ -5101,12 +5378,17 @@ module Polars
       lazy.merge_sorted(other.lazy, key).collect(_eager: true)
     end
-    # Indicate that one or multiple columns are sorted.
+    # Flag a column as sorted.
+    #
+    # This can speed up future operations.
+    #
+    # @note
+    #   This can lead to incorrect results if the data is NOT sorted! Use with care!
     #
     # @param column [Object]
-    #   Columns that are sorted
+    #   Column that is sorted.
     # @param descending [Boolean]
-    #   Whether the columns are sorted in descending order.
+    #   Whether the column is sorted in descending order.
     #
     # @return [DataFrame]
     def set_sorted(
@@ -5538,5 +5820,21 @@ module Polars
       end
       other
     end
+    def with_connection(connection, &block)
+      if !connection.nil?
+        yield connection
+      else
+        ActiveRecord::Base.connection_pool.with_connection(&block)
+      end
+    end
+    def maybe_transaction(connection, create_table, &block)
+      if create_table && connection.adapter_name.match?(/postg|sqlite/i) && connection.open_transactions == 0
+        connection.transaction(&block)
+      else
+        yield
+      end
+    end
   end
 end

data/lib/polars/data_types.rb CHANGED Viewed

@@ -167,6 +167,10 @@ module Polars
   class Int64 < SignedIntegerType
   end
+  # 128-bit signed integer type.
+  class Int128 < SignedIntegerType
+  end
   # 8-bit unsigned integer type.
   class UInt8 < UnsignedIntegerType
   end

data/lib/polars/expr.rb CHANGED Viewed

@@ -3994,6 +3994,37 @@ module Polars
       _from_rbexpr(_rbexpr.interpolate(method))
     end
+    # Fill null values using interpolation based on another column.
+    #
+    # @param by [Expr] Column to interpolate values based on.
+    #
+    # @return [Expr]
+    #
+    # @example Fill null values using linear interpolation.
+    #   df = Polars::DataFrame.new(
+    #     {
+    #       "a" => [1, nil, nil, 3],
+    #       "b" => [1, 2, 7, 8]
+    #     }
+    #   )
+    #   df.with_columns(a_interpolated: Polars.col("a").interpolate_by("b"))
+    #   # =>
+    #   # shape: (4, 3)
+    #   # ┌──────┬─────┬────────────────┐
+    #   # │ a    ┆ b   ┆ a_interpolated │
+    #   # │ ---  ┆ --- ┆ ---            │
+    #   # │ i64  ┆ i64 ┆ f64            │
+    #   # ╞══════╪═════╪════════════════╡
+    #   # │ 1    ┆ 1   ┆ 1.0            │
+    #   # │ null ┆ 2   ┆ 1.285714       │
+    #   # │ null ┆ 7   ┆ 2.714286       │
+    #   # │ 3    ┆ 8   ┆ 3.0            │
+    #   # └──────┴─────┴────────────────┘
+    def interpolate_by(by)
+      by = Utils.parse_into_expression(by)
+      _from_rbexpr(_rbexpr.interpolate_by(by))
+    end
     # Apply a rolling min based on another column.
     #
     # @param by [String]