polars-df 0.17.0-x86_64-linux → 0.18.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Ruby Polars
2
2
 
3
- :fire: Blazingly fast DataFrames for Ruby, powered by [Polars](https://github.com/pola-rs/polars)
3
+ 🔥 Blazingly fast DataFrames for Ruby, powered by [Polars](https://github.com/pola-rs/polars)
4
4
 
5
5
  [![Build Status](https://github.com/ankane/ruby-polars/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/ruby-polars/actions)
6
6
 
@@ -448,7 +448,7 @@ df.group_by("c").plot("a", "b", stacked: true)
448
448
 
449
449
  ## History
450
450
 
451
- View the [changelog](CHANGELOG.md)
451
+ View the [changelog](https://github.com/ankane/ruby-polars/blob/master/CHANGELOG.md)
452
452
 
453
453
  ## Contributing
454
454
 
Binary file
Binary file
Binary file
@@ -495,6 +495,45 @@ module Polars
495
495
  _df.arrow_c_stream
496
496
  end
497
497
 
498
+ # Get an ordered mapping of column names to their data type.
499
+ #
500
+ # @return [Schema]
501
+ #
502
+ # @note
503
+ # This method is included to facilitate writing code that is generic for both
504
+ # DataFrame and LazyFrame.
505
+ #
506
+ # @example Determine the schema.
507
+ # df = Polars::DataFrame.new(
508
+ # {
509
+ # "foo" => [1, 2, 3],
510
+ # "bar" => [6.0, 7.0, 8.0],
511
+ # "ham" => ["a", "b", "c"]
512
+ # }
513
+ # )
514
+ # df.collect_schema
515
+ # # => Polars::Schema({"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String})
516
+ #
517
+ # @example Access various properties of the schema using the `Schema` object.
518
+ # schema = df.collect_schema
519
+ # schema["bar"]
520
+ # # => Polars::Float64
521
+ #
522
+ # @example
523
+ # schema.names
524
+ # # => ["foo", "bar", "ham"]
525
+ #
526
+ # @example
527
+ # schema.dtypes
528
+ # # => [Polars::Int64, Polars::Float64, Polars::String]
529
+ #
530
+ # @example
531
+ # schema.length
532
+ # # => 3
533
+ def collect_schema
534
+ Schema.new(columns.zip(dtypes), check_dtypes: false)
535
+ end
536
+
498
537
  # Return the dataframe as a scalar.
499
538
  #
500
539
  # Equivalent to `df[0,0]`, with a check that the shape is (1,1).
@@ -961,6 +1000,139 @@ module Polars
961
1000
  )
962
1001
  end
963
1002
 
1003
+ # Write the data in a Polars DataFrame to a database.
1004
+ #
1005
+ # @param table_name [String]
1006
+ # Schema-qualified name of the table to create or append to in the target
1007
+ # SQL database.
1008
+ # @param connection [Object]
1009
+ # An existing Active Record connection against the target database.
1010
+ # @param if_table_exists ['append', 'replace', 'fail']
1011
+ # The insert mode:
1012
+ #
1013
+ # * 'replace' will create a new database table, overwriting an existing one.
1014
+ # * 'append' will append to an existing table.
1015
+ # * 'fail' will fail if table already exists.
1016
+ #
1017
+ # @return [Integer]
1018
+ #
1019
+ # @note
1020
+ # This functionality is experimental. It may be changed at any point without it being considered a breaking change.
1021
+ def write_database(table_name, connection = nil, if_table_exists: "fail")
1022
+ if !defined?(ActiveRecord)
1023
+ raise Error, "Active Record not available"
1024
+ elsif ActiveRecord::VERSION::MAJOR < 7
1025
+ raise Error, "Requires Active Record 7+"
1026
+ end
1027
+
1028
+ valid_write_modes = ["append", "replace", "fail"]
1029
+ if !valid_write_modes.include?(if_table_exists)
1030
+ msg = "write_database `if_table_exists` must be one of #{valid_write_modes.inspect}, got #{if_table_exists.inspect}"
1031
+ raise ArgumentError, msg
1032
+ end
1033
+
1034
+ with_connection(connection) do |connection|
1035
+ table_exists = connection.table_exists?(table_name)
1036
+ if table_exists && if_table_exists == "fail"
1037
+ raise ArgumentError, "Table already exists"
1038
+ end
1039
+
1040
+ create_table = !table_exists || if_table_exists == "replace"
1041
+ maybe_transaction(connection, create_table) do
1042
+ if create_table
1043
+ mysql = connection.adapter_name.match?(/mysql|trilogy/i)
1044
+ force = if_table_exists == "replace"
1045
+ connection.create_table(table_name, id: false, force: force) do |t|
1046
+ schema.each do |c, dtype|
1047
+ options = {}
1048
+ column_type =
1049
+ case dtype
1050
+ when Binary
1051
+ :binary
1052
+ when Boolean
1053
+ :boolean
1054
+ when Date
1055
+ :date
1056
+ when Datetime
1057
+ :datetime
1058
+ when Decimal
1059
+ if mysql
1060
+ options[:precision] = dtype.precision || 65
1061
+ options[:scale] = dtype.scale || 30
1062
+ end
1063
+ :decimal
1064
+ when Float32
1065
+ options[:limit] = 24
1066
+ :float
1067
+ when Float64
1068
+ options[:limit] = 53
1069
+ :float
1070
+ when Int8
1071
+ options[:limit] = 1
1072
+ :integer
1073
+ when Int16
1074
+ options[:limit] = 2
1075
+ :integer
1076
+ when Int32
1077
+ options[:limit] = 4
1078
+ :integer
1079
+ when Int64
1080
+ options[:limit] = 8
1081
+ :integer
1082
+ when UInt8
1083
+ if mysql
1084
+ options[:limit] = 1
1085
+ options[:unsigned] = true
1086
+ else
1087
+ options[:limit] = 2
1088
+ end
1089
+ :integer
1090
+ when UInt16
1091
+ if mysql
1092
+ options[:limit] = 2
1093
+ options[:unsigned] = true
1094
+ else
1095
+ options[:limit] = 4
1096
+ end
1097
+ :integer
1098
+ when UInt32
1099
+ if mysql
1100
+ options[:limit] = 4
1101
+ options[:unsigned] = true
1102
+ else
1103
+ options[:limit] = 8
1104
+ end
1105
+ :integer
1106
+ when UInt64
1107
+ if mysql
1108
+ options[:limit] = 8
1109
+ options[:unsigned] = true
1110
+ :integer
1111
+ else
1112
+ options[:precision] = 20
1113
+ options[:scale] = 0
1114
+ :decimal
1115
+ end
1116
+ when String
1117
+ :text
1118
+ when Time
1119
+ :time
1120
+ else
1121
+ raise ArgumentError, "column type not supported yet: #{dtype}"
1122
+ end
1123
+ t.column c, column_type, **options
1124
+ end
1125
+ end
1126
+ end
1127
+
1128
+ quoted_table = connection.quote_table_name(table_name)
1129
+ quoted_columns = columns.map { |c| connection.quote_column_name(c) }
1130
+ rows = cast({Polars::UInt64 => Polars::String}).rows(named: false).map { |row| "(#{row.map { |v| connection.quote(v) }.join(", ")})" }
1131
+ connection.exec_update("INSERT INTO #{quoted_table} (#{quoted_columns.join(", ")}) VALUES #{rows.join(", ")}")
1132
+ end
1133
+ end
1134
+ end
1135
+
964
1136
  # Write DataFrame as delta table.
965
1137
  #
966
1138
  # @param target [Object]
@@ -2424,6 +2596,24 @@ module Polars
2424
2596
  # - true: -> Always coalesce join columns.
2425
2597
  # - false: -> Never coalesce join columns.
2426
2598
  # Note that joining on any other expressions than `col` will turn off coalescing.
2599
+ # @param maintain_order ['none', 'left', 'right', 'left_right', 'right_left']
2600
+ # Which DataFrame row order to preserve, if any.
2601
+ # Do not rely on any observed ordering without explicitly
2602
+ # setting this parameter, as your code may break in a future release.
2603
+ # Not specifying any ordering can improve performance
2604
+ # Supported for inner, left, right and full joins
2605
+ #
2606
+ # * *none*
2607
+ # No specific ordering is desired. The ordering might differ across
2608
+ # Polars versions or even between different runs.
2609
+ # * *left*
2610
+ # Preserves the order of the left DataFrame.
2611
+ # * *right*
2612
+ # Preserves the order of the right DataFrame.
2613
+ # * *left_right*
2614
+ # First preserves the order of the left DataFrame, then the right.
2615
+ # * *right_left*
2616
+ # First preserves the order of the right DataFrame, then the left.
2427
2617
  #
2428
2618
  # @return [DataFrame]
2429
2619
  #
@@ -2506,7 +2696,8 @@ module Polars
2506
2696
  # # ╞═════╪═════╪═════╡
2507
2697
  # # │ 3 ┆ 8.0 ┆ c │
2508
2698
  # # └─────┴─────┴─────┘
2509
- def join(other,
2699
+ def join(
2700
+ other,
2510
2701
  left_on: nil,
2511
2702
  right_on: nil,
2512
2703
  on: nil,
@@ -2514,7 +2705,8 @@ module Polars
2514
2705
  suffix: "_right",
2515
2706
  validate: "m:m",
2516
2707
  join_nulls: false,
2517
- coalesce: nil
2708
+ coalesce: nil,
2709
+ maintain_order: nil
2518
2710
  )
2519
2711
  lazy
2520
2712
  .join(
@@ -2526,7 +2718,8 @@ module Polars
2526
2718
  suffix: suffix,
2527
2719
  validate: validate,
2528
2720
  join_nulls: join_nulls,
2529
- coalesce: coalesce
2721
+ coalesce: coalesce,
2722
+ maintain_order: maintain_order
2530
2723
  )
2531
2724
  .collect(no_optimization: true)
2532
2725
  end
@@ -4865,6 +5058,90 @@ module Polars
4865
5058
  iter_rows(named: named, buffer_size: buffer_size, &block)
4866
5059
  end
4867
5060
 
5061
+ # Returns an iterator over the columns of this DataFrame.
5062
+ #
5063
+ # @return [Object]
5064
+ #
5065
+ # @note
5066
+ # Consider whether you can use `all` instead.
5067
+ # If you can, it will be more efficient.
5068
+ #
5069
+ # @example
5070
+ # df = Polars::DataFrame.new(
5071
+ # {
5072
+ # "a" => [1, 3, 5],
5073
+ # "b" => [2, 4, 6]
5074
+ # }
5075
+ # )
5076
+ # df.iter_columns.map { |s| s.name }
5077
+ # # => ["a", "b"]
5078
+ #
5079
+ # @example If you're using this to modify a dataframe's columns, e.g.
5080
+ # # Do NOT do this
5081
+ # Polars::DataFrame.new(df.iter_columns.map { |column| column * 2 })
5082
+ # # =>
5083
+ # # shape: (3, 2)
5084
+ # # ┌─────┬─────┐
5085
+ # # │ a ┆ b │
5086
+ # # │ --- ┆ --- │
5087
+ # # │ i64 ┆ i64 │
5088
+ # # ╞═════╪═════╡
5089
+ # # │ 2 ┆ 4 │
5090
+ # # │ 6 ┆ 8 │
5091
+ # # │ 10 ┆ 12 │
5092
+ # # └─────┴─────┘
5093
+ #
5094
+ # @example then consider whether you can use `all` instead:
5095
+ # df.select(Polars.all * 2)
5096
+ # # =>
5097
+ # # shape: (3, 2)
5098
+ # # ┌─────┬─────┐
5099
+ # # │ a ┆ b │
5100
+ # # │ --- ┆ --- │
5101
+ # # │ i64 ┆ i64 │
5102
+ # # ╞═════╪═════╡
5103
+ # # │ 2 ┆ 4 │
5104
+ # # │ 6 ┆ 8 │
5105
+ # # │ 10 ┆ 12 │
5106
+ # # └─────┴─────┘
5107
+ def iter_columns
5108
+ return to_enum(:iter_columns) unless block_given?
5109
+
5110
+ _df.get_columns.each do |s|
5111
+ yield Utils.wrap_s(s)
5112
+ end
5113
+ end
5114
+
5115
+ # Returns a non-copying iterator of slices over the underlying DataFrame.
5116
+ #
5117
+ # @param n_rows [Integer]
5118
+ # Determines the number of rows contained in each DataFrame slice.
5119
+ #
5120
+ # @return [Object]
5121
+ #
5122
+ # @example
5123
+ # df = Polars::DataFrame.new(
5124
+ # {
5125
+ # "a" => 0...17_500,
5126
+ # "b" => Date.new(2023, 1, 1),
5127
+ # "c" => "klmnoopqrstuvwxyz"
5128
+ # },
5129
+ # schema_overrides: {"a" => Polars::Int32}
5130
+ # )
5131
+ # df.iter_slices.map.with_index do |frame, idx|
5132
+ # "#{frame.class.name}:[#{idx}]:#{frame.length}"
5133
+ # end
5134
+ # # => ["Polars::DataFrame:[0]:10000", "Polars::DataFrame:[1]:7500"]
5135
+ def iter_slices(n_rows: 10_000)
5136
+ return to_enum(:iter_slices, n_rows: n_rows) unless block_given?
5137
+
5138
+ offset = 0
5139
+ while offset < height
5140
+ yield slice(offset, n_rows)
5141
+ offset += n_rows
5142
+ end
5143
+ end
5144
+
4868
5145
  # Shrink DataFrame memory usage.
4869
5146
  #
4870
5147
  # Shrinks to fit the exact capacity needed to hold the data.
@@ -5101,12 +5378,17 @@ module Polars
5101
5378
  lazy.merge_sorted(other.lazy, key).collect(_eager: true)
5102
5379
  end
5103
5380
 
5104
- # Indicate that one or multiple columns are sorted.
5381
+ # Flag a column as sorted.
5382
+ #
5383
+ # This can speed up future operations.
5384
+ #
5385
+ # @note
5386
+ # This can lead to incorrect results if the data is NOT sorted! Use with care!
5105
5387
  #
5106
5388
  # @param column [Object]
5107
- # Columns that are sorted
5389
+ # Column that is sorted.
5108
5390
  # @param descending [Boolean]
5109
- # Whether the columns are sorted in descending order.
5391
+ # Whether the column is sorted in descending order.
5110
5392
  #
5111
5393
  # @return [DataFrame]
5112
5394
  def set_sorted(
@@ -5538,5 +5820,21 @@ module Polars
5538
5820
  end
5539
5821
  other
5540
5822
  end
5823
+
5824
+ def with_connection(connection, &block)
5825
+ if !connection.nil?
5826
+ yield connection
5827
+ else
5828
+ ActiveRecord::Base.connection_pool.with_connection(&block)
5829
+ end
5830
+ end
5831
+
5832
+ def maybe_transaction(connection, create_table, &block)
5833
+ if create_table && connection.adapter_name.match?(/postg|sqlite/i) && connection.open_transactions == 0
5834
+ connection.transaction(&block)
5835
+ else
5836
+ yield
5837
+ end
5838
+ end
5541
5839
  end
5542
5840
  end
data/lib/polars/expr.rb CHANGED
@@ -1176,8 +1176,8 @@ module Polars
1176
1176
  # # │ 1.0 │
1177
1177
  # # │ 1.2 │
1178
1178
  # # └─────┘
1179
- def round(decimals = 0)
1180
- _from_rbexpr(_rbexpr.round(decimals))
1179
+ def round(decimals = 0, mode: "half_to_even")
1180
+ _from_rbexpr(_rbexpr.round(decimals, mode))
1181
1181
  end
1182
1182
 
1183
1183
  # Compute the dot/inner product between two Expressions.
@@ -1867,7 +1867,7 @@ module Polars
1867
1867
  # # │ 2 ┆ 6 │
1868
1868
  # # └─────┴─────┘
1869
1869
  def forward_fill(limit: nil)
1870
- _from_rbexpr(_rbexpr.forward_fill(limit))
1870
+ fill_null(strategy: "forward", limit: limit)
1871
1871
  end
1872
1872
 
1873
1873
  # Fill missing values with the next to be seen values.
@@ -1897,7 +1897,7 @@ module Polars
1897
1897
  # # │ null ┆ 6 │
1898
1898
  # # └──────┴─────┘
1899
1899
  def backward_fill(limit: nil)
1900
- _from_rbexpr(_rbexpr.backward_fill(limit))
1900
+ fill_null(strategy: "backward", limit: limit)
1901
1901
  end
1902
1902
 
1903
1903
  # Reverse the selection.
@@ -3712,6 +3712,8 @@ module Polars
3712
3712
  #
3713
3713
  # @param other [Object]
3714
3714
  # Series or sequence of primitive type.
3715
+ # @param nulls_equal [Boolean]
3716
+ # If true, treat null as a distinct value. Null values will not propagate.
3715
3717
  #
3716
3718
  # @return [Expr]
3717
3719
  #
@@ -3719,19 +3721,19 @@ module Polars
3719
3721
  # df = Polars::DataFrame.new(
3720
3722
  # {"sets" => [[1, 2, 3], [1, 2], [9, 10]], "optional_members" => [1, 2, 3]}
3721
3723
  # )
3722
- # df.select([Polars.col("optional_members").is_in("sets").alias("contains")])
3724
+ # df.with_columns(contains: Polars.col("optional_members").is_in("sets"))
3723
3725
  # # =>
3724
- # # shape: (3, 1)
3725
- # # ┌──────────┐
3726
- # # │ contains │
3727
- # # │ --- │
3728
- # # │ bool │
3729
- # # ╞══════════╡
3730
- # # │ true │
3731
- # # │ true │
3732
- # # │ false │
3733
- # # └──────────┘
3734
- def is_in(other)
3726
+ # # shape: (3, 3)
3727
+ # # ┌───────────┬──────────────────┬──────────┐
3728
+ # # │ sets ┆ optional_members ┆ contains │
3729
+ # # │ --- ┆ --- ┆ ---
3730
+ # # │ list[i64] ┆ i64 ┆ bool │
3731
+ # # ╞═══════════╪══════════════════╪══════════╡
3732
+ # # │ [1, 2, 3] ┆ 1 ┆ true │
3733
+ # # │ [1, 2] ┆ 2 ┆ true │
3734
+ # # │ [9, 10] ┆ 3 ┆ false │
3735
+ # # └───────────┴──────────────────┴──────────┘
3736
+ def is_in(other, nulls_equal: false)
3735
3737
  if other.is_a?(::Array)
3736
3738
  if other.length == 0
3737
3739
  other = Polars.lit(nil)._rbexpr
@@ -3741,7 +3743,7 @@ module Polars
3741
3743
  else
3742
3744
  other = Utils.parse_into_expression(other, str_as_lit: false)
3743
3745
  end
3744
- _from_rbexpr(_rbexpr.is_in(other))
3746
+ _from_rbexpr(_rbexpr.is_in(other, nulls_equal))
3745
3747
  end
3746
3748
  alias_method :in?, :is_in
3747
3749
 
@@ -3994,6 +3996,37 @@ module Polars
3994
3996
  _from_rbexpr(_rbexpr.interpolate(method))
3995
3997
  end
3996
3998
 
3999
+ # Fill null values using interpolation based on another column.
4000
+ #
4001
+ # @param by [Expr] Column to interpolate values based on.
4002
+ #
4003
+ # @return [Expr]
4004
+ #
4005
+ # @example Fill null values using linear interpolation.
4006
+ # df = Polars::DataFrame.new(
4007
+ # {
4008
+ # "a" => [1, nil, nil, 3],
4009
+ # "b" => [1, 2, 7, 8]
4010
+ # }
4011
+ # )
4012
+ # df.with_columns(a_interpolated: Polars.col("a").interpolate_by("b"))
4013
+ # # =>
4014
+ # # shape: (4, 3)
4015
+ # # ┌──────┬─────┬────────────────┐
4016
+ # # │ a ┆ b ┆ a_interpolated │
4017
+ # # │ --- ┆ --- ┆ --- │
4018
+ # # │ i64 ┆ i64 ┆ f64 │
4019
+ # # ╞══════╪═════╪════════════════╡
4020
+ # # │ 1 ┆ 1 ┆ 1.0 │
4021
+ # # │ null ┆ 2 ┆ 1.285714 │
4022
+ # # │ null ┆ 7 ┆ 2.714286 │
4023
+ # # │ 3 ┆ 8 ┆ 3.0 │
4024
+ # # └──────┴─────┴────────────────┘
4025
+ def interpolate_by(by)
4026
+ by = Utils.parse_into_expression(by)
4027
+ _from_rbexpr(_rbexpr.interpolate_by(by))
4028
+ end
4029
+
3997
4030
  # Apply a rolling min based on another column.
3998
4031
  #
3999
4032
  # @param by [String]
@@ -5684,6 +5717,11 @@ module Polars
5684
5717
  # Integer size of the rolling window.
5685
5718
  # @param bias [Boolean]
5686
5719
  # If false, the calculations are corrected for statistical bias.
5720
+ # @param min_samples [Integer]
5721
+ # The number of values in the window that should be non-null before computing
5722
+ # a result. If set to `nil` (default), it will be set equal to `window_size`.
5723
+ # @param center [Boolean]
5724
+ # Set the labels at the center of the window.
5687
5725
  #
5688
5726
  # @return [Expr]
5689
5727
  #
@@ -5702,8 +5740,8 @@ module Polars
5702
5740
  # # │ 0.381802 │
5703
5741
  # # │ 0.47033 │
5704
5742
  # # └──────────┘
5705
- def rolling_skew(window_size, bias: true)
5706
- _from_rbexpr(_rbexpr.rolling_skew(window_size, bias))
5743
+ def rolling_skew(window_size, bias: true, min_samples: nil, center: false)
5744
+ _from_rbexpr(_rbexpr.rolling_skew(window_size, bias, min_samples, center))
5707
5745
  end
5708
5746
 
5709
5747
  # Compute absolute values.
@@ -5858,6 +5896,7 @@ module Polars
5858
5896
  # # │ 20 │
5859
5897
  # # └──────┘
5860
5898
  def diff(n: 1, null_behavior: "ignore")
5899
+ n = Utils.parse_into_expression(n)
5861
5900
  _from_rbexpr(_rbexpr.diff(n, null_behavior))
5862
5901
  end
5863
5902