polars-df 0.21.0-x86_64-linux-musl → 0.22.0-x86_64-linux-musl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +27 -0
  3. data/Cargo.lock +55 -48
  4. data/Cargo.toml +3 -0
  5. data/LICENSE-THIRD-PARTY.txt +23 -49
  6. data/README.md +12 -0
  7. data/lib/polars/3.2/polars.so +0 -0
  8. data/lib/polars/3.3/polars.so +0 -0
  9. data/lib/polars/3.4/polars.so +0 -0
  10. data/lib/polars/array_expr.rb +382 -3
  11. data/lib/polars/array_name_space.rb +281 -0
  12. data/lib/polars/binary_expr.rb +67 -0
  13. data/lib/polars/binary_name_space.rb +43 -0
  14. data/lib/polars/cat_expr.rb +224 -0
  15. data/lib/polars/cat_name_space.rb +138 -0
  16. data/lib/polars/config.rb +2 -2
  17. data/lib/polars/convert.rb +6 -6
  18. data/lib/polars/data_frame.rb +794 -27
  19. data/lib/polars/data_type_expr.rb +52 -0
  20. data/lib/polars/data_types.rb +26 -5
  21. data/lib/polars/date_time_expr.rb +252 -1
  22. data/lib/polars/date_time_name_space.rb +299 -0
  23. data/lib/polars/expr.rb +1248 -206
  24. data/lib/polars/functions/business.rb +95 -0
  25. data/lib/polars/functions/datatype.rb +21 -0
  26. data/lib/polars/functions/lazy.rb +14 -1
  27. data/lib/polars/io/csv.rb +1 -1
  28. data/lib/polars/io/iceberg.rb +27 -0
  29. data/lib/polars/io/json.rb +4 -4
  30. data/lib/polars/io/ndjson.rb +4 -4
  31. data/lib/polars/io/parquet.rb +32 -7
  32. data/lib/polars/io/scan_options.rb +4 -1
  33. data/lib/polars/lazy_frame.rb +1028 -28
  34. data/lib/polars/list_expr.rb +217 -17
  35. data/lib/polars/list_name_space.rb +231 -22
  36. data/lib/polars/meta_expr.rb +89 -0
  37. data/lib/polars/name_expr.rb +36 -0
  38. data/lib/polars/query_opt_flags.rb +50 -0
  39. data/lib/polars/scan_cast_options.rb +20 -1
  40. data/lib/polars/schema.rb +79 -3
  41. data/lib/polars/selector.rb +72 -0
  42. data/lib/polars/selectors.rb +3 -3
  43. data/lib/polars/series.rb +1053 -54
  44. data/lib/polars/string_expr.rb +436 -32
  45. data/lib/polars/string_name_space.rb +736 -50
  46. data/lib/polars/struct_expr.rb +103 -0
  47. data/lib/polars/struct_name_space.rb +19 -1
  48. data/lib/polars/utils/serde.rb +17 -0
  49. data/lib/polars/utils/various.rb +22 -1
  50. data/lib/polars/utils.rb +5 -1
  51. data/lib/polars/version.rb +1 -1
  52. data/lib/polars.rb +6 -0
  53. metadata +8 -2
@@ -15,11 +15,11 @@ module Polars
15
15
  # The schema of the resulting DataFrame. The schema may be declared in several
16
16
  # ways:
17
17
  #
18
- # * As a hash of name:type pairs; if type is nil, it will be auto-inferred.
18
+ # * As a hash of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
19
19
  # * As an array of column names; in this case types are automatically inferred.
20
- # * As an array of (name,type) pairs; this is equivalent to the dictionary form.
20
+ # * As an array of (name,type) pairs; this is equivalent to the hash form.
21
21
  #
22
- # If you supply a list of column names that does not match the names in the
22
+ # If you supply an array of column names that does not match the names in the
23
23
  # underlying data, the names given here will overwrite them. The number
24
24
  # of names given in the schema should match the underlying data dimensions.
25
25
  #
@@ -72,6 +72,43 @@ module Polars
72
72
  end
73
73
  end
74
74
 
75
+ # Read a serialized DataFrame from a file.
76
+ #
77
+ # @param source [Object]
78
+ # Path to a file or a file-like object (by file-like object, we refer to
79
+ # objects that have a `read` method, such as a file handler or `StringIO`).
80
+ #
81
+ # @return [DataFrame]
82
+ #
83
+ # @note
84
+ # Serialization is not stable across Polars versions: a LazyFrame serialized
85
+ # in one Polars version may not be deserializable in another Polars version.
86
+ #
87
+ # @example
88
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => [4.0, 5.0, 6.0]})
89
+ # bytes = df.serialize
90
+ # Polars::DataFrame.deserialize(StringIO.new(bytes))
91
+ # # =>
92
+ # # shape: (3, 2)
93
+ # # ┌─────┬─────┐
94
+ # # │ a ┆ b │
95
+ # # │ --- ┆ --- │
96
+ # # │ i64 ┆ f64 │
97
+ # # ╞═════╪═════╡
98
+ # # │ 1 ┆ 4.0 │
99
+ # # │ 2 ┆ 5.0 │
100
+ # # │ 3 ┆ 6.0 │
101
+ # # └─────┴─────┘
102
+ def self.deserialize(source)
103
+ if Utils.pathlike?(source)
104
+ source = Utils.normalize_filepath(source)
105
+ end
106
+
107
+ deserializer = RbDataFrame.method(:deserialize_binary)
108
+
109
+ _from_rbdf(deserializer.(source))
110
+ end
111
+
75
112
  # @private
76
113
  def self._from_rbdf(rb_df)
77
114
  df = DataFrame.allocate
@@ -560,9 +597,7 @@ module Polars
560
597
  end
561
598
  end
562
599
 
563
- # Convert every row to a dictionary.
564
- #
565
- # Note that this is slow.
600
+ # Convert every row to a hash.
566
601
  #
567
602
  # @return [Array]
568
603
  #
@@ -572,12 +607,7 @@ module Polars
572
607
  # # =>
573
608
  # # [{"foo"=>1, "bar"=>4}, {"foo"=>2, "bar"=>5}, {"foo"=>3, "bar"=>6}]
574
609
  def to_hashes
575
- rbdf = _df
576
- names = columns
577
-
578
- height.times.map do |i|
579
- names.zip(rbdf.row_tuple(i)).to_h
580
- end
610
+ rows(named: true)
581
611
  end
582
612
 
583
613
  # Convert DataFrame to a 2D Numo array.
@@ -634,6 +664,44 @@ module Polars
634
664
  Utils.wrap_s(_df.select_at_idx(index))
635
665
  end
636
666
 
667
+ # Serialize this DataFrame to a file or string.
668
+ #
669
+ # @param file [Object]
670
+ # File path or writable file-like object to which the result will be written.
671
+ # If set to `nil` (default), the output is returned as a string instead.
672
+ #
673
+ # @return [Object]
674
+ #
675
+ # @note
676
+ # Serialization is not stable across Polars versions: a LazyFrame serialized
677
+ # in one Polars version may not be deserializable in another Polars version.
678
+ #
679
+ # @example
680
+ # df = Polars::DataFrame.new(
681
+ # {
682
+ # "foo" => [1, 2, 3],
683
+ # "bar" => [6, 7, 8]
684
+ # }
685
+ # )
686
+ # bytes = df.serialize
687
+ # Polars::DataFrame.deserialize(StringIO.new(bytes))
688
+ # # =>
689
+ # # shape: (3, 2)
690
+ # # ┌─────┬─────┐
691
+ # # │ foo ┆ bar │
692
+ # # │ --- ┆ --- │
693
+ # # │ i64 ┆ i64 │
694
+ # # ╞═════╪═════╡
695
+ # # │ 1 ┆ 6 │
696
+ # # │ 2 ┆ 7 │
697
+ # # │ 3 ┆ 8 │
698
+ # # └─────┴─────┘
699
+ def serialize(file = nil)
700
+ serializer = _df.method(:serialize_binary)
701
+
702
+ Utils.serialize_polars_object(serializer, file)
703
+ end
704
+
637
705
  # Serialize to JSON representation.
638
706
  #
639
707
  # @param file [String]
@@ -910,7 +978,7 @@ module Polars
910
978
  #
911
979
  # @param file [Object]
912
980
  # Path or writable file-like object to which the IPC record batch data will
913
- # be written. If set to `None`, the output is returned as a BytesIO object.
981
+ # be written. If set to `nil`, the output is returned as a BytesIO object.
914
982
  # @param compression ['uncompressed', 'lz4', 'zstd']
915
983
  # Compression method. Defaults to "uncompressed".
916
984
  # @param compat_level [Object]
@@ -1148,6 +1216,40 @@ module Polars
1148
1216
  end
1149
1217
  end
1150
1218
 
1219
+ # Write DataFrame to an Iceberg table.
1220
+ #
1221
+ # @note
1222
+ # This functionality is currently considered **unstable**. It may be
1223
+ # changed at any point without it being considered a breaking change.
1224
+ #
1225
+ # @param target [Object]
1226
+ # Name of the table or the Table object representing an Iceberg table.
1227
+ # @param mode ['append', 'overwrite']
1228
+ # How to handle existing data.
1229
+ #
1230
+ # - If 'append', will add new data.
1231
+ # - If 'overwrite', will replace table with new data.
1232
+ #
1233
+ # @return [nil]
1234
+ def write_iceberg(target, mode:)
1235
+ require "iceberg"
1236
+
1237
+ table =
1238
+ if target.is_a?(Iceberg::Table)
1239
+ target
1240
+ else
1241
+ raise Todo
1242
+ end
1243
+
1244
+ data = self
1245
+
1246
+ if mode == "append"
1247
+ table.append(data)
1248
+ else
1249
+ raise Todo
1250
+ end
1251
+ end
1252
+
1151
1253
  # Write DataFrame as delta table.
1152
1254
  #
1153
1255
  # @param target [Object]
@@ -1463,6 +1565,126 @@ module Polars
1463
1565
  lazy.filter(predicate).collect
1464
1566
  end
1465
1567
 
1568
+ # Remove rows, dropping those that match the given predicate expression(s).
1569
+ #
1570
+ # The original order of the remaining rows is preserved.
1571
+ #
1572
+ # Rows where the filter predicate does not evaluate to True are retained
1573
+ # (this includes rows where the predicate evaluates as `null`).
1574
+ #
1575
+ # @param predicates [Array]
1576
+ # Expression that evaluates to a boolean Series.
1577
+ # @param constraints [Hash]
1578
+ # Column filters; use `name = value` to filter columns using the supplied
1579
+ # value. Each constraint behaves the same as `Polars.col(name).eq(value)`,
1580
+ # and is implicitly joined with the other filter conditions using `&`.
1581
+ #
1582
+ # @return [DataFrame]
1583
+ #
1584
+ # @example Remove rows matching a condition:
1585
+ # df = Polars::DataFrame.new(
1586
+ # {
1587
+ # "foo" => [2, 3, nil, 4, 0],
1588
+ # "bar" => [5, 6, nil, nil, 0],
1589
+ # "ham" => ["a", "b", nil, "c", "d"]
1590
+ # }
1591
+ # )
1592
+ # df.remove(Polars.col("bar") >= 5)
1593
+ # # =>
1594
+ # # shape: (3, 3)
1595
+ # # ┌──────┬──────┬──────┐
1596
+ # # │ foo ┆ bar ┆ ham │
1597
+ # # │ --- ┆ --- ┆ --- │
1598
+ # # │ i64 ┆ i64 ┆ str │
1599
+ # # ╞══════╪══════╪══════╡
1600
+ # # │ null ┆ null ┆ null │
1601
+ # # │ 4 ┆ null ┆ c │
1602
+ # # │ 0 ┆ 0 ┆ d │
1603
+ # # └──────┴──────┴──────┘
1604
+ #
1605
+ # @example Discard rows based on multiple conditions, combined with and/or operators:
1606
+ # df.remove(
1607
+ # (Polars.col("foo") >= 0) & (Polars.col("bar") >= 0),
1608
+ # )
1609
+ # # =>
1610
+ # # shape: (2, 3)
1611
+ # # ┌──────┬──────┬──────┐
1612
+ # # │ foo ┆ bar ┆ ham │
1613
+ # # │ --- ┆ --- ┆ --- │
1614
+ # # │ i64 ┆ i64 ┆ str │
1615
+ # # ╞══════╪══════╪══════╡
1616
+ # # │ null ┆ null ┆ null │
1617
+ # # │ 4 ┆ null ┆ c │
1618
+ # # └──────┴──────┴──────┘
1619
+ #
1620
+ # @example
1621
+ # df.remove(
1622
+ # (Polars.col("foo") >= 0) | (Polars.col("bar") >= 0),
1623
+ # )
1624
+ # # =>
1625
+ # # shape: (1, 3)
1626
+ # # ┌──────┬──────┬──────┐
1627
+ # # │ foo ┆ bar ┆ ham │
1628
+ # # │ --- ┆ --- ┆ --- │
1629
+ # # │ i64 ┆ i64 ┆ str │
1630
+ # # ╞══════╪══════╪══════╡
1631
+ # # │ null ┆ null ┆ null │
1632
+ # # └──────┴──────┴──────┘
1633
+ #
1634
+ # @example Provide multiple constraints using `*args` syntax:
1635
+ # df.remove(
1636
+ # Polars.col("ham").is_not_null,
1637
+ # Polars.col("bar") >= 0
1638
+ # )
1639
+ # # =>
1640
+ # # shape: (2, 3)
1641
+ # # ┌──────┬──────┬──────┐
1642
+ # # │ foo ┆ bar ┆ ham │
1643
+ # # │ --- ┆ --- ┆ --- │
1644
+ # # │ i64 ┆ i64 ┆ str │
1645
+ # # ╞══════╪══════╪══════╡
1646
+ # # │ null ┆ null ┆ null │
1647
+ # # │ 4 ┆ null ┆ c │
1648
+ # # └──────┴──────┴──────┘
1649
+ #
1650
+ # @example Provide constraints(s) using `**kwargs` syntax:
1651
+ # df.remove(foo: 0, bar: 0)
1652
+ # # =>
1653
+ # # shape: (4, 3)
1654
+ # # ┌──────┬──────┬──────┐
1655
+ # # │ foo ┆ bar ┆ ham │
1656
+ # # │ --- ┆ --- ┆ --- │
1657
+ # # │ i64 ┆ i64 ┆ str │
1658
+ # # ╞══════╪══════╪══════╡
1659
+ # # │ 2 ┆ 5 ┆ a │
1660
+ # # │ 3 ┆ 6 ┆ b │
1661
+ # # │ null ┆ null ┆ null │
1662
+ # # │ 4 ┆ null ┆ c │
1663
+ # # └──────┴──────┴──────┘
1664
+ #
1665
+ # @example Remove rows by comparing two columns against each other:
1666
+ # df.remove(
1667
+ # Polars.col("foo").ne_missing(Polars.col("bar"))
1668
+ # )
1669
+ # # =>
1670
+ # # shape: (2, 3)
1671
+ # # ┌──────┬──────┬──────┐
1672
+ # # │ foo ┆ bar ┆ ham │
1673
+ # # │ --- ┆ --- ┆ --- │
1674
+ # # │ i64 ┆ i64 ┆ str │
1675
+ # # ╞══════╪══════╪══════╡
1676
+ # # │ null ┆ null ┆ null │
1677
+ # # │ 0 ┆ 0 ┆ d │
1678
+ # # └──────┴──────┴──────┘
1679
+ def remove(
1680
+ *predicates,
1681
+ **constraints
1682
+ )
1683
+ lazy
1684
+ .remove(*predicates, **constraints)
1685
+ .collect(_eager: true)
1686
+ end
1687
+
1466
1688
  # Summary statistics for a DataFrame.
1467
1689
  #
1468
1690
  # @return [DataFrame]
@@ -1658,6 +1880,223 @@ module Polars
1658
1880
  self._df = sort(by, reverse: reverse, nulls_last: nulls_last)._df
1659
1881
  end
1660
1882
 
1883
+ # Execute a SQL query against the DataFrame.
1884
+ #
1885
+ # @note
1886
+ # This functionality is considered **unstable**, although it is close to
1887
+ # being considered stable. It may be changed at any point without it being
1888
+ # considered a breaking change.
1889
+ #
1890
+ # @param query [String]
1891
+ # SQL query to execute.
1892
+ # @param table_name [String]
1893
+ # Optionally provide an explicit name for the table that represents the
1894
+ # calling frame (defaults to "self").
1895
+ #
1896
+ # @return [DataFrame]
1897
+ #
1898
+ # @note
1899
+ # * The calling frame is automatically registered as a table in the SQL context
1900
+ # under the name "self". If you want access to the DataFrames and LazyFrames
1901
+ # found in the current globals, use the top-level :meth:`pl.sql <polars.sql>`.
1902
+ # * More control over registration and execution behaviour is available by
1903
+ # using the :class:`SQLContext` object.
1904
+ # * The SQL query executes in lazy mode before being collected and returned
1905
+ # as a DataFrame.
1906
+ #
1907
+ # @example Query the DataFrame using SQL:
1908
+ # df1 = Polars::DataFrame.new(
1909
+ # {
1910
+ # "a" => [1, 2, 3],
1911
+ # "b" => ["zz", "yy", "xx"],
1912
+ # "c" => [Date.new(1999, 12, 31), Date.new(2010, 10, 10), Date.new(2077, 8, 8)]
1913
+ # }
1914
+ # )
1915
+ # df1.sql("SELECT c, b FROM self WHERE a > 1")
1916
+ # # =>
1917
+ # # shape: (2, 2)
1918
+ # # ┌────────────┬─────┐
1919
+ # # │ c ┆ b │
1920
+ # # │ --- ┆ --- │
1921
+ # # │ date ┆ str │
1922
+ # # ╞════════════╪═════╡
1923
+ # # │ 2010-10-10 ┆ yy │
1924
+ # # │ 2077-08-08 ┆ xx │
1925
+ # # └────────────┴─────┘
1926
+ #
1927
+ # @example Apply transformations to a DataFrame using SQL, aliasing "self" to "frame".
1928
+ # df1.sql(
1929
+ # "
1930
+ # SELECT
1931
+ # a,
1932
+ # (a % 2 == 0) AS a_is_even,
1933
+ # CONCAT_WS(':', b, b) AS b_b,
1934
+ # EXTRACT(year FROM c) AS year,
1935
+ # 0::float4 AS \"zero\",
1936
+ # FROM frame
1937
+ # ",
1938
+ # table_name: "frame"
1939
+ # )
1940
+ # # =>
1941
+ # # shape: (3, 5)
1942
+ # # ┌─────┬───────────┬───────┬──────┬──────┐
1943
+ # # │ a ┆ a_is_even ┆ b_b ┆ year ┆ zero │
1944
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1945
+ # # │ i64 ┆ bool ┆ str ┆ i32 ┆ f32 │
1946
+ # # ╞═════╪═══════════╪═══════╪══════╪══════╡
1947
+ # # │ 1 ┆ false ┆ zz:zz ┆ 1999 ┆ 0.0 │
1948
+ # # │ 2 ┆ true ┆ yy:yy ┆ 2010 ┆ 0.0 │
1949
+ # # │ 3 ┆ false ┆ xx:xx ┆ 2077 ┆ 0.0 │
1950
+ # # └─────┴───────────┴───────┴──────┴──────┘
1951
+ def sql(query, table_name: "self")
1952
+ ctx = SQLContext.new(eager_execution: true)
1953
+ name = table_name || "self"
1954
+ ctx.register(name, self)
1955
+ ctx.execute(query)
1956
+ end
1957
+
1958
+ # Return the `k` largest rows.
1959
+ #
1960
+ # Non-null elements are always preferred over null elements, regardless of
1961
+ # the value of `reverse`. The output is not guaranteed to be in any
1962
+ # particular order, call `sort` after this function if you wish the
1963
+ # output to be sorted.
1964
+ #
1965
+ # @param k [Integer]
1966
+ # Number of rows to return.
1967
+ # @param by [Object]
1968
+ # Column(s) used to determine the top rows.
1969
+ # Accepts expression input. Strings are parsed as column names.
1970
+ # @param reverse [Object]
1971
+ # Consider the `k` smallest elements of the `by` column(s) (instead of the `k`
1972
+ # largest). This can be specified per column by passing a sequence of
1973
+ # booleans.
1974
+ #
1975
+ # @return [DataFrame]
1976
+ #
1977
+ # @example Get the rows which contain the 4 largest values in column b.
1978
+ # df = Polars::DataFrame.new(
1979
+ # {
1980
+ # "a" => ["a", "b", "a", "b", "b", "c"],
1981
+ # "b" => [2, 1, 1, 3, 2, 1]
1982
+ # }
1983
+ # )
1984
+ # df.top_k(4, by: "b")
1985
+ # # =>
1986
+ # # shape: (4, 2)
1987
+ # # ┌─────┬─────┐
1988
+ # # │ a ┆ b │
1989
+ # # │ --- ┆ --- │
1990
+ # # │ str ┆ i64 │
1991
+ # # ╞═════╪═════╡
1992
+ # # │ b ┆ 3 │
1993
+ # # │ a ┆ 2 │
1994
+ # # │ b ┆ 2 │
1995
+ # # │ b ┆ 1 │
1996
+ # # └─────┴─────┘
1997
+ #
1998
+ # @example Get the rows which contain the 4 largest values when sorting on column b and a.
1999
+ # df.top_k(4, by: ["b", "a"])
2000
+ # # =>
2001
+ # # shape: (4, 2)
2002
+ # # ┌─────┬─────┐
2003
+ # # │ a ┆ b │
2004
+ # # │ --- ┆ --- │
2005
+ # # │ str ┆ i64 │
2006
+ # # ╞═════╪═════╡
2007
+ # # │ b ┆ 3 │
2008
+ # # │ b ┆ 2 │
2009
+ # # │ a ┆ 2 │
2010
+ # # │ c ┆ 1 │
2011
+ # # └─────┴─────┘
2012
+ def top_k(
2013
+ k,
2014
+ by:,
2015
+ reverse: false
2016
+ )
2017
+ lazy
2018
+ .top_k(k, by: by, reverse: reverse)
2019
+ .collect(
2020
+ # optimizations=QueryOptFlags(
2021
+ # projection_pushdown=False,
2022
+ # predicate_pushdown=False,
2023
+ # comm_subplan_elim=False,
2024
+ # slice_pushdown=True
2025
+ # )
2026
+ )
2027
+ end
2028
+
2029
+ # Return the `k` smallest rows.
2030
+ #
2031
+ # Non-null elements are always preferred over null elements, regardless of
2032
+ # the value of `reverse`. The output is not guaranteed to be in any
2033
+ # particular order, call `sort` after this function if you wish the
2034
+ # output to be sorted.
2035
+ #
2036
+ # @param k [Integer]
2037
+ # Number of rows to return.
2038
+ # @param by [Object]
2039
+ # Column(s) used to determine the bottom rows.
2040
+ # Accepts expression input. Strings are parsed as column names.
2041
+ # @param reverse [Object]
2042
+ # Consider the `k` largest elements of the `by` column(s) (instead of the `k`
2043
+ # smallest). This can be specified per column by passing a sequence of
2044
+ # booleans.
2045
+ #
2046
+ # @return [DataFrame]
2047
+ #
2048
+ # @example Get the rows which contain the 4 smallest values in column b.
2049
+ # df = Polars::DataFrame.new(
2050
+ # {
2051
+ # "a" => ["a", "b", "a", "b", "b", "c"],
2052
+ # "b" => [2, 1, 1, 3, 2, 1]
2053
+ # }
2054
+ # )
2055
+ # df.bottom_k(4, by: "b")
2056
+ # # =>
2057
+ # # shape: (4, 2)
2058
+ # # ┌─────┬─────┐
2059
+ # # │ a ┆ b │
2060
+ # # │ --- ┆ --- │
2061
+ # # │ str ┆ i64 │
2062
+ # # ╞═════╪═════╡
2063
+ # # │ b ┆ 1 │
2064
+ # # │ a ┆ 1 │
2065
+ # # │ c ┆ 1 │
2066
+ # # │ a ┆ 2 │
2067
+ # # └─────┴─────┘
2068
+ #
2069
+ # @example Get the rows which contain the 4 smallest values when sorting on column a and b.
2070
+ # df.bottom_k(4, by: ["a", "b"])
2071
+ # # =>
2072
+ # # shape: (4, 2)
2073
+ # # ┌─────┬─────┐
2074
+ # # │ a ┆ b │
2075
+ # # │ --- ┆ --- │
2076
+ # # │ str ┆ i64 │
2077
+ # # ╞═════╪═════╡
2078
+ # # │ a ┆ 1 │
2079
+ # # │ a ┆ 2 │
2080
+ # # │ b ┆ 1 │
2081
+ # # │ b ┆ 2 │
2082
+ # # └─────┴─────┘
2083
+ def bottom_k(
2084
+ k,
2085
+ by:,
2086
+ reverse: false
2087
+ )
2088
+ lazy
2089
+ .bottom_k(k, by: by, reverse: reverse)
2090
+ .collect(
2091
+ # optimizations=QueryOptFlags(
2092
+ # projection_pushdown=False,
2093
+ # predicate_pushdown=False,
2094
+ # comm_subplan_elim=False,
2095
+ # slice_pushdown=True,
2096
+ # )
2097
+ )
2098
+ end
2099
+
1661
2100
  # Check if DataFrame is equal to other.
1662
2101
  #
1663
2102
  # @param other [DataFrame]
@@ -1848,10 +2287,59 @@ module Polars
1848
2287
  _from_rbdf(_df.tail(n))
1849
2288
  end
1850
2289
 
1851
- # Return a new DataFrame where the null values are dropped.
2290
+ # Drop all rows that contain one or more NaN values.
2291
+ #
2292
+ # The original order of the remaining rows is preserved.
2293
+ #
2294
+ # @param subset [Object]
2295
+ # Column name(s) for which NaN values are considered; if set to `nil`
2296
+ # (default), use all columns (note that only floating-point columns
2297
+ # can contain NaNs).
2298
+ #
2299
+ # @return [DataFrame]
2300
+ #
2301
+ # @example
2302
+ # df = Polars::DataFrame.new(
2303
+ # {
2304
+ # "foo" => [-20.5, Float::NAN, 80.0],
2305
+ # "bar" => [Float::NAN, 110.0, 25.5],
2306
+ # "ham" => ["xxx", "yyy", nil]
2307
+ # }
2308
+ # )
2309
+ # df.drop_nans
2310
+ # # =>
2311
+ # # shape: (1, 3)
2312
+ # # ┌──────┬──────┬──────┐
2313
+ # # │ foo ┆ bar ┆ ham │
2314
+ # # │ --- ┆ --- ┆ --- │
2315
+ # # │ f64 ┆ f64 ┆ str │
2316
+ # # ╞══════╪══════╪══════╡
2317
+ # # │ 80.0 ┆ 25.5 ┆ null │
2318
+ # # └──────┴──────┴──────┘
2319
+ #
2320
+ # @example
2321
+ # df.drop_nans(subset: ["bar"])
2322
+ # # =>
2323
+ # # shape: (2, 3)
2324
+ # # ┌──────┬───────┬──────┐
2325
+ # # │ foo ┆ bar ┆ ham │
2326
+ # # │ --- ┆ --- ┆ --- │
2327
+ # # │ f64 ┆ f64 ┆ str │
2328
+ # # ╞══════╪═══════╪══════╡
2329
+ # # │ NaN ┆ 110.0 ┆ yyy │
2330
+ # # │ 80.0 ┆ 25.5 ┆ null │
2331
+ # # └──────┴───────┴──────┘
2332
+ def drop_nans(subset: nil)
2333
+ lazy.drop_nans(subset: subset).collect(_eager: true)
2334
+ end
2335
+
2336
+ # Drop all rows that contain one or more null values.
2337
+ #
2338
+ # The original order of the remaining rows is preserved.
1852
2339
  #
1853
2340
  # @param subset [Object]
1854
- # Subset of column(s) on which `drop_nulls` will be applied.
2341
+ # Column name(s) for which null values are considered.
2342
+ # If set to `nil` (default), use all columns.
1855
2343
  #
1856
2344
  # @return [DataFrame]
1857
2345
  #
@@ -1860,20 +2348,32 @@ module Polars
1860
2348
  # {
1861
2349
  # "foo" => [1, 2, 3],
1862
2350
  # "bar" => [6, nil, 8],
1863
- # "ham" => ["a", "b", "c"]
2351
+ # "ham" => ["a", "b", nil]
1864
2352
  # }
1865
2353
  # )
1866
2354
  # df.drop_nulls
1867
2355
  # # =>
1868
- # # shape: (2, 3)
2356
+ # # shape: (1, 3)
1869
2357
  # # ┌─────┬─────┬─────┐
1870
2358
  # # │ foo ┆ bar ┆ ham │
1871
2359
  # # │ --- ┆ --- ┆ --- │
1872
2360
  # # │ i64 ┆ i64 ┆ str │
1873
2361
  # # ╞═════╪═════╪═════╡
1874
2362
  # # │ 1 ┆ 6 ┆ a │
1875
- # # │ 3 ┆ 8 ┆ c │
1876
2363
  # # └─────┴─────┴─────┘
2364
+ #
2365
+ # @example
2366
+ # df.drop_nulls(subset: Polars.cs.integer)
2367
+ # # =>
2368
+ # # shape: (2, 3)
2369
+ # # ┌─────┬─────┬──────┐
2370
+ # # │ foo ┆ bar ┆ ham │
2371
+ # # │ --- ┆ --- ┆ --- │
2372
+ # # │ i64 ┆ i64 ┆ str │
2373
+ # # ╞═════╪═════╪══════╡
2374
+ # # │ 1 ┆ 6 ┆ a │
2375
+ # # │ 3 ┆ 8 ┆ null │
2376
+ # # └─────┴─────┴──────┘
1877
2377
  def drop_nulls(subset: nil)
1878
2378
  lazy.drop_nulls(subset: subset).collect(_eager: true)
1879
2379
  end
@@ -2139,9 +2639,9 @@ module Polars
2139
2639
  # @param every
2140
2640
  # Interval of the window.
2141
2641
  # @param period
2142
- # Length of the window, if None it is equal to 'every'.
2642
+ # Length of the window, if nil it is equal to 'every'.
2143
2643
  # @param offset
2144
- # Offset of the window if None and period is None it will be equal to negative
2644
+ # Offset of the window if nil and period is nil it will be equal to negative
2145
2645
  # `every`.
2146
2646
  # @param truncate
2147
2647
  # Truncate the time value to the window lower bound.
@@ -2469,7 +2969,7 @@ module Polars
2469
2969
  # Join column of the right DataFrame.
2470
2970
  # @param on [String]
2471
2971
  # Join column of both DataFrames. If set, `left_on` and `right_on` should be
2472
- # None.
2972
+ # nil.
2473
2973
  # @param by_left [Object]
2474
2974
  # join on these columns before doing asof join
2475
2975
  # @param by_right [Object]
@@ -2755,6 +3255,101 @@ module Polars
2755
3255
  .collect(no_optimization: true)
2756
3256
  end
2757
3257
 
3258
+ # Perform a join based on one or multiple (in)equality predicates.
3259
+ #
3260
+ # This performs an inner join, so only rows where all predicates are true
3261
+ # are included in the result, and a row from either DataFrame may be included
3262
+ # multiple times in the result.
3263
+ #
3264
+ # @note
3265
+ # The row order of the input DataFrames is not preserved.
3266
+ #
3267
+ # @note
3268
+ # This functionality is experimental. It may be
3269
+ # changed at any point without it being considered a breaking change.
3270
+ #
3271
+ # @param other [DataFrame]
3272
+ # DataFrame to join with.
3273
+ # @param predicates [Array]
3274
+ # (In)Equality condition to join the two tables on.
3275
+ # When a column name occurs in both tables, the proper suffix must
3276
+ # be applied in the predicate.
3277
+ # @param suffix [String]
3278
+ # Suffix to append to columns with a duplicate name.
3279
+ #
3280
+ # @return [DataFrame]
3281
+ #
3282
+ # @example Join two dataframes together based on two predicates which get AND-ed together.
3283
+ # east = Polars::DataFrame.new(
3284
+ # {
3285
+ # "id": [100, 101, 102],
3286
+ # "dur": [120, 140, 160],
3287
+ # "rev": [12, 14, 16],
3288
+ # "cores": [2, 8, 4]
3289
+ # }
3290
+ # )
3291
+ # west = Polars::DataFrame.new(
3292
+ # {
3293
+ # "t_id": [404, 498, 676, 742],
3294
+ # "time": [90, 130, 150, 170],
3295
+ # "cost": [9, 13, 15, 16],
3296
+ # "cores": [4, 2, 1, 4]
3297
+ # }
3298
+ # )
3299
+ # east.join_where(
3300
+ # west,
3301
+ # Polars.col("dur") < Polars.col("time"),
3302
+ # Polars.col("rev") < Polars.col("cost")
3303
+ # )
3304
+ # # =>
3305
+ # # shape: (5, 8)
3306
+ # # ┌─────┬─────┬─────┬───────┬──────┬──────┬──────┬─────────────┐
3307
+ # # │ id ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right │
3308
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3309
+ # # │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
3310
+ # # ╞═════╪═════╪═════╪═══════╪══════╪══════╪══════╪═════════════╡
3311
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 498 ┆ 130 ┆ 13 ┆ 2 │
3312
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
3313
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
3314
+ # # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
3315
+ # # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
3316
+ # # └─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
3317
+ #
3318
+ # @example To OR them together, use a single expression and the `|` operator.
3319
+ # east.join_where(
3320
+ # west,
3321
+ # (Polars.col("dur") < Polars.col("time")) | (Polars.col("rev") < Polars.col("cost"))
3322
+ # )
3323
+ # # =>
3324
+ # # shape: (6, 8)
3325
+ # # ┌─────┬─────┬─────┬───────┬──────┬──────┬──────┬─────────────┐
3326
+ # # │ id ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right │
3327
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3328
+ # # │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
3329
+ # # ╞═════╪═════╪═════╪═══════╪══════╪══════╪══════╪═════════════╡
3330
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 498 ┆ 130 ┆ 13 ┆ 2 │
3331
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
3332
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
3333
+ # # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
3334
+ # # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
3335
+ # # │ 102 ┆ 160 ┆ 16 ┆ 4 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
3336
+ # # └─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
3337
+ def join_where(
3338
+ other,
3339
+ *predicates,
3340
+ suffix: "_right"
3341
+ )
3342
+ Utils.require_same_type(self, other)
3343
+
3344
+ lazy
3345
+ .join_where(
3346
+ other.lazy,
3347
+ *predicates,
3348
+ suffix: suffix
3349
+ )
3350
+ .collect(_eager: true)
3351
+ end
3352
+
2758
3353
  # Apply a custom/user-defined function (UDF) over the rows of the DataFrame.
2759
3354
  #
2760
3355
  # The UDF will receive each row as a tuple of values: `udf(row)`.
@@ -3749,8 +4344,8 @@ module Polars
3749
4344
  # @param include_key [Boolean]
3750
4345
  # Include the columns used to partition the DataFrame in the output.
3751
4346
  # @param as_dict [Boolean]
3752
- # If true, return the partitions in a dictionary keyed by the distinct group
3753
- # values instead of a list.
4347
+ # If true, return the partitions in a hash keyed by the distinct group
4348
+ # values instead of an array.
3754
4349
  #
3755
4350
  # @return [Object]
3756
4351
  #
@@ -4071,6 +4666,26 @@ module Polars
4071
4666
  lazy.select(*exprs, **named_exprs).collect(_eager: true)
4072
4667
  end
4073
4668
 
4669
+ # Select columns from this DataFrame.
4670
+ #
4671
+ # This will run all expression sequentially instead of in parallel.
4672
+ # Use this when the work per expression is cheap.
4673
+ #
4674
+ # @param exprs [Array]
4675
+ # Column(s) to select, specified as positional arguments.
4676
+ # Accepts expression input. Strings are parsed as column names,
4677
+ # other non-expression inputs are parsed as literals.
4678
+ # @param named_exprs [Hash]
4679
+ # Additional columns to select, specified as keyword arguments.
4680
+ # The columns will be renamed to the keyword used.
4681
+ #
4682
+ # @return [DataFrame]
4683
+ def select_seq(*exprs, **named_exprs)
4684
+ lazy
4685
+ .select_seq(*exprs, **named_exprs)
4686
+ .collect(_eager: true)
4687
+ end
4688
+
4074
4689
  # Add columns to this DataFrame.
4075
4690
  #
4076
4691
  # Added columns will replace existing columns with the same name.
@@ -4183,6 +4798,31 @@ module Polars
4183
4798
  lazy.with_columns(*exprs, **named_exprs).collect(_eager: true)
4184
4799
  end
4185
4800
 
4801
+ # Add columns to this DataFrame.
4802
+ #
4803
+ # Added columns will replace existing columns with the same name.
4804
+ #
4805
+ # This will run all expression sequentially instead of in parallel.
4806
+ # Use this when the work per expression is cheap.
4807
+ #
4808
+ # @param exprs [Array]
4809
+ # Column(s) to add, specified as positional arguments.
4810
+ # Accepts expression input. Strings are parsed as column names, other
4811
+ # non-expression inputs are parsed as literals.
4812
+ # @param named_exprs [Hash]
4813
+ # Additional columns to add, specified as keyword arguments.
4814
+ # The columns will be renamed to the keyword used.
4815
+ #
4816
+ # @return [DataFrame]
4817
+ def with_columns_seq(
4818
+ *exprs,
4819
+ **named_exprs
4820
+ )
4821
+ lazy
4822
+ .with_columns_seq(*exprs, **named_exprs)
4823
+ .collect(_eager: true)
4824
+ end
4825
+
4186
4826
  # Get number of chunks used by the ChunkedArrays of this DataFrame.
4187
4827
  #
4188
4828
  # @param strategy ["first", "all"]
@@ -4600,7 +5240,7 @@ module Polars
4600
5240
  # @param drop_first [Boolean]
4601
5241
  # Remove the first category from the variables being encoded.
4602
5242
  # @param drop_nulls [Boolean]
4603
- # If there are `None` values in the series, a `null` column is not generated
5243
+ # If there are `nil` values in the series, a `null` column is not generated
4604
5244
  #
4605
5245
  # @return [DataFrame]
4606
5246
  #
@@ -5521,9 +6161,136 @@ module Polars
5521
6161
  .collect(no_optimization: true)
5522
6162
  end
5523
6163
 
5524
- # TODO
5525
- # def update
5526
- # end
6164
+ # Update the values in this `DataFrame` with the values in `other`.
6165
+ #
6166
+ # @note
6167
+ # This functionality is considered **unstable**. It may be changed
6168
+ # at any point without it being considered a breaking change.
6169
+ #
6170
+ # @param other [DataFrame]
6171
+ # DataFrame that will be used to update the values
6172
+ # @param on [Object]
6173
+ # Column names that will be joined on. If set to `nil` (default),
6174
+ # the implicit row index of each frame is used as a join key.
6175
+ # @param how ['left', 'inner', 'full']
6176
+ # * 'left' will keep all rows from the left table; rows may be duplicated
6177
+ # if multiple rows in the right frame match the left row's key.
6178
+ # * 'inner' keeps only those rows where the key exists in both frames.
6179
+ # * 'full' will update existing rows where the key matches while also
6180
+ # adding any new rows contained in the given frame.
6181
+ # @param left_on [Object]
6182
+ # Join column(s) of the left DataFrame.
6183
+ # @param right_on [Object]
6184
+ # Join column(s) of the right DataFrame.
6185
+ # @param include_nulls [Boolean]
6186
+ # Overwrite values in the left frame with null values from the right frame.
6187
+ # If set to `false` (default), null values in the right frame are ignored.
6188
+ # @param maintain_order ['none', 'left', 'right', 'left_right', 'right_left']
6189
+ # Which order of rows from the inputs to preserve. See `DataFrame.join`
6190
+ # for details. Unlike `join` this function preserves the left order by
6191
+ # default.
6192
+ #
6193
+ # @return [DataFrame]
6194
+ #
6195
+ # @note
6196
+ # This is syntactic sugar for a left/inner join that preserves the order
6197
+ # of the left `DataFrame` by default, with an optional coalesce when
6198
+ # `include_nulls: false`.
6199
+ #
6200
+ # @example Update `df` values with the non-null values in `new_df`, by row index:
6201
+ # df = Polars::DataFrame.new(
6202
+ # {
6203
+ # "A" => [1, 2, 3, 4],
6204
+ # "B" => [400, 500, 600, 700]
6205
+ # }
6206
+ # )
6207
+ # new_df = Polars::DataFrame.new(
6208
+ # {
6209
+ # "B" => [-66, nil, -99],
6210
+ # "C" => [5, 3, 1]
6211
+ # }
6212
+ # )
6213
+ # df.update(new_df)
6214
+ # # =>
6215
+ # # shape: (4, 2)
6216
+ # # ┌─────┬─────┐
6217
+ # # │ A ┆ B │
6218
+ # # │ --- ┆ --- │
6219
+ # # │ i64 ┆ i64 │
6220
+ # # ╞═════╪═════╡
6221
+ # # │ 1 ┆ -66 │
6222
+ # # │ 2 ┆ 500 │
6223
+ # # │ 3 ┆ -99 │
6224
+ # # │ 4 ┆ 700 │
6225
+ # # └─────┴─────┘
6226
+ #
6227
+ # @example Update `df` values with the non-null values in `new_df`, by row index, but only keeping those rows that are common to both frames:
6228
+ # df.update(new_df, how: "inner")
6229
+ # # =>
6230
+ # # shape: (3, 2)
6231
+ # # ┌─────┬─────┐
6232
+ # # │ A ┆ B │
6233
+ # # │ --- ┆ --- │
6234
+ # # │ i64 ┆ i64 │
6235
+ # # ╞═════╪═════╡
6236
+ # # │ 1 ┆ -66 │
6237
+ # # │ 2 ┆ 500 │
6238
+ # # │ 3 ┆ -99 │
6239
+ # # └─────┴─────┘
6240
+ #
6241
+ # @example Update `df` values with the non-null values in `new_df`, using a full outer join strategy that defines explicit join columns in each frame:
6242
+ # df.update(new_df, left_on: ["A"], right_on: ["C"], how: "full")
6243
+ # # =>
6244
+ # # shape: (5, 2)
6245
+ # # ┌─────┬─────┐
6246
+ # # │ A ┆ B │
6247
+ # # │ --- ┆ --- │
6248
+ # # │ i64 ┆ i64 │
6249
+ # # ╞═════╪═════╡
6250
+ # # │ 1 ┆ -99 │
6251
+ # # │ 2 ┆ 500 │
6252
+ # # │ 3 ┆ 600 │
6253
+ # # │ 4 ┆ 700 │
6254
+ # # │ 5 ┆ -66 │
6255
+ # # └─────┴─────┘
6256
+ #
6257
+ # @example Update `df` values including null values in `new_df`, using a full outer join strategy that defines explicit join columns in each frame:
6258
+ # df.update(new_df, left_on: "A", right_on: "C", how: "full", include_nulls: true)
6259
+ # # =>
6260
+ # # shape: (5, 2)
6261
+ # # ┌─────┬──────┐
6262
+ # # │ A ┆ B │
6263
+ # # │ --- ┆ --- │
6264
+ # # │ i64 ┆ i64 │
6265
+ # # ╞═════╪══════╡
6266
+ # # │ 1 ┆ -99 │
6267
+ # # │ 2 ┆ 500 │
6268
+ # # │ 3 ┆ null │
6269
+ # # │ 4 ┆ 700 │
6270
+ # # │ 5 ┆ -66 │
6271
+ # # └─────┴──────┘
6272
+ def update(
6273
+ other,
6274
+ on: nil,
6275
+ how: "left",
6276
+ left_on: nil,
6277
+ right_on: nil,
6278
+ include_nulls: false,
6279
+ maintain_order: "left"
6280
+ )
6281
+ Utils.require_same_type(self, other)
6282
+ lazy
6283
+ .update(
6284
+ other.lazy,
6285
+ on: on,
6286
+ how: how,
6287
+ left_on: left_on,
6288
+ right_on: right_on,
6289
+ include_nulls: include_nulls,
6290
+ maintain_order: maintain_order
6291
+ )
6292
+ .collect(_eager: true)
6293
+ end
5527
6294
 
5528
6295
  private
5529
6296