polars-df 0.21.0 → 0.21.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/Cargo.lock +1 -1
  4. data/ext/polars/Cargo.toml +7 -1
  5. data/ext/polars/src/conversion/mod.rs +92 -4
  6. data/ext/polars/src/exceptions.rs +1 -0
  7. data/ext/polars/src/expr/array.rs +73 -4
  8. data/ext/polars/src/expr/binary.rs +26 -1
  9. data/ext/polars/src/expr/bitwise.rs +39 -0
  10. data/ext/polars/src/expr/categorical.rs +20 -0
  11. data/ext/polars/src/expr/datatype.rs +24 -1
  12. data/ext/polars/src/expr/datetime.rs +58 -0
  13. data/ext/polars/src/expr/general.rs +84 -5
  14. data/ext/polars/src/expr/list.rs +24 -0
  15. data/ext/polars/src/expr/meta.rs +11 -0
  16. data/ext/polars/src/expr/mod.rs +1 -0
  17. data/ext/polars/src/expr/name.rs +8 -0
  18. data/ext/polars/src/expr/rolling.rs +20 -0
  19. data/ext/polars/src/expr/string.rs +59 -0
  20. data/ext/polars/src/expr/struct.rs +9 -1
  21. data/ext/polars/src/functions/io.rs +19 -0
  22. data/ext/polars/src/functions/lazy.rs +4 -0
  23. data/ext/polars/src/lazyframe/general.rs +51 -0
  24. data/ext/polars/src/lib.rs +119 -10
  25. data/ext/polars/src/map/dataframe.rs +2 -2
  26. data/ext/polars/src/map/series.rs +1 -1
  27. data/ext/polars/src/series/aggregation.rs +44 -0
  28. data/ext/polars/src/series/general.rs +64 -4
  29. data/lib/polars/array_expr.rb +382 -3
  30. data/lib/polars/array_name_space.rb +281 -0
  31. data/lib/polars/binary_expr.rb +67 -0
  32. data/lib/polars/binary_name_space.rb +43 -0
  33. data/lib/polars/cat_expr.rb +224 -0
  34. data/lib/polars/cat_name_space.rb +138 -0
  35. data/lib/polars/config.rb +2 -2
  36. data/lib/polars/convert.rb +6 -6
  37. data/lib/polars/data_frame.rb +684 -19
  38. data/lib/polars/data_type_expr.rb +52 -0
  39. data/lib/polars/data_types.rb +14 -2
  40. data/lib/polars/date_time_expr.rb +251 -0
  41. data/lib/polars/date_time_name_space.rb +299 -0
  42. data/lib/polars/expr.rb +1213 -180
  43. data/lib/polars/functions/datatype.rb +21 -0
  44. data/lib/polars/functions/lazy.rb +13 -0
  45. data/lib/polars/io/csv.rb +1 -1
  46. data/lib/polars/io/json.rb +4 -4
  47. data/lib/polars/io/ndjson.rb +4 -4
  48. data/lib/polars/io/parquet.rb +27 -5
  49. data/lib/polars/lazy_frame.rb +936 -20
  50. data/lib/polars/list_expr.rb +196 -4
  51. data/lib/polars/list_name_space.rb +201 -4
  52. data/lib/polars/meta_expr.rb +64 -0
  53. data/lib/polars/name_expr.rb +36 -0
  54. data/lib/polars/schema.rb +79 -3
  55. data/lib/polars/selector.rb +72 -0
  56. data/lib/polars/selectors.rb +3 -3
  57. data/lib/polars/series.rb +1051 -54
  58. data/lib/polars/string_expr.rb +411 -6
  59. data/lib/polars/string_name_space.rb +722 -49
  60. data/lib/polars/struct_expr.rb +103 -0
  61. data/lib/polars/struct_name_space.rb +19 -1
  62. data/lib/polars/utils/various.rb +18 -1
  63. data/lib/polars/utils.rb +5 -1
  64. data/lib/polars/version.rb +1 -1
  65. data/lib/polars.rb +2 -0
  66. metadata +4 -1
@@ -15,11 +15,11 @@ module Polars
15
15
  # The schema of the resulting DataFrame. The schema may be declared in several
16
16
  # ways:
17
17
  #
18
- # * As a hash of name:type pairs; if type is nil, it will be auto-inferred.
18
+ # * As a hash of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
19
19
  # * As an array of column names; in this case types are automatically inferred.
20
- # * As an array of (name,type) pairs; this is equivalent to the dictionary form.
20
+ # * As an array of (name,type) pairs; this is equivalent to the hash form.
21
21
  #
22
- # If you supply a list of column names that does not match the names in the
22
+ # If you supply an array of column names that does not match the names in the
23
23
  # underlying data, the names given here will overwrite them. The number
24
24
  # of names given in the schema should match the underlying data dimensions.
25
25
  #
@@ -560,7 +560,7 @@ module Polars
560
560
  end
561
561
  end
562
562
 
563
- # Convert every row to a dictionary.
563
+ # Convert every row to a hash.
564
564
  #
565
565
  # Note that this is slow.
566
566
  #
@@ -910,7 +910,7 @@ module Polars
910
910
  #
911
911
  # @param file [Object]
912
912
  # Path or writable file-like object to which the IPC record batch data will
913
- # be written. If set to `None`, the output is returned as a BytesIO object.
913
+ # be written. If set to `nil`, the output is returned as a BytesIO object.
914
914
  # @param compression ['uncompressed', 'lz4', 'zstd']
915
915
  # Compression method. Defaults to "uncompressed".
916
916
  # @param compat_level [Object]
@@ -1463,6 +1463,126 @@ module Polars
1463
1463
  lazy.filter(predicate).collect
1464
1464
  end
1465
1465
 
1466
+ # Remove rows, dropping those that match the given predicate expression(s).
1467
+ #
1468
+ # The original order of the remaining rows is preserved.
1469
+ #
1470
+ # Rows where the filter predicate does not evaluate to True are retained
1471
+ # (this includes rows where the predicate evaluates as `null`).
1472
+ #
1473
+ # @param predicates [Array]
1474
+ # Expression that evaluates to a boolean Series.
1475
+ # @param constraints [Hash]
1476
+ # Column filters; use `name = value` to filter columns using the supplied
1477
+ # value. Each constraint behaves the same as `Polars.col(name).eq(value)`,
1478
+ # and is implicitly joined with the other filter conditions using `&`.
1479
+ #
1480
+ # @return [DataFrame]
1481
+ #
1482
+ # @example Remove rows matching a condition:
1483
+ # df = Polars::DataFrame.new(
1484
+ # {
1485
+ # "foo" => [2, 3, nil, 4, 0],
1486
+ # "bar" => [5, 6, nil, nil, 0],
1487
+ # "ham" => ["a", "b", nil, "c", "d"]
1488
+ # }
1489
+ # )
1490
+ # df.remove(Polars.col("bar") >= 5)
1491
+ # # =>
1492
+ # # shape: (3, 3)
1493
+ # # ┌──────┬──────┬──────┐
1494
+ # # │ foo ┆ bar ┆ ham │
1495
+ # # │ --- ┆ --- ┆ --- │
1496
+ # # │ i64 ┆ i64 ┆ str │
1497
+ # # ╞══════╪══════╪══════╡
1498
+ # # │ null ┆ null ┆ null │
1499
+ # # │ 4 ┆ null ┆ c │
1500
+ # # │ 0 ┆ 0 ┆ d │
1501
+ # # └──────┴──────┴──────┘
1502
+ #
1503
+ # @example Discard rows based on multiple conditions, combined with and/or operators:
1504
+ # df.remove(
1505
+ # (Polars.col("foo") >= 0) & (Polars.col("bar") >= 0),
1506
+ # )
1507
+ # # =>
1508
+ # # shape: (2, 3)
1509
+ # # ┌──────┬──────┬──────┐
1510
+ # # │ foo ┆ bar ┆ ham │
1511
+ # # │ --- ┆ --- ┆ --- │
1512
+ # # │ i64 ┆ i64 ┆ str │
1513
+ # # ╞══════╪══════╪══════╡
1514
+ # # │ null ┆ null ┆ null │
1515
+ # # │ 4 ┆ null ┆ c │
1516
+ # # └──────┴──────┴──────┘
1517
+ #
1518
+ # @example
1519
+ # df.remove(
1520
+ # (Polars.col("foo") >= 0) | (Polars.col("bar") >= 0),
1521
+ # )
1522
+ # # =>
1523
+ # # shape: (1, 3)
1524
+ # # ┌──────┬──────┬──────┐
1525
+ # # │ foo ┆ bar ┆ ham │
1526
+ # # │ --- ┆ --- ┆ --- │
1527
+ # # │ i64 ┆ i64 ┆ str │
1528
+ # # ╞══════╪══════╪══════╡
1529
+ # # │ null ┆ null ┆ null │
1530
+ # # └──────┴──────┴──────┘
1531
+ #
1532
+ # @example Provide multiple constraints using `*args` syntax:
1533
+ # df.remove(
1534
+ # Polars.col("ham").is_not_null,
1535
+ # Polars.col("bar") >= 0
1536
+ # )
1537
+ # # =>
1538
+ # # shape: (2, 3)
1539
+ # # ┌──────┬──────┬──────┐
1540
+ # # │ foo ┆ bar ┆ ham │
1541
+ # # │ --- ┆ --- ┆ --- │
1542
+ # # │ i64 ┆ i64 ┆ str │
1543
+ # # ╞══════╪══════╪══════╡
1544
+ # # │ null ┆ null ┆ null │
1545
+ # # │ 4 ┆ null ┆ c │
1546
+ # # └──────┴──────┴──────┘
1547
+ #
1548
+ # @example Provide constraints(s) using `**kwargs` syntax:
1549
+ # df.remove(foo: 0, bar: 0)
1550
+ # # =>
1551
+ # # shape: (4, 3)
1552
+ # # ┌──────┬──────┬──────┐
1553
+ # # │ foo ┆ bar ┆ ham │
1554
+ # # │ --- ┆ --- ┆ --- │
1555
+ # # │ i64 ┆ i64 ┆ str │
1556
+ # # ╞══════╪══════╪══════╡
1557
+ # # │ 2 ┆ 5 ┆ a │
1558
+ # # │ 3 ┆ 6 ┆ b │
1559
+ # # │ null ┆ null ┆ null │
1560
+ # # │ 4 ┆ null ┆ c │
1561
+ # # └──────┴──────┴──────┘
1562
+ #
1563
+ # @example Remove rows by comparing two columns against each other:
1564
+ # df.remove(
1565
+ # Polars.col("foo").ne_missing(Polars.col("bar"))
1566
+ # )
1567
+ # # =>
1568
+ # # shape: (2, 3)
1569
+ # # ┌──────┬──────┬──────┐
1570
+ # # │ foo ┆ bar ┆ ham │
1571
+ # # │ --- ┆ --- ┆ --- │
1572
+ # # │ i64 ┆ i64 ┆ str │
1573
+ # # ╞══════╪══════╪══════╡
1574
+ # # │ null ┆ null ┆ null │
1575
+ # # │ 0 ┆ 0 ┆ d │
1576
+ # # └──────┴──────┴──────┘
1577
+ def remove(
1578
+ *predicates,
1579
+ **constraints
1580
+ )
1581
+ lazy
1582
+ .remove(*predicates, **constraints)
1583
+ .collect(_eager: true)
1584
+ end
1585
+
1466
1586
  # Summary statistics for a DataFrame.
1467
1587
  #
1468
1588
  # @return [DataFrame]
@@ -1658,6 +1778,223 @@ module Polars
1658
1778
  self._df = sort(by, reverse: reverse, nulls_last: nulls_last)._df
1659
1779
  end
1660
1780
 
1781
+ # Execute a SQL query against the DataFrame.
1782
+ #
1783
+ # @note
1784
+ # This functionality is considered **unstable**, although it is close to
1785
+ # being considered stable. It may be changed at any point without it being
1786
+ # considered a breaking change.
1787
+ #
1788
+ # @param query [String]
1789
+ # SQL query to execute.
1790
+ # @param table_name [String]
1791
+ # Optionally provide an explicit name for the table that represents the
1792
+ # calling frame (defaults to "self").
1793
+ #
1794
+ # @return [DataFrame]
1795
+ #
1796
+ # @note
1797
+ # * The calling frame is automatically registered as a table in the SQL context
1798
+ # under the name "self". If you want access to the DataFrames and LazyFrames
1799
+ # found in the current globals, use the top-level :meth:`pl.sql <polars.sql>`.
1800
+ # * More control over registration and execution behaviour is available by
1801
+ # using the :class:`SQLContext` object.
1802
+ # * The SQL query executes in lazy mode before being collected and returned
1803
+ # as a DataFrame.
1804
+ #
1805
+ # @example Query the DataFrame using SQL:
1806
+ # df1 = Polars::DataFrame.new(
1807
+ # {
1808
+ # "a" => [1, 2, 3],
1809
+ # "b" => ["zz", "yy", "xx"],
1810
+ # "c" => [Date.new(1999, 12, 31), Date.new(2010, 10, 10), Date.new(2077, 8, 8)]
1811
+ # }
1812
+ # )
1813
+ # df1.sql("SELECT c, b FROM self WHERE a > 1")
1814
+ # # =>
1815
+ # # shape: (2, 2)
1816
+ # # ┌────────────┬─────┐
1817
+ # # │ c ┆ b │
1818
+ # # │ --- ┆ --- │
1819
+ # # │ date ┆ str │
1820
+ # # ╞════════════╪═════╡
1821
+ # # │ 2010-10-10 ┆ yy │
1822
+ # # │ 2077-08-08 ┆ xx │
1823
+ # # └────────────┴─────┘
1824
+ #
1825
+ # @example Apply transformations to a DataFrame using SQL, aliasing "self" to "frame".
1826
+ # df1.sql(
1827
+ # "
1828
+ # SELECT
1829
+ # a,
1830
+ # (a % 2 == 0) AS a_is_even,
1831
+ # CONCAT_WS(':', b, b) AS b_b,
1832
+ # EXTRACT(year FROM c) AS year,
1833
+ # 0::float4 AS \"zero\",
1834
+ # FROM frame
1835
+ # ",
1836
+ # table_name: "frame"
1837
+ # )
1838
+ # # =>
1839
+ # # shape: (3, 5)
1840
+ # # ┌─────┬───────────┬───────┬──────┬──────┐
1841
+ # # │ a ┆ a_is_even ┆ b_b ┆ year ┆ zero │
1842
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1843
+ # # │ i64 ┆ bool ┆ str ┆ i32 ┆ f32 │
1844
+ # # ╞═════╪═══════════╪═══════╪══════╪══════╡
1845
+ # # │ 1 ┆ false ┆ zz:zz ┆ 1999 ┆ 0.0 │
1846
+ # # │ 2 ┆ true ┆ yy:yy ┆ 2010 ┆ 0.0 │
1847
+ # # │ 3 ┆ false ┆ xx:xx ┆ 2077 ┆ 0.0 │
1848
+ # # └─────┴───────────┴───────┴──────┴──────┘
1849
+ def sql(query, table_name: "self")
1850
+ ctx = SQLContext.new(eager_execution: true)
1851
+ name = table_name || "self"
1852
+ ctx.register(name, self)
1853
+ ctx.execute(query)
1854
+ end
1855
+
1856
+ # Return the `k` largest rows.
1857
+ #
1858
+ # Non-null elements are always preferred over null elements, regardless of
1859
+ # the value of `reverse`. The output is not guaranteed to be in any
1860
+ # particular order, call `sort` after this function if you wish the
1861
+ # output to be sorted.
1862
+ #
1863
+ # @param k [Integer]
1864
+ # Number of rows to return.
1865
+ # @param by [Object]
1866
+ # Column(s) used to determine the top rows.
1867
+ # Accepts expression input. Strings are parsed as column names.
1868
+ # @param reverse [Object]
1869
+ # Consider the `k` smallest elements of the `by` column(s) (instead of the `k`
1870
+ # largest). This can be specified per column by passing a sequence of
1871
+ # booleans.
1872
+ #
1873
+ # @return [DataFrame]
1874
+ #
1875
+ # @example Get the rows which contain the 4 largest values in column b.
1876
+ # df = Polars::DataFrame.new(
1877
+ # {
1878
+ # "a" => ["a", "b", "a", "b", "b", "c"],
1879
+ # "b" => [2, 1, 1, 3, 2, 1]
1880
+ # }
1881
+ # )
1882
+ # df.top_k(4, by: "b")
1883
+ # # =>
1884
+ # # shape: (4, 2)
1885
+ # # ┌─────┬─────┐
1886
+ # # │ a ┆ b │
1887
+ # # │ --- ┆ --- │
1888
+ # # │ str ┆ i64 │
1889
+ # # ╞═════╪═════╡
1890
+ # # │ b ┆ 3 │
1891
+ # # │ a ┆ 2 │
1892
+ # # │ b ┆ 2 │
1893
+ # # │ b ┆ 1 │
1894
+ # # └─────┴─────┘
1895
+ #
1896
+ # @example Get the rows which contain the 4 largest values when sorting on column b and a.
1897
+ # df.top_k(4, by: ["b", "a"])
1898
+ # # =>
1899
+ # # shape: (4, 2)
1900
+ # # ┌─────┬─────┐
1901
+ # # │ a ┆ b │
1902
+ # # │ --- ┆ --- │
1903
+ # # │ str ┆ i64 │
1904
+ # # ╞═════╪═════╡
1905
+ # # │ b ┆ 3 │
1906
+ # # │ b ┆ 2 │
1907
+ # # │ a ┆ 2 │
1908
+ # # │ c ┆ 1 │
1909
+ # # └─────┴─────┘
1910
+ def top_k(
1911
+ k,
1912
+ by:,
1913
+ reverse: false
1914
+ )
1915
+ lazy
1916
+ .top_k(k, by: by, reverse: reverse)
1917
+ .collect(
1918
+ # optimizations=QueryOptFlags(
1919
+ # projection_pushdown=False,
1920
+ # predicate_pushdown=False,
1921
+ # comm_subplan_elim=False,
1922
+ # slice_pushdown=True
1923
+ # )
1924
+ )
1925
+ end
1926
+
1927
+ # Return the `k` smallest rows.
1928
+ #
1929
+ # Non-null elements are always preferred over null elements, regardless of
1930
+ # the value of `reverse`. The output is not guaranteed to be in any
1931
+ # particular order, call `sort` after this function if you wish the
1932
+ # output to be sorted.
1933
+ #
1934
+ # @param k [Integer]
1935
+ # Number of rows to return.
1936
+ # @param by [Object]
1937
+ # Column(s) used to determine the bottom rows.
1938
+ # Accepts expression input. Strings are parsed as column names.
1939
+ # @param reverse [Object]
1940
+ # Consider the `k` largest elements of the `by` column(s) (instead of the `k`
1941
+ # smallest). This can be specified per column by passing a sequence of
1942
+ # booleans.
1943
+ #
1944
+ # @return [DataFrame]
1945
+ #
1946
+ # @example Get the rows which contain the 4 smallest values in column b.
1947
+ # df = Polars::DataFrame.new(
1948
+ # {
1949
+ # "a" => ["a", "b", "a", "b", "b", "c"],
1950
+ # "b" => [2, 1, 1, 3, 2, 1]
1951
+ # }
1952
+ # )
1953
+ # df.bottom_k(4, by: "b")
1954
+ # # =>
1955
+ # # shape: (4, 2)
1956
+ # # ┌─────┬─────┐
1957
+ # # │ a ┆ b │
1958
+ # # │ --- ┆ --- │
1959
+ # # │ str ┆ i64 │
1960
+ # # ╞═════╪═════╡
1961
+ # # │ b ┆ 1 │
1962
+ # # │ a ┆ 1 │
1963
+ # # │ c ┆ 1 │
1964
+ # # │ a ┆ 2 │
1965
+ # # └─────┴─────┘
1966
+ #
1967
+ # @example Get the rows which contain the 4 smallest values when sorting on column a and b.
1968
+ # df.bottom_k(4, by: ["a", "b"])
1969
+ # # =>
1970
+ # # shape: (4, 2)
1971
+ # # ┌─────┬─────┐
1972
+ # # │ a ┆ b │
1973
+ # # │ --- ┆ --- │
1974
+ # # │ str ┆ i64 │
1975
+ # # ╞═════╪═════╡
1976
+ # # │ a ┆ 1 │
1977
+ # # │ a ┆ 2 │
1978
+ # # │ b ┆ 1 │
1979
+ # # │ b ┆ 2 │
1980
+ # # └─────┴─────┘
1981
+ def bottom_k(
1982
+ k,
1983
+ by:,
1984
+ reverse: false
1985
+ )
1986
+ lazy
1987
+ .bottom_k(k, by: by, reverse: reverse)
1988
+ .collect(
1989
+ # optimizations=QueryOptFlags(
1990
+ # projection_pushdown=False,
1991
+ # predicate_pushdown=False,
1992
+ # comm_subplan_elim=False,
1993
+ # slice_pushdown=True,
1994
+ # )
1995
+ )
1996
+ end
1997
+
1661
1998
  # Check if DataFrame is equal to other.
1662
1999
  #
1663
2000
  # @param other [DataFrame]
@@ -1848,10 +2185,59 @@ module Polars
1848
2185
  _from_rbdf(_df.tail(n))
1849
2186
  end
1850
2187
 
1851
- # Return a new DataFrame where the null values are dropped.
2188
+ # Drop all rows that contain one or more NaN values.
2189
+ #
2190
+ # The original order of the remaining rows is preserved.
1852
2191
  #
1853
2192
  # @param subset [Object]
1854
- # Subset of column(s) on which `drop_nulls` will be applied.
2193
+ # Column name(s) for which NaN values are considered; if set to `nil`
2194
+ # (default), use all columns (note that only floating-point columns
2195
+ # can contain NaNs).
2196
+ #
2197
+ # @return [DataFrame]
2198
+ #
2199
+ # @example
2200
+ # df = Polars::DataFrame.new(
2201
+ # {
2202
+ # "foo" => [-20.5, Float::NAN, 80.0],
2203
+ # "bar" => [Float::NAN, 110.0, 25.5],
2204
+ # "ham" => ["xxx", "yyy", nil]
2205
+ # }
2206
+ # )
2207
+ # df.drop_nans
2208
+ # # =>
2209
+ # # shape: (1, 3)
2210
+ # # ┌──────┬──────┬──────┐
2211
+ # # │ foo ┆ bar ┆ ham │
2212
+ # # │ --- ┆ --- ┆ --- │
2213
+ # # │ f64 ┆ f64 ┆ str │
2214
+ # # ╞══════╪══════╪══════╡
2215
+ # # │ 80.0 ┆ 25.5 ┆ null │
2216
+ # # └──────┴──────┴──────┘
2217
+ #
2218
+ # @example
2219
+ # df.drop_nans(subset: ["bar"])
2220
+ # # =>
2221
+ # # shape: (2, 3)
2222
+ # # ┌──────┬───────┬──────┐
2223
+ # # │ foo ┆ bar ┆ ham │
2224
+ # # │ --- ┆ --- ┆ --- │
2225
+ # # │ f64 ┆ f64 ┆ str │
2226
+ # # ╞══════╪═══════╪══════╡
2227
+ # # │ NaN ┆ 110.0 ┆ yyy │
2228
+ # # │ 80.0 ┆ 25.5 ┆ null │
2229
+ # # └──────┴───────┴──────┘
2230
+ def drop_nans(subset: nil)
2231
+ lazy.drop_nans(subset: subset).collect(_eager: true)
2232
+ end
2233
+
2234
+ # Drop all rows that contain one or more null values.
2235
+ #
2236
+ # The original order of the remaining rows is preserved.
2237
+ #
2238
+ # @param subset [Object]
2239
+ # Column name(s) for which null values are considered.
2240
+ # If set to `nil` (default), use all columns.
1855
2241
  #
1856
2242
  # @return [DataFrame]
1857
2243
  #
@@ -1860,20 +2246,32 @@ module Polars
1860
2246
  # {
1861
2247
  # "foo" => [1, 2, 3],
1862
2248
  # "bar" => [6, nil, 8],
1863
- # "ham" => ["a", "b", "c"]
2249
+ # "ham" => ["a", "b", nil]
1864
2250
  # }
1865
2251
  # )
1866
2252
  # df.drop_nulls
1867
2253
  # # =>
1868
- # # shape: (2, 3)
2254
+ # # shape: (1, 3)
1869
2255
  # # ┌─────┬─────┬─────┐
1870
2256
  # # │ foo ┆ bar ┆ ham │
1871
2257
  # # │ --- ┆ --- ┆ --- │
1872
2258
  # # │ i64 ┆ i64 ┆ str │
1873
2259
  # # ╞═════╪═════╪═════╡
1874
2260
  # # │ 1 ┆ 6 ┆ a │
1875
- # # │ 3 ┆ 8 ┆ c │
1876
2261
  # # └─────┴─────┴─────┘
2262
+ #
2263
+ # @example
2264
+ # df.drop_nulls(subset: Polars.cs.integer)
2265
+ # # =>
2266
+ # # shape: (2, 3)
2267
+ # # ┌─────┬─────┬──────┐
2268
+ # # │ foo ┆ bar ┆ ham │
2269
+ # # │ --- ┆ --- ┆ --- │
2270
+ # # │ i64 ┆ i64 ┆ str │
2271
+ # # ╞═════╪═════╪══════╡
2272
+ # # │ 1 ┆ 6 ┆ a │
2273
+ # # │ 3 ┆ 8 ┆ null │
2274
+ # # └─────┴─────┴──────┘
1877
2275
  def drop_nulls(subset: nil)
1878
2276
  lazy.drop_nulls(subset: subset).collect(_eager: true)
1879
2277
  end
@@ -2139,9 +2537,9 @@ module Polars
2139
2537
  # @param every
2140
2538
  # Interval of the window.
2141
2539
  # @param period
2142
- # Length of the window, if None it is equal to 'every'.
2540
+ # Length of the window, if nil it is equal to 'every'.
2143
2541
  # @param offset
2144
- # Offset of the window if None and period is None it will be equal to negative
2542
+ # Offset of the window if nil and period is nil it will be equal to negative
2145
2543
  # `every`.
2146
2544
  # @param truncate
2147
2545
  # Truncate the time value to the window lower bound.
@@ -2469,7 +2867,7 @@ module Polars
2469
2867
  # Join column of the right DataFrame.
2470
2868
  # @param on [String]
2471
2869
  # Join column of both DataFrames. If set, `left_on` and `right_on` should be
2472
- # None.
2870
+ # nil.
2473
2871
  # @param by_left [Object]
2474
2872
  # join on these columns before doing asof join
2475
2873
  # @param by_right [Object]
@@ -2755,6 +3153,101 @@ module Polars
2755
3153
  .collect(no_optimization: true)
2756
3154
  end
2757
3155
 
3156
+ # Perform a join based on one or multiple (in)equality predicates.
3157
+ #
3158
+ # This performs an inner join, so only rows where all predicates are true
3159
+ # are included in the result, and a row from either DataFrame may be included
3160
+ # multiple times in the result.
3161
+ #
3162
+ # @note
3163
+ # The row order of the input DataFrames is not preserved.
3164
+ #
3165
+ # @note
3166
+ # This functionality is experimental. It may be
3167
+ # changed at any point without it being considered a breaking change.
3168
+ #
3169
+ # @param other [DataFrame]
3170
+ # DataFrame to join with.
3171
+ # @param predicates [Array]
3172
+ # (In)Equality condition to join the two tables on.
3173
+ # When a column name occurs in both tables, the proper suffix must
3174
+ # be applied in the predicate.
3175
+ # @param suffix [String]
3176
+ # Suffix to append to columns with a duplicate name.
3177
+ #
3178
+ # @return [DataFrame]
3179
+ #
3180
+ # @example Join two dataframes together based on two predicates which get AND-ed together.
3181
+ # east = Polars::DataFrame.new(
3182
+ # {
3183
+ # "id": [100, 101, 102],
3184
+ # "dur": [120, 140, 160],
3185
+ # "rev": [12, 14, 16],
3186
+ # "cores": [2, 8, 4]
3187
+ # }
3188
+ # )
3189
+ # west = Polars::DataFrame.new(
3190
+ # {
3191
+ # "t_id": [404, 498, 676, 742],
3192
+ # "time": [90, 130, 150, 170],
3193
+ # "cost": [9, 13, 15, 16],
3194
+ # "cores": [4, 2, 1, 4]
3195
+ # }
3196
+ # )
3197
+ # east.join_where(
3198
+ # west,
3199
+ # Polars.col("dur") < Polars.col("time"),
3200
+ # Polars.col("rev") < Polars.col("cost")
3201
+ # )
3202
+ # # =>
3203
+ # # shape: (5, 8)
3204
+ # # ┌─────┬─────┬─────┬───────┬──────┬──────┬──────┬─────────────┐
3205
+ # # │ id ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right │
3206
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3207
+ # # │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
3208
+ # # ╞═════╪═════╪═════╪═══════╪══════╪══════╪══════╪═════════════╡
3209
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 498 ┆ 130 ┆ 13 ┆ 2 │
3210
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
3211
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
3212
+ # # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
3213
+ # # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
3214
+ # # └─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
3215
+ #
3216
+ # @example To OR them together, use a single expression and the `|` operator.
3217
+ # east.join_where(
3218
+ # west,
3219
+ # (Polars.col("dur") < Polars.col("time")) | (Polars.col("rev") < Polars.col("cost"))
3220
+ # )
3221
+ # # =>
3222
+ # # shape: (6, 8)
3223
+ # # ┌─────┬─────┬─────┬───────┬──────┬──────┬──────┬─────────────┐
3224
+ # # │ id ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right │
3225
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3226
+ # # │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
3227
+ # # ╞═════╪═════╪═════╪═══════╪══════╪══════╪══════╪═════════════╡
3228
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 498 ┆ 130 ┆ 13 ┆ 2 │
3229
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
3230
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
3231
+ # # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
3232
+ # # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
3233
+ # # │ 102 ┆ 160 ┆ 16 ┆ 4 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
3234
+ # # └─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
3235
+ def join_where(
3236
+ other,
3237
+ *predicates,
3238
+ suffix: "_right"
3239
+ )
3240
+ Utils.require_same_type(self, other)
3241
+
3242
+ lazy
3243
+ .join_where(
3244
+ other.lazy,
3245
+ *predicates,
3246
+ suffix: suffix
3247
+ )
3248
+ .collect(_eager: true)
3249
+ end
3250
+
2758
3251
  # Apply a custom/user-defined function (UDF) over the rows of the DataFrame.
2759
3252
  #
2760
3253
  # The UDF will receive each row as a tuple of values: `udf(row)`.
@@ -3749,8 +4242,8 @@ module Polars
3749
4242
  # @param include_key [Boolean]
3750
4243
  # Include the columns used to partition the DataFrame in the output.
3751
4244
  # @param as_dict [Boolean]
3752
- # If true, return the partitions in a dictionary keyed by the distinct group
3753
- # values instead of a list.
4245
+ # If true, return the partitions in a hash keyed by the distinct group
4246
+ # values instead of an array.
3754
4247
  #
3755
4248
  # @return [Object]
3756
4249
  #
@@ -4071,6 +4564,26 @@ module Polars
4071
4564
  lazy.select(*exprs, **named_exprs).collect(_eager: true)
4072
4565
  end
4073
4566
 
4567
+ # Select columns from this DataFrame.
4568
+ #
4569
+ # This will run all expression sequentially instead of in parallel.
4570
+ # Use this when the work per expression is cheap.
4571
+ #
4572
+ # @param exprs [Array]
4573
+ # Column(s) to select, specified as positional arguments.
4574
+ # Accepts expression input. Strings are parsed as column names,
4575
+ # other non-expression inputs are parsed as literals.
4576
+ # @param named_exprs [Hash]
4577
+ # Additional columns to select, specified as keyword arguments.
4578
+ # The columns will be renamed to the keyword used.
4579
+ #
4580
+ # @return [DataFrame]
4581
+ def select_seq(*exprs, **named_exprs)
4582
+ lazy
4583
+ .select_seq(*exprs, **named_exprs)
4584
+ .collect(_eager: true)
4585
+ end
4586
+
4074
4587
  # Add columns to this DataFrame.
4075
4588
  #
4076
4589
  # Added columns will replace existing columns with the same name.
@@ -4183,6 +4696,31 @@ module Polars
4183
4696
  lazy.with_columns(*exprs, **named_exprs).collect(_eager: true)
4184
4697
  end
4185
4698
 
4699
+ # Add columns to this DataFrame.
4700
+ #
4701
+ # Added columns will replace existing columns with the same name.
4702
+ #
4703
+ # This will run all expression sequentially instead of in parallel.
4704
+ # Use this when the work per expression is cheap.
4705
+ #
4706
+ # @param exprs [Array]
4707
+ # Column(s) to add, specified as positional arguments.
4708
+ # Accepts expression input. Strings are parsed as column names, other
4709
+ # non-expression inputs are parsed as literals.
4710
+ # @param named_exprs [Hash]
4711
+ # Additional columns to add, specified as keyword arguments.
4712
+ # The columns will be renamed to the keyword used.
4713
+ #
4714
+ # @return [DataFrame]
4715
+ def with_columns_seq(
4716
+ *exprs,
4717
+ **named_exprs
4718
+ )
4719
+ lazy
4720
+ .with_columns_seq(*exprs, **named_exprs)
4721
+ .collect(_eager: true)
4722
+ end
4723
+
4186
4724
  # Get number of chunks used by the ChunkedArrays of this DataFrame.
4187
4725
  #
4188
4726
  # @param strategy ["first", "all"]
@@ -4600,7 +5138,7 @@ module Polars
4600
5138
  # @param drop_first [Boolean]
4601
5139
  # Remove the first category from the variables being encoded.
4602
5140
  # @param drop_nulls [Boolean]
4603
- # If there are `None` values in the series, a `null` column is not generated
5141
+ # If there are `nil` values in the series, a `null` column is not generated
4604
5142
  #
4605
5143
  # @return [DataFrame]
4606
5144
  #
@@ -5521,9 +6059,136 @@ module Polars
5521
6059
  .collect(no_optimization: true)
5522
6060
  end
5523
6061
 
5524
- # TODO
5525
- # def update
5526
- # end
6062
+ # Update the values in this `DataFrame` with the values in `other`.
6063
+ #
6064
+ # @note
6065
+ # This functionality is considered **unstable**. It may be changed
6066
+ # at any point without it being considered a breaking change.
6067
+ #
6068
+ # @param other [DataFrame]
6069
+ # DataFrame that will be used to update the values
6070
+ # @param on [Object]
6071
+ # Column names that will be joined on. If set to `nil` (default),
6072
+ # the implicit row index of each frame is used as a join key.
6073
+ # @param how ['left', 'inner', 'full']
6074
+ # * 'left' will keep all rows from the left table; rows may be duplicated
6075
+ # if multiple rows in the right frame match the left row's key.
6076
+ # * 'inner' keeps only those rows where the key exists in both frames.
6077
+ # * 'full' will update existing rows where the key matches while also
6078
+ # adding any new rows contained in the given frame.
6079
+ # @param left_on [Object]
6080
+ # Join column(s) of the left DataFrame.
6081
+ # @param right_on [Object]
6082
+ # Join column(s) of the right DataFrame.
6083
+ # @param include_nulls [Boolean]
6084
+ # Overwrite values in the left frame with null values from the right frame.
6085
+ # If set to `false` (default), null values in the right frame are ignored.
6086
+ # @param maintain_order ['none', 'left', 'right', 'left_right', 'right_left']
6087
+ # Which order of rows from the inputs to preserve. See `DataFrame.join`
6088
+ # for details. Unlike `join` this function preserves the left order by
6089
+ # default.
6090
+ #
6091
+ # @return [DataFrame]
6092
+ #
6093
+ # @note
6094
+ # This is syntactic sugar for a left/inner join that preserves the order
6095
+ # of the left `DataFrame` by default, with an optional coalesce when
6096
+ # `include_nulls: false`.
6097
+ #
6098
+ # @example Update `df` values with the non-null values in `new_df`, by row index:
6099
+ # df = Polars::DataFrame.new(
6100
+ # {
6101
+ # "A" => [1, 2, 3, 4],
6102
+ # "B" => [400, 500, 600, 700]
6103
+ # }
6104
+ # )
6105
+ # new_df = Polars::DataFrame.new(
6106
+ # {
6107
+ # "B" => [-66, nil, -99],
6108
+ # "C" => [5, 3, 1]
6109
+ # }
6110
+ # )
6111
+ # df.update(new_df)
6112
+ # # =>
6113
+ # # shape: (4, 2)
6114
+ # # ┌─────┬─────┐
6115
+ # # │ A ┆ B │
6116
+ # # │ --- ┆ --- │
6117
+ # # │ i64 ┆ i64 │
6118
+ # # ╞═════╪═════╡
6119
+ # # │ 1 ┆ -66 │
6120
+ # # │ 2 ┆ 500 │
6121
+ # # │ 3 ┆ -99 │
6122
+ # # │ 4 ┆ 700 │
6123
+ # # └─────┴─────┘
6124
+ #
6125
+ # @example Update `df` values with the non-null values in `new_df`, by row index, but only keeping those rows that are common to both frames:
6126
+ # df.update(new_df, how: "inner")
6127
+ # # =>
6128
+ # # shape: (3, 2)
6129
+ # # ┌─────┬─────┐
6130
+ # # │ A ┆ B │
6131
+ # # │ --- ┆ --- │
6132
+ # # │ i64 ┆ i64 │
6133
+ # # ╞═════╪═════╡
6134
+ # # │ 1 ┆ -66 │
6135
+ # # │ 2 ┆ 500 │
6136
+ # # │ 3 ┆ -99 │
6137
+ # # └─────┴─────┘
6138
+ #
6139
+ # @example Update `df` values with the non-null values in `new_df`, using a full outer join strategy that defines explicit join columns in each frame:
6140
+ # df.update(new_df, left_on: ["A"], right_on: ["C"], how: "full")
6141
+ # # =>
6142
+ # # shape: (5, 2)
6143
+ # # ┌─────┬─────┐
6144
+ # # │ A ┆ B │
6145
+ # # │ --- ┆ --- │
6146
+ # # │ i64 ┆ i64 │
6147
+ # # ╞═════╪═════╡
6148
+ # # │ 1 ┆ -99 │
6149
+ # # │ 2 ┆ 500 │
6150
+ # # │ 3 ┆ 600 │
6151
+ # # │ 4 ┆ 700 │
6152
+ # # │ 5 ┆ -66 │
6153
+ # # └─────┴─────┘
6154
+ #
6155
+ # @example Update `df` values including null values in `new_df`, using a full outer join strategy that defines explicit join columns in each frame:
6156
+ # df.update(new_df, left_on: "A", right_on: "C", how: "full", include_nulls: true)
6157
+ # # =>
6158
+ # # shape: (5, 2)
6159
+ # # ┌─────┬──────┐
6160
+ # # │ A ┆ B │
6161
+ # # │ --- ┆ --- │
6162
+ # # │ i64 ┆ i64 │
6163
+ # # ╞═════╪══════╡
6164
+ # # │ 1 ┆ -99 │
6165
+ # # │ 2 ┆ 500 │
6166
+ # # │ 3 ┆ null │
6167
+ # # │ 4 ┆ 700 │
6168
+ # # │ 5 ┆ -66 │
6169
+ # # └─────┴──────┘
6170
+ def update(
6171
+ other,
6172
+ on: nil,
6173
+ how: "left",
6174
+ left_on: nil,
6175
+ right_on: nil,
6176
+ include_nulls: false,
6177
+ maintain_order: "left"
6178
+ )
6179
+ Utils.require_same_type(self, other)
6180
+ lazy
6181
+ .update(
6182
+ other.lazy,
6183
+ on: on,
6184
+ how: how,
6185
+ left_on: left_on,
6186
+ right_on: right_on,
6187
+ include_nulls: include_nulls,
6188
+ maintain_order: maintain_order
6189
+ )
6190
+ .collect(_eager: true)
6191
+ end
5527
6192
 
5528
6193
  private
5529
6194