polars-df 0.20.0-x64-mingw-ucrt → 0.21.1-x64-mingw-ucrt
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -0
- data/Cargo.lock +192 -186
- data/LICENSE-THIRD-PARTY.txt +2153 -2532
- data/LICENSE.txt +1 -1
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/3.4/polars.so +0 -0
- data/lib/polars/array_expr.rb +382 -3
- data/lib/polars/array_name_space.rb +281 -0
- data/lib/polars/binary_expr.rb +67 -0
- data/lib/polars/binary_name_space.rb +43 -0
- data/lib/polars/cat_expr.rb +224 -0
- data/lib/polars/cat_name_space.rb +130 -32
- data/lib/polars/catalog/unity/catalog_info.rb +20 -0
- data/lib/polars/catalog/unity/column_info.rb +31 -0
- data/lib/polars/catalog/unity/namespace_info.rb +21 -0
- data/lib/polars/catalog/unity/table_info.rb +50 -0
- data/lib/polars/catalog.rb +448 -0
- data/lib/polars/config.rb +2 -2
- data/lib/polars/convert.rb +12 -2
- data/lib/polars/data_frame.rb +834 -48
- data/lib/polars/data_type_expr.rb +52 -0
- data/lib/polars/data_types.rb +61 -5
- data/lib/polars/date_time_expr.rb +251 -0
- data/lib/polars/date_time_name_space.rb +299 -0
- data/lib/polars/exceptions.rb +7 -2
- data/lib/polars/expr.rb +1247 -211
- data/lib/polars/functions/col.rb +6 -5
- data/lib/polars/functions/datatype.rb +21 -0
- data/lib/polars/functions/lazy.rb +127 -15
- data/lib/polars/functions/repeat.rb +4 -0
- data/lib/polars/io/csv.rb +19 -1
- data/lib/polars/io/json.rb +16 -0
- data/lib/polars/io/ndjson.rb +13 -0
- data/lib/polars/io/parquet.rb +70 -66
- data/lib/polars/io/scan_options.rb +47 -0
- data/lib/polars/lazy_frame.rb +1099 -95
- data/lib/polars/list_expr.rb +400 -11
- data/lib/polars/list_name_space.rb +321 -5
- data/lib/polars/meta_expr.rb +71 -22
- data/lib/polars/name_expr.rb +36 -0
- data/lib/polars/scan_cast_options.rb +64 -0
- data/lib/polars/schema.rb +84 -3
- data/lib/polars/selector.rb +210 -0
- data/lib/polars/selectors.rb +932 -203
- data/lib/polars/series.rb +1083 -63
- data/lib/polars/string_expr.rb +435 -9
- data/lib/polars/string_name_space.rb +729 -45
- data/lib/polars/struct_expr.rb +103 -0
- data/lib/polars/struct_name_space.rb +19 -1
- data/lib/polars/utils/parse.rb +40 -0
- data/lib/polars/utils/various.rb +18 -1
- data/lib/polars/utils.rb +9 -1
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +10 -0
- metadata +12 -2
data/lib/polars/data_frame.rb
CHANGED
@@ -15,11 +15,11 @@ module Polars
|
|
15
15
|
# The schema of the resulting DataFrame. The schema may be declared in several
|
16
16
|
# ways:
|
17
17
|
#
|
18
|
-
# * As a hash of name:type pairs; if type is nil, it will be auto-inferred.
|
18
|
+
# * As a hash of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
|
19
19
|
# * As an array of column names; in this case types are automatically inferred.
|
20
|
-
# * As an array of (name,type) pairs; this is equivalent to the
|
20
|
+
# * As an array of (name,type) pairs; this is equivalent to the hash form.
|
21
21
|
#
|
22
|
-
# If you supply
|
22
|
+
# If you supply an array of column names that does not match the names in the
|
23
23
|
# underlying data, the names given here will overwrite them. The number
|
24
24
|
# of names given in the schema should match the underlying data dimensions.
|
25
25
|
#
|
@@ -47,12 +47,7 @@ module Polars
|
|
47
47
|
# @param nan_to_null [Boolean]
|
48
48
|
# If the data comes from one or more Numo arrays, can optionally convert input
|
49
49
|
# data NaN values to null instead. This is a no-op for all other input data.
|
50
|
-
def initialize(data = nil, schema: nil,
|
51
|
-
if schema && columns
|
52
|
-
warn "columns is ignored when schema is passed"
|
53
|
-
end
|
54
|
-
schema ||= columns
|
55
|
-
|
50
|
+
def initialize(data = nil, schema: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: 100, nan_to_null: false)
|
56
51
|
if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
|
57
52
|
raise ArgumentError, "Use read_database instead"
|
58
53
|
end
|
@@ -565,7 +560,7 @@ module Polars
|
|
565
560
|
end
|
566
561
|
end
|
567
562
|
|
568
|
-
# Convert every row to a
|
563
|
+
# Convert every row to a hash.
|
569
564
|
#
|
570
565
|
# Note that this is slow.
|
571
566
|
#
|
@@ -722,7 +717,7 @@ module Polars
|
|
722
717
|
# @param file [String, nil]
|
723
718
|
# File path to which the result should be written. If set to `nil`
|
724
719
|
# (default), the output is returned as a string instead.
|
725
|
-
# @param
|
720
|
+
# @param include_header [Boolean]
|
726
721
|
# Whether to include header in the CSV output.
|
727
722
|
# @param sep [String]
|
728
723
|
# Separate CSV fields with this symbol.
|
@@ -763,8 +758,7 @@ module Polars
|
|
763
758
|
# df.write_csv("file.csv")
|
764
759
|
def write_csv(
|
765
760
|
file = nil,
|
766
|
-
|
767
|
-
include_header: nil,
|
761
|
+
include_header: true,
|
768
762
|
sep: ",",
|
769
763
|
quote: '"',
|
770
764
|
batch_size: 1024,
|
@@ -774,8 +768,6 @@ module Polars
|
|
774
768
|
float_precision: nil,
|
775
769
|
null_value: nil
|
776
770
|
)
|
777
|
-
include_header = has_header if include_header.nil?
|
778
|
-
|
779
771
|
if sep.length > 1
|
780
772
|
raise ArgumentError, "only single byte separator is allowed"
|
781
773
|
elsif quote.length > 1
|
@@ -834,6 +826,8 @@ module Polars
|
|
834
826
|
# File path to which the file should be written.
|
835
827
|
# @param compression ["uncompressed", "snappy", "deflate"]
|
836
828
|
# Compression method. Defaults to "uncompressed".
|
829
|
+
# @param name [String]
|
830
|
+
# Schema name. Defaults to empty string.
|
837
831
|
#
|
838
832
|
# @return [nil]
|
839
833
|
def write_avro(file, compression = "uncompressed", name: "")
|
@@ -856,6 +850,24 @@ module Polars
|
|
856
850
|
# File path to which the file should be written.
|
857
851
|
# @param compression ["uncompressed", "lz4", "zstd"]
|
858
852
|
# Compression method. Defaults to "uncompressed".
|
853
|
+
# @param compat_level [Object]
|
854
|
+
# Use a specific compatibility level
|
855
|
+
# when exporting Polars' internal data structures.
|
856
|
+
# @param storage_options [Hash]
|
857
|
+
# Options that indicate how to connect to a cloud provider.
|
858
|
+
#
|
859
|
+
# The cloud providers currently supported are AWS, GCP, and Azure.
|
860
|
+
# See supported keys here:
|
861
|
+
#
|
862
|
+
# * [aws](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html)
|
863
|
+
# * [gcp](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html)
|
864
|
+
# * [azure](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html)
|
865
|
+
# * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
|
866
|
+
#
|
867
|
+
# If `storage_options` is not provided, Polars will try to infer the
|
868
|
+
# information from environment variables.
|
869
|
+
# @param retries [Integer]
|
870
|
+
# Number of retries if accessing a cloud instance fails.
|
859
871
|
#
|
860
872
|
# @return [nil]
|
861
873
|
def write_ipc(
|
@@ -898,9 +910,12 @@ module Polars
|
|
898
910
|
#
|
899
911
|
# @param file [Object]
|
900
912
|
# Path or writable file-like object to which the IPC record batch data will
|
901
|
-
# be written. If set to `
|
913
|
+
# be written. If set to `nil`, the output is returned as a BytesIO object.
|
902
914
|
# @param compression ['uncompressed', 'lz4', 'zstd']
|
903
915
|
# Compression method. Defaults to "uncompressed".
|
916
|
+
# @param compat_level [Object]
|
917
|
+
# Use a specific compatibility level
|
918
|
+
# when exporting Polars' internal data structures.
|
904
919
|
#
|
905
920
|
# @return [Object]
|
906
921
|
#
|
@@ -1215,7 +1230,7 @@ module Polars
|
|
1215
1230
|
# "y" => 1_000_000.times.map { |v| v / 1000.0 },
|
1216
1231
|
# "z" => 1_000_000.times.map(&:to_s)
|
1217
1232
|
# },
|
1218
|
-
#
|
1233
|
+
# schema: {"x" => :u32, "y" => :f64, "z" => :str}
|
1219
1234
|
# )
|
1220
1235
|
# df.estimated_size
|
1221
1236
|
# # => 25888898
|
@@ -1448,6 +1463,126 @@ module Polars
|
|
1448
1463
|
lazy.filter(predicate).collect
|
1449
1464
|
end
|
1450
1465
|
|
1466
|
+
# Remove rows, dropping those that match the given predicate expression(s).
|
1467
|
+
#
|
1468
|
+
# The original order of the remaining rows is preserved.
|
1469
|
+
#
|
1470
|
+
# Rows where the filter predicate does not evaluate to True are retained
|
1471
|
+
# (this includes rows where the predicate evaluates as `null`).
|
1472
|
+
#
|
1473
|
+
# @param predicates [Array]
|
1474
|
+
# Expression that evaluates to a boolean Series.
|
1475
|
+
# @param constraints [Hash]
|
1476
|
+
# Column filters; use `name = value` to filter columns using the supplied
|
1477
|
+
# value. Each constraint behaves the same as `Polars.col(name).eq(value)`,
|
1478
|
+
# and is implicitly joined with the other filter conditions using `&`.
|
1479
|
+
#
|
1480
|
+
# @return [DataFrame]
|
1481
|
+
#
|
1482
|
+
# @example Remove rows matching a condition:
|
1483
|
+
# df = Polars::DataFrame.new(
|
1484
|
+
# {
|
1485
|
+
# "foo" => [2, 3, nil, 4, 0],
|
1486
|
+
# "bar" => [5, 6, nil, nil, 0],
|
1487
|
+
# "ham" => ["a", "b", nil, "c", "d"]
|
1488
|
+
# }
|
1489
|
+
# )
|
1490
|
+
# df.remove(Polars.col("bar") >= 5)
|
1491
|
+
# # =>
|
1492
|
+
# # shape: (3, 3)
|
1493
|
+
# # ┌──────┬──────┬──────┐
|
1494
|
+
# # │ foo ┆ bar ┆ ham │
|
1495
|
+
# # │ --- ┆ --- ┆ --- │
|
1496
|
+
# # │ i64 ┆ i64 ┆ str │
|
1497
|
+
# # ╞══════╪══════╪══════╡
|
1498
|
+
# # │ null ┆ null ┆ null │
|
1499
|
+
# # │ 4 ┆ null ┆ c │
|
1500
|
+
# # │ 0 ┆ 0 ┆ d │
|
1501
|
+
# # └──────┴──────┴──────┘
|
1502
|
+
#
|
1503
|
+
# @example Discard rows based on multiple conditions, combined with and/or operators:
|
1504
|
+
# df.remove(
|
1505
|
+
# (Polars.col("foo") >= 0) & (Polars.col("bar") >= 0),
|
1506
|
+
# )
|
1507
|
+
# # =>
|
1508
|
+
# # shape: (2, 3)
|
1509
|
+
# # ┌──────┬──────┬──────┐
|
1510
|
+
# # │ foo ┆ bar ┆ ham │
|
1511
|
+
# # │ --- ┆ --- ┆ --- │
|
1512
|
+
# # │ i64 ┆ i64 ┆ str │
|
1513
|
+
# # ╞══════╪══════╪══════╡
|
1514
|
+
# # │ null ┆ null ┆ null │
|
1515
|
+
# # │ 4 ┆ null ┆ c │
|
1516
|
+
# # └──────┴──────┴──────┘
|
1517
|
+
#
|
1518
|
+
# @example
|
1519
|
+
# df.remove(
|
1520
|
+
# (Polars.col("foo") >= 0) | (Polars.col("bar") >= 0),
|
1521
|
+
# )
|
1522
|
+
# # =>
|
1523
|
+
# # shape: (1, 3)
|
1524
|
+
# # ┌──────┬──────┬──────┐
|
1525
|
+
# # │ foo ┆ bar ┆ ham │
|
1526
|
+
# # │ --- ┆ --- ┆ --- │
|
1527
|
+
# # │ i64 ┆ i64 ┆ str │
|
1528
|
+
# # ╞══════╪══════╪══════╡
|
1529
|
+
# # │ null ┆ null ┆ null │
|
1530
|
+
# # └──────┴──────┴──────┘
|
1531
|
+
#
|
1532
|
+
# @example Provide multiple constraints using `*args` syntax:
|
1533
|
+
# df.remove(
|
1534
|
+
# Polars.col("ham").is_not_null,
|
1535
|
+
# Polars.col("bar") >= 0
|
1536
|
+
# )
|
1537
|
+
# # =>
|
1538
|
+
# # shape: (2, 3)
|
1539
|
+
# # ┌──────┬──────┬──────┐
|
1540
|
+
# # │ foo ┆ bar ┆ ham │
|
1541
|
+
# # │ --- ┆ --- ┆ --- │
|
1542
|
+
# # │ i64 ┆ i64 ┆ str │
|
1543
|
+
# # ╞══════╪══════╪══════╡
|
1544
|
+
# # │ null ┆ null ┆ null │
|
1545
|
+
# # │ 4 ┆ null ┆ c │
|
1546
|
+
# # └──────┴──────┴──────┘
|
1547
|
+
#
|
1548
|
+
# @example Provide constraints(s) using `**kwargs` syntax:
|
1549
|
+
# df.remove(foo: 0, bar: 0)
|
1550
|
+
# # =>
|
1551
|
+
# # shape: (4, 3)
|
1552
|
+
# # ┌──────┬──────┬──────┐
|
1553
|
+
# # │ foo ┆ bar ┆ ham │
|
1554
|
+
# # │ --- ┆ --- ┆ --- │
|
1555
|
+
# # │ i64 ┆ i64 ┆ str │
|
1556
|
+
# # ╞══════╪══════╪══════╡
|
1557
|
+
# # │ 2 ┆ 5 ┆ a │
|
1558
|
+
# # │ 3 ┆ 6 ┆ b │
|
1559
|
+
# # │ null ┆ null ┆ null │
|
1560
|
+
# # │ 4 ┆ null ┆ c │
|
1561
|
+
# # └──────┴──────┴──────┘
|
1562
|
+
#
|
1563
|
+
# @example Remove rows by comparing two columns against each other:
|
1564
|
+
# df.remove(
|
1565
|
+
# Polars.col("foo").ne_missing(Polars.col("bar"))
|
1566
|
+
# )
|
1567
|
+
# # =>
|
1568
|
+
# # shape: (2, 3)
|
1569
|
+
# # ┌──────┬──────┬──────┐
|
1570
|
+
# # │ foo ┆ bar ┆ ham │
|
1571
|
+
# # │ --- ┆ --- ┆ --- │
|
1572
|
+
# # │ i64 ┆ i64 ┆ str │
|
1573
|
+
# # ╞══════╪══════╪══════╡
|
1574
|
+
# # │ null ┆ null ┆ null │
|
1575
|
+
# # │ 0 ┆ 0 ┆ d │
|
1576
|
+
# # └──────┴──────┴──────┘
|
1577
|
+
def remove(
|
1578
|
+
*predicates,
|
1579
|
+
**constraints
|
1580
|
+
)
|
1581
|
+
lazy
|
1582
|
+
.remove(*predicates, **constraints)
|
1583
|
+
.collect(_eager: true)
|
1584
|
+
end
|
1585
|
+
|
1451
1586
|
# Summary statistics for a DataFrame.
|
1452
1587
|
#
|
1453
1588
|
# @return [DataFrame]
|
@@ -1643,6 +1778,223 @@ module Polars
|
|
1643
1778
|
self._df = sort(by, reverse: reverse, nulls_last: nulls_last)._df
|
1644
1779
|
end
|
1645
1780
|
|
1781
|
+
# Execute a SQL query against the DataFrame.
|
1782
|
+
#
|
1783
|
+
# @note
|
1784
|
+
# This functionality is considered **unstable**, although it is close to
|
1785
|
+
# being considered stable. It may be changed at any point without it being
|
1786
|
+
# considered a breaking change.
|
1787
|
+
#
|
1788
|
+
# @param query [String]
|
1789
|
+
# SQL query to execute.
|
1790
|
+
# @param table_name [String]
|
1791
|
+
# Optionally provide an explicit name for the table that represents the
|
1792
|
+
# calling frame (defaults to "self").
|
1793
|
+
#
|
1794
|
+
# @return [DataFrame]
|
1795
|
+
#
|
1796
|
+
# @note
|
1797
|
+
# * The calling frame is automatically registered as a table in the SQL context
|
1798
|
+
# under the name "self". If you want access to the DataFrames and LazyFrames
|
1799
|
+
# found in the current globals, use the top-level :meth:`pl.sql <polars.sql>`.
|
1800
|
+
# * More control over registration and execution behaviour is available by
|
1801
|
+
# using the :class:`SQLContext` object.
|
1802
|
+
# * The SQL query executes in lazy mode before being collected and returned
|
1803
|
+
# as a DataFrame.
|
1804
|
+
#
|
1805
|
+
# @example Query the DataFrame using SQL:
|
1806
|
+
# df1 = Polars::DataFrame.new(
|
1807
|
+
# {
|
1808
|
+
# "a" => [1, 2, 3],
|
1809
|
+
# "b" => ["zz", "yy", "xx"],
|
1810
|
+
# "c" => [Date.new(1999, 12, 31), Date.new(2010, 10, 10), Date.new(2077, 8, 8)]
|
1811
|
+
# }
|
1812
|
+
# )
|
1813
|
+
# df1.sql("SELECT c, b FROM self WHERE a > 1")
|
1814
|
+
# # =>
|
1815
|
+
# # shape: (2, 2)
|
1816
|
+
# # ┌────────────┬─────┐
|
1817
|
+
# # │ c ┆ b │
|
1818
|
+
# # │ --- ┆ --- │
|
1819
|
+
# # │ date ┆ str │
|
1820
|
+
# # ╞════════════╪═════╡
|
1821
|
+
# # │ 2010-10-10 ┆ yy │
|
1822
|
+
# # │ 2077-08-08 ┆ xx │
|
1823
|
+
# # └────────────┴─────┘
|
1824
|
+
#
|
1825
|
+
# @example Apply transformations to a DataFrame using SQL, aliasing "self" to "frame".
|
1826
|
+
# df1.sql(
|
1827
|
+
# "
|
1828
|
+
# SELECT
|
1829
|
+
# a,
|
1830
|
+
# (a % 2 == 0) AS a_is_even,
|
1831
|
+
# CONCAT_WS(':', b, b) AS b_b,
|
1832
|
+
# EXTRACT(year FROM c) AS year,
|
1833
|
+
# 0::float4 AS \"zero\",
|
1834
|
+
# FROM frame
|
1835
|
+
# ",
|
1836
|
+
# table_name: "frame"
|
1837
|
+
# )
|
1838
|
+
# # =>
|
1839
|
+
# # shape: (3, 5)
|
1840
|
+
# # ┌─────┬───────────┬───────┬──────┬──────┐
|
1841
|
+
# # │ a ┆ a_is_even ┆ b_b ┆ year ┆ zero │
|
1842
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
1843
|
+
# # │ i64 ┆ bool ┆ str ┆ i32 ┆ f32 │
|
1844
|
+
# # ╞═════╪═══════════╪═══════╪══════╪══════╡
|
1845
|
+
# # │ 1 ┆ false ┆ zz:zz ┆ 1999 ┆ 0.0 │
|
1846
|
+
# # │ 2 ┆ true ┆ yy:yy ┆ 2010 ┆ 0.0 │
|
1847
|
+
# # │ 3 ┆ false ┆ xx:xx ┆ 2077 ┆ 0.0 │
|
1848
|
+
# # └─────┴───────────┴───────┴──────┴──────┘
|
1849
|
+
def sql(query, table_name: "self")
|
1850
|
+
ctx = SQLContext.new(eager_execution: true)
|
1851
|
+
name = table_name || "self"
|
1852
|
+
ctx.register(name, self)
|
1853
|
+
ctx.execute(query)
|
1854
|
+
end
|
1855
|
+
|
1856
|
+
# Return the `k` largest rows.
|
1857
|
+
#
|
1858
|
+
# Non-null elements are always preferred over null elements, regardless of
|
1859
|
+
# the value of `reverse`. The output is not guaranteed to be in any
|
1860
|
+
# particular order, call `sort` after this function if you wish the
|
1861
|
+
# output to be sorted.
|
1862
|
+
#
|
1863
|
+
# @param k [Integer]
|
1864
|
+
# Number of rows to return.
|
1865
|
+
# @param by [Object]
|
1866
|
+
# Column(s) used to determine the top rows.
|
1867
|
+
# Accepts expression input. Strings are parsed as column names.
|
1868
|
+
# @param reverse [Object]
|
1869
|
+
# Consider the `k` smallest elements of the `by` column(s) (instead of the `k`
|
1870
|
+
# largest). This can be specified per column by passing a sequence of
|
1871
|
+
# booleans.
|
1872
|
+
#
|
1873
|
+
# @return [DataFrame]
|
1874
|
+
#
|
1875
|
+
# @example Get the rows which contain the 4 largest values in column b.
|
1876
|
+
# df = Polars::DataFrame.new(
|
1877
|
+
# {
|
1878
|
+
# "a" => ["a", "b", "a", "b", "b", "c"],
|
1879
|
+
# "b" => [2, 1, 1, 3, 2, 1]
|
1880
|
+
# }
|
1881
|
+
# )
|
1882
|
+
# df.top_k(4, by: "b")
|
1883
|
+
# # =>
|
1884
|
+
# # shape: (4, 2)
|
1885
|
+
# # ┌─────┬─────┐
|
1886
|
+
# # │ a ┆ b │
|
1887
|
+
# # │ --- ┆ --- │
|
1888
|
+
# # │ str ┆ i64 │
|
1889
|
+
# # ╞═════╪═════╡
|
1890
|
+
# # │ b ┆ 3 │
|
1891
|
+
# # │ a ┆ 2 │
|
1892
|
+
# # │ b ┆ 2 │
|
1893
|
+
# # │ b ┆ 1 │
|
1894
|
+
# # └─────┴─────┘
|
1895
|
+
#
|
1896
|
+
# @example Get the rows which contain the 4 largest values when sorting on column b and a.
|
1897
|
+
# df.top_k(4, by: ["b", "a"])
|
1898
|
+
# # =>
|
1899
|
+
# # shape: (4, 2)
|
1900
|
+
# # ┌─────┬─────┐
|
1901
|
+
# # │ a ┆ b │
|
1902
|
+
# # │ --- ┆ --- │
|
1903
|
+
# # │ str ┆ i64 │
|
1904
|
+
# # ╞═════╪═════╡
|
1905
|
+
# # │ b ┆ 3 │
|
1906
|
+
# # │ b ┆ 2 │
|
1907
|
+
# # │ a ┆ 2 │
|
1908
|
+
# # │ c ┆ 1 │
|
1909
|
+
# # └─────┴─────┘
|
1910
|
+
def top_k(
|
1911
|
+
k,
|
1912
|
+
by:,
|
1913
|
+
reverse: false
|
1914
|
+
)
|
1915
|
+
lazy
|
1916
|
+
.top_k(k, by: by, reverse: reverse)
|
1917
|
+
.collect(
|
1918
|
+
# optimizations=QueryOptFlags(
|
1919
|
+
# projection_pushdown=False,
|
1920
|
+
# predicate_pushdown=False,
|
1921
|
+
# comm_subplan_elim=False,
|
1922
|
+
# slice_pushdown=True
|
1923
|
+
# )
|
1924
|
+
)
|
1925
|
+
end
|
1926
|
+
|
1927
|
+
# Return the `k` smallest rows.
|
1928
|
+
#
|
1929
|
+
# Non-null elements are always preferred over null elements, regardless of
|
1930
|
+
# the value of `reverse`. The output is not guaranteed to be in any
|
1931
|
+
# particular order, call `sort` after this function if you wish the
|
1932
|
+
# output to be sorted.
|
1933
|
+
#
|
1934
|
+
# @param k [Integer]
|
1935
|
+
# Number of rows to return.
|
1936
|
+
# @param by [Object]
|
1937
|
+
# Column(s) used to determine the bottom rows.
|
1938
|
+
# Accepts expression input. Strings are parsed as column names.
|
1939
|
+
# @param reverse [Object]
|
1940
|
+
# Consider the `k` largest elements of the `by` column(s) (instead of the `k`
|
1941
|
+
# smallest). This can be specified per column by passing a sequence of
|
1942
|
+
# booleans.
|
1943
|
+
#
|
1944
|
+
# @return [DataFrame]
|
1945
|
+
#
|
1946
|
+
# @example Get the rows which contain the 4 smallest values in column b.
|
1947
|
+
# df = Polars::DataFrame.new(
|
1948
|
+
# {
|
1949
|
+
# "a" => ["a", "b", "a", "b", "b", "c"],
|
1950
|
+
# "b" => [2, 1, 1, 3, 2, 1]
|
1951
|
+
# }
|
1952
|
+
# )
|
1953
|
+
# df.bottom_k(4, by: "b")
|
1954
|
+
# # =>
|
1955
|
+
# # shape: (4, 2)
|
1956
|
+
# # ┌─────┬─────┐
|
1957
|
+
# # │ a ┆ b │
|
1958
|
+
# # │ --- ┆ --- │
|
1959
|
+
# # │ str ┆ i64 │
|
1960
|
+
# # ╞═════╪═════╡
|
1961
|
+
# # │ b ┆ 1 │
|
1962
|
+
# # │ a ┆ 1 │
|
1963
|
+
# # │ c ┆ 1 │
|
1964
|
+
# # │ a ┆ 2 │
|
1965
|
+
# # └─────┴─────┘
|
1966
|
+
#
|
1967
|
+
# @example Get the rows which contain the 4 smallest values when sorting on column a and b.
|
1968
|
+
# df.bottom_k(4, by: ["a", "b"])
|
1969
|
+
# # =>
|
1970
|
+
# # shape: (4, 2)
|
1971
|
+
# # ┌─────┬─────┐
|
1972
|
+
# # │ a ┆ b │
|
1973
|
+
# # │ --- ┆ --- │
|
1974
|
+
# # │ str ┆ i64 │
|
1975
|
+
# # ╞═════╪═════╡
|
1976
|
+
# # │ a ┆ 1 │
|
1977
|
+
# # │ a ┆ 2 │
|
1978
|
+
# # │ b ┆ 1 │
|
1979
|
+
# # │ b ┆ 2 │
|
1980
|
+
# # └─────┴─────┘
|
1981
|
+
def bottom_k(
|
1982
|
+
k,
|
1983
|
+
by:,
|
1984
|
+
reverse: false
|
1985
|
+
)
|
1986
|
+
lazy
|
1987
|
+
.bottom_k(k, by: by, reverse: reverse)
|
1988
|
+
.collect(
|
1989
|
+
# optimizations=QueryOptFlags(
|
1990
|
+
# projection_pushdown=False,
|
1991
|
+
# predicate_pushdown=False,
|
1992
|
+
# comm_subplan_elim=False,
|
1993
|
+
# slice_pushdown=True,
|
1994
|
+
# )
|
1995
|
+
)
|
1996
|
+
end
|
1997
|
+
|
1646
1998
|
# Check if DataFrame is equal to other.
|
1647
1999
|
#
|
1648
2000
|
# @param other [DataFrame]
|
@@ -1833,10 +2185,59 @@ module Polars
|
|
1833
2185
|
_from_rbdf(_df.tail(n))
|
1834
2186
|
end
|
1835
2187
|
|
1836
|
-
#
|
2188
|
+
# Drop all rows that contain one or more NaN values.
|
2189
|
+
#
|
2190
|
+
# The original order of the remaining rows is preserved.
|
2191
|
+
#
|
2192
|
+
# @param subset [Object]
|
2193
|
+
# Column name(s) for which NaN values are considered; if set to `nil`
|
2194
|
+
# (default), use all columns (note that only floating-point columns
|
2195
|
+
# can contain NaNs).
|
2196
|
+
#
|
2197
|
+
# @return [DataFrame]
|
2198
|
+
#
|
2199
|
+
# @example
|
2200
|
+
# df = Polars::DataFrame.new(
|
2201
|
+
# {
|
2202
|
+
# "foo" => [-20.5, Float::NAN, 80.0],
|
2203
|
+
# "bar" => [Float::NAN, 110.0, 25.5],
|
2204
|
+
# "ham" => ["xxx", "yyy", nil]
|
2205
|
+
# }
|
2206
|
+
# )
|
2207
|
+
# df.drop_nans
|
2208
|
+
# # =>
|
2209
|
+
# # shape: (1, 3)
|
2210
|
+
# # ┌──────┬──────┬──────┐
|
2211
|
+
# # │ foo ┆ bar ┆ ham │
|
2212
|
+
# # │ --- ┆ --- ┆ --- │
|
2213
|
+
# # │ f64 ┆ f64 ┆ str │
|
2214
|
+
# # ╞══════╪══════╪══════╡
|
2215
|
+
# # │ 80.0 ┆ 25.5 ┆ null │
|
2216
|
+
# # └──────┴──────┴──────┘
|
2217
|
+
#
|
2218
|
+
# @example
|
2219
|
+
# df.drop_nans(subset: ["bar"])
|
2220
|
+
# # =>
|
2221
|
+
# # shape: (2, 3)
|
2222
|
+
# # ┌──────┬───────┬──────┐
|
2223
|
+
# # │ foo ┆ bar ┆ ham │
|
2224
|
+
# # │ --- ┆ --- ┆ --- │
|
2225
|
+
# # │ f64 ┆ f64 ┆ str │
|
2226
|
+
# # ╞══════╪═══════╪══════╡
|
2227
|
+
# # │ NaN ┆ 110.0 ┆ yyy │
|
2228
|
+
# # │ 80.0 ┆ 25.5 ┆ null │
|
2229
|
+
# # └──────┴───────┴──────┘
|
2230
|
+
def drop_nans(subset: nil)
|
2231
|
+
lazy.drop_nans(subset: subset).collect(_eager: true)
|
2232
|
+
end
|
2233
|
+
|
2234
|
+
# Drop all rows that contain one or more null values.
|
2235
|
+
#
|
2236
|
+
# The original order of the remaining rows is preserved.
|
1837
2237
|
#
|
1838
2238
|
# @param subset [Object]
|
1839
|
-
#
|
2239
|
+
# Column name(s) for which null values are considered.
|
2240
|
+
# If set to `nil` (default), use all columns.
|
1840
2241
|
#
|
1841
2242
|
# @return [DataFrame]
|
1842
2243
|
#
|
@@ -1845,20 +2246,32 @@ module Polars
|
|
1845
2246
|
# {
|
1846
2247
|
# "foo" => [1, 2, 3],
|
1847
2248
|
# "bar" => [6, nil, 8],
|
1848
|
-
# "ham" => ["a", "b",
|
2249
|
+
# "ham" => ["a", "b", nil]
|
1849
2250
|
# }
|
1850
2251
|
# )
|
1851
2252
|
# df.drop_nulls
|
1852
2253
|
# # =>
|
1853
|
-
# # shape: (
|
2254
|
+
# # shape: (1, 3)
|
1854
2255
|
# # ┌─────┬─────┬─────┐
|
1855
2256
|
# # │ foo ┆ bar ┆ ham │
|
1856
2257
|
# # │ --- ┆ --- ┆ --- │
|
1857
2258
|
# # │ i64 ┆ i64 ┆ str │
|
1858
2259
|
# # ╞═════╪═════╪═════╡
|
1859
2260
|
# # │ 1 ┆ 6 ┆ a │
|
1860
|
-
# # │ 3 ┆ 8 ┆ c │
|
1861
2261
|
# # └─────┴─────┴─────┘
|
2262
|
+
#
|
2263
|
+
# @example
|
2264
|
+
# df.drop_nulls(subset: Polars.cs.integer)
|
2265
|
+
# # =>
|
2266
|
+
# # shape: (2, 3)
|
2267
|
+
# # ┌─────┬─────┬──────┐
|
2268
|
+
# # │ foo ┆ bar ┆ ham │
|
2269
|
+
# # │ --- ┆ --- ┆ --- │
|
2270
|
+
# # │ i64 ┆ i64 ┆ str │
|
2271
|
+
# # ╞═════╪═════╪══════╡
|
2272
|
+
# # │ 1 ┆ 6 ┆ a │
|
2273
|
+
# # │ 3 ┆ 8 ┆ null │
|
2274
|
+
# # └─────┴─────┴──────┘
|
1862
2275
|
def drop_nulls(subset: nil)
|
1863
2276
|
lazy.drop_nulls(subset: subset).collect(_eager: true)
|
1864
2277
|
end
|
@@ -2124,9 +2537,9 @@ module Polars
|
|
2124
2537
|
# @param every
|
2125
2538
|
# Interval of the window.
|
2126
2539
|
# @param period
|
2127
|
-
# Length of the window, if
|
2540
|
+
# Length of the window, if nil it is equal to 'every'.
|
2128
2541
|
# @param offset
|
2129
|
-
# Offset of the window if
|
2542
|
+
# Offset of the window if nil and period is nil it will be equal to negative
|
2130
2543
|
# `every`.
|
2131
2544
|
# @param truncate
|
2132
2545
|
# Truncate the time value to the window lower bound.
|
@@ -2138,6 +2551,22 @@ module Polars
|
|
2138
2551
|
# Define whether the temporal window interval is closed or not.
|
2139
2552
|
# @param by
|
2140
2553
|
# Also group by this column/these columns
|
2554
|
+
# @param start_by ['window', 'datapoint', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
|
2555
|
+
# The strategy to determine the start of the first window by.
|
2556
|
+
#
|
2557
|
+
# * 'window': Start by taking the earliest timestamp, truncating it with
|
2558
|
+
# `every`, and then adding `offset`.
|
2559
|
+
# Note that weekly windows start on Monday.
|
2560
|
+
# * 'datapoint': Start from the first encountered data point.
|
2561
|
+
# * a day of the week (only takes effect if `every` contains `'w'`):
|
2562
|
+
#
|
2563
|
+
# * 'monday': Start the window on the Monday before the first data point.
|
2564
|
+
# * 'tuesday': Start the window on the Tuesday before the first data point.
|
2565
|
+
# * ...
|
2566
|
+
# * 'sunday': Start the window on the Sunday before the first data point.
|
2567
|
+
#
|
2568
|
+
# The resulting window is then shifted back until the earliest datapoint
|
2569
|
+
# is in or in front of it.
|
2141
2570
|
#
|
2142
2571
|
# @return [DataFrame]
|
2143
2572
|
#
|
@@ -2438,13 +2867,13 @@ module Polars
|
|
2438
2867
|
# Join column of the right DataFrame.
|
2439
2868
|
# @param on [String]
|
2440
2869
|
# Join column of both DataFrames. If set, `left_on` and `right_on` should be
|
2441
|
-
#
|
2442
|
-
# @param by [Object]
|
2443
|
-
# join on these columns before doing asof join
|
2870
|
+
# nil.
|
2444
2871
|
# @param by_left [Object]
|
2445
2872
|
# join on these columns before doing asof join
|
2446
2873
|
# @param by_right [Object]
|
2447
2874
|
# join on these columns before doing asof join
|
2875
|
+
# @param by [Object]
|
2876
|
+
# join on these columns before doing asof join
|
2448
2877
|
# @param strategy ["backward", "forward"]
|
2449
2878
|
# Join strategy.
|
2450
2879
|
# @param suffix [String]
|
@@ -2454,14 +2883,6 @@ module Polars
|
|
2454
2883
|
# keys are within this distance. If an asof join is done on columns of dtype
|
2455
2884
|
# "Date", "Datetime", "Duration" or "Time" you use the following string
|
2456
2885
|
# language:
|
2457
|
-
# @param allow_exact_matches [Boolean]
|
2458
|
-
# Whether exact matches are valid join predicates.
|
2459
|
-
# - If true, allow matching with the same `on` value (i.e. less-than-or-equal-to / greater-than-or-equal-to).
|
2460
|
-
# - If false, don't match the same `on` value (i.e., strictly less-than / strictly greater-than).
|
2461
|
-
# @param check_sortedness [Boolean]
|
2462
|
-
# Check the sortedness of the asof keys. If the keys are not sorted Polars
|
2463
|
-
# will error, or in case of 'by' argument raise a warning. This might become
|
2464
|
-
# a hard error in the future.
|
2465
2886
|
#
|
2466
2887
|
# - 1ns (1 nanosecond)
|
2467
2888
|
# - 1us (1 microsecond)
|
@@ -2489,6 +2910,14 @@ module Polars
|
|
2489
2910
|
# - true: -> Always coalesce join columns.
|
2490
2911
|
# - false: -> Never coalesce join columns.
|
2491
2912
|
# Note that joining on any other expressions than `col` will turn off coalescing.
|
2913
|
+
# @param allow_exact_matches [Boolean]
|
2914
|
+
# Whether exact matches are valid join predicates.
|
2915
|
+
# - If true, allow matching with the same `on` value (i.e. less-than-or-equal-to / greater-than-or-equal-to).
|
2916
|
+
# - If false, don't match the same `on` value (i.e., strictly less-than / strictly greater-than).
|
2917
|
+
# @param check_sortedness [Boolean]
|
2918
|
+
# Check the sortedness of the asof keys. If the keys are not sorted Polars
|
2919
|
+
# will error, or in case of 'by' argument raise a warning. This might become
|
2920
|
+
# a hard error in the future.
|
2492
2921
|
#
|
2493
2922
|
# @return [DataFrame]
|
2494
2923
|
#
|
@@ -2724,6 +3153,101 @@ module Polars
|
|
2724
3153
|
.collect(no_optimization: true)
|
2725
3154
|
end
|
2726
3155
|
|
3156
|
+
# Perform a join based on one or multiple (in)equality predicates.
|
3157
|
+
#
|
3158
|
+
# This performs an inner join, so only rows where all predicates are true
|
3159
|
+
# are included in the result, and a row from either DataFrame may be included
|
3160
|
+
# multiple times in the result.
|
3161
|
+
#
|
3162
|
+
# @note
|
3163
|
+
# The row order of the input DataFrames is not preserved.
|
3164
|
+
#
|
3165
|
+
# @note
|
3166
|
+
# This functionality is experimental. It may be
|
3167
|
+
# changed at any point without it being considered a breaking change.
|
3168
|
+
#
|
3169
|
+
# @param other [DataFrame]
|
3170
|
+
# DataFrame to join with.
|
3171
|
+
# @param predicates [Array]
|
3172
|
+
# (In)Equality condition to join the two tables on.
|
3173
|
+
# When a column name occurs in both tables, the proper suffix must
|
3174
|
+
# be applied in the predicate.
|
3175
|
+
# @param suffix [String]
|
3176
|
+
# Suffix to append to columns with a duplicate name.
|
3177
|
+
#
|
3178
|
+
# @return [DataFrame]
|
3179
|
+
#
|
3180
|
+
# @example Join two dataframes together based on two predicates which get AND-ed together.
|
3181
|
+
# east = Polars::DataFrame.new(
|
3182
|
+
# {
|
3183
|
+
# "id": [100, 101, 102],
|
3184
|
+
# "dur": [120, 140, 160],
|
3185
|
+
# "rev": [12, 14, 16],
|
3186
|
+
# "cores": [2, 8, 4]
|
3187
|
+
# }
|
3188
|
+
# )
|
3189
|
+
# west = Polars::DataFrame.new(
|
3190
|
+
# {
|
3191
|
+
# "t_id": [404, 498, 676, 742],
|
3192
|
+
# "time": [90, 130, 150, 170],
|
3193
|
+
# "cost": [9, 13, 15, 16],
|
3194
|
+
# "cores": [4, 2, 1, 4]
|
3195
|
+
# }
|
3196
|
+
# )
|
3197
|
+
# east.join_where(
|
3198
|
+
# west,
|
3199
|
+
# Polars.col("dur") < Polars.col("time"),
|
3200
|
+
# Polars.col("rev") < Polars.col("cost")
|
3201
|
+
# )
|
3202
|
+
# # =>
|
3203
|
+
# # shape: (5, 8)
|
3204
|
+
# # ┌─────┬─────┬─────┬───────┬──────┬──────┬──────┬─────────────┐
|
3205
|
+
# # │ id ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right │
|
3206
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
3207
|
+
# # │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
|
3208
|
+
# # ╞═════╪═════╪═════╪═══════╪══════╪══════╪══════╪═════════════╡
|
3209
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 498 ┆ 130 ┆ 13 ┆ 2 │
|
3210
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
|
3211
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
|
3212
|
+
# # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
|
3213
|
+
# # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
|
3214
|
+
# # └─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
|
3215
|
+
#
|
3216
|
+
# @example To OR them together, use a single expression and the `|` operator.
|
3217
|
+
# east.join_where(
|
3218
|
+
# west,
|
3219
|
+
# (Polars.col("dur") < Polars.col("time")) | (Polars.col("rev") < Polars.col("cost"))
|
3220
|
+
# )
|
3221
|
+
# # =>
|
3222
|
+
# # shape: (6, 8)
|
3223
|
+
# # ┌─────┬─────┬─────┬───────┬──────┬──────┬──────┬─────────────┐
|
3224
|
+
# # │ id ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right │
|
3225
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
3226
|
+
# # │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
|
3227
|
+
# # ╞═════╪═════╪═════╪═══════╪══════╪══════╪══════╪═════════════╡
|
3228
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 498 ┆ 130 ┆ 13 ┆ 2 │
|
3229
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
|
3230
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
|
3231
|
+
# # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
|
3232
|
+
# # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
|
3233
|
+
# # │ 102 ┆ 160 ┆ 16 ┆ 4 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
|
3234
|
+
# # └─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
|
3235
|
+
def join_where(
|
3236
|
+
other,
|
3237
|
+
*predicates,
|
3238
|
+
suffix: "_right"
|
3239
|
+
)
|
3240
|
+
Utils.require_same_type(self, other)
|
3241
|
+
|
3242
|
+
lazy
|
3243
|
+
.join_where(
|
3244
|
+
other.lazy,
|
3245
|
+
*predicates,
|
3246
|
+
suffix: suffix
|
3247
|
+
)
|
3248
|
+
.collect(_eager: true)
|
3249
|
+
end
|
3250
|
+
|
2727
3251
|
# Apply a custom/user-defined function (UDF) over the rows of the DataFrame.
|
2728
3252
|
#
|
2729
3253
|
# The UDF will receive each row as a tuple of values: `udf(row)`.
|
@@ -3436,19 +3960,22 @@ module Polars
|
|
3436
3960
|
|
3437
3961
|
# Create a spreadsheet-style pivot table as a DataFrame.
|
3438
3962
|
#
|
3963
|
+
# @param on [Object]
|
3964
|
+
# Columns whose values will be used as the header of the output DataFrame
|
3965
|
+
# @param index [Object]
|
3966
|
+
# One or multiple keys to group by
|
3439
3967
|
# @param values [Object]
|
3440
3968
|
# Column values to aggregate. Can be multiple columns if the *columns*
|
3441
3969
|
# arguments contains multiple columns as well
|
3442
|
-
# @param index [Object]
|
3443
|
-
# One or multiple keys to group by
|
3444
|
-
# @param on [Object]
|
3445
|
-
# Columns whose values will be used as the header of the output DataFrame
|
3446
3970
|
# @param aggregate_function ["first", "sum", "max", "min", "mean", "median", "last", "count"]
|
3447
3971
|
# A predefined aggregate function str or an expression.
|
3448
3972
|
# @param maintain_order [Object]
|
3449
3973
|
# Sort the grouped keys so that the output order is predictable.
|
3450
3974
|
# @param sort_columns [Object]
|
3451
3975
|
# Sort the transposed columns by name. Default is by order of discovery.
|
3976
|
+
# @param separator [String]
|
3977
|
+
# Used as separator/delimiter in generated column names in case of multiple
|
3978
|
+
# `values` columns.
|
3452
3979
|
#
|
3453
3980
|
# @return [DataFrame]
|
3454
3981
|
#
|
@@ -3712,9 +4239,11 @@ module Polars
|
|
3712
4239
|
# @param maintain_order [Boolean]
|
3713
4240
|
# Keep predictable output order. This is slower as it requires an extra sort
|
3714
4241
|
# operation.
|
4242
|
+
# @param include_key [Boolean]
|
4243
|
+
# Include the columns used to partition the DataFrame in the output.
|
3715
4244
|
# @param as_dict [Boolean]
|
3716
|
-
# If true, return the partitions in a
|
3717
|
-
# values instead of
|
4245
|
+
# If true, return the partitions in a hash keyed by the distinct group
|
4246
|
+
# values instead of an array.
|
3718
4247
|
#
|
3719
4248
|
# @return [Object]
|
3720
4249
|
#
|
@@ -4035,6 +4564,26 @@ module Polars
|
|
4035
4564
|
lazy.select(*exprs, **named_exprs).collect(_eager: true)
|
4036
4565
|
end
|
4037
4566
|
|
4567
|
+
# Select columns from this DataFrame.
|
4568
|
+
#
|
4569
|
+
# This will run all expression sequentially instead of in parallel.
|
4570
|
+
# Use this when the work per expression is cheap.
|
4571
|
+
#
|
4572
|
+
# @param exprs [Array]
|
4573
|
+
# Column(s) to select, specified as positional arguments.
|
4574
|
+
# Accepts expression input. Strings are parsed as column names,
|
4575
|
+
# other non-expression inputs are parsed as literals.
|
4576
|
+
# @param named_exprs [Hash]
|
4577
|
+
# Additional columns to select, specified as keyword arguments.
|
4578
|
+
# The columns will be renamed to the keyword used.
|
4579
|
+
#
|
4580
|
+
# @return [DataFrame]
|
4581
|
+
def select_seq(*exprs, **named_exprs)
|
4582
|
+
lazy
|
4583
|
+
.select_seq(*exprs, **named_exprs)
|
4584
|
+
.collect(_eager: true)
|
4585
|
+
end
|
4586
|
+
|
4038
4587
|
# Add columns to this DataFrame.
|
4039
4588
|
#
|
4040
4589
|
# Added columns will replace existing columns with the same name.
|
@@ -4147,6 +4696,31 @@ module Polars
|
|
4147
4696
|
lazy.with_columns(*exprs, **named_exprs).collect(_eager: true)
|
4148
4697
|
end
|
4149
4698
|
|
4699
|
+
# Add columns to this DataFrame.
|
4700
|
+
#
|
4701
|
+
# Added columns will replace existing columns with the same name.
|
4702
|
+
#
|
4703
|
+
# This will run all expression sequentially instead of in parallel.
|
4704
|
+
# Use this when the work per expression is cheap.
|
4705
|
+
#
|
4706
|
+
# @param exprs [Array]
|
4707
|
+
# Column(s) to add, specified as positional arguments.
|
4708
|
+
# Accepts expression input. Strings are parsed as column names, other
|
4709
|
+
# non-expression inputs are parsed as literals.
|
4710
|
+
# @param named_exprs [Hash]
|
4711
|
+
# Additional columns to add, specified as keyword arguments.
|
4712
|
+
# The columns will be renamed to the keyword used.
|
4713
|
+
#
|
4714
|
+
# @return [DataFrame]
|
4715
|
+
def with_columns_seq(
|
4716
|
+
*exprs,
|
4717
|
+
**named_exprs
|
4718
|
+
)
|
4719
|
+
lazy
|
4720
|
+
.with_columns_seq(*exprs, **named_exprs)
|
4721
|
+
.collect(_eager: true)
|
4722
|
+
end
|
4723
|
+
|
4150
4724
|
# Get number of chunks used by the ChunkedArrays of this DataFrame.
|
4151
4725
|
#
|
4152
4726
|
# @param strategy ["first", "all"]
|
@@ -4556,9 +5130,15 @@ module Polars
|
|
4556
5130
|
|
4557
5131
|
# Get one hot encoded dummy variables.
|
4558
5132
|
#
|
4559
|
-
# @param columns
|
5133
|
+
# @param columns [Array]
|
4560
5134
|
# A subset of columns to convert to dummy variables. `nil` means
|
4561
5135
|
# "all columns".
|
5136
|
+
# @param separator [String]
|
5137
|
+
# Separator/delimiter used when generating column names.
|
5138
|
+
# @param drop_first [Boolean]
|
5139
|
+
# Remove the first category from the variables being encoded.
|
5140
|
+
# @param drop_nulls [Boolean]
|
5141
|
+
# If there are `nil` values in the series, a `null` column is not generated
|
4562
5142
|
#
|
4563
5143
|
# @return [DataFrame]
|
4564
5144
|
#
|
@@ -4581,11 +5161,11 @@ module Polars
|
|
4581
5161
|
# # │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │
|
4582
5162
|
# # │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │
|
4583
5163
|
# # └───────┴───────┴───────┴───────┴───────┴───────┘
|
4584
|
-
def to_dummies(columns: nil, separator: "_", drop_first: false)
|
5164
|
+
def to_dummies(columns: nil, separator: "_", drop_first: false, drop_nulls: false)
|
4585
5165
|
if columns.is_a?(::String)
|
4586
5166
|
columns = [columns]
|
4587
5167
|
end
|
4588
|
-
_from_rbdf(_df.to_dummies(columns, separator, drop_first))
|
5168
|
+
_from_rbdf(_df.to_dummies(columns, separator, drop_first, drop_nulls))
|
4589
5169
|
end
|
4590
5170
|
|
4591
5171
|
# Drop duplicate rows from this DataFrame.
|
@@ -4753,7 +5333,7 @@ module Polars
|
|
4753
5333
|
# # │ --- ┆ --- ┆ --- │
|
4754
5334
|
# # │ i64 ┆ i64 ┆ str │
|
4755
5335
|
# # ╞═════╪═════╪═════╡
|
4756
|
-
# # │
|
5336
|
+
# # │ 1 ┆ 6 ┆ a │
|
4757
5337
|
# # │ 2 ┆ 7 ┆ b │
|
4758
5338
|
# # └─────┴─────┴─────┘
|
4759
5339
|
def sample(
|
@@ -4979,6 +5559,85 @@ module Polars
|
|
4979
5559
|
end
|
4980
5560
|
end
|
4981
5561
|
|
5562
|
+
# Convert columnar data to rows as Ruby arrays in a hash keyed by some column.
|
5563
|
+
#
|
5564
|
+
# This method is like `rows`, but instead of returning rows in a flat list, rows
|
5565
|
+
# are grouped by the values in the `key` column(s) and returned as a hash.
|
5566
|
+
#
|
5567
|
+
# Note that this method should not be used in place of native operations, due to
|
5568
|
+
# the high cost of materializing all frame data out into a hash; it should
|
5569
|
+
# be used only when you need to move the values out into a Ruby data structure
|
5570
|
+
# or other object that cannot operate directly with Polars/Arrow.
|
5571
|
+
#
|
5572
|
+
# @param key [Object]
|
5573
|
+
# The column(s) to use as the key for the returned hash. If multiple
|
5574
|
+
# columns are specified, the key will be a tuple of those values, otherwise
|
5575
|
+
# it will be a string.
|
5576
|
+
# @param named [Boolean]
|
5577
|
+
# Return hashes instead of arrays. The hashes are a mapping of
|
5578
|
+
# column name to row value. This is more expensive than returning an
|
5579
|
+
# array, but allows for accessing values by column name.
|
5580
|
+
# @param include_key [Boolean]
|
5581
|
+
# Include key values inline with the associated data (by default the key
|
5582
|
+
# values are omitted as a memory/performance optimisation, as they can be
|
5583
|
+
# reoconstructed from the key).
|
5584
|
+
# @param unique [Boolean]
|
5585
|
+
# Indicate that the key is unique; this will result in a 1:1 mapping from
|
5586
|
+
# key to a single associated row. Note that if the key is *not* actually
|
5587
|
+
# unique the last row with the given key will be returned.
|
5588
|
+
#
|
5589
|
+
# @return [Hash]
|
5590
|
+
#
|
5591
|
+
# @example Group rows by the given key column(s):
|
5592
|
+
# df = Polars::DataFrame.new(
|
5593
|
+
# {
|
5594
|
+
# "w" => ["a", "b", "b", "a"],
|
5595
|
+
# "x" => ["q", "q", "q", "k"],
|
5596
|
+
# "y" => [1.0, 2.5, 3.0, 4.5],
|
5597
|
+
# "z" => [9, 8, 7, 6]
|
5598
|
+
# }
|
5599
|
+
# )
|
5600
|
+
# df.rows_by_key(["w"])
|
5601
|
+
# # => {"a"=>[["q", 1.0, 9], ["k", 4.5, 6]], "b"=>[["q", 2.5, 8], ["q", 3.0, 7]]}
|
5602
|
+
#
|
5603
|
+
# @example Return the same row groupings as hashes:
|
5604
|
+
# df.rows_by_key(["w"], named: true)
|
5605
|
+
# # => {"a"=>[{"x"=>"q", "y"=>1.0, "z"=>9}, {"x"=>"k", "y"=>4.5, "z"=>6}], "b"=>[{"x"=>"q", "y"=>2.5, "z"=>8}, {"x"=>"q", "y"=>3.0, "z"=>7}]}
|
5606
|
+
#
|
5607
|
+
# @example Return row groupings, assuming keys are unique:
|
5608
|
+
# df.rows_by_key(["z"], unique: true)
|
5609
|
+
# # => {9=>["a", "q", 1.0], 8=>["b", "q", 2.5], 7=>["b", "q", 3.0], 6=>["a", "k", 4.5]}
|
5610
|
+
#
|
5611
|
+
# @example Return row groupings as hashes, assuming keys are unique:
|
5612
|
+
# df.rows_by_key(["z"], named: true, unique: true)
|
5613
|
+
# # => {9=>{"w"=>"a", "x"=>"q", "y"=>1.0}, 8=>{"w"=>"b", "x"=>"q", "y"=>2.5}, 7=>{"w"=>"b", "x"=>"q", "y"=>3.0}, 6=>{"w"=>"a", "x"=>"k", "y"=>4.5}}
|
5614
|
+
#
|
5615
|
+
# @example Return hash rows grouped by a compound key, including key values:
|
5616
|
+
# df.rows_by_key(["w", "x"], named: true, include_key: true)
|
5617
|
+
# # => {["a", "q"]=>[{"w"=>"a", "x"=>"q", "y"=>1.0, "z"=>9}], ["b", "q"]=>[{"w"=>"b", "x"=>"q", "y"=>2.5, "z"=>8}, {"w"=>"b", "x"=>"q", "y"=>3.0, "z"=>7}], ["a", "k"]=>[{"w"=>"a", "x"=>"k", "y"=>4.5, "z"=>6}]}
|
5618
|
+
def rows_by_key(key, named: false, include_key: false, unique: false)
|
5619
|
+
key = Utils._expand_selectors(self, key)
|
5620
|
+
|
5621
|
+
keys = key.size == 1 ? get_column(key[0]) : select(key).iter_rows
|
5622
|
+
|
5623
|
+
if include_key
|
5624
|
+
values = self
|
5625
|
+
else
|
5626
|
+
data_cols = schema.keys - key
|
5627
|
+
values = select(data_cols)
|
5628
|
+
end
|
5629
|
+
|
5630
|
+
zipped = keys.each.zip(values.iter_rows(named: named))
|
5631
|
+
|
5632
|
+
# if unique, we expect to write just one entry per key; otherwise, we're
|
5633
|
+
# returning a list of rows for each key, so append into a hash of arrays.
|
5634
|
+
if unique
|
5635
|
+
zipped.to_h
|
5636
|
+
else
|
5637
|
+
zipped.each_with_object({}) { |(key, data), h| (h[key] ||= []) << data }
|
5638
|
+
end
|
5639
|
+
end
|
5640
|
+
|
4982
5641
|
# Returns an iterator over the DataFrame of rows of Ruby-native values.
|
4983
5642
|
#
|
4984
5643
|
# @param named [Boolean]
|
@@ -5400,9 +6059,136 @@ module Polars
|
|
5400
6059
|
.collect(no_optimization: true)
|
5401
6060
|
end
|
5402
6061
|
|
5403
|
-
#
|
5404
|
-
#
|
5405
|
-
#
|
6062
|
+
# Update the values in this `DataFrame` with the values in `other`.
|
6063
|
+
#
|
6064
|
+
# @note
|
6065
|
+
# This functionality is considered **unstable**. It may be changed
|
6066
|
+
# at any point without it being considered a breaking change.
|
6067
|
+
#
|
6068
|
+
# @param other [DataFrame]
|
6069
|
+
# DataFrame that will be used to update the values
|
6070
|
+
# @param on [Object]
|
6071
|
+
# Column names that will be joined on. If set to `nil` (default),
|
6072
|
+
# the implicit row index of each frame is used as a join key.
|
6073
|
+
# @param how ['left', 'inner', 'full']
|
6074
|
+
# * 'left' will keep all rows from the left table; rows may be duplicated
|
6075
|
+
# if multiple rows in the right frame match the left row's key.
|
6076
|
+
# * 'inner' keeps only those rows where the key exists in both frames.
|
6077
|
+
# * 'full' will update existing rows where the key matches while also
|
6078
|
+
# adding any new rows contained in the given frame.
|
6079
|
+
# @param left_on [Object]
|
6080
|
+
# Join column(s) of the left DataFrame.
|
6081
|
+
# @param right_on [Object]
|
6082
|
+
# Join column(s) of the right DataFrame.
|
6083
|
+
# @param include_nulls [Boolean]
|
6084
|
+
# Overwrite values in the left frame with null values from the right frame.
|
6085
|
+
# If set to `false` (default), null values in the right frame are ignored.
|
6086
|
+
# @param maintain_order ['none', 'left', 'right', 'left_right', 'right_left']
|
6087
|
+
# Which order of rows from the inputs to preserve. See `DataFrame.join`
|
6088
|
+
# for details. Unlike `join` this function preserves the left order by
|
6089
|
+
# default.
|
6090
|
+
#
|
6091
|
+
# @return [DataFrame]
|
6092
|
+
#
|
6093
|
+
# @note
|
6094
|
+
# This is syntactic sugar for a left/inner join that preserves the order
|
6095
|
+
# of the left `DataFrame` by default, with an optional coalesce when
|
6096
|
+
# `include_nulls: false`.
|
6097
|
+
#
|
6098
|
+
# @example Update `df` values with the non-null values in `new_df`, by row index:
|
6099
|
+
# df = Polars::DataFrame.new(
|
6100
|
+
# {
|
6101
|
+
# "A" => [1, 2, 3, 4],
|
6102
|
+
# "B" => [400, 500, 600, 700]
|
6103
|
+
# }
|
6104
|
+
# )
|
6105
|
+
# new_df = Polars::DataFrame.new(
|
6106
|
+
# {
|
6107
|
+
# "B" => [-66, nil, -99],
|
6108
|
+
# "C" => [5, 3, 1]
|
6109
|
+
# }
|
6110
|
+
# )
|
6111
|
+
# df.update(new_df)
|
6112
|
+
# # =>
|
6113
|
+
# # shape: (4, 2)
|
6114
|
+
# # ┌─────┬─────┐
|
6115
|
+
# # │ A ┆ B │
|
6116
|
+
# # │ --- ┆ --- │
|
6117
|
+
# # │ i64 ┆ i64 │
|
6118
|
+
# # ╞═════╪═════╡
|
6119
|
+
# # │ 1 ┆ -66 │
|
6120
|
+
# # │ 2 ┆ 500 │
|
6121
|
+
# # │ 3 ┆ -99 │
|
6122
|
+
# # │ 4 ┆ 700 │
|
6123
|
+
# # └─────┴─────┘
|
6124
|
+
#
|
6125
|
+
# @example Update `df` values with the non-null values in `new_df`, by row index, but only keeping those rows that are common to both frames:
|
6126
|
+
# df.update(new_df, how: "inner")
|
6127
|
+
# # =>
|
6128
|
+
# # shape: (3, 2)
|
6129
|
+
# # ┌─────┬─────┐
|
6130
|
+
# # │ A ┆ B │
|
6131
|
+
# # │ --- ┆ --- │
|
6132
|
+
# # │ i64 ┆ i64 │
|
6133
|
+
# # ╞═════╪═════╡
|
6134
|
+
# # │ 1 ┆ -66 │
|
6135
|
+
# # │ 2 ┆ 500 │
|
6136
|
+
# # │ 3 ┆ -99 │
|
6137
|
+
# # └─────┴─────┘
|
6138
|
+
#
|
6139
|
+
# @example Update `df` values with the non-null values in `new_df`, using a full outer join strategy that defines explicit join columns in each frame:
|
6140
|
+
# df.update(new_df, left_on: ["A"], right_on: ["C"], how: "full")
|
6141
|
+
# # =>
|
6142
|
+
# # shape: (5, 2)
|
6143
|
+
# # ┌─────┬─────┐
|
6144
|
+
# # │ A ┆ B │
|
6145
|
+
# # │ --- ┆ --- │
|
6146
|
+
# # │ i64 ┆ i64 │
|
6147
|
+
# # ╞═════╪═════╡
|
6148
|
+
# # │ 1 ┆ -99 │
|
6149
|
+
# # │ 2 ┆ 500 │
|
6150
|
+
# # │ 3 ┆ 600 │
|
6151
|
+
# # │ 4 ┆ 700 │
|
6152
|
+
# # │ 5 ┆ -66 │
|
6153
|
+
# # └─────┴─────┘
|
6154
|
+
#
|
6155
|
+
# @example Update `df` values including null values in `new_df`, using a full outer join strategy that defines explicit join columns in each frame:
|
6156
|
+
# df.update(new_df, left_on: "A", right_on: "C", how: "full", include_nulls: true)
|
6157
|
+
# # =>
|
6158
|
+
# # shape: (5, 2)
|
6159
|
+
# # ┌─────┬──────┐
|
6160
|
+
# # │ A ┆ B │
|
6161
|
+
# # │ --- ┆ --- │
|
6162
|
+
# # │ i64 ┆ i64 │
|
6163
|
+
# # ╞═════╪══════╡
|
6164
|
+
# # │ 1 ┆ -99 │
|
6165
|
+
# # │ 2 ┆ 500 │
|
6166
|
+
# # │ 3 ┆ null │
|
6167
|
+
# # │ 4 ┆ 700 │
|
6168
|
+
# # │ 5 ┆ -66 │
|
6169
|
+
# # └─────┴──────┘
|
6170
|
+
def update(
|
6171
|
+
other,
|
6172
|
+
on: nil,
|
6173
|
+
how: "left",
|
6174
|
+
left_on: nil,
|
6175
|
+
right_on: nil,
|
6176
|
+
include_nulls: false,
|
6177
|
+
maintain_order: "left"
|
6178
|
+
)
|
6179
|
+
Utils.require_same_type(self, other)
|
6180
|
+
lazy
|
6181
|
+
.update(
|
6182
|
+
other.lazy,
|
6183
|
+
on: on,
|
6184
|
+
how: how,
|
6185
|
+
left_on: left_on,
|
6186
|
+
right_on: right_on,
|
6187
|
+
include_nulls: include_nulls,
|
6188
|
+
maintain_order: maintain_order
|
6189
|
+
)
|
6190
|
+
.collect(_eager: true)
|
6191
|
+
end
|
5406
6192
|
|
5407
6193
|
private
|
5408
6194
|
|