polars-df 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/Cargo.lock +142 -11
- data/Cargo.toml +5 -0
- data/ext/polars/Cargo.toml +17 -1
- data/ext/polars/src/apply/dataframe.rs +292 -0
- data/ext/polars/src/apply/mod.rs +254 -0
- data/ext/polars/src/apply/series.rs +1173 -0
- data/ext/polars/src/conversion.rs +180 -5
- data/ext/polars/src/dataframe.rs +146 -1
- data/ext/polars/src/error.rs +12 -0
- data/ext/polars/src/lazy/apply.rs +34 -2
- data/ext/polars/src/lazy/dataframe.rs +74 -3
- data/ext/polars/src/lazy/dsl.rs +136 -0
- data/ext/polars/src/lib.rs +199 -1
- data/ext/polars/src/list_construction.rs +100 -0
- data/ext/polars/src/series.rs +331 -0
- data/ext/polars/src/utils.rs +25 -0
- data/lib/polars/cat_name_space.rb +54 -0
- data/lib/polars/convert.rb +100 -0
- data/lib/polars/data_frame.rb +1558 -60
- data/lib/polars/date_time_expr.rb +2 -2
- data/lib/polars/date_time_name_space.rb +1484 -0
- data/lib/polars/dynamic_group_by.rb +49 -0
- data/lib/polars/expr.rb +4072 -107
- data/lib/polars/expr_dispatch.rb +8 -0
- data/lib/polars/functions.rb +192 -3
- data/lib/polars/group_by.rb +44 -3
- data/lib/polars/io.rb +20 -4
- data/lib/polars/lazy_frame.rb +800 -26
- data/lib/polars/lazy_functions.rb +687 -43
- data/lib/polars/lazy_group_by.rb +1 -0
- data/lib/polars/list_expr.rb +502 -5
- data/lib/polars/list_name_space.rb +346 -0
- data/lib/polars/rolling_group_by.rb +35 -0
- data/lib/polars/series.rb +934 -62
- data/lib/polars/string_expr.rb +189 -13
- data/lib/polars/string_name_space.rb +690 -0
- data/lib/polars/struct_name_space.rb +64 -0
- data/lib/polars/utils.rb +44 -0
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +14 -1
- metadata +15 -3
data/lib/polars/data_frame.rb
CHANGED
@@ -26,14 +26,14 @@ module Polars
|
|
26
26
|
end
|
27
27
|
|
28
28
|
if data.nil?
|
29
|
-
self._df = hash_to_rbdf({}, columns: columns)
|
29
|
+
self._df = self.class.hash_to_rbdf({}, columns: columns)
|
30
30
|
elsif data.is_a?(Hash)
|
31
31
|
data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
|
32
|
-
self._df = hash_to_rbdf(data, columns: columns)
|
32
|
+
self._df = self.class.hash_to_rbdf(data, columns: columns)
|
33
33
|
elsif data.is_a?(Array)
|
34
|
-
self._df = sequence_to_rbdf(data, columns: columns, orient: orient)
|
34
|
+
self._df = self.class.sequence_to_rbdf(data, columns: columns, orient: orient)
|
35
35
|
elsif data.is_a?(Series)
|
36
|
-
self._df = series_to_rbdf(data, columns: columns)
|
36
|
+
self._df = self.class.series_to_rbdf(data, columns: columns)
|
37
37
|
else
|
38
38
|
raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
|
39
39
|
end
|
@@ -46,11 +46,16 @@ module Polars
|
|
46
46
|
df
|
47
47
|
end
|
48
48
|
|
49
|
-
#
|
50
|
-
|
49
|
+
# @private
|
50
|
+
def self._from_hashes(data, infer_schema_length: 100, schema: nil)
|
51
|
+
rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema)
|
52
|
+
_from_rbdf(rbdf)
|
53
|
+
end
|
51
54
|
|
52
|
-
#
|
53
|
-
|
55
|
+
# @private
|
56
|
+
def self._from_hash(data, columns: nil)
|
57
|
+
_from_rbdf(hash_to_rbdf(data, columns: columns))
|
58
|
+
end
|
54
59
|
|
55
60
|
# def self._from_records
|
56
61
|
# end
|
@@ -186,8 +191,14 @@ module Polars
|
|
186
191
|
)
|
187
192
|
end
|
188
193
|
|
189
|
-
#
|
190
|
-
|
194
|
+
# @private
|
195
|
+
def self._read_avro(file, columns: nil, n_rows: nil)
|
196
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
197
|
+
file = Utils.format_path(file)
|
198
|
+
end
|
199
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
200
|
+
_from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
|
201
|
+
end
|
191
202
|
|
192
203
|
# @private
|
193
204
|
def self._read_ipc(
|
@@ -486,12 +497,6 @@ module Polars
|
|
486
497
|
# def each
|
487
498
|
# end
|
488
499
|
|
489
|
-
# def _pos_idx
|
490
|
-
# end
|
491
|
-
|
492
|
-
# def _pos_idxs
|
493
|
-
# end
|
494
|
-
|
495
500
|
# Returns subset of the DataFrame.
|
496
501
|
#
|
497
502
|
# @return [Object]
|
@@ -554,19 +559,33 @@ module Polars
|
|
554
559
|
|
555
560
|
# df[idx]
|
556
561
|
if item.is_a?(Integer)
|
557
|
-
return slice(_pos_idx(item,
|
562
|
+
return slice(_pos_idx(item, 0), 1)
|
558
563
|
end
|
559
564
|
|
560
565
|
# df[..]
|
561
566
|
if item.is_a?(Range)
|
562
567
|
return Slice.new(self).apply(item)
|
563
568
|
end
|
569
|
+
|
570
|
+
if Utils.is_str_sequence(item, allow_str: false)
|
571
|
+
# select multiple columns
|
572
|
+
# df[["foo", "bar"]]
|
573
|
+
return _from_rbdf(_df.select(item))
|
574
|
+
end
|
564
575
|
end
|
565
576
|
|
566
577
|
raise ArgumentError, "Cannot get item of type: #{item.class.name}"
|
567
578
|
end
|
568
579
|
|
580
|
+
# Set item.
|
581
|
+
#
|
582
|
+
# @return [Object]
|
569
583
|
# def []=(key, value)
|
584
|
+
# if key.is_a?(String)
|
585
|
+
# raise TypeError, "'DataFrame' object does not support 'Series' assignment by index. Use 'DataFrame.with_columns'"
|
586
|
+
# end
|
587
|
+
|
588
|
+
# raise Todo
|
570
589
|
# end
|
571
590
|
|
572
591
|
# no to_arrow
|
@@ -582,8 +601,24 @@ module Polars
|
|
582
601
|
end
|
583
602
|
end
|
584
603
|
|
585
|
-
#
|
586
|
-
#
|
604
|
+
# Convert every row to a dictionary.
|
605
|
+
#
|
606
|
+
# Note that this is slow.
|
607
|
+
#
|
608
|
+
# @return [Array]
|
609
|
+
#
|
610
|
+
# @example
|
611
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
612
|
+
# df.to_hashes
|
613
|
+
# [{'foo': 1, 'bar': 4}, {'foo': 2, 'bar': 5}, {'foo': 3, 'bar': 6}]
|
614
|
+
def to_hashes
|
615
|
+
rbdf = _df
|
616
|
+
names = columns
|
617
|
+
|
618
|
+
height.times.map do |i|
|
619
|
+
names.zip(rbdf.row_tuple(i)).to_h
|
620
|
+
end
|
621
|
+
end
|
587
622
|
|
588
623
|
# def to_numo
|
589
624
|
# end
|
@@ -762,8 +797,24 @@ module Polars
|
|
762
797
|
nil
|
763
798
|
end
|
764
799
|
|
765
|
-
#
|
766
|
-
#
|
800
|
+
# Write to Apache Avro file.
|
801
|
+
#
|
802
|
+
# @param file [String]
|
803
|
+
# File path to which the file should be written.
|
804
|
+
# @param compression ["uncompressed", "snappy", "deflate"]
|
805
|
+
# Compression method. Defaults to "uncompressed".
|
806
|
+
#
|
807
|
+
# @return [nil]
|
808
|
+
def write_avro(file, compression = "uncompressed")
|
809
|
+
if compression.nil?
|
810
|
+
compression = "uncompressed"
|
811
|
+
end
|
812
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
813
|
+
file = Utils.format_path(file)
|
814
|
+
end
|
815
|
+
|
816
|
+
_df.write_avro(file, compression)
|
817
|
+
end
|
767
818
|
|
768
819
|
# Write to Arrow IPC binary stream or Feather file.
|
769
820
|
#
|
@@ -866,8 +917,84 @@ module Polars
|
|
866
917
|
Utils.scale_bytes(sz, to: unit)
|
867
918
|
end
|
868
919
|
|
869
|
-
#
|
870
|
-
#
|
920
|
+
# Transpose a DataFrame over the diagonal.
|
921
|
+
#
|
922
|
+
# @param include_header [Boolean]
|
923
|
+
# If set, the column names will be added as first column.
|
924
|
+
# @param header_name [String]
|
925
|
+
# If `include_header` is set, this determines the name of the column that will
|
926
|
+
# be inserted.
|
927
|
+
# @param column_names [Array]
|
928
|
+
# Optional generator/iterator that yields column names. Will be used to
|
929
|
+
# replace the columns in the DataFrame.
|
930
|
+
#
|
931
|
+
# @return [DataFrame]
|
932
|
+
#
|
933
|
+
# @note
|
934
|
+
# This is a very expensive operation. Perhaps you can do it differently.
|
935
|
+
#
|
936
|
+
# @example
|
937
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => [1, 2, 3]})
|
938
|
+
# df.transpose(include_header: true)
|
939
|
+
# # =>
|
940
|
+
# # shape: (2, 4)
|
941
|
+
# # ┌────────┬──────────┬──────────┬──────────┐
|
942
|
+
# # │ column ┆ column_0 ┆ column_1 ┆ column_2 │
|
943
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
944
|
+
# # │ str ┆ i64 ┆ i64 ┆ i64 │
|
945
|
+
# # ╞════════╪══════════╪══════════╪══════════╡
|
946
|
+
# # │ a ┆ 1 ┆ 2 ┆ 3 │
|
947
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
|
948
|
+
# # │ b ┆ 1 ┆ 2 ┆ 3 │
|
949
|
+
# # └────────┴──────────┴──────────┴──────────┘
|
950
|
+
#
|
951
|
+
# @example Replace the auto-generated column names with a list
|
952
|
+
# df.transpose(include_header: false, column_names: ["a", "b", "c"])
|
953
|
+
# # =>
|
954
|
+
# # shape: (2, 3)
|
955
|
+
# # ┌─────┬─────┬─────┐
|
956
|
+
# # │ a ┆ b ┆ c │
|
957
|
+
# # │ --- ┆ --- ┆ --- │
|
958
|
+
# # │ i64 ┆ i64 ┆ i64 │
|
959
|
+
# # ╞═════╪═════╪═════╡
|
960
|
+
# # │ 1 ┆ 2 ┆ 3 │
|
961
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
962
|
+
# # │ 1 ┆ 2 ┆ 3 │
|
963
|
+
# # └─────┴─────┴─────┘
|
964
|
+
#
|
965
|
+
# @example Include the header as a separate column
|
966
|
+
# df.transpose(
|
967
|
+
# include_header: true, header_name: "foo", column_names: ["a", "b", "c"]
|
968
|
+
# )
|
969
|
+
# # =>
|
970
|
+
# # shape: (2, 4)
|
971
|
+
# # ┌─────┬─────┬─────┬─────┐
|
972
|
+
# # │ foo ┆ a ┆ b ┆ c │
|
973
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
974
|
+
# # │ str ┆ i64 ┆ i64 ┆ i64 │
|
975
|
+
# # ╞═════╪═════╪═════╪═════╡
|
976
|
+
# # │ a ┆ 1 ┆ 2 ┆ 3 │
|
977
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
978
|
+
# # │ b ┆ 1 ┆ 2 ┆ 3 │
|
979
|
+
# # └─────┴─────┴─────┴─────┘
|
980
|
+
def transpose(include_header: false, header_name: "column", column_names: nil)
|
981
|
+
df = _from_rbdf(_df.transpose(include_header, header_name))
|
982
|
+
if !column_names.nil?
|
983
|
+
names = []
|
984
|
+
n = df.width
|
985
|
+
if include_header
|
986
|
+
names << header_name
|
987
|
+
n -= 1
|
988
|
+
end
|
989
|
+
|
990
|
+
column_names = column_names.each
|
991
|
+
n.times do
|
992
|
+
names << column_names.next
|
993
|
+
end
|
994
|
+
df.columns = names
|
995
|
+
end
|
996
|
+
df
|
997
|
+
end
|
871
998
|
|
872
999
|
# Reverse the DataFrame.
|
873
1000
|
#
|
@@ -880,7 +1007,7 @@ module Polars
|
|
880
1007
|
# "val" => [1, 2, 3]
|
881
1008
|
# }
|
882
1009
|
# )
|
883
|
-
# df.reverse
|
1010
|
+
# df.reverse
|
884
1011
|
# # =>
|
885
1012
|
# # shape: (3, 2)
|
886
1013
|
# # ┌─────┬─────┐
|
@@ -1462,8 +1589,48 @@ module Polars
|
|
1462
1589
|
_from_rbdf(_df.drop_nulls(subset))
|
1463
1590
|
end
|
1464
1591
|
|
1465
|
-
#
|
1466
|
-
#
|
1592
|
+
# Offers a structured way to apply a sequence of user-defined functions (UDFs).
|
1593
|
+
#
|
1594
|
+
# @param func [Object]
|
1595
|
+
# Callable; will receive the frame as the first parameter,
|
1596
|
+
# followed by any given args/kwargs.
|
1597
|
+
# @param args [Object]
|
1598
|
+
# Arguments to pass to the UDF.
|
1599
|
+
# @param kwargs [Object]
|
1600
|
+
# Keyword arguments to pass to the UDF.
|
1601
|
+
#
|
1602
|
+
# @return [Object]
|
1603
|
+
#
|
1604
|
+
# @note
|
1605
|
+
# It is recommended to use LazyFrame when piping operations, in order
|
1606
|
+
# to fully take advantage of query optimization and parallelization.
|
1607
|
+
# See {#lazy}.
|
1608
|
+
#
|
1609
|
+
# @example
|
1610
|
+
# cast_str_to_int = lambda do |data, col_name:|
|
1611
|
+
# data.with_column(Polars.col(col_name).cast(:i64))
|
1612
|
+
# end
|
1613
|
+
#
|
1614
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]})
|
1615
|
+
# df.pipe(cast_str_to_int, col_name: "b")
|
1616
|
+
# # =>
|
1617
|
+
# # shape: (4, 2)
|
1618
|
+
# # ┌─────┬─────┐
|
1619
|
+
# # │ a ┆ b │
|
1620
|
+
# # │ --- ┆ --- │
|
1621
|
+
# # │ i64 ┆ i64 │
|
1622
|
+
# # ╞═════╪═════╡
|
1623
|
+
# # │ 1 ┆ 10 │
|
1624
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1625
|
+
# # │ 2 ┆ 20 │
|
1626
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1627
|
+
# # │ 3 ┆ 30 │
|
1628
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1629
|
+
# # │ 4 ┆ 40 │
|
1630
|
+
# # └─────┴─────┘
|
1631
|
+
def pipe(func, *args, **kwargs, &block)
|
1632
|
+
func.call(self, *args, **kwargs, &block)
|
1633
|
+
end
|
1467
1634
|
|
1468
1635
|
# Add a column at index 0 that counts the rows.
|
1469
1636
|
#
|
@@ -1547,17 +1714,612 @@ module Polars
|
|
1547
1714
|
)
|
1548
1715
|
end
|
1549
1716
|
|
1550
|
-
#
|
1551
|
-
#
|
1717
|
+
# Create rolling groups based on a time column.
|
1718
|
+
#
|
1719
|
+
# Also works for index values of type `:i32` or `:i64`.
|
1720
|
+
#
|
1721
|
+
# Different from a `dynamic_groupby` the windows are now determined by the
|
1722
|
+
# individual values and are not of constant intervals. For constant intervals use
|
1723
|
+
# *groupby_dynamic*
|
1724
|
+
#
|
1725
|
+
# The `period` and `offset` arguments are created either from a timedelta, or
|
1726
|
+
# by using the following string language:
|
1727
|
+
#
|
1728
|
+
# - 1ns (1 nanosecond)
|
1729
|
+
# - 1us (1 microsecond)
|
1730
|
+
# - 1ms (1 millisecond)
|
1731
|
+
# - 1s (1 second)
|
1732
|
+
# - 1m (1 minute)
|
1733
|
+
# - 1h (1 hour)
|
1734
|
+
# - 1d (1 day)
|
1735
|
+
# - 1w (1 week)
|
1736
|
+
# - 1mo (1 calendar month)
|
1737
|
+
# - 1y (1 calendar year)
|
1738
|
+
# - 1i (1 index count)
|
1739
|
+
#
|
1740
|
+
# Or combine them:
|
1741
|
+
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
1742
|
+
#
|
1743
|
+
# In case of a groupby_rolling on an integer column, the windows are defined by:
|
1744
|
+
#
|
1745
|
+
# - **"1i" # length 1**
|
1746
|
+
# - **"10i" # length 10**
|
1747
|
+
#
|
1748
|
+
# @param index_column [Object]
|
1749
|
+
# Column used to group based on the time window.
|
1750
|
+
# Often to type Date/Datetime
|
1751
|
+
# This column must be sorted in ascending order. If not the output will not
|
1752
|
+
# make sense.
|
1753
|
+
#
|
1754
|
+
# In case of a rolling groupby on indices, dtype needs to be one of
|
1755
|
+
# `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
|
1756
|
+
# performance matters use an `:i64` column.
|
1757
|
+
# @param period [Object]
|
1758
|
+
# Length of the window.
|
1759
|
+
# @param offset [Object]
|
1760
|
+
# Offset of the window. Default is -period.
|
1761
|
+
# @param closed ["right", "left", "both", "none"]
|
1762
|
+
# Define whether the temporal window interval is closed or not.
|
1763
|
+
# @param by [Object]
|
1764
|
+
# Also group by this column/these columns.
|
1765
|
+
#
|
1766
|
+
# @return [RollingGroupBy]
|
1767
|
+
#
|
1768
|
+
# @example
|
1769
|
+
# dates = [
|
1770
|
+
# "2020-01-01 13:45:48",
|
1771
|
+
# "2020-01-01 16:42:13",
|
1772
|
+
# "2020-01-01 16:45:09",
|
1773
|
+
# "2020-01-02 18:12:48",
|
1774
|
+
# "2020-01-03 19:45:32",
|
1775
|
+
# "2020-01-08 23:16:43"
|
1776
|
+
# ]
|
1777
|
+
# df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
|
1778
|
+
# Polars.col("dt").str.strptime(:datetime)
|
1779
|
+
# )
|
1780
|
+
# df.groupby_rolling(index_column: "dt", period: "2d").agg(
|
1781
|
+
# [
|
1782
|
+
# Polars.sum("a").alias("sum_a"),
|
1783
|
+
# Polars.min("a").alias("min_a"),
|
1784
|
+
# Polars.max("a").alias("max_a")
|
1785
|
+
# ]
|
1786
|
+
# )
|
1787
|
+
# # =>
|
1788
|
+
# # shape: (6, 4)
|
1789
|
+
# # ┌─────────────────────┬───────┬───────┬───────┐
|
1790
|
+
# # │ dt ┆ sum_a ┆ min_a ┆ max_a │
|
1791
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1792
|
+
# # │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │
|
1793
|
+
# # ╞═════════════════════╪═══════╪═══════╪═══════╡
|
1794
|
+
# # │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │
|
1795
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1796
|
+
# # │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │
|
1797
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1798
|
+
# # │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │
|
1799
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1800
|
+
# # │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │
|
1801
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1802
|
+
# # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
|
1803
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1804
|
+
# # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
|
1805
|
+
# # └─────────────────────┴───────┴───────┴───────┘
|
1806
|
+
def groupby_rolling(
|
1807
|
+
index_column:,
|
1808
|
+
period:,
|
1809
|
+
offset: nil,
|
1810
|
+
closed: "right",
|
1811
|
+
by: nil
|
1812
|
+
)
|
1813
|
+
RollingGroupBy.new(self, index_column, period, offset, closed, by)
|
1814
|
+
end
|
1552
1815
|
|
1553
|
-
#
|
1554
|
-
#
|
1816
|
+
# Group based on a time value (or index value of type `:i32`, `:i64`).
|
1817
|
+
#
|
1818
|
+
# Time windows are calculated and rows are assigned to windows. Different from a
|
1819
|
+
# normal groupby is that a row can be member of multiple groups. The time/index
|
1820
|
+
# window could be seen as a rolling window, with a window size determined by
|
1821
|
+
# dates/times/values instead of slots in the DataFrame.
|
1822
|
+
#
|
1823
|
+
# A window is defined by:
|
1824
|
+
#
|
1825
|
+
# - every: interval of the window
|
1826
|
+
# - period: length of the window
|
1827
|
+
# - offset: offset of the window
|
1828
|
+
#
|
1829
|
+
# The `every`, `period` and `offset` arguments are created with
|
1830
|
+
# the following string language:
|
1831
|
+
#
|
1832
|
+
# - 1ns (1 nanosecond)
|
1833
|
+
# - 1us (1 microsecond)
|
1834
|
+
# - 1ms (1 millisecond)
|
1835
|
+
# - 1s (1 second)
|
1836
|
+
# - 1m (1 minute)
|
1837
|
+
# - 1h (1 hour)
|
1838
|
+
# - 1d (1 day)
|
1839
|
+
# - 1w (1 week)
|
1840
|
+
# - 1mo (1 calendar month)
|
1841
|
+
# - 1y (1 calendar year)
|
1842
|
+
# - 1i (1 index count)
|
1843
|
+
#
|
1844
|
+
# Or combine them:
|
1845
|
+
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
1846
|
+
#
|
1847
|
+
# In case of a groupby_dynamic on an integer column, the windows are defined by:
|
1848
|
+
#
|
1849
|
+
# - "1i" # length 1
|
1850
|
+
# - "10i" # length 10
|
1851
|
+
#
|
1852
|
+
# @param index_column
|
1853
|
+
# Column used to group based on the time window.
|
1854
|
+
# Often to type Date/Datetime
|
1855
|
+
# This column must be sorted in ascending order. If not the output will not
|
1856
|
+
# make sense.
|
1857
|
+
#
|
1858
|
+
# In case of a dynamic groupby on indices, dtype needs to be one of
|
1859
|
+
# `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
|
1860
|
+
# performance matters use an `:i64` column.
|
1861
|
+
# @param every
|
1862
|
+
# Interval of the window.
|
1863
|
+
# @param period
|
1864
|
+
# Length of the window, if None it is equal to 'every'.
|
1865
|
+
# @param offset
|
1866
|
+
# Offset of the window if None and period is None it will be equal to negative
|
1867
|
+
# `every`.
|
1868
|
+
# @param truncate
|
1869
|
+
# Truncate the time value to the window lower bound.
|
1870
|
+
# @param include_boundaries
|
1871
|
+
# Add the lower and upper bound of the window to the "_lower_bound" and
|
1872
|
+
# "_upper_bound" columns. This will impact performance because it's harder to
|
1873
|
+
# parallelize
|
1874
|
+
# @param closed ["right", "left", "both", "none"]
|
1875
|
+
# Define whether the temporal window interval is closed or not.
|
1876
|
+
# @param by
|
1877
|
+
# Also group by this column/these columns
|
1878
|
+
#
|
1879
|
+
# @return [DataFrame]
|
1880
|
+
#
|
1881
|
+
# @example
|
1882
|
+
# df = Polars::DataFrame.new(
|
1883
|
+
# {
|
1884
|
+
# "time" => Polars.date_range(
|
1885
|
+
# DateTime.new(2021, 12, 16),
|
1886
|
+
# DateTime.new(2021, 12, 16, 3),
|
1887
|
+
# "30m"
|
1888
|
+
# ),
|
1889
|
+
# "n" => 0..6
|
1890
|
+
# }
|
1891
|
+
# )
|
1892
|
+
# # =>
|
1893
|
+
# # shape: (7, 2)
|
1894
|
+
# # ┌─────────────────────┬─────┐
|
1895
|
+
# # │ time ┆ n │
|
1896
|
+
# # │ --- ┆ --- │
|
1897
|
+
# # │ datetime[μs] ┆ i64 │
|
1898
|
+
# # ╞═════════════════════╪═════╡
|
1899
|
+
# # │ 2021-12-16 00:00:00 ┆ 0 │
|
1900
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1901
|
+
# # │ 2021-12-16 00:30:00 ┆ 1 │
|
1902
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1903
|
+
# # │ 2021-12-16 01:00:00 ┆ 2 │
|
1904
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1905
|
+
# # │ 2021-12-16 01:30:00 ┆ 3 │
|
1906
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1907
|
+
# # │ 2021-12-16 02:00:00 ┆ 4 │
|
1908
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1909
|
+
# # │ 2021-12-16 02:30:00 ┆ 5 │
|
1910
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1911
|
+
# # │ 2021-12-16 03:00:00 ┆ 6 │
|
1912
|
+
# # └─────────────────────┴─────┘
|
1913
|
+
#
|
1914
|
+
# @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
|
1915
|
+
# df.groupby_dynamic("time", every: "1h", closed: "right").agg(
|
1916
|
+
# [
|
1917
|
+
# Polars.col("time").min.alias("time_min"),
|
1918
|
+
# Polars.col("time").max.alias("time_max")
|
1919
|
+
# ]
|
1920
|
+
# )
|
1921
|
+
# # =>
|
1922
|
+
# # shape: (4, 3)
|
1923
|
+
# # ┌─────────────────────┬─────────────────────┬─────────────────────┐
|
1924
|
+
# # │ time ┆ time_min ┆ time_max │
|
1925
|
+
# # │ --- ┆ --- ┆ --- │
|
1926
|
+
# # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │
|
1927
|
+
# # ╞═════════════════════╪═════════════════════╪═════════════════════╡
|
1928
|
+
# # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │
|
1929
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1930
|
+
# # │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │
|
1931
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1932
|
+
# # │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │
|
1933
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1934
|
+
# # │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │
|
1935
|
+
# # └─────────────────────┴─────────────────────┴─────────────────────┘
|
1936
|
+
#
|
1937
|
+
# @example The window boundaries can also be added to the aggregation result.
|
1938
|
+
# df.groupby_dynamic(
|
1939
|
+
# "time", every: "1h", include_boundaries: true, closed: "right"
|
1940
|
+
# ).agg([Polars.col("time").count.alias("time_count")])
|
1941
|
+
# # =>
|
1942
|
+
# # shape: (4, 4)
|
1943
|
+
# # ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
|
1944
|
+
# # │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
|
1945
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1946
|
+
# # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
|
1947
|
+
# # ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
|
1948
|
+
# # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
|
1949
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1950
|
+
# # │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │
|
1951
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1952
|
+
# # │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
|
1953
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1954
|
+
# # │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
|
1955
|
+
# # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
|
1956
|
+
#
|
1957
|
+
# @example When closed="left", should not include right end of interval.
|
1958
|
+
# df.groupby_dynamic("time", every: "1h", closed: "left").agg(
|
1959
|
+
# [
|
1960
|
+
# Polars.col("time").count.alias("time_count"),
|
1961
|
+
# Polars.col("time").list.alias("time_agg_list")
|
1962
|
+
# ]
|
1963
|
+
# )
|
1964
|
+
# # =>
|
1965
|
+
# # shape: (4, 3)
|
1966
|
+
# # ┌─────────────────────┬────────────┬─────────────────────────────────────┐
|
1967
|
+
# # │ time ┆ time_count ┆ time_agg_list │
|
1968
|
+
# # │ --- ┆ --- ┆ --- │
|
1969
|
+
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
|
1970
|
+
# # ╞═════════════════════╪════════════╪═════════════════════════════════════╡
|
1971
|
+
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16... │
|
1972
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1973
|
+
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16... │
|
1974
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1975
|
+
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16... │
|
1976
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1977
|
+
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
|
1978
|
+
# # └─────────────────────┴────────────┴─────────────────────────────────────┘
|
1979
|
+
#
|
1980
|
+
# @example When closed="both" the time values at the window boundaries belong to 2 groups.
|
1981
|
+
# df.groupby_dynamic("time", every: "1h", closed: "both").agg(
|
1982
|
+
# [Polars.col("time").count.alias("time_count")]
|
1983
|
+
# )
|
1984
|
+
# # =>
|
1985
|
+
# # shape: (5, 2)
|
1986
|
+
# # ┌─────────────────────┬────────────┐
|
1987
|
+
# # │ time ┆ time_count │
|
1988
|
+
# # │ --- ┆ --- │
|
1989
|
+
# # │ datetime[μs] ┆ u32 │
|
1990
|
+
# # ╞═════════════════════╪════════════╡
|
1991
|
+
# # │ 2021-12-15 23:00:00 ┆ 1 │
|
1992
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1993
|
+
# # │ 2021-12-16 00:00:00 ┆ 3 │
|
1994
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1995
|
+
# # │ 2021-12-16 01:00:00 ┆ 3 │
|
1996
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1997
|
+
# # │ 2021-12-16 02:00:00 ┆ 3 │
|
1998
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1999
|
+
# # │ 2021-12-16 03:00:00 ┆ 1 │
|
2000
|
+
# # └─────────────────────┴────────────┘
|
2001
|
+
#
|
2002
|
+
# @example Dynamic groupbys can also be combined with grouping on normal keys.
|
2003
|
+
# df = Polars::DataFrame.new(
|
2004
|
+
# {
|
2005
|
+
# "time" => Polars.date_range(
|
2006
|
+
# DateTime.new(2021, 12, 16),
|
2007
|
+
# DateTime.new(2021, 12, 16, 3),
|
2008
|
+
# "30m"
|
2009
|
+
# ),
|
2010
|
+
# "groups" => ["a", "a", "a", "b", "b", "a", "a"]
|
2011
|
+
# }
|
2012
|
+
# )
|
2013
|
+
# df.groupby_dynamic(
|
2014
|
+
# "time",
|
2015
|
+
# every: "1h",
|
2016
|
+
# closed: "both",
|
2017
|
+
# by: "groups",
|
2018
|
+
# include_boundaries: true
|
2019
|
+
# ).agg([Polars.col("time").count.alias("time_count")])
|
2020
|
+
# # =>
|
2021
|
+
# # shape: (7, 5)
|
2022
|
+
# # ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
|
2023
|
+
# # │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
|
2024
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
2025
|
+
# # │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
|
2026
|
+
# # ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
|
2027
|
+
# # │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
|
2028
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2029
|
+
# # │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │
|
2030
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2031
|
+
# # │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │
|
2032
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2033
|
+
# # │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
|
2034
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2035
|
+
# # │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │
|
2036
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2037
|
+
# # │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
|
2038
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2039
|
+
# # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
|
2040
|
+
# # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
|
2041
|
+
#
|
2042
|
+
# @example Dynamic groupby on an index column.
|
2043
|
+
# df = Polars::DataFrame.new(
|
2044
|
+
# {
|
2045
|
+
# "idx" => Polars.arange(0, 6, eager: true),
|
2046
|
+
# "A" => ["A", "A", "B", "B", "B", "C"]
|
2047
|
+
# }
|
2048
|
+
# )
|
2049
|
+
# df.groupby_dynamic(
|
2050
|
+
# "idx",
|
2051
|
+
# every: "2i",
|
2052
|
+
# period: "3i",
|
2053
|
+
# include_boundaries: true,
|
2054
|
+
# closed: "right"
|
2055
|
+
# ).agg(Polars.col("A").list.alias("A_agg_list"))
|
2056
|
+
# # =>
|
2057
|
+
# # shape: (3, 4)
|
2058
|
+
# # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
|
2059
|
+
# # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
|
2060
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
2061
|
+
# # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
|
2062
|
+
# # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
|
2063
|
+
# # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
|
2064
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2065
|
+
# # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
|
2066
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2067
|
+
# # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
|
2068
|
+
# # └─────────────────┴─────────────────┴─────┴─────────────────┘
|
2069
|
+
def groupby_dynamic(
|
2070
|
+
index_column,
|
2071
|
+
every:,
|
2072
|
+
period: nil,
|
2073
|
+
offset: nil,
|
2074
|
+
truncate: true,
|
2075
|
+
include_boundaries: false,
|
2076
|
+
closed: "left",
|
2077
|
+
by: nil
|
2078
|
+
)
|
2079
|
+
DynamicGroupBy.new(
|
2080
|
+
self,
|
2081
|
+
index_column,
|
2082
|
+
every,
|
2083
|
+
period,
|
2084
|
+
offset,
|
2085
|
+
truncate,
|
2086
|
+
include_boundaries,
|
2087
|
+
closed,
|
2088
|
+
by
|
2089
|
+
)
|
2090
|
+
end
|
1555
2091
|
|
1556
|
-
#
|
1557
|
-
#
|
2092
|
+
# Upsample a DataFrame at a regular frequency.
|
2093
|
+
#
|
2094
|
+
# @param time_column [Object]
|
2095
|
+
# time column will be used to determine a date_range.
|
2096
|
+
# Note that this column has to be sorted for the output to make sense.
|
2097
|
+
# @param every [String]
|
2098
|
+
# interval will start 'every' duration
|
2099
|
+
# @param offset [String]
|
2100
|
+
# change the start of the date_range by this offset.
|
2101
|
+
# @param by [Object]
|
2102
|
+
# First group by these columns and then upsample for every group
|
2103
|
+
# @param maintain_order [Boolean]
|
2104
|
+
# Keep the ordering predictable. This is slower.
|
2105
|
+
#
|
2106
|
+
# The `every` and `offset` arguments are created with
|
2107
|
+
# the following string language:
|
2108
|
+
#
|
2109
|
+
# - 1ns (1 nanosecond)
|
2110
|
+
# - 1us (1 microsecond)
|
2111
|
+
# - 1ms (1 millisecond)
|
2112
|
+
# - 1s (1 second)
|
2113
|
+
# - 1m (1 minute)
|
2114
|
+
# - 1h (1 hour)
|
2115
|
+
# - 1d (1 day)
|
2116
|
+
# - 1w (1 week)
|
2117
|
+
# - 1mo (1 calendar month)
|
2118
|
+
# - 1y (1 calendar year)
|
2119
|
+
# - 1i (1 index count)
|
2120
|
+
#
|
2121
|
+
# Or combine them:
|
2122
|
+
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
2123
|
+
#
|
2124
|
+
# @return [DataFrame]
|
2125
|
+
#
|
2126
|
+
# @example Upsample a DataFrame by a certain interval.
|
2127
|
+
# df = Polars::DataFrame.new(
|
2128
|
+
# {
|
2129
|
+
# "time" => [
|
2130
|
+
# DateTime.new(2021, 2, 1),
|
2131
|
+
# DateTime.new(2021, 4, 1),
|
2132
|
+
# DateTime.new(2021, 5, 1),
|
2133
|
+
# DateTime.new(2021, 6, 1)
|
2134
|
+
# ],
|
2135
|
+
# "groups" => ["A", "B", "A", "B"],
|
2136
|
+
# "values" => [0, 1, 2, 3]
|
2137
|
+
# }
|
2138
|
+
# )
|
2139
|
+
# df.upsample(
|
2140
|
+
# time_column: "time", every: "1mo", by: "groups", maintain_order: true
|
2141
|
+
# ).select(Polars.all.forward_fill)
|
2142
|
+
# # =>
|
2143
|
+
# # shape: (7, 3)
|
2144
|
+
# # ┌─────────────────────┬────────┬────────┐
|
2145
|
+
# # │ time ┆ groups ┆ values │
|
2146
|
+
# # │ --- ┆ --- ┆ --- │
|
2147
|
+
# # │ datetime[ns] ┆ str ┆ i64 │
|
2148
|
+
# # ╞═════════════════════╪════════╪════════╡
|
2149
|
+
# # │ 2021-02-01 00:00:00 ┆ A ┆ 0 │
|
2150
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
2151
|
+
# # │ 2021-03-01 00:00:00 ┆ A ┆ 0 │
|
2152
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
2153
|
+
# # │ 2021-04-01 00:00:00 ┆ A ┆ 0 │
|
2154
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
2155
|
+
# # │ 2021-05-01 00:00:00 ┆ A ┆ 2 │
|
2156
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
2157
|
+
# # │ 2021-04-01 00:00:00 ┆ B ┆ 1 │
|
2158
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
2159
|
+
# # │ 2021-05-01 00:00:00 ┆ B ┆ 1 │
|
2160
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
2161
|
+
# # │ 2021-06-01 00:00:00 ┆ B ┆ 3 │
|
2162
|
+
# # └─────────────────────┴────────┴────────┘
|
2163
|
+
def upsample(
|
2164
|
+
time_column:,
|
2165
|
+
every:,
|
2166
|
+
offset: nil,
|
2167
|
+
by: nil,
|
2168
|
+
maintain_order: false
|
2169
|
+
)
|
2170
|
+
if by.nil?
|
2171
|
+
by = []
|
2172
|
+
end
|
2173
|
+
if by.is_a?(String)
|
2174
|
+
by = [by]
|
2175
|
+
end
|
2176
|
+
if offset.nil?
|
2177
|
+
offset = "0ns"
|
2178
|
+
end
|
1558
2179
|
|
1559
|
-
|
1560
|
-
|
2180
|
+
every = Utils._timedelta_to_pl_duration(every)
|
2181
|
+
offset = Utils._timedelta_to_pl_duration(offset)
|
2182
|
+
|
2183
|
+
_from_rbdf(
|
2184
|
+
_df.upsample(by, time_column, every, offset, maintain_order)
|
2185
|
+
)
|
2186
|
+
end
|
2187
|
+
|
2188
|
+
# Perform an asof join.
|
2189
|
+
#
|
2190
|
+
# This is similar to a left-join except that we match on nearest key rather than
|
2191
|
+
# equal keys.
|
2192
|
+
#
|
2193
|
+
# Both DataFrames must be sorted by the asof_join key.
|
2194
|
+
#
|
2195
|
+
# For each row in the left DataFrame:
|
2196
|
+
#
|
2197
|
+
# - A "backward" search selects the last row in the right DataFrame whose 'on' key is less than or equal to the left's key.
|
2198
|
+
# - A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key.
|
2199
|
+
#
|
2200
|
+
# The default is "backward".
|
2201
|
+
#
|
2202
|
+
# @param other [DataFrame]
|
2203
|
+
# DataFrame to join with.
|
2204
|
+
# @param left_on [String]
|
2205
|
+
# Join column of the left DataFrame.
|
2206
|
+
# @param right_on [String]
|
2207
|
+
# Join column of the right DataFrame.
|
2208
|
+
# @param on [String]
|
2209
|
+
# Join column of both DataFrames. If set, `left_on` and `right_on` should be
|
2210
|
+
# None.
|
2211
|
+
# @param by [Object]
|
2212
|
+
# join on these columns before doing asof join
|
2213
|
+
# @param by_left [Object]
|
2214
|
+
# join on these columns before doing asof join
|
2215
|
+
# @param by_right [Object]
|
2216
|
+
# join on these columns before doing asof join
|
2217
|
+
# @param strategy ["backward", "forward"]
|
2218
|
+
# Join strategy.
|
2219
|
+
# @param suffix [String]
|
2220
|
+
# Suffix to append to columns with a duplicate name.
|
2221
|
+
# @param tolerance [Object]
|
2222
|
+
# Numeric tolerance. By setting this the join will only be done if the near
|
2223
|
+
# keys are within this distance. If an asof join is done on columns of dtype
|
2224
|
+
# "Date", "Datetime", "Duration" or "Time" you use the following string
|
2225
|
+
# language:
|
2226
|
+
#
|
2227
|
+
# - 1ns (1 nanosecond)
|
2228
|
+
# - 1us (1 microsecond)
|
2229
|
+
# - 1ms (1 millisecond)
|
2230
|
+
# - 1s (1 second)
|
2231
|
+
# - 1m (1 minute)
|
2232
|
+
# - 1h (1 hour)
|
2233
|
+
# - 1d (1 day)
|
2234
|
+
# - 1w (1 week)
|
2235
|
+
# - 1mo (1 calendar month)
|
2236
|
+
# - 1y (1 calendar year)
|
2237
|
+
# - 1i (1 index count)
|
2238
|
+
#
|
2239
|
+
# Or combine them:
|
2240
|
+
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
2241
|
+
#
|
2242
|
+
# @param allow_parallel [Boolean]
|
2243
|
+
# Allow the physical plan to optionally evaluate the computation of both
|
2244
|
+
# DataFrames up to the join in parallel.
|
2245
|
+
# @param force_parallel [Boolean]
|
2246
|
+
# Force the physical plan to evaluate the computation of both DataFrames up to
|
2247
|
+
# the join in parallel.
|
2248
|
+
#
|
2249
|
+
# @return [DataFrame]
|
2250
|
+
#
|
2251
|
+
# @example
|
2252
|
+
# gdp = Polars::DataFrame.new(
|
2253
|
+
# {
|
2254
|
+
# "date" => [
|
2255
|
+
# DateTime.new(2016, 1, 1),
|
2256
|
+
# DateTime.new(2017, 1, 1),
|
2257
|
+
# DateTime.new(2018, 1, 1),
|
2258
|
+
# DateTime.new(2019, 1, 1),
|
2259
|
+
# ], # note record date: Jan 1st (sorted!)
|
2260
|
+
# "gdp" => [4164, 4411, 4566, 4696]
|
2261
|
+
# }
|
2262
|
+
# )
|
2263
|
+
# population = Polars::DataFrame.new(
|
2264
|
+
# {
|
2265
|
+
# "date" => [
|
2266
|
+
# DateTime.new(2016, 5, 12),
|
2267
|
+
# DateTime.new(2017, 5, 12),
|
2268
|
+
# DateTime.new(2018, 5, 12),
|
2269
|
+
# DateTime.new(2019, 5, 12),
|
2270
|
+
# ], # note record date: May 12th (sorted!)
|
2271
|
+
# "population" => [82.19, 82.66, 83.12, 83.52]
|
2272
|
+
# }
|
2273
|
+
# )
|
2274
|
+
# population.join_asof(
|
2275
|
+
# gdp, left_on: "date", right_on: "date", strategy: "backward"
|
2276
|
+
# )
|
2277
|
+
# # =>
|
2278
|
+
# # shape: (4, 3)
|
2279
|
+
# # ┌─────────────────────┬────────────┬──────┐
|
2280
|
+
# # │ date ┆ population ┆ gdp │
|
2281
|
+
# # │ --- ┆ --- ┆ --- │
|
2282
|
+
# # │ datetime[ns] ┆ f64 ┆ i64 │
|
2283
|
+
# # ╞═════════════════════╪════════════╪══════╡
|
2284
|
+
# # │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │
|
2285
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2286
|
+
# # │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │
|
2287
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2288
|
+
# # │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │
|
2289
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2290
|
+
# # │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │
|
2291
|
+
# # └─────────────────────┴────────────┴──────┘
|
2292
|
+
def join_asof(
|
2293
|
+
other,
|
2294
|
+
left_on: nil,
|
2295
|
+
right_on: nil,
|
2296
|
+
on: nil,
|
2297
|
+
by_left: nil,
|
2298
|
+
by_right: nil,
|
2299
|
+
by: nil,
|
2300
|
+
strategy: "backward",
|
2301
|
+
suffix: "_right",
|
2302
|
+
tolerance: nil,
|
2303
|
+
allow_parallel: true,
|
2304
|
+
force_parallel: false
|
2305
|
+
)
|
2306
|
+
lazy
|
2307
|
+
.join_asof(
|
2308
|
+
other.lazy,
|
2309
|
+
left_on: left_on,
|
2310
|
+
right_on: right_on,
|
2311
|
+
on: on,
|
2312
|
+
by_left: by_left,
|
2313
|
+
by_right: by_right,
|
2314
|
+
by: by,
|
2315
|
+
strategy: strategy,
|
2316
|
+
suffix: suffix,
|
2317
|
+
tolerance: tolerance,
|
2318
|
+
allow_parallel: allow_parallel,
|
2319
|
+
force_parallel: force_parallel
|
2320
|
+
)
|
2321
|
+
.collect(no_optimization: true)
|
2322
|
+
end
|
1561
2323
|
|
1562
2324
|
# Join in SQL-like fashion.
|
1563
2325
|
#
|
@@ -1675,8 +2437,78 @@ module Polars
|
|
1675
2437
|
.collect(no_optimization: true)
|
1676
2438
|
end
|
1677
2439
|
|
1678
|
-
#
|
1679
|
-
#
|
2440
|
+
# Apply a custom/user-defined function (UDF) over the rows of the DataFrame.
|
2441
|
+
#
|
2442
|
+
# The UDF will receive each row as a tuple of values: `udf(row)`.
|
2443
|
+
#
|
2444
|
+
# Implementing logic using a Ruby function is almost always _significantly_
|
2445
|
+
# slower and more memory intensive than implementing the same logic using
|
2446
|
+
# the native expression API because:
|
2447
|
+
#
|
2448
|
+
# - The native expression engine runs in Rust; UDFs run in Ruby.
|
2449
|
+
# - Use of Ruby UDFs forces the DataFrame to be materialized in memory.
|
2450
|
+
# - Polars-native expressions can be parallelised (UDFs cannot).
|
2451
|
+
# - Polars-native expressions can be logically optimised (UDFs cannot).
|
2452
|
+
#
|
2453
|
+
# Wherever possible you should strongly prefer the native expression API
|
2454
|
+
# to achieve the best performance.
|
2455
|
+
#
|
2456
|
+
# @param return_dtype [Symbol]
|
2457
|
+
# Output type of the operation. If none given, Polars tries to infer the type.
|
2458
|
+
# @param inference_size [Integer]
|
2459
|
+
# Only used in the case when the custom function returns rows.
|
2460
|
+
# This uses the first `n` rows to determine the output schema
|
2461
|
+
#
|
2462
|
+
# @return [Object]
|
2463
|
+
#
|
2464
|
+
# @note
|
2465
|
+
# The frame-level `apply` cannot track column names (as the UDF is a black-box
|
2466
|
+
# that may arbitrarily drop, rearrange, transform, or add new columns); if you
|
2467
|
+
# want to apply a UDF such that column names are preserved, you should use the
|
2468
|
+
# expression-level `apply` syntax instead.
|
2469
|
+
#
|
2470
|
+
# @example
|
2471
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [-1, 5, 8]})
|
2472
|
+
#
|
2473
|
+
# @example Return a DataFrame by mapping each row to a tuple:
|
2474
|
+
# df.apply { |t| [t[0] * 2, t[1] * 3] }
|
2475
|
+
# # =>
|
2476
|
+
# # shape: (3, 2)
|
2477
|
+
# # ┌──────────┬──────────┐
|
2478
|
+
# # │ column_0 ┆ column_1 │
|
2479
|
+
# # │ --- ┆ --- │
|
2480
|
+
# # │ i64 ┆ i64 │
|
2481
|
+
# # ╞══════════╪══════════╡
|
2482
|
+
# # │ 2 ┆ -3 │
|
2483
|
+
# # ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
|
2484
|
+
# # │ 4 ┆ 15 │
|
2485
|
+
# # ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
|
2486
|
+
# # │ 6 ┆ 24 │
|
2487
|
+
# # └──────────┴──────────┘
|
2488
|
+
#
|
2489
|
+
# @example Return a Series by mapping each row to a scalar:
|
2490
|
+
# df.apply { |t| t[0] * 2 + t[1] }
|
2491
|
+
# # =>
|
2492
|
+
# # shape: (3, 1)
|
2493
|
+
# # ┌───────┐
|
2494
|
+
# # │ apply │
|
2495
|
+
# # │ --- │
|
2496
|
+
# # │ i64 │
|
2497
|
+
# # ╞═══════╡
|
2498
|
+
# # │ 1 │
|
2499
|
+
# # ├╌╌╌╌╌╌╌┤
|
2500
|
+
# # │ 9 │
|
2501
|
+
# # ├╌╌╌╌╌╌╌┤
|
2502
|
+
# # │ 14 │
|
2503
|
+
# # └───────┘
|
2504
|
+
def apply(return_dtype: nil, inference_size: 256, &f)
|
2505
|
+
out, is_df = _df.apply(f, return_dtype, inference_size)
|
2506
|
+
if is_df
|
2507
|
+
_from_rbdf(out)
|
2508
|
+
else
|
2509
|
+
_from_rbdf(Utils.wrap_s(out).to_frame._df)
|
2510
|
+
end
|
2511
|
+
end
|
1680
2512
|
|
1681
2513
|
# Return a new DataFrame with the column added or replaced.
|
1682
2514
|
#
|
@@ -1998,8 +2830,105 @@ module Polars
|
|
1998
2830
|
self[name]
|
1999
2831
|
end
|
2000
2832
|
|
2001
|
-
#
|
2002
|
-
#
|
2833
|
+
# Fill null values using the specified value or strategy.
|
2834
|
+
#
|
2835
|
+
# @param value [Numeric]
|
2836
|
+
# Value used to fill null values.
|
2837
|
+
# @param strategy [nil, "forward", "backward", "min", "max", "mean", "zero", "one"]
|
2838
|
+
# Strategy used to fill null values.
|
2839
|
+
# @param limit [Integer]
|
2840
|
+
# Number of consecutive null values to fill when using the 'forward' or
|
2841
|
+
# 'backward' strategy.
|
2842
|
+
# @param matches_supertype [Boolean]
|
2843
|
+
# Fill all matching supertype of the fill `value`.
|
2844
|
+
#
|
2845
|
+
# @return [DataFrame]
|
2846
|
+
#
|
2847
|
+
# @example
|
2848
|
+
# df = Polars::DataFrame.new(
|
2849
|
+
# {
|
2850
|
+
# "a" => [1, 2, nil, 4],
|
2851
|
+
# "b" => [0.5, 4, nil, 13]
|
2852
|
+
# }
|
2853
|
+
# )
|
2854
|
+
# df.fill_null(99)
|
2855
|
+
# # =>
|
2856
|
+
# # shape: (4, 2)
|
2857
|
+
# # ┌─────┬──────┐
|
2858
|
+
# # │ a ┆ b │
|
2859
|
+
# # │ --- ┆ --- │
|
2860
|
+
# # │ i64 ┆ f64 │
|
2861
|
+
# # ╞═════╪══════╡
|
2862
|
+
# # │ 1 ┆ 0.5 │
|
2863
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2864
|
+
# # │ 2 ┆ 4.0 │
|
2865
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2866
|
+
# # │ 99 ┆ 99.0 │
|
2867
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2868
|
+
# # │ 4 ┆ 13.0 │
|
2869
|
+
# # └─────┴──────┘
|
2870
|
+
#
|
2871
|
+
# @example
|
2872
|
+
# df.fill_null(strategy: "forward")
|
2873
|
+
# # =>
|
2874
|
+
# # shape: (4, 2)
|
2875
|
+
# # ┌─────┬──────┐
|
2876
|
+
# # │ a ┆ b │
|
2877
|
+
# # │ --- ┆ --- │
|
2878
|
+
# # │ i64 ┆ f64 │
|
2879
|
+
# # ╞═════╪══════╡
|
2880
|
+
# # │ 1 ┆ 0.5 │
|
2881
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2882
|
+
# # │ 2 ┆ 4.0 │
|
2883
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2884
|
+
# # │ 2 ┆ 4.0 │
|
2885
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2886
|
+
# # │ 4 ┆ 13.0 │
|
2887
|
+
# # └─────┴──────┘
|
2888
|
+
#
|
2889
|
+
# @example
|
2890
|
+
# df.fill_null(strategy: "max")
|
2891
|
+
# # =>
|
2892
|
+
# # shape: (4, 2)
|
2893
|
+
# # ┌─────┬──────┐
|
2894
|
+
# # │ a ┆ b │
|
2895
|
+
# # │ --- ┆ --- │
|
2896
|
+
# # │ i64 ┆ f64 │
|
2897
|
+
# # ╞═════╪══════╡
|
2898
|
+
# # │ 1 ┆ 0.5 │
|
2899
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2900
|
+
# # │ 2 ┆ 4.0 │
|
2901
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2902
|
+
# # │ 4 ┆ 13.0 │
|
2903
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2904
|
+
# # │ 4 ┆ 13.0 │
|
2905
|
+
# # └─────┴──────┘
|
2906
|
+
#
|
2907
|
+
# @example
|
2908
|
+
# df.fill_null(strategy: "zero")
|
2909
|
+
# # =>
|
2910
|
+
# # shape: (4, 2)
|
2911
|
+
# # ┌─────┬──────┐
|
2912
|
+
# # │ a ┆ b │
|
2913
|
+
# # │ --- ┆ --- │
|
2914
|
+
# # │ i64 ┆ f64 │
|
2915
|
+
# # ╞═════╪══════╡
|
2916
|
+
# # │ 1 ┆ 0.5 │
|
2917
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2918
|
+
# # │ 2 ┆ 4.0 │
|
2919
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2920
|
+
# # │ 0 ┆ 0.0 │
|
2921
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2922
|
+
# # │ 4 ┆ 13.0 │
|
2923
|
+
# # └─────┴──────┘
|
2924
|
+
def fill_null(value = nil, strategy: nil, limit: nil, matches_supertype: true)
|
2925
|
+
_from_rbdf(
|
2926
|
+
lazy
|
2927
|
+
.fill_null(value, strategy: strategy, limit: limit, matches_supertype: matches_supertype)
|
2928
|
+
.collect(no_optimization: true)
|
2929
|
+
._df
|
2930
|
+
)
|
2931
|
+
end
|
2003
2932
|
|
2004
2933
|
# Fill floating point NaN values by an Expression evaluation.
|
2005
2934
|
#
|
@@ -2081,17 +3010,404 @@ module Polars
|
|
2081
3010
|
lazy.explode(columns).collect(no_optimization: true)
|
2082
3011
|
end
|
2083
3012
|
|
2084
|
-
#
|
2085
|
-
#
|
3013
|
+
# Create a spreadsheet-style pivot table as a DataFrame.
|
3014
|
+
#
|
3015
|
+
# @param values [Object]
|
3016
|
+
# Column values to aggregate. Can be multiple columns if the *columns*
|
3017
|
+
# arguments contains multiple columns as well
|
3018
|
+
# @param index [Object]
|
3019
|
+
# One or multiple keys to group by
|
3020
|
+
# @param columns [Object]
|
3021
|
+
# Columns whose values will be used as the header of the output DataFrame
|
3022
|
+
# @param aggregate_fn ["first", "sum", "max", "min", "mean", "median", "last", "count"]
|
3023
|
+
# A predefined aggregate function str or an expression.
|
3024
|
+
# @param maintain_order [Object]
|
3025
|
+
# Sort the grouped keys so that the output order is predictable.
|
3026
|
+
# @param sort_columns [Object]
|
3027
|
+
# Sort the transposed columns by name. Default is by order of discovery.
|
3028
|
+
#
|
3029
|
+
# @return [DataFrame]
|
3030
|
+
#
|
3031
|
+
# @example
|
3032
|
+
# df = Polars::DataFrame.new(
|
3033
|
+
# {
|
3034
|
+
# "foo" => ["one", "one", "one", "two", "two", "two"],
|
3035
|
+
# "bar" => ["A", "B", "C", "A", "B", "C"],
|
3036
|
+
# "baz" => [1, 2, 3, 4, 5, 6]
|
3037
|
+
# }
|
3038
|
+
# )
|
3039
|
+
# df.pivot(values: "baz", index: "foo", columns: "bar")
|
3040
|
+
# # =>
|
3041
|
+
# # shape: (2, 4)
|
3042
|
+
# # ┌─────┬─────┬─────┬─────┐
|
3043
|
+
# # │ foo ┆ A ┆ B ┆ C │
|
3044
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
3045
|
+
# # │ str ┆ i64 ┆ i64 ┆ i64 │
|
3046
|
+
# # ╞═════╪═════╪═════╪═════╡
|
3047
|
+
# # │ one ┆ 1 ┆ 2 ┆ 3 │
|
3048
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
3049
|
+
# # │ two ┆ 4 ┆ 5 ┆ 6 │
|
3050
|
+
# # └─────┴─────┴─────┴─────┘
|
3051
|
+
def pivot(
|
3052
|
+
values:,
|
3053
|
+
index:,
|
3054
|
+
columns:,
|
3055
|
+
aggregate_fn: "first",
|
3056
|
+
maintain_order: true,
|
3057
|
+
sort_columns: false
|
3058
|
+
)
|
3059
|
+
if values.is_a?(String)
|
3060
|
+
values = [values]
|
3061
|
+
end
|
3062
|
+
if index.is_a?(String)
|
3063
|
+
index = [index]
|
3064
|
+
end
|
3065
|
+
if columns.is_a?(String)
|
3066
|
+
columns = [columns]
|
3067
|
+
end
|
2086
3068
|
|
2087
|
-
|
2088
|
-
|
3069
|
+
if aggregate_fn.is_a?(String)
|
3070
|
+
case aggregate_fn
|
3071
|
+
when "first"
|
3072
|
+
aggregate_fn = Polars.element.first
|
3073
|
+
when "sum"
|
3074
|
+
aggregate_fn = Polars.element.sum
|
3075
|
+
when "max"
|
3076
|
+
aggregate_fn = Polars.element.max
|
3077
|
+
when "min"
|
3078
|
+
aggregate_fn = Polars.element.min
|
3079
|
+
when "mean"
|
3080
|
+
aggregate_fn = Polars.element.mean
|
3081
|
+
when "median"
|
3082
|
+
aggregate_fn = Polars.element.median
|
3083
|
+
when "last"
|
3084
|
+
aggregate_fn = Polars.element.last
|
3085
|
+
when "count"
|
3086
|
+
aggregate_fn = Polars.count
|
3087
|
+
else
|
3088
|
+
raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
|
3089
|
+
end
|
3090
|
+
end
|
2089
3091
|
|
2090
|
-
|
2091
|
-
|
3092
|
+
_from_rbdf(
|
3093
|
+
_df.pivot_expr(
|
3094
|
+
values,
|
3095
|
+
index,
|
3096
|
+
columns,
|
3097
|
+
aggregate_fn._rbexpr,
|
3098
|
+
maintain_order,
|
3099
|
+
sort_columns
|
3100
|
+
)
|
3101
|
+
)
|
3102
|
+
end
|
2092
3103
|
|
2093
|
-
#
|
2094
|
-
#
|
3104
|
+
# Unpivot a DataFrame from wide to long format.
|
3105
|
+
#
|
3106
|
+
# Optionally leaves identifiers set.
|
3107
|
+
#
|
3108
|
+
# This function is useful to massage a DataFrame into a format where one or more
|
3109
|
+
# columns are identifier variables (id_vars), while all other columns, considered
|
3110
|
+
# measured variables (value_vars), are "unpivoted" to the row axis, leaving just
|
3111
|
+
# two non-identifier columns, 'variable' and 'value'.
|
3112
|
+
#
|
3113
|
+
# @param id_vars [Object]
|
3114
|
+
# Columns to use as identifier variables.
|
3115
|
+
# @param value_vars [Object]
|
3116
|
+
# Values to use as identifier variables.
|
3117
|
+
# If `value_vars` is empty all columns that are not in `id_vars` will be used.
|
3118
|
+
# @param variable_name [String]
|
3119
|
+
# Name to give to the `value` column. Defaults to "variable"
|
3120
|
+
# @param value_name [String]
|
3121
|
+
# Name to give to the `value` column. Defaults to "value"
|
3122
|
+
#
|
3123
|
+
# @return [DataFrame]
|
3124
|
+
#
|
3125
|
+
# @example
|
3126
|
+
# df = Polars::DataFrame.new(
|
3127
|
+
# {
|
3128
|
+
# "a" => ["x", "y", "z"],
|
3129
|
+
# "b" => [1, 3, 5],
|
3130
|
+
# "c" => [2, 4, 6]
|
3131
|
+
# }
|
3132
|
+
# )
|
3133
|
+
# df.melt(id_vars: "a", value_vars: ["b", "c"])
|
3134
|
+
# # =>
|
3135
|
+
# # shape: (6, 3)
|
3136
|
+
# # ┌─────┬──────────┬───────┐
|
3137
|
+
# # │ a ┆ variable ┆ value │
|
3138
|
+
# # │ --- ┆ --- ┆ --- │
|
3139
|
+
# # │ str ┆ str ┆ i64 │
|
3140
|
+
# # ╞═════╪══════════╪═══════╡
|
3141
|
+
# # │ x ┆ b ┆ 1 │
|
3142
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
3143
|
+
# # │ y ┆ b ┆ 3 │
|
3144
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
3145
|
+
# # │ z ┆ b ┆ 5 │
|
3146
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
3147
|
+
# # │ x ┆ c ┆ 2 │
|
3148
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
3149
|
+
# # │ y ┆ c ┆ 4 │
|
3150
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
3151
|
+
# # │ z ┆ c ┆ 6 │
|
3152
|
+
# # └─────┴──────────┴───────┘
|
3153
|
+
def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
|
3154
|
+
if value_vars.is_a?(String)
|
3155
|
+
value_vars = [value_vars]
|
3156
|
+
end
|
3157
|
+
if id_vars.is_a?(String)
|
3158
|
+
id_vars = [id_vars]
|
3159
|
+
end
|
3160
|
+
if value_vars.nil?
|
3161
|
+
value_vars = []
|
3162
|
+
end
|
3163
|
+
if id_vars.nil?
|
3164
|
+
id_vars = []
|
3165
|
+
end
|
3166
|
+
_from_rbdf(
|
3167
|
+
_df.melt(id_vars, value_vars, value_name, variable_name)
|
3168
|
+
)
|
3169
|
+
end
|
3170
|
+
|
3171
|
+
# Unstack a long table to a wide form without doing an aggregation.
|
3172
|
+
#
|
3173
|
+
# This can be much faster than a pivot, because it can skip the grouping phase.
|
3174
|
+
#
|
3175
|
+
# @note
|
3176
|
+
# This functionality is experimental and may be subject to changes
|
3177
|
+
# without it being considered a breaking change.
|
3178
|
+
#
|
3179
|
+
# @param step Integer
|
3180
|
+
# Number of rows in the unstacked frame.
|
3181
|
+
# @param how ["vertical", "horizontal"]
|
3182
|
+
# Direction of the unstack.
|
3183
|
+
# @param columns [Object]
|
3184
|
+
# Column to include in the operation.
|
3185
|
+
# @param fill_values [Object]
|
3186
|
+
# Fill values that don't fit the new size with this value.
|
3187
|
+
#
|
3188
|
+
# @return [DataFrame]
|
3189
|
+
#
|
3190
|
+
# @example
|
3191
|
+
# df = Polars::DataFrame.new(
|
3192
|
+
# {
|
3193
|
+
# "col1" => "A".."I",
|
3194
|
+
# "col2" => Polars.arange(0, 9, eager: true)
|
3195
|
+
# }
|
3196
|
+
# )
|
3197
|
+
# # =>
|
3198
|
+
# # shape: (9, 2)
|
3199
|
+
# # ┌──────┬──────┐
|
3200
|
+
# # │ col1 ┆ col2 │
|
3201
|
+
# # │ --- ┆ --- │
|
3202
|
+
# # │ str ┆ i64 │
|
3203
|
+
# # ╞══════╪══════╡
|
3204
|
+
# # │ A ┆ 0 │
|
3205
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3206
|
+
# # │ B ┆ 1 │
|
3207
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3208
|
+
# # │ C ┆ 2 │
|
3209
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3210
|
+
# # │ D ┆ 3 │
|
3211
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3212
|
+
# # │ ... ┆ ... │
|
3213
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3214
|
+
# # │ F ┆ 5 │
|
3215
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3216
|
+
# # │ G ┆ 6 │
|
3217
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3218
|
+
# # │ H ┆ 7 │
|
3219
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3220
|
+
# # │ I ┆ 8 │
|
3221
|
+
# # └──────┴──────┘
|
3222
|
+
#
|
3223
|
+
# @example
|
3224
|
+
# df.unstack(step: 3, how: "vertical")
|
3225
|
+
# # =>
|
3226
|
+
# # shape: (3, 6)
|
3227
|
+
# # ┌────────┬────────┬────────┬────────┬────────┬────────┐
|
3228
|
+
# # │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │
|
3229
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
3230
|
+
# # │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │
|
3231
|
+
# # ╞════════╪════════╪════════╪════════╪════════╪════════╡
|
3232
|
+
# # │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │
|
3233
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
3234
|
+
# # │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │
|
3235
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
3236
|
+
# # │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │
|
3237
|
+
# # └────────┴────────┴────────┴────────┴────────┴────────┘
|
3238
|
+
#
|
3239
|
+
# @example
|
3240
|
+
# df.unstack(step: 3, how: "horizontal")
|
3241
|
+
# # =>
|
3242
|
+
# # shape: (3, 6)
|
3243
|
+
# # ┌────────┬────────┬────────┬────────┬────────┬────────┐
|
3244
|
+
# # │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │
|
3245
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
3246
|
+
# # │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │
|
3247
|
+
# # ╞════════╪════════╪════════╪════════╪════════╪════════╡
|
3248
|
+
# # │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │
|
3249
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
3250
|
+
# # │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │
|
3251
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
3252
|
+
# # │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │
|
3253
|
+
# # └────────┴────────┴────────┴────────┴────────┴────────┘
|
3254
|
+
def unstack(step:, how: "vertical", columns: nil, fill_values: nil)
|
3255
|
+
if !columns.nil?
|
3256
|
+
df = select(columns)
|
3257
|
+
else
|
3258
|
+
df = self
|
3259
|
+
end
|
3260
|
+
|
3261
|
+
height = df.height
|
3262
|
+
if how == "vertical"
|
3263
|
+
n_rows = step
|
3264
|
+
n_cols = (height / n_rows.to_f).ceil
|
3265
|
+
else
|
3266
|
+
n_cols = step
|
3267
|
+
n_rows = (height / n_cols.to_f).ceil
|
3268
|
+
end
|
3269
|
+
|
3270
|
+
n_fill = n_cols * n_rows - height
|
3271
|
+
|
3272
|
+
if n_fill > 0
|
3273
|
+
if !fill_values.is_a?(Array)
|
3274
|
+
fill_values = [fill_values] * df.width
|
3275
|
+
end
|
3276
|
+
|
3277
|
+
df = df.select(
|
3278
|
+
df.get_columns.zip(fill_values).map do |s, next_fill|
|
3279
|
+
s.extend_constant(next_fill, n_fill)
|
3280
|
+
end
|
3281
|
+
)
|
3282
|
+
end
|
3283
|
+
|
3284
|
+
if how == "horizontal"
|
3285
|
+
df = (
|
3286
|
+
df.with_column(
|
3287
|
+
(Polars.arange(0, n_cols * n_rows, eager: true) % n_cols).alias(
|
3288
|
+
"__sort_order"
|
3289
|
+
)
|
3290
|
+
)
|
3291
|
+
.sort("__sort_order")
|
3292
|
+
.drop("__sort_order")
|
3293
|
+
)
|
3294
|
+
end
|
3295
|
+
|
3296
|
+
zfill_val = Math.log10(n_cols).floor + 1
|
3297
|
+
slices =
|
3298
|
+
df.get_columns.flat_map do |s|
|
3299
|
+
n_cols.times.map do |slice_nbr|
|
3300
|
+
s.slice(slice_nbr * n_rows, n_rows).alias("%s_%0#{zfill_val}d" % [s.name, slice_nbr])
|
3301
|
+
end
|
3302
|
+
end
|
3303
|
+
|
3304
|
+
_from_rbdf(DataFrame.new(slices)._df)
|
3305
|
+
end
|
3306
|
+
|
3307
|
+
# Split into multiple DataFrames partitioned by groups.
|
3308
|
+
#
|
3309
|
+
# @param groups [Object]
|
3310
|
+
# Groups to partition by.
|
3311
|
+
# @param maintain_order [Boolean]
|
3312
|
+
# Keep predictable output order. This is slower as it requires an extra sort
|
3313
|
+
# operation.
|
3314
|
+
# @param as_dict [Boolean]
|
3315
|
+
# If true, return the partitions in a dictionary keyed by the distinct group
|
3316
|
+
# values instead of a list.
|
3317
|
+
#
|
3318
|
+
# @return [Object]
|
3319
|
+
#
|
3320
|
+
# @example
|
3321
|
+
# df = Polars::DataFrame.new(
|
3322
|
+
# {
|
3323
|
+
# "foo" => ["A", "A", "B", "B", "C"],
|
3324
|
+
# "N" => [1, 2, 2, 4, 2],
|
3325
|
+
# "bar" => ["k", "l", "m", "m", "l"]
|
3326
|
+
# }
|
3327
|
+
# )
|
3328
|
+
# df.partition_by("foo", maintain_order: true)
|
3329
|
+
# # =>
|
3330
|
+
# # [shape: (2, 3)
|
3331
|
+
# # ┌─────┬─────┬─────┐
|
3332
|
+
# # │ foo ┆ N ┆ bar │
|
3333
|
+
# # │ --- ┆ --- ┆ --- │
|
3334
|
+
# # │ str ┆ i64 ┆ str │
|
3335
|
+
# # ╞═════╪═════╪═════╡
|
3336
|
+
# # │ A ┆ 1 ┆ k │
|
3337
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
3338
|
+
# # │ A ┆ 2 ┆ l │
|
3339
|
+
# # └─────┴─────┴─────┘, shape: (2, 3)
|
3340
|
+
# # ┌─────┬─────┬─────┐
|
3341
|
+
# # │ foo ┆ N ┆ bar │
|
3342
|
+
# # │ --- ┆ --- ┆ --- │
|
3343
|
+
# # │ str ┆ i64 ┆ str │
|
3344
|
+
# # ╞═════╪═════╪═════╡
|
3345
|
+
# # │ B ┆ 2 ┆ m │
|
3346
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
3347
|
+
# # │ B ┆ 4 ┆ m │
|
3348
|
+
# # └─────┴─────┴─────┘, shape: (1, 3)
|
3349
|
+
# # ┌─────┬─────┬─────┐
|
3350
|
+
# # │ foo ┆ N ┆ bar │
|
3351
|
+
# # │ --- ┆ --- ┆ --- │
|
3352
|
+
# # │ str ┆ i64 ┆ str │
|
3353
|
+
# # ╞═════╪═════╪═════╡
|
3354
|
+
# # │ C ┆ 2 ┆ l │
|
3355
|
+
# # └─────┴─────┴─────┘]
|
3356
|
+
#
|
3357
|
+
# @example
|
3358
|
+
# df.partition_by("foo", maintain_order: true, as_dict: true)
|
3359
|
+
# # =>
|
3360
|
+
# # {"A"=>shape: (2, 3)
|
3361
|
+
# # ┌─────┬─────┬─────┐
|
3362
|
+
# # │ foo ┆ N ┆ bar │
|
3363
|
+
# # │ --- ┆ --- ┆ --- │
|
3364
|
+
# # │ str ┆ i64 ┆ str │
|
3365
|
+
# # ╞═════╪═════╪═════╡
|
3366
|
+
# # │ A ┆ 1 ┆ k │
|
3367
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
3368
|
+
# # │ A ┆ 2 ┆ l │
|
3369
|
+
# # └─────┴─────┴─────┘, "B"=>shape: (2, 3)
|
3370
|
+
# # ┌─────┬─────┬─────┐
|
3371
|
+
# # │ foo ┆ N ┆ bar │
|
3372
|
+
# # │ --- ┆ --- ┆ --- │
|
3373
|
+
# # │ str ┆ i64 ┆ str │
|
3374
|
+
# # ╞═════╪═════╪═════╡
|
3375
|
+
# # │ B ┆ 2 ┆ m │
|
3376
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
3377
|
+
# # │ B ┆ 4 ┆ m │
|
3378
|
+
# # └─────┴─────┴─────┘, "C"=>shape: (1, 3)
|
3379
|
+
# # ┌─────┬─────┬─────┐
|
3380
|
+
# # │ foo ┆ N ┆ bar │
|
3381
|
+
# # │ --- ┆ --- ┆ --- │
|
3382
|
+
# # │ str ┆ i64 ┆ str │
|
3383
|
+
# # ╞═════╪═════╪═════╡
|
3384
|
+
# # │ C ┆ 2 ┆ l │
|
3385
|
+
# # └─────┴─────┴─────┘}
|
3386
|
+
def partition_by(groups, maintain_order: true, as_dict: false)
|
3387
|
+
if groups.is_a?(String)
|
3388
|
+
groups = [groups]
|
3389
|
+
elsif !groups.is_a?(Array)
|
3390
|
+
groups = Array(groups)
|
3391
|
+
end
|
3392
|
+
|
3393
|
+
if as_dict
|
3394
|
+
out = {}
|
3395
|
+
if groups.length == 1
|
3396
|
+
_df.partition_by(groups, maintain_order).each do |df|
|
3397
|
+
df = _from_rbdf(df)
|
3398
|
+
out[df[groups][0, 0]] = df
|
3399
|
+
end
|
3400
|
+
else
|
3401
|
+
_df.partition_by(groups, maintain_order).each do |df|
|
3402
|
+
df = _from_rbdf(df)
|
3403
|
+
out[df[groups].row(0)] = df
|
3404
|
+
end
|
3405
|
+
end
|
3406
|
+
out
|
3407
|
+
else
|
3408
|
+
_df.partition_by(groups, maintain_order).map { |df| _from_rbdf(df) }
|
3409
|
+
end
|
3410
|
+
end
|
2095
3411
|
|
2096
3412
|
# Shift values by the given period.
|
2097
3413
|
#
|
@@ -2357,7 +3673,7 @@ module Polars
|
|
2357
3673
|
# [
|
2358
3674
|
# (Polars.col("a") ** 2).alias("a^2"),
|
2359
3675
|
# (Polars.col("b") / 2).alias("b/2"),
|
2360
|
-
# (Polars.col("c").is_not
|
3676
|
+
# (Polars.col("c").is_not).alias("not c")
|
2361
3677
|
# ]
|
2362
3678
|
# )
|
2363
3679
|
# # =>
|
@@ -2964,8 +4280,93 @@ module Polars
|
|
2964
4280
|
_from_rbdf(_df.sample_n(n, with_replacement, shuffle, seed))
|
2965
4281
|
end
|
2966
4282
|
|
2967
|
-
#
|
2968
|
-
#
|
4283
|
+
# Apply a horizontal reduction on a DataFrame.
|
4284
|
+
#
|
4285
|
+
# This can be used to effectively determine aggregations on a row level, and can
|
4286
|
+
# be applied to any DataType that can be supercasted (casted to a similar parent
|
4287
|
+
# type).
|
4288
|
+
#
|
4289
|
+
# An example of the supercast rules when applying an arithmetic operation on two
|
4290
|
+
# DataTypes are for instance:
|
4291
|
+
#
|
4292
|
+
# i8 + str = str
|
4293
|
+
# f32 + i64 = f32
|
4294
|
+
# f32 + f64 = f64
|
4295
|
+
#
|
4296
|
+
# @return [Series]
|
4297
|
+
#
|
4298
|
+
# @example A horizontal sum operation:
|
4299
|
+
# df = Polars::DataFrame.new(
|
4300
|
+
# {
|
4301
|
+
# "a" => [2, 1, 3],
|
4302
|
+
# "b" => [1, 2, 3],
|
4303
|
+
# "c" => [1.0, 2.0, 3.0]
|
4304
|
+
# }
|
4305
|
+
# )
|
4306
|
+
# df.fold { |s1, s2| s1 + s2 }
|
4307
|
+
# # =>
|
4308
|
+
# # shape: (3,)
|
4309
|
+
# # Series: 'a' [f64]
|
4310
|
+
# # [
|
4311
|
+
# # 4.0
|
4312
|
+
# # 5.0
|
4313
|
+
# # 9.0
|
4314
|
+
# # ]
|
4315
|
+
#
|
4316
|
+
# @example A horizontal minimum operation:
|
4317
|
+
# df = Polars::DataFrame.new({"a" => [2, 1, 3], "b" => [1, 2, 3], "c" => [1.0, 2.0, 3.0]})
|
4318
|
+
# df.fold { |s1, s2| s1.zip_with(s1 < s2, s2) }
|
4319
|
+
# # =>
|
4320
|
+
# # shape: (3,)
|
4321
|
+
# # Series: 'a' [f64]
|
4322
|
+
# # [
|
4323
|
+
# # 1.0
|
4324
|
+
# # 1.0
|
4325
|
+
# # 3.0
|
4326
|
+
# # ]
|
4327
|
+
#
|
4328
|
+
# @example A horizontal string concatenation:
|
4329
|
+
# df = Polars::DataFrame.new(
|
4330
|
+
# {
|
4331
|
+
# "a" => ["foo", "bar", 2],
|
4332
|
+
# "b" => [1, 2, 3],
|
4333
|
+
# "c" => [1.0, 2.0, 3.0]
|
4334
|
+
# }
|
4335
|
+
# )
|
4336
|
+
# df.fold { |s1, s2| s1 + s2 }
|
4337
|
+
# # =>
|
4338
|
+
# # shape: (3,)
|
4339
|
+
# # Series: 'a' [str]
|
4340
|
+
# # [
|
4341
|
+
# # "foo11.0"
|
4342
|
+
# # "bar22.0"
|
4343
|
+
# # null
|
4344
|
+
# # ]
|
4345
|
+
#
|
4346
|
+
# @example A horizontal boolean or, similar to a row-wise .any():
|
4347
|
+
# df = Polars::DataFrame.new(
|
4348
|
+
# {
|
4349
|
+
# "a" => [false, false, true],
|
4350
|
+
# "b" => [false, true, false]
|
4351
|
+
# }
|
4352
|
+
# )
|
4353
|
+
# df.fold { |s1, s2| s1 | s2 }
|
4354
|
+
# # =>
|
4355
|
+
# # shape: (3,)
|
4356
|
+
# # Series: 'a' [bool]
|
4357
|
+
# # [
|
4358
|
+
# # false
|
4359
|
+
# # true
|
4360
|
+
# # true
|
4361
|
+
# # ]
|
4362
|
+
def fold(&operation)
|
4363
|
+
acc = to_series(0)
|
4364
|
+
|
4365
|
+
1.upto(width - 1) do |i|
|
4366
|
+
acc = operation.call(acc, to_series(i))
|
4367
|
+
end
|
4368
|
+
acc
|
4369
|
+
end
|
2969
4370
|
|
2970
4371
|
# Get a row as tuple, either by index or by predicate.
|
2971
4372
|
#
|
@@ -3074,8 +4475,45 @@ module Polars
|
|
3074
4475
|
select(Utils.col("*").take_every(n))
|
3075
4476
|
end
|
3076
4477
|
|
3077
|
-
#
|
3078
|
-
#
|
4478
|
+
# Hash and combine the rows in this DataFrame.
|
4479
|
+
#
|
4480
|
+
# The hash value is of type `:u64`.
|
4481
|
+
#
|
4482
|
+
# @param seed [Integer]
|
4483
|
+
# Random seed parameter. Defaults to 0.
|
4484
|
+
# @param seed_1 [Integer]
|
4485
|
+
# Random seed parameter. Defaults to `seed` if not set.
|
4486
|
+
# @param seed_2 [Integer]
|
4487
|
+
# Random seed parameter. Defaults to `seed` if not set.
|
4488
|
+
# @param seed_3 [Integer]
|
4489
|
+
# Random seed parameter. Defaults to `seed` if not set.
|
4490
|
+
#
|
4491
|
+
# @return [Series]
|
4492
|
+
#
|
4493
|
+
# @example
|
4494
|
+
# df = Polars::DataFrame.new(
|
4495
|
+
# {
|
4496
|
+
# "foo" => [1, nil, 3, 4],
|
4497
|
+
# "ham" => ["a", "b", nil, "d"]
|
4498
|
+
# }
|
4499
|
+
# )
|
4500
|
+
# df.hash_rows(seed: 42)
|
4501
|
+
# # =>
|
4502
|
+
# # shape: (4,)
|
4503
|
+
# # Series: '' [u64]
|
4504
|
+
# # [
|
4505
|
+
# # 4238614331852490969
|
4506
|
+
# # 17976148875586754089
|
4507
|
+
# # 4702262519505526977
|
4508
|
+
# # 18144177983981041107
|
4509
|
+
# # ]
|
4510
|
+
def hash_rows(seed: 0, seed_1: nil, seed_2: nil, seed_3: nil)
|
4511
|
+
k0 = seed
|
4512
|
+
k1 = seed_1.nil? ? seed : seed_1
|
4513
|
+
k2 = seed_2.nil? ? seed : seed_2
|
4514
|
+
k3 = seed_3.nil? ? seed : seed_3
|
4515
|
+
Utils.wrap_s(_df.hash_rows(k0, k1, k2, k3))
|
4516
|
+
end
|
3079
4517
|
|
3080
4518
|
# Interpolate intermediate values. The interpolation method is linear.
|
3081
4519
|
#
|
@@ -3200,7 +4638,19 @@ module Polars
|
|
3200
4638
|
self._df = _df._clone
|
3201
4639
|
end
|
3202
4640
|
|
3203
|
-
def
|
4641
|
+
def _pos_idx(idx, dim)
|
4642
|
+
if idx >= 0
|
4643
|
+
idx
|
4644
|
+
else
|
4645
|
+
shape[dim] + idx
|
4646
|
+
end
|
4647
|
+
end
|
4648
|
+
|
4649
|
+
# def _pos_idxs
|
4650
|
+
# end
|
4651
|
+
|
4652
|
+
# @private
|
4653
|
+
def self.hash_to_rbdf(data, columns: nil)
|
3204
4654
|
if !columns.nil?
|
3205
4655
|
columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
|
3206
4656
|
|
@@ -3216,11 +4666,34 @@ module Polars
|
|
3216
4666
|
RbDataFrame.read_hash(data)
|
3217
4667
|
end
|
3218
4668
|
|
3219
|
-
|
3220
|
-
|
4669
|
+
# @private
|
4670
|
+
def self._unpack_columns(columns, lookup_names: nil, n_expected: nil)
|
4671
|
+
if columns.is_a?(Hash)
|
4672
|
+
columns = columns.to_a
|
4673
|
+
end
|
4674
|
+
column_names =
|
4675
|
+
(columns || []).map.with_index do |col, i|
|
4676
|
+
if col.is_a?(String)
|
4677
|
+
col || "column_#{i}"
|
4678
|
+
else
|
4679
|
+
col[0]
|
4680
|
+
end
|
4681
|
+
end
|
4682
|
+
if column_names.empty? && n_expected
|
4683
|
+
column_names = n_expected.times.map { |i| "column_#{i}" }
|
4684
|
+
end
|
4685
|
+
# TODO zip_longest
|
4686
|
+
lookup = column_names.zip(lookup_names || []).to_h
|
4687
|
+
|
4688
|
+
[
|
4689
|
+
column_names,
|
4690
|
+
(columns || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
|
4691
|
+
[lookup[col[0]] || col[0], col[1]]
|
4692
|
+
end
|
4693
|
+
]
|
3221
4694
|
end
|
3222
4695
|
|
3223
|
-
def _handle_columns_arg(data, columns: nil)
|
4696
|
+
def self._handle_columns_arg(data, columns: nil)
|
3224
4697
|
if columns.nil?
|
3225
4698
|
data
|
3226
4699
|
else
|
@@ -3238,14 +4711,39 @@ module Polars
|
|
3238
4711
|
end
|
3239
4712
|
end
|
3240
4713
|
|
3241
|
-
|
3242
|
-
|
3243
|
-
|
4714
|
+
# @private
|
4715
|
+
def self.sequence_to_rbdf(data, columns: nil, orient: nil)
|
4716
|
+
if data.length == 0
|
4717
|
+
return hash_to_rbdf({}, columns: columns)
|
3244
4718
|
end
|
3245
|
-
|
4719
|
+
|
4720
|
+
if data[0].is_a?(Series)
|
4721
|
+
# series_names = data.map(&:name)
|
4722
|
+
# columns, dtypes = _unpack_columns(columns || series_names, n_expected: data.length)
|
4723
|
+
data_series = []
|
4724
|
+
data.each do |s|
|
4725
|
+
data_series << s._s
|
4726
|
+
end
|
4727
|
+
elsif data[0].is_a?(Array)
|
4728
|
+
if orient.nil? && !columns.nil?
|
4729
|
+
orient = columns.length == data.length ? "col" : "row"
|
4730
|
+
end
|
4731
|
+
|
4732
|
+
if orient == "row"
|
4733
|
+
raise Todo
|
4734
|
+
elsif orient == "col" || orient.nil?
|
4735
|
+
raise Todo
|
4736
|
+
else
|
4737
|
+
raise ArgumentError, "orient must be one of {{'col', 'row', nil}}, got #{orient} instead."
|
4738
|
+
end
|
4739
|
+
end
|
4740
|
+
|
4741
|
+
data_series = _handle_columns_arg(data_series, columns: columns)
|
4742
|
+
RbDataFrame.new(data_series)
|
3246
4743
|
end
|
3247
4744
|
|
3248
|
-
|
4745
|
+
# @private
|
4746
|
+
def self.series_to_rbdf(data, columns: nil)
|
3249
4747
|
if columns
|
3250
4748
|
raise Todo
|
3251
4749
|
end
|