polars-df 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Cargo.lock +70 -9
- data/Cargo.toml +2 -0
- data/ext/polars/Cargo.toml +6 -1
- data/ext/polars/src/apply/dataframe.rs +292 -0
- data/ext/polars/src/apply/mod.rs +254 -0
- data/ext/polars/src/apply/series.rs +1173 -0
- data/ext/polars/src/conversion.rs +100 -5
- data/ext/polars/src/dataframe.rs +146 -1
- data/ext/polars/src/error.rs +8 -0
- data/ext/polars/src/lazy/apply.rs +34 -2
- data/ext/polars/src/lazy/dataframe.rs +72 -1
- data/ext/polars/src/lazy/dsl.rs +38 -0
- data/ext/polars/src/lib.rs +165 -1
- data/ext/polars/src/series.rs +296 -0
- data/ext/polars/src/utils.rs +25 -0
- data/lib/polars/convert.rb +100 -0
- data/lib/polars/data_frame.rb +1457 -56
- data/lib/polars/dynamic_group_by.rb +49 -0
- data/lib/polars/expr.rb +258 -9
- data/lib/polars/functions.rb +192 -3
- data/lib/polars/group_by.rb +43 -3
- data/lib/polars/io.rb +19 -3
- data/lib/polars/lazy_frame.rb +792 -22
- data/lib/polars/lazy_functions.rb +561 -27
- data/lib/polars/rolling_group_by.rb +35 -0
- data/lib/polars/series.rb +132 -10
- data/lib/polars/utils.rb +16 -0
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +9 -1
- metadata +9 -3
data/lib/polars/data_frame.rb
CHANGED
@@ -26,14 +26,14 @@ module Polars
|
|
26
26
|
end
|
27
27
|
|
28
28
|
if data.nil?
|
29
|
-
self._df = hash_to_rbdf({}, columns: columns)
|
29
|
+
self._df = self.class.hash_to_rbdf({}, columns: columns)
|
30
30
|
elsif data.is_a?(Hash)
|
31
31
|
data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
|
32
|
-
self._df = hash_to_rbdf(data, columns: columns)
|
32
|
+
self._df = self.class.hash_to_rbdf(data, columns: columns)
|
33
33
|
elsif data.is_a?(Array)
|
34
|
-
self._df = sequence_to_rbdf(data, columns: columns, orient: orient)
|
34
|
+
self._df = self.class.sequence_to_rbdf(data, columns: columns, orient: orient)
|
35
35
|
elsif data.is_a?(Series)
|
36
|
-
self._df = series_to_rbdf(data, columns: columns)
|
36
|
+
self._df = self.class.series_to_rbdf(data, columns: columns)
|
37
37
|
else
|
38
38
|
raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
|
39
39
|
end
|
@@ -46,11 +46,16 @@ module Polars
|
|
46
46
|
df
|
47
47
|
end
|
48
48
|
|
49
|
-
#
|
50
|
-
|
49
|
+
# @private
|
50
|
+
def self._from_hashes(data, infer_schema_length: 100, schema: nil)
|
51
|
+
rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema)
|
52
|
+
_from_rbdf(rbdf)
|
53
|
+
end
|
51
54
|
|
52
|
-
#
|
53
|
-
|
55
|
+
# @private
|
56
|
+
def self._from_hash(data, columns: nil)
|
57
|
+
_from_rbdf(hash_to_rbdf(data, columns: columns))
|
58
|
+
end
|
54
59
|
|
55
60
|
# def self._from_records
|
56
61
|
# end
|
@@ -186,8 +191,14 @@ module Polars
|
|
186
191
|
)
|
187
192
|
end
|
188
193
|
|
189
|
-
#
|
190
|
-
|
194
|
+
# @private
|
195
|
+
def self._read_avro(file, columns: nil, n_rows: nil)
|
196
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
197
|
+
file = Utils.format_path(file)
|
198
|
+
end
|
199
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
200
|
+
_from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
|
201
|
+
end
|
191
202
|
|
192
203
|
# @private
|
193
204
|
def self._read_ipc(
|
@@ -486,12 +497,6 @@ module Polars
|
|
486
497
|
# def each
|
487
498
|
# end
|
488
499
|
|
489
|
-
# def _pos_idx
|
490
|
-
# end
|
491
|
-
|
492
|
-
# def _pos_idxs
|
493
|
-
# end
|
494
|
-
|
495
500
|
# Returns subset of the DataFrame.
|
496
501
|
#
|
497
502
|
# @return [Object]
|
@@ -554,19 +559,33 @@ module Polars
|
|
554
559
|
|
555
560
|
# df[idx]
|
556
561
|
if item.is_a?(Integer)
|
557
|
-
return slice(_pos_idx(item,
|
562
|
+
return slice(_pos_idx(item, 0), 1)
|
558
563
|
end
|
559
564
|
|
560
565
|
# df[..]
|
561
566
|
if item.is_a?(Range)
|
562
567
|
return Slice.new(self).apply(item)
|
563
568
|
end
|
569
|
+
|
570
|
+
if Utils.is_str_sequence(item, allow_str: false)
|
571
|
+
# select multiple columns
|
572
|
+
# df[["foo", "bar"]]
|
573
|
+
return _from_rbdf(_df.select(item))
|
574
|
+
end
|
564
575
|
end
|
565
576
|
|
566
577
|
raise ArgumentError, "Cannot get item of type: #{item.class.name}"
|
567
578
|
end
|
568
579
|
|
580
|
+
# Set item.
|
581
|
+
#
|
582
|
+
# @return [Object]
|
569
583
|
# def []=(key, value)
|
584
|
+
# if key.is_a?(String)
|
585
|
+
# raise TypeError, "'DataFrame' object does not support 'Series' assignment by index. Use 'DataFrame.with_columns'"
|
586
|
+
# end
|
587
|
+
|
588
|
+
# raise Todo
|
570
589
|
# end
|
571
590
|
|
572
591
|
# no to_arrow
|
@@ -582,8 +601,24 @@ module Polars
|
|
582
601
|
end
|
583
602
|
end
|
584
603
|
|
585
|
-
#
|
586
|
-
#
|
604
|
+
# Convert every row to a dictionary.
|
605
|
+
#
|
606
|
+
# Note that this is slow.
|
607
|
+
#
|
608
|
+
# @return [Array]
|
609
|
+
#
|
610
|
+
# @example
|
611
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
612
|
+
# df.to_hashes
|
613
|
+
# [{'foo': 1, 'bar': 4}, {'foo': 2, 'bar': 5}, {'foo': 3, 'bar': 6}]
|
614
|
+
def to_hashes
|
615
|
+
rbdf = _df
|
616
|
+
names = columns
|
617
|
+
|
618
|
+
height.times.map do |i|
|
619
|
+
names.zip(rbdf.row_tuple(i)).to_h
|
620
|
+
end
|
621
|
+
end
|
587
622
|
|
588
623
|
# def to_numo
|
589
624
|
# end
|
@@ -762,8 +797,24 @@ module Polars
|
|
762
797
|
nil
|
763
798
|
end
|
764
799
|
|
765
|
-
#
|
766
|
-
#
|
800
|
+
# Write to Apache Avro file.
|
801
|
+
#
|
802
|
+
# @param file [String]
|
803
|
+
# File path to which the file should be written.
|
804
|
+
# @param compression ["uncompressed", "snappy", "deflate"]
|
805
|
+
# Compression method. Defaults to "uncompressed".
|
806
|
+
#
|
807
|
+
# @return [nil]
|
808
|
+
def write_avro(file, compression = "uncompressed")
|
809
|
+
if compression.nil?
|
810
|
+
compression = "uncompressed"
|
811
|
+
end
|
812
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
813
|
+
file = Utils.format_path(file)
|
814
|
+
end
|
815
|
+
|
816
|
+
_df.write_avro(file, compression)
|
817
|
+
end
|
767
818
|
|
768
819
|
# Write to Arrow IPC binary stream or Feather file.
|
769
820
|
#
|
@@ -866,8 +917,84 @@ module Polars
|
|
866
917
|
Utils.scale_bytes(sz, to: unit)
|
867
918
|
end
|
868
919
|
|
869
|
-
#
|
870
|
-
#
|
920
|
+
# Transpose a DataFrame over the diagonal.
|
921
|
+
#
|
922
|
+
# @param include_header [Boolean]
|
923
|
+
# If set, the column names will be added as first column.
|
924
|
+
# @param header_name [String]
|
925
|
+
# If `include_header` is set, this determines the name of the column that will
|
926
|
+
# be inserted.
|
927
|
+
# @param column_names [Array]
|
928
|
+
# Optional generator/iterator that yields column names. Will be used to
|
929
|
+
# replace the columns in the DataFrame.
|
930
|
+
#
|
931
|
+
# @return [DataFrame]
|
932
|
+
#
|
933
|
+
# @note
|
934
|
+
# This is a very expensive operation. Perhaps you can do it differently.
|
935
|
+
#
|
936
|
+
# @example
|
937
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => [1, 2, 3]})
|
938
|
+
# df.transpose(include_header: true)
|
939
|
+
# # =>
|
940
|
+
# # shape: (2, 4)
|
941
|
+
# # ┌────────┬──────────┬──────────┬──────────┐
|
942
|
+
# # │ column ┆ column_0 ┆ column_1 ┆ column_2 │
|
943
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
944
|
+
# # │ str ┆ i64 ┆ i64 ┆ i64 │
|
945
|
+
# # ╞════════╪══════════╪══════════╪══════════╡
|
946
|
+
# # │ a ┆ 1 ┆ 2 ┆ 3 │
|
947
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
|
948
|
+
# # │ b ┆ 1 ┆ 2 ┆ 3 │
|
949
|
+
# # └────────┴──────────┴──────────┴──────────┘
|
950
|
+
#
|
951
|
+
# @example Replace the auto-generated column names with a list
|
952
|
+
# df.transpose(include_header: false, column_names: ["a", "b", "c"])
|
953
|
+
# # =>
|
954
|
+
# # shape: (2, 3)
|
955
|
+
# # ┌─────┬─────┬─────┐
|
956
|
+
# # │ a ┆ b ┆ c │
|
957
|
+
# # │ --- ┆ --- ┆ --- │
|
958
|
+
# # │ i64 ┆ i64 ┆ i64 │
|
959
|
+
# # ╞═════╪═════╪═════╡
|
960
|
+
# # │ 1 ┆ 2 ┆ 3 │
|
961
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
962
|
+
# # │ 1 ┆ 2 ┆ 3 │
|
963
|
+
# # └─────┴─────┴─────┘
|
964
|
+
#
|
965
|
+
# @example Include the header as a separate column
|
966
|
+
# df.transpose(
|
967
|
+
# include_header: true, header_name: "foo", column_names: ["a", "b", "c"]
|
968
|
+
# )
|
969
|
+
# # =>
|
970
|
+
# # shape: (2, 4)
|
971
|
+
# # ┌─────┬─────┬─────┬─────┐
|
972
|
+
# # │ foo ┆ a ┆ b ┆ c │
|
973
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
974
|
+
# # │ str ┆ i64 ┆ i64 ┆ i64 │
|
975
|
+
# # ╞═════╪═════╪═════╪═════╡
|
976
|
+
# # │ a ┆ 1 ┆ 2 ┆ 3 │
|
977
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
978
|
+
# # │ b ┆ 1 ┆ 2 ┆ 3 │
|
979
|
+
# # └─────┴─────┴─────┴─────┘
|
980
|
+
def transpose(include_header: false, header_name: "column", column_names: nil)
|
981
|
+
df = _from_rbdf(_df.transpose(include_header, header_name))
|
982
|
+
if !column_names.nil?
|
983
|
+
names = []
|
984
|
+
n = df.width
|
985
|
+
if include_header
|
986
|
+
names << header_name
|
987
|
+
n -= 1
|
988
|
+
end
|
989
|
+
|
990
|
+
column_names = column_names.each
|
991
|
+
n.times do
|
992
|
+
names << column_names.next
|
993
|
+
end
|
994
|
+
df.columns = names
|
995
|
+
end
|
996
|
+
df
|
997
|
+
end
|
871
998
|
|
872
999
|
# Reverse the DataFrame.
|
873
1000
|
#
|
@@ -1462,8 +1589,48 @@ module Polars
|
|
1462
1589
|
_from_rbdf(_df.drop_nulls(subset))
|
1463
1590
|
end
|
1464
1591
|
|
1465
|
-
#
|
1466
|
-
#
|
1592
|
+
# Offers a structured way to apply a sequence of user-defined functions (UDFs).
|
1593
|
+
#
|
1594
|
+
# @param func [Object]
|
1595
|
+
# Callable; will receive the frame as the first parameter,
|
1596
|
+
# followed by any given args/kwargs.
|
1597
|
+
# @param args [Object]
|
1598
|
+
# Arguments to pass to the UDF.
|
1599
|
+
# @param kwargs [Object]
|
1600
|
+
# Keyword arguments to pass to the UDF.
|
1601
|
+
#
|
1602
|
+
# @return [Object]
|
1603
|
+
#
|
1604
|
+
# @note
|
1605
|
+
# It is recommended to use LazyFrame when piping operations, in order
|
1606
|
+
# to fully take advantage of query optimization and parallelization.
|
1607
|
+
# See {#lazy}.
|
1608
|
+
#
|
1609
|
+
# @example
|
1610
|
+
# cast_str_to_int = lambda do |data, col_name:|
|
1611
|
+
# data.with_column(Polars.col(col_name).cast(:i64))
|
1612
|
+
# end
|
1613
|
+
#
|
1614
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]})
|
1615
|
+
# df.pipe(cast_str_to_int, col_name: "b")
|
1616
|
+
# # =>
|
1617
|
+
# # shape: (4, 2)
|
1618
|
+
# # ┌─────┬─────┐
|
1619
|
+
# # │ a ┆ b │
|
1620
|
+
# # │ --- ┆ --- │
|
1621
|
+
# # │ i64 ┆ i64 │
|
1622
|
+
# # ╞═════╪═════╡
|
1623
|
+
# # │ 1 ┆ 10 │
|
1624
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1625
|
+
# # │ 2 ┆ 20 │
|
1626
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1627
|
+
# # │ 3 ┆ 30 │
|
1628
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1629
|
+
# # │ 4 ┆ 40 │
|
1630
|
+
# # └─────┴─────┘
|
1631
|
+
def pipe(func, *args, **kwargs, &block)
|
1632
|
+
func.call(self, *args, **kwargs, &block)
|
1633
|
+
end
|
1467
1634
|
|
1468
1635
|
# Add a column at index 0 that counts the rows.
|
1469
1636
|
#
|
@@ -1547,17 +1714,612 @@ module Polars
|
|
1547
1714
|
)
|
1548
1715
|
end
|
1549
1716
|
|
1550
|
-
#
|
1551
|
-
#
|
1717
|
+
# Create rolling groups based on a time column.
|
1718
|
+
#
|
1719
|
+
# Also works for index values of type `:i32` or `:i64`.
|
1720
|
+
#
|
1721
|
+
# Different from a `dynamic_groupby` the windows are now determined by the
|
1722
|
+
# individual values and are not of constant intervals. For constant intervals use
|
1723
|
+
# *groupby_dynamic*
|
1724
|
+
#
|
1725
|
+
# The `period` and `offset` arguments are created either from a timedelta, or
|
1726
|
+
# by using the following string language:
|
1727
|
+
#
|
1728
|
+
# - 1ns (1 nanosecond)
|
1729
|
+
# - 1us (1 microsecond)
|
1730
|
+
# - 1ms (1 millisecond)
|
1731
|
+
# - 1s (1 second)
|
1732
|
+
# - 1m (1 minute)
|
1733
|
+
# - 1h (1 hour)
|
1734
|
+
# - 1d (1 day)
|
1735
|
+
# - 1w (1 week)
|
1736
|
+
# - 1mo (1 calendar month)
|
1737
|
+
# - 1y (1 calendar year)
|
1738
|
+
# - 1i (1 index count)
|
1739
|
+
#
|
1740
|
+
# Or combine them:
|
1741
|
+
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
1742
|
+
#
|
1743
|
+
# In case of a groupby_rolling on an integer column, the windows are defined by:
|
1744
|
+
#
|
1745
|
+
# - **"1i" # length 1**
|
1746
|
+
# - **"10i" # length 10**
|
1747
|
+
#
|
1748
|
+
# @param index_column [Object]
|
1749
|
+
# Column used to group based on the time window.
|
1750
|
+
# Often to type Date/Datetime
|
1751
|
+
# This column must be sorted in ascending order. If not the output will not
|
1752
|
+
# make sense.
|
1753
|
+
#
|
1754
|
+
# In case of a rolling groupby on indices, dtype needs to be one of
|
1755
|
+
# `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
|
1756
|
+
# performance matters use an `:i64` column.
|
1757
|
+
# @param period [Object]
|
1758
|
+
# Length of the window.
|
1759
|
+
# @param offset [Object]
|
1760
|
+
# Offset of the window. Default is -period.
|
1761
|
+
# @param closed ["right", "left", "both", "none"]
|
1762
|
+
# Define whether the temporal window interval is closed or not.
|
1763
|
+
# @param by [Object]
|
1764
|
+
# Also group by this column/these columns.
|
1765
|
+
#
|
1766
|
+
# @return [RollingGroupBy]
|
1767
|
+
#
|
1768
|
+
# @example
|
1769
|
+
# dates = [
|
1770
|
+
# "2020-01-01 13:45:48",
|
1771
|
+
# "2020-01-01 16:42:13",
|
1772
|
+
# "2020-01-01 16:45:09",
|
1773
|
+
# "2020-01-02 18:12:48",
|
1774
|
+
# "2020-01-03 19:45:32",
|
1775
|
+
# "2020-01-08 23:16:43"
|
1776
|
+
# ]
|
1777
|
+
# df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
|
1778
|
+
# Polars.col("dt").str.strptime(:datetime)
|
1779
|
+
# )
|
1780
|
+
# df.groupby_rolling(index_column: "dt", period: "2d").agg(
|
1781
|
+
# [
|
1782
|
+
# Polars.sum("a").alias("sum_a"),
|
1783
|
+
# Polars.min("a").alias("min_a"),
|
1784
|
+
# Polars.max("a").alias("max_a")
|
1785
|
+
# ]
|
1786
|
+
# )
|
1787
|
+
# # =>
|
1788
|
+
# # shape: (6, 4)
|
1789
|
+
# # ┌─────────────────────┬───────┬───────┬───────┐
|
1790
|
+
# # │ dt ┆ sum_a ┆ min_a ┆ max_a │
|
1791
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1792
|
+
# # │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │
|
1793
|
+
# # ╞═════════════════════╪═══════╪═══════╪═══════╡
|
1794
|
+
# # │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │
|
1795
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1796
|
+
# # │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │
|
1797
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1798
|
+
# # │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │
|
1799
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1800
|
+
# # │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │
|
1801
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1802
|
+
# # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
|
1803
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1804
|
+
# # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
|
1805
|
+
# # └─────────────────────┴───────┴───────┴───────┘
|
1806
|
+
def groupby_rolling(
|
1807
|
+
index_column:,
|
1808
|
+
period:,
|
1809
|
+
offset: nil,
|
1810
|
+
closed: "right",
|
1811
|
+
by: nil
|
1812
|
+
)
|
1813
|
+
RollingGroupBy.new(self, index_column, period, offset, closed, by)
|
1814
|
+
end
|
1552
1815
|
|
1553
|
-
#
|
1554
|
-
#
|
1816
|
+
# Group based on a time value (or index value of type `:i32`, `:i64`).
|
1817
|
+
#
|
1818
|
+
# Time windows are calculated and rows are assigned to windows. Different from a
|
1819
|
+
# normal groupby is that a row can be member of multiple groups. The time/index
|
1820
|
+
# window could be seen as a rolling window, with a window size determined by
|
1821
|
+
# dates/times/values instead of slots in the DataFrame.
|
1822
|
+
#
|
1823
|
+
# A window is defined by:
|
1824
|
+
#
|
1825
|
+
# - every: interval of the window
|
1826
|
+
# - period: length of the window
|
1827
|
+
# - offset: offset of the window
|
1828
|
+
#
|
1829
|
+
# The `every`, `period` and `offset` arguments are created with
|
1830
|
+
# the following string language:
|
1831
|
+
#
|
1832
|
+
# - 1ns (1 nanosecond)
|
1833
|
+
# - 1us (1 microsecond)
|
1834
|
+
# - 1ms (1 millisecond)
|
1835
|
+
# - 1s (1 second)
|
1836
|
+
# - 1m (1 minute)
|
1837
|
+
# - 1h (1 hour)
|
1838
|
+
# - 1d (1 day)
|
1839
|
+
# - 1w (1 week)
|
1840
|
+
# - 1mo (1 calendar month)
|
1841
|
+
# - 1y (1 calendar year)
|
1842
|
+
# - 1i (1 index count)
|
1843
|
+
#
|
1844
|
+
# Or combine them:
|
1845
|
+
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
1846
|
+
#
|
1847
|
+
# In case of a groupby_dynamic on an integer column, the windows are defined by:
|
1848
|
+
#
|
1849
|
+
# - "1i" # length 1
|
1850
|
+
# - "10i" # length 10
|
1851
|
+
#
|
1852
|
+
# @param index_column
|
1853
|
+
# Column used to group based on the time window.
|
1854
|
+
# Often to type Date/Datetime
|
1855
|
+
# This column must be sorted in ascending order. If not the output will not
|
1856
|
+
# make sense.
|
1857
|
+
#
|
1858
|
+
# In case of a dynamic groupby on indices, dtype needs to be one of
|
1859
|
+
# `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
|
1860
|
+
# performance matters use an `:i64` column.
|
1861
|
+
# @param every
|
1862
|
+
# Interval of the window.
|
1863
|
+
# @param period
|
1864
|
+
# Length of the window, if None it is equal to 'every'.
|
1865
|
+
# @param offset
|
1866
|
+
# Offset of the window if None and period is None it will be equal to negative
|
1867
|
+
# `every`.
|
1868
|
+
# @param truncate
|
1869
|
+
# Truncate the time value to the window lower bound.
|
1870
|
+
# @param include_boundaries
|
1871
|
+
# Add the lower and upper bound of the window to the "_lower_bound" and
|
1872
|
+
# "_upper_bound" columns. This will impact performance because it's harder to
|
1873
|
+
# parallelize
|
1874
|
+
# @param closed ["right", "left", "both", "none"]
|
1875
|
+
# Define whether the temporal window interval is closed or not.
|
1876
|
+
# @param by
|
1877
|
+
# Also group by this column/these columns
|
1878
|
+
#
|
1879
|
+
# @return [DataFrame]
|
1880
|
+
#
|
1881
|
+
# @example
|
1882
|
+
# df = Polars::DataFrame.new(
|
1883
|
+
# {
|
1884
|
+
# "time" => Polars.date_range(
|
1885
|
+
# DateTime.new(2021, 12, 16),
|
1886
|
+
# DateTime.new(2021, 12, 16, 3),
|
1887
|
+
# "30m"
|
1888
|
+
# ),
|
1889
|
+
# "n" => 0..6
|
1890
|
+
# }
|
1891
|
+
# )
|
1892
|
+
# # =>
|
1893
|
+
# # shape: (7, 2)
|
1894
|
+
# # ┌─────────────────────┬─────┐
|
1895
|
+
# # │ time ┆ n │
|
1896
|
+
# # │ --- ┆ --- │
|
1897
|
+
# # │ datetime[μs] ┆ i64 │
|
1898
|
+
# # ╞═════════════════════╪═════╡
|
1899
|
+
# # │ 2021-12-16 00:00:00 ┆ 0 │
|
1900
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1901
|
+
# # │ 2021-12-16 00:30:00 ┆ 1 │
|
1902
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1903
|
+
# # │ 2021-12-16 01:00:00 ┆ 2 │
|
1904
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1905
|
+
# # │ 2021-12-16 01:30:00 ┆ 3 │
|
1906
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1907
|
+
# # │ 2021-12-16 02:00:00 ┆ 4 │
|
1908
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1909
|
+
# # │ 2021-12-16 02:30:00 ┆ 5 │
|
1910
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1911
|
+
# # │ 2021-12-16 03:00:00 ┆ 6 │
|
1912
|
+
# # └─────────────────────┴─────┘
|
1913
|
+
#
|
1914
|
+
# @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
|
1915
|
+
# df.groupby_dynamic("time", every: "1h", closed: "right").agg(
|
1916
|
+
# [
|
1917
|
+
# Polars.col("time").min.alias("time_min"),
|
1918
|
+
# Polars.col("time").max.alias("time_max")
|
1919
|
+
# ]
|
1920
|
+
# )
|
1921
|
+
# # =>
|
1922
|
+
# # shape: (4, 3)
|
1923
|
+
# # ┌─────────────────────┬─────────────────────┬─────────────────────┐
|
1924
|
+
# # │ time ┆ time_min ┆ time_max │
|
1925
|
+
# # │ --- ┆ --- ┆ --- │
|
1926
|
+
# # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │
|
1927
|
+
# # ╞═════════════════════╪═════════════════════╪═════════════════════╡
|
1928
|
+
# # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │
|
1929
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1930
|
+
# # │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │
|
1931
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1932
|
+
# # │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │
|
1933
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1934
|
+
# # │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │
|
1935
|
+
# # └─────────────────────┴─────────────────────┴─────────────────────┘
|
1936
|
+
#
|
1937
|
+
# @example The window boundaries can also be added to the aggregation result.
|
1938
|
+
# df.groupby_dynamic(
|
1939
|
+
# "time", every: "1h", include_boundaries: true, closed: "right"
|
1940
|
+
# ).agg([Polars.col("time").count.alias("time_count")])
|
1941
|
+
# # =>
|
1942
|
+
# # shape: (4, 4)
|
1943
|
+
# # ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
|
1944
|
+
# # │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
|
1945
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1946
|
+
# # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
|
1947
|
+
# # ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
|
1948
|
+
# # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
|
1949
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1950
|
+
# # │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │
|
1951
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1952
|
+
# # │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
|
1953
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1954
|
+
# # │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
|
1955
|
+
# # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
|
1956
|
+
#
|
1957
|
+
# @example When closed="left", should not include right end of interval.
|
1958
|
+
# df.groupby_dynamic("time", every: "1h", closed: "left").agg(
|
1959
|
+
# [
|
1960
|
+
# Polars.col("time").count.alias("time_count"),
|
1961
|
+
# Polars.col("time").list.alias("time_agg_list")
|
1962
|
+
# ]
|
1963
|
+
# )
|
1964
|
+
# # =>
|
1965
|
+
# # shape: (4, 3)
|
1966
|
+
# # ┌─────────────────────┬────────────┬─────────────────────────────────────┐
|
1967
|
+
# # │ time ┆ time_count ┆ time_agg_list │
|
1968
|
+
# # │ --- ┆ --- ┆ --- │
|
1969
|
+
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
|
1970
|
+
# # ╞═════════════════════╪════════════╪═════════════════════════════════════╡
|
1971
|
+
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16... │
|
1972
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1973
|
+
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16... │
|
1974
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1975
|
+
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16... │
|
1976
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1977
|
+
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
|
1978
|
+
# # └─────────────────────┴────────────┴─────────────────────────────────────┘
|
1979
|
+
#
|
1980
|
+
# @example When closed="both" the time values at the window boundaries belong to 2 groups.
|
1981
|
+
# df.groupby_dynamic("time", every: "1h", closed: "both").agg(
|
1982
|
+
# [Polars.col("time").count.alias("time_count")]
|
1983
|
+
# )
|
1984
|
+
# # =>
|
1985
|
+
# # shape: (5, 2)
|
1986
|
+
# # ┌─────────────────────┬────────────┐
|
1987
|
+
# # │ time ┆ time_count │
|
1988
|
+
# # │ --- ┆ --- │
|
1989
|
+
# # │ datetime[μs] ┆ u32 │
|
1990
|
+
# # ╞═════════════════════╪════════════╡
|
1991
|
+
# # │ 2021-12-15 23:00:00 ┆ 1 │
|
1992
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1993
|
+
# # │ 2021-12-16 00:00:00 ┆ 3 │
|
1994
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1995
|
+
# # │ 2021-12-16 01:00:00 ┆ 3 │
|
1996
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1997
|
+
# # │ 2021-12-16 02:00:00 ┆ 3 │
|
1998
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1999
|
+
# # │ 2021-12-16 03:00:00 ┆ 1 │
|
2000
|
+
# # └─────────────────────┴────────────┘
|
2001
|
+
#
|
2002
|
+
# @example Dynamic groupbys can also be combined with grouping on normal keys.
|
2003
|
+
# df = Polars::DataFrame.new(
|
2004
|
+
# {
|
2005
|
+
# "time" => Polars.date_range(
|
2006
|
+
# DateTime.new(2021, 12, 16),
|
2007
|
+
# DateTime.new(2021, 12, 16, 3),
|
2008
|
+
# "30m"
|
2009
|
+
# ),
|
2010
|
+
# "groups" => ["a", "a", "a", "b", "b", "a", "a"]
|
2011
|
+
# }
|
2012
|
+
# )
|
2013
|
+
# df.groupby_dynamic(
|
2014
|
+
# "time",
|
2015
|
+
# every: "1h",
|
2016
|
+
# closed: "both",
|
2017
|
+
# by: "groups",
|
2018
|
+
# include_boundaries: true
|
2019
|
+
# ).agg([Polars.col("time").count.alias("time_count")])
|
2020
|
+
# # =>
|
2021
|
+
# # shape: (7, 5)
|
2022
|
+
# # ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
|
2023
|
+
# # │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
|
2024
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
2025
|
+
# # │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
|
2026
|
+
# # ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
|
2027
|
+
# # │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
|
2028
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2029
|
+
# # │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │
|
2030
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2031
|
+
# # │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │
|
2032
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2033
|
+
# # │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
|
2034
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2035
|
+
# # │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │
|
2036
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2037
|
+
# # │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
|
2038
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2039
|
+
# # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
|
2040
|
+
# # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
|
2041
|
+
#
|
2042
|
+
# @example Dynamic groupby on an index column.
|
2043
|
+
# df = Polars::DataFrame.new(
|
2044
|
+
# {
|
2045
|
+
# "idx" => Polars.arange(0, 6, eager: true),
|
2046
|
+
# "A" => ["A", "A", "B", "B", "B", "C"]
|
2047
|
+
# }
|
2048
|
+
# )
|
2049
|
+
# df.groupby_dynamic(
|
2050
|
+
# "idx",
|
2051
|
+
# every: "2i",
|
2052
|
+
# period: "3i",
|
2053
|
+
# include_boundaries: true,
|
2054
|
+
# closed: "right"
|
2055
|
+
# ).agg(Polars.col("A").list.alias("A_agg_list"))
|
2056
|
+
# # =>
|
2057
|
+
# # shape: (3, 4)
|
2058
|
+
# # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
|
2059
|
+
# # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
|
2060
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
2061
|
+
# # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
|
2062
|
+
# # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
|
2063
|
+
# # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
|
2064
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2065
|
+
# # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
|
2066
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2067
|
+
# # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
|
2068
|
+
# # └─────────────────┴─────────────────┴─────┴─────────────────┘
|
2069
|
+
def groupby_dynamic(
|
2070
|
+
index_column,
|
2071
|
+
every:,
|
2072
|
+
period: nil,
|
2073
|
+
offset: nil,
|
2074
|
+
truncate: true,
|
2075
|
+
include_boundaries: false,
|
2076
|
+
closed: "left",
|
2077
|
+
by: nil
|
2078
|
+
)
|
2079
|
+
DynamicGroupBy.new(
|
2080
|
+
self,
|
2081
|
+
index_column,
|
2082
|
+
every,
|
2083
|
+
period,
|
2084
|
+
offset,
|
2085
|
+
truncate,
|
2086
|
+
include_boundaries,
|
2087
|
+
closed,
|
2088
|
+
by
|
2089
|
+
)
|
2090
|
+
end
|
1555
2091
|
|
1556
|
-
#
|
1557
|
-
#
|
2092
|
+
# Upsample a DataFrame at a regular frequency.
|
2093
|
+
#
|
2094
|
+
# @param time_column [Object]
|
2095
|
+
# time column will be used to determine a date_range.
|
2096
|
+
# Note that this column has to be sorted for the output to make sense.
|
2097
|
+
# @param every [String]
|
2098
|
+
# interval will start 'every' duration
|
2099
|
+
# @param offset [String]
|
2100
|
+
# change the start of the date_range by this offset.
|
2101
|
+
# @param by [Object]
|
2102
|
+
# First group by these columns and then upsample for every group
|
2103
|
+
# @param maintain_order [Boolean]
|
2104
|
+
# Keep the ordering predictable. This is slower.
|
2105
|
+
#
|
2106
|
+
# The `every` and `offset` arguments are created with
|
2107
|
+
# the following string language:
|
2108
|
+
#
|
2109
|
+
# - 1ns (1 nanosecond)
|
2110
|
+
# - 1us (1 microsecond)
|
2111
|
+
# - 1ms (1 millisecond)
|
2112
|
+
# - 1s (1 second)
|
2113
|
+
# - 1m (1 minute)
|
2114
|
+
# - 1h (1 hour)
|
2115
|
+
# - 1d (1 day)
|
2116
|
+
# - 1w (1 week)
|
2117
|
+
# - 1mo (1 calendar month)
|
2118
|
+
# - 1y (1 calendar year)
|
2119
|
+
# - 1i (1 index count)
|
2120
|
+
#
|
2121
|
+
# Or combine them:
|
2122
|
+
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
2123
|
+
#
|
2124
|
+
# @return [DataFrame]
|
2125
|
+
#
|
2126
|
+
# @example Upsample a DataFrame by a certain interval.
|
2127
|
+
# df = Polars::DataFrame.new(
|
2128
|
+
# {
|
2129
|
+
# "time" => [
|
2130
|
+
# DateTime.new(2021, 2, 1),
|
2131
|
+
# DateTime.new(2021, 4, 1),
|
2132
|
+
# DateTime.new(2021, 5, 1),
|
2133
|
+
# DateTime.new(2021, 6, 1)
|
2134
|
+
# ],
|
2135
|
+
# "groups" => ["A", "B", "A", "B"],
|
2136
|
+
# "values" => [0, 1, 2, 3]
|
2137
|
+
# }
|
2138
|
+
# )
|
2139
|
+
# df.upsample(
|
2140
|
+
# time_column: "time", every: "1mo", by: "groups", maintain_order: true
|
2141
|
+
# ).select(Polars.all.forward_fill)
|
2142
|
+
# # =>
|
2143
|
+
# # shape: (7, 3)
|
2144
|
+
# # ┌─────────────────────┬────────┬────────┐
|
2145
|
+
# # │ time ┆ groups ┆ values │
|
2146
|
+
# # │ --- ┆ --- ┆ --- │
|
2147
|
+
# # │ datetime[ns] ┆ str ┆ i64 │
|
2148
|
+
# # ╞═════════════════════╪════════╪════════╡
|
2149
|
+
# # │ 2021-02-01 00:00:00 ┆ A ┆ 0 │
|
2150
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
2151
|
+
# # │ 2021-03-01 00:00:00 ┆ A ┆ 0 │
|
2152
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
2153
|
+
# # │ 2021-04-01 00:00:00 ┆ A ┆ 0 │
|
2154
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
2155
|
+
# # │ 2021-05-01 00:00:00 ┆ A ┆ 2 │
|
2156
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
2157
|
+
# # │ 2021-04-01 00:00:00 ┆ B ┆ 1 │
|
2158
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
2159
|
+
# # │ 2021-05-01 00:00:00 ┆ B ┆ 1 │
|
2160
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
2161
|
+
# # │ 2021-06-01 00:00:00 ┆ B ┆ 3 │
|
2162
|
+
# # └─────────────────────┴────────┴────────┘
|
2163
|
+
def upsample(
|
2164
|
+
time_column:,
|
2165
|
+
every:,
|
2166
|
+
offset: nil,
|
2167
|
+
by: nil,
|
2168
|
+
maintain_order: false
|
2169
|
+
)
|
2170
|
+
if by.nil?
|
2171
|
+
by = []
|
2172
|
+
end
|
2173
|
+
if by.is_a?(String)
|
2174
|
+
by = [by]
|
2175
|
+
end
|
2176
|
+
if offset.nil?
|
2177
|
+
offset = "0ns"
|
2178
|
+
end
|
1558
2179
|
|
1559
|
-
|
1560
|
-
|
2180
|
+
every = Utils._timedelta_to_pl_duration(every)
|
2181
|
+
offset = Utils._timedelta_to_pl_duration(offset)
|
2182
|
+
|
2183
|
+
_from_rbdf(
|
2184
|
+
_df.upsample(by, time_column, every, offset, maintain_order)
|
2185
|
+
)
|
2186
|
+
end
|
2187
|
+
|
2188
|
+
# Perform an asof join.
|
2189
|
+
#
|
2190
|
+
# This is similar to a left-join except that we match on nearest key rather than
|
2191
|
+
# equal keys.
|
2192
|
+
#
|
2193
|
+
# Both DataFrames must be sorted by the asof_join key.
|
2194
|
+
#
|
2195
|
+
# For each row in the left DataFrame:
|
2196
|
+
#
|
2197
|
+
# - A "backward" search selects the last row in the right DataFrame whose 'on' key is less than or equal to the left's key.
|
2198
|
+
# - A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key.
|
2199
|
+
#
|
2200
|
+
# The default is "backward".
|
2201
|
+
#
|
2202
|
+
# @param other [DataFrame]
|
2203
|
+
# DataFrame to join with.
|
2204
|
+
# @param left_on [String]
|
2205
|
+
# Join column of the left DataFrame.
|
2206
|
+
# @param right_on [String]
|
2207
|
+
# Join column of the right DataFrame.
|
2208
|
+
# @param on [String]
|
2209
|
+
# Join column of both DataFrames. If set, `left_on` and `right_on` should be
|
2210
|
+
# None.
|
2211
|
+
# @param by [Object]
|
2212
|
+
# join on these columns before doing asof join
|
2213
|
+
# @param by_left [Object]
|
2214
|
+
# join on these columns before doing asof join
|
2215
|
+
# @param by_right [Object]
|
2216
|
+
# join on these columns before doing asof join
|
2217
|
+
# @param strategy ["backward", "forward"]
|
2218
|
+
# Join strategy.
|
2219
|
+
# @param suffix [String]
|
2220
|
+
# Suffix to append to columns with a duplicate name.
|
2221
|
+
# @param tolerance [Object]
|
2222
|
+
# Numeric tolerance. By setting this the join will only be done if the near
|
2223
|
+
# keys are within this distance. If an asof join is done on columns of dtype
|
2224
|
+
# "Date", "Datetime", "Duration" or "Time" you use the following string
|
2225
|
+
# language:
|
2226
|
+
#
|
2227
|
+
# - 1ns (1 nanosecond)
|
2228
|
+
# - 1us (1 microsecond)
|
2229
|
+
# - 1ms (1 millisecond)
|
2230
|
+
# - 1s (1 second)
|
2231
|
+
# - 1m (1 minute)
|
2232
|
+
# - 1h (1 hour)
|
2233
|
+
# - 1d (1 day)
|
2234
|
+
# - 1w (1 week)
|
2235
|
+
# - 1mo (1 calendar month)
|
2236
|
+
# - 1y (1 calendar year)
|
2237
|
+
# - 1i (1 index count)
|
2238
|
+
#
|
2239
|
+
# Or combine them:
|
2240
|
+
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
2241
|
+
#
|
2242
|
+
# @param allow_parallel [Boolean]
|
2243
|
+
# Allow the physical plan to optionally evaluate the computation of both
|
2244
|
+
# DataFrames up to the join in parallel.
|
2245
|
+
# @param force_parallel [Boolean]
|
2246
|
+
# Force the physical plan to evaluate the computation of both DataFrames up to
|
2247
|
+
# the join in parallel.
|
2248
|
+
#
|
2249
|
+
# @return [DataFrame]
|
2250
|
+
#
|
2251
|
+
# @example
|
2252
|
+
# gdp = Polars::DataFrame.new(
|
2253
|
+
# {
|
2254
|
+
# "date" => [
|
2255
|
+
# DateTime.new(2016, 1, 1),
|
2256
|
+
# DateTime.new(2017, 1, 1),
|
2257
|
+
# DateTime.new(2018, 1, 1),
|
2258
|
+
# DateTime.new(2019, 1, 1),
|
2259
|
+
# ], # note record date: Jan 1st (sorted!)
|
2260
|
+
# "gdp" => [4164, 4411, 4566, 4696]
|
2261
|
+
# }
|
2262
|
+
# )
|
2263
|
+
# population = Polars::DataFrame.new(
|
2264
|
+
# {
|
2265
|
+
# "date" => [
|
2266
|
+
# DateTime.new(2016, 5, 12),
|
2267
|
+
# DateTime.new(2017, 5, 12),
|
2268
|
+
# DateTime.new(2018, 5, 12),
|
2269
|
+
# DateTime.new(2019, 5, 12),
|
2270
|
+
# ], # note record date: May 12th (sorted!)
|
2271
|
+
# "population" => [82.19, 82.66, 83.12, 83.52]
|
2272
|
+
# }
|
2273
|
+
# )
|
2274
|
+
# population.join_asof(
|
2275
|
+
# gdp, left_on: "date", right_on: "date", strategy: "backward"
|
2276
|
+
# )
|
2277
|
+
# # =>
|
2278
|
+
# # shape: (4, 3)
|
2279
|
+
# # ┌─────────────────────┬────────────┬──────┐
|
2280
|
+
# # │ date ┆ population ┆ gdp │
|
2281
|
+
# # │ --- ┆ --- ┆ --- │
|
2282
|
+
# # │ datetime[ns] ┆ f64 ┆ i64 │
|
2283
|
+
# # ╞═════════════════════╪════════════╪══════╡
|
2284
|
+
# # │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │
|
2285
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2286
|
+
# # │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │
|
2287
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2288
|
+
# # │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │
|
2289
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2290
|
+
# # │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │
|
2291
|
+
# # └─────────────────────┴────────────┴──────┘
|
2292
|
+
def join_asof(
|
2293
|
+
other,
|
2294
|
+
left_on: nil,
|
2295
|
+
right_on: nil,
|
2296
|
+
on: nil,
|
2297
|
+
by_left: nil,
|
2298
|
+
by_right: nil,
|
2299
|
+
by: nil,
|
2300
|
+
strategy: "backward",
|
2301
|
+
suffix: "_right",
|
2302
|
+
tolerance: nil,
|
2303
|
+
allow_parallel: true,
|
2304
|
+
force_parallel: false
|
2305
|
+
)
|
2306
|
+
lazy
|
2307
|
+
.join_asof(
|
2308
|
+
other.lazy,
|
2309
|
+
left_on: left_on,
|
2310
|
+
right_on: right_on,
|
2311
|
+
on: on,
|
2312
|
+
by_left: by_left,
|
2313
|
+
by_right: by_right,
|
2314
|
+
by: by,
|
2315
|
+
strategy: strategy,
|
2316
|
+
suffix: suffix,
|
2317
|
+
tolerance: tolerance,
|
2318
|
+
allow_parallel: allow_parallel,
|
2319
|
+
force_parallel: force_parallel
|
2320
|
+
)
|
2321
|
+
.collect(no_optimization: true)
|
2322
|
+
end
|
1561
2323
|
|
1562
2324
|
# Join in SQL-like fashion.
|
1563
2325
|
#
|
@@ -1675,8 +2437,78 @@ module Polars
|
|
1675
2437
|
.collect(no_optimization: true)
|
1676
2438
|
end
|
1677
2439
|
|
1678
|
-
#
|
1679
|
-
#
|
2440
|
+
# Apply a custom/user-defined function (UDF) over the rows of the DataFrame.
|
2441
|
+
#
|
2442
|
+
# The UDF will receive each row as a tuple of values: `udf(row)`.
|
2443
|
+
#
|
2444
|
+
# Implementing logic using a Ruby function is almost always _significantly_
|
2445
|
+
# slower and more memory intensive than implementing the same logic using
|
2446
|
+
# the native expression API because:
|
2447
|
+
#
|
2448
|
+
# - The native expression engine runs in Rust; UDFs run in Ruby.
|
2449
|
+
# - Use of Ruby UDFs forces the DataFrame to be materialized in memory.
|
2450
|
+
# - Polars-native expressions can be parallelised (UDFs cannot).
|
2451
|
+
# - Polars-native expressions can be logically optimised (UDFs cannot).
|
2452
|
+
#
|
2453
|
+
# Wherever possible you should strongly prefer the native expression API
|
2454
|
+
# to achieve the best performance.
|
2455
|
+
#
|
2456
|
+
# @param return_dtype [Symbol]
|
2457
|
+
# Output type of the operation. If none given, Polars tries to infer the type.
|
2458
|
+
# @param inference_size [Integer]
|
2459
|
+
# Only used in the case when the custom function returns rows.
|
2460
|
+
# This uses the first `n` rows to determine the output schema
|
2461
|
+
#
|
2462
|
+
# @return [Object]
|
2463
|
+
#
|
2464
|
+
# @note
|
2465
|
+
# The frame-level `apply` cannot track column names (as the UDF is a black-box
|
2466
|
+
# that may arbitrarily drop, rearrange, transform, or add new columns); if you
|
2467
|
+
# want to apply a UDF such that column names are preserved, you should use the
|
2468
|
+
# expression-level `apply` syntax instead.
|
2469
|
+
#
|
2470
|
+
# @example
|
2471
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [-1, 5, 8]})
|
2472
|
+
#
|
2473
|
+
# @example Return a DataFrame by mapping each row to a tuple:
|
2474
|
+
# df.apply { |t| [t[0] * 2, t[1] * 3] }
|
2475
|
+
# # =>
|
2476
|
+
# # shape: (3, 2)
|
2477
|
+
# # ┌──────────┬──────────┐
|
2478
|
+
# # │ column_0 ┆ column_1 │
|
2479
|
+
# # │ --- ┆ --- │
|
2480
|
+
# # │ i64 ┆ i64 │
|
2481
|
+
# # ╞══════════╪══════════╡
|
2482
|
+
# # │ 2 ┆ -3 │
|
2483
|
+
# # ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
|
2484
|
+
# # │ 4 ┆ 15 │
|
2485
|
+
# # ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
|
2486
|
+
# # │ 6 ┆ 24 │
|
2487
|
+
# # └──────────┴──────────┘
|
2488
|
+
#
|
2489
|
+
# @example Return a Series by mapping each row to a scalar:
|
2490
|
+
# df.apply { |t| t[0] * 2 + t[1] }
|
2491
|
+
# # =>
|
2492
|
+
# # shape: (3, 1)
|
2493
|
+
# # ┌───────┐
|
2494
|
+
# # │ apply │
|
2495
|
+
# # │ --- │
|
2496
|
+
# # │ i64 │
|
2497
|
+
# # ╞═══════╡
|
2498
|
+
# # │ 1 │
|
2499
|
+
# # ├╌╌╌╌╌╌╌┤
|
2500
|
+
# # │ 9 │
|
2501
|
+
# # ├╌╌╌╌╌╌╌┤
|
2502
|
+
# # │ 14 │
|
2503
|
+
# # └───────┘
|
2504
|
+
def apply(return_dtype: nil, inference_size: 256, &f)
|
2505
|
+
out, is_df = _df.apply(f, return_dtype, inference_size)
|
2506
|
+
if is_df
|
2507
|
+
_from_rbdf(out)
|
2508
|
+
else
|
2509
|
+
_from_rbdf(Utils.wrap_s(out).to_frame._df)
|
2510
|
+
end
|
2511
|
+
end
|
1680
2512
|
|
1681
2513
|
# Return a new DataFrame with the column added or replaced.
|
1682
2514
|
#
|
@@ -2178,17 +3010,404 @@ module Polars
|
|
2178
3010
|
lazy.explode(columns).collect(no_optimization: true)
|
2179
3011
|
end
|
2180
3012
|
|
2181
|
-
#
|
2182
|
-
#
|
3013
|
+
# Create a spreadsheet-style pivot table as a DataFrame.
|
3014
|
+
#
|
3015
|
+
# @param values [Object]
|
3016
|
+
# Column values to aggregate. Can be multiple columns if the *columns*
|
3017
|
+
# arguments contains multiple columns as well
|
3018
|
+
# @param index [Object]
|
3019
|
+
# One or multiple keys to group by
|
3020
|
+
# @param columns [Object]
|
3021
|
+
# Columns whose values will be used as the header of the output DataFrame
|
3022
|
+
# @param aggregate_fn ["first", "sum", "max", "min", "mean", "median", "last", "count"]
|
3023
|
+
# A predefined aggregate function str or an expression.
|
3024
|
+
# @param maintain_order [Object]
|
3025
|
+
# Sort the grouped keys so that the output order is predictable.
|
3026
|
+
# @param sort_columns [Object]
|
3027
|
+
# Sort the transposed columns by name. Default is by order of discovery.
|
3028
|
+
#
|
3029
|
+
# @return [DataFrame]
|
3030
|
+
#
|
3031
|
+
# @example
|
3032
|
+
# df = Polars::DataFrame.new(
|
3033
|
+
# {
|
3034
|
+
# "foo" => ["one", "one", "one", "two", "two", "two"],
|
3035
|
+
# "bar" => ["A", "B", "C", "A", "B", "C"],
|
3036
|
+
# "baz" => [1, 2, 3, 4, 5, 6]
|
3037
|
+
# }
|
3038
|
+
# )
|
3039
|
+
# df.pivot(values: "baz", index: "foo", columns: "bar")
|
3040
|
+
# # =>
|
3041
|
+
# # shape: (2, 4)
|
3042
|
+
# # ┌─────┬─────┬─────┬─────┐
|
3043
|
+
# # │ foo ┆ A ┆ B ┆ C │
|
3044
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
3045
|
+
# # │ str ┆ i64 ┆ i64 ┆ i64 │
|
3046
|
+
# # ╞═════╪═════╪═════╪═════╡
|
3047
|
+
# # │ one ┆ 1 ┆ 2 ┆ 3 │
|
3048
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
3049
|
+
# # │ two ┆ 4 ┆ 5 ┆ 6 │
|
3050
|
+
# # └─────┴─────┴─────┴─────┘
|
3051
|
+
def pivot(
|
3052
|
+
values:,
|
3053
|
+
index:,
|
3054
|
+
columns:,
|
3055
|
+
aggregate_fn: "first",
|
3056
|
+
maintain_order: true,
|
3057
|
+
sort_columns: false
|
3058
|
+
)
|
3059
|
+
if values.is_a?(String)
|
3060
|
+
values = [values]
|
3061
|
+
end
|
3062
|
+
if index.is_a?(String)
|
3063
|
+
index = [index]
|
3064
|
+
end
|
3065
|
+
if columns.is_a?(String)
|
3066
|
+
columns = [columns]
|
3067
|
+
end
|
2183
3068
|
|
2184
|
-
|
2185
|
-
|
3069
|
+
if aggregate_fn.is_a?(String)
|
3070
|
+
case aggregate_fn
|
3071
|
+
when "first"
|
3072
|
+
aggregate_fn = Polars.element.first
|
3073
|
+
when "sum"
|
3074
|
+
aggregate_fn = Polars.element.sum
|
3075
|
+
when "max"
|
3076
|
+
aggregate_fn = Polars.element.max
|
3077
|
+
when "min"
|
3078
|
+
aggregate_fn = Polars.element.min
|
3079
|
+
when "mean"
|
3080
|
+
aggregate_fn = Polars.element.mean
|
3081
|
+
when "median"
|
3082
|
+
aggregate_fn = Polars.element.median
|
3083
|
+
when "last"
|
3084
|
+
aggregate_fn = Polars.element.last
|
3085
|
+
when "count"
|
3086
|
+
aggregate_fn = Polars.count
|
3087
|
+
else
|
3088
|
+
raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
|
3089
|
+
end
|
3090
|
+
end
|
2186
3091
|
|
2187
|
-
|
2188
|
-
|
3092
|
+
_from_rbdf(
|
3093
|
+
_df.pivot_expr(
|
3094
|
+
values,
|
3095
|
+
index,
|
3096
|
+
columns,
|
3097
|
+
aggregate_fn._rbexpr,
|
3098
|
+
maintain_order,
|
3099
|
+
sort_columns
|
3100
|
+
)
|
3101
|
+
)
|
3102
|
+
end
|
2189
3103
|
|
2190
|
-
#
|
2191
|
-
#
|
3104
|
+
# Unpivot a DataFrame from wide to long format.
|
3105
|
+
#
|
3106
|
+
# Optionally leaves identifiers set.
|
3107
|
+
#
|
3108
|
+
# This function is useful to massage a DataFrame into a format where one or more
|
3109
|
+
# columns are identifier variables (id_vars), while all other columns, considered
|
3110
|
+
# measured variables (value_vars), are "unpivoted" to the row axis, leaving just
|
3111
|
+
# two non-identifier columns, 'variable' and 'value'.
|
3112
|
+
#
|
3113
|
+
# @param id_vars [Object]
|
3114
|
+
# Columns to use as identifier variables.
|
3115
|
+
# @param value_vars [Object]
|
3116
|
+
# Values to use as identifier variables.
|
3117
|
+
# If `value_vars` is empty all columns that are not in `id_vars` will be used.
|
3118
|
+
# @param variable_name [String]
|
3119
|
+
# Name to give to the `value` column. Defaults to "variable"
|
3120
|
+
# @param value_name [String]
|
3121
|
+
# Name to give to the `value` column. Defaults to "value"
|
3122
|
+
#
|
3123
|
+
# @return [DataFrame]
|
3124
|
+
#
|
3125
|
+
# @example
|
3126
|
+
# df = Polars::DataFrame.new(
|
3127
|
+
# {
|
3128
|
+
# "a" => ["x", "y", "z"],
|
3129
|
+
# "b" => [1, 3, 5],
|
3130
|
+
# "c" => [2, 4, 6]
|
3131
|
+
# }
|
3132
|
+
# )
|
3133
|
+
# df.melt(id_vars: "a", value_vars: ["b", "c"])
|
3134
|
+
# # =>
|
3135
|
+
# # shape: (6, 3)
|
3136
|
+
# # ┌─────┬──────────┬───────┐
|
3137
|
+
# # │ a ┆ variable ┆ value │
|
3138
|
+
# # │ --- ┆ --- ┆ --- │
|
3139
|
+
# # │ str ┆ str ┆ i64 │
|
3140
|
+
# # ╞═════╪══════════╪═══════╡
|
3141
|
+
# # │ x ┆ b ┆ 1 │
|
3142
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
3143
|
+
# # │ y ┆ b ┆ 3 │
|
3144
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
3145
|
+
# # │ z ┆ b ┆ 5 │
|
3146
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
3147
|
+
# # │ x ┆ c ┆ 2 │
|
3148
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
3149
|
+
# # │ y ┆ c ┆ 4 │
|
3150
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
3151
|
+
# # │ z ┆ c ┆ 6 │
|
3152
|
+
# # └─────┴──────────┴───────┘
|
3153
|
+
def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
|
3154
|
+
if value_vars.is_a?(String)
|
3155
|
+
value_vars = [value_vars]
|
3156
|
+
end
|
3157
|
+
if id_vars.is_a?(String)
|
3158
|
+
id_vars = [id_vars]
|
3159
|
+
end
|
3160
|
+
if value_vars.nil?
|
3161
|
+
value_vars = []
|
3162
|
+
end
|
3163
|
+
if id_vars.nil?
|
3164
|
+
id_vars = []
|
3165
|
+
end
|
3166
|
+
_from_rbdf(
|
3167
|
+
_df.melt(id_vars, value_vars, value_name, variable_name)
|
3168
|
+
)
|
3169
|
+
end
|
3170
|
+
|
3171
|
+
# Unstack a long table to a wide form without doing an aggregation.
|
3172
|
+
#
|
3173
|
+
# This can be much faster than a pivot, because it can skip the grouping phase.
|
3174
|
+
#
|
3175
|
+
# @note
|
3176
|
+
# This functionality is experimental and may be subject to changes
|
3177
|
+
# without it being considered a breaking change.
|
3178
|
+
#
|
3179
|
+
# @param step Integer
|
3180
|
+
# Number of rows in the unstacked frame.
|
3181
|
+
# @param how ["vertical", "horizontal"]
|
3182
|
+
# Direction of the unstack.
|
3183
|
+
# @param columns [Object]
|
3184
|
+
# Column to include in the operation.
|
3185
|
+
# @param fill_values [Object]
|
3186
|
+
# Fill values that don't fit the new size with this value.
|
3187
|
+
#
|
3188
|
+
# @return [DataFrame]
|
3189
|
+
#
|
3190
|
+
# @example
|
3191
|
+
# df = Polars::DataFrame.new(
|
3192
|
+
# {
|
3193
|
+
# "col1" => "A".."I",
|
3194
|
+
# "col2" => Polars.arange(0, 9, eager: true)
|
3195
|
+
# }
|
3196
|
+
# )
|
3197
|
+
# # =>
|
3198
|
+
# # shape: (9, 2)
|
3199
|
+
# # ┌──────┬──────┐
|
3200
|
+
# # │ col1 ┆ col2 │
|
3201
|
+
# # │ --- ┆ --- │
|
3202
|
+
# # │ str ┆ i64 │
|
3203
|
+
# # ╞══════╪══════╡
|
3204
|
+
# # │ A ┆ 0 │
|
3205
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3206
|
+
# # │ B ┆ 1 │
|
3207
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3208
|
+
# # │ C ┆ 2 │
|
3209
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3210
|
+
# # │ D ┆ 3 │
|
3211
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3212
|
+
# # │ ... ┆ ... │
|
3213
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3214
|
+
# # │ F ┆ 5 │
|
3215
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3216
|
+
# # │ G ┆ 6 │
|
3217
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3218
|
+
# # │ H ┆ 7 │
|
3219
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3220
|
+
# # │ I ┆ 8 │
|
3221
|
+
# # └──────┴──────┘
|
3222
|
+
#
|
3223
|
+
# @example
|
3224
|
+
# df.unstack(step: 3, how: "vertical")
|
3225
|
+
# # =>
|
3226
|
+
# # shape: (3, 6)
|
3227
|
+
# # ┌────────┬────────┬────────┬────────┬────────┬────────┐
|
3228
|
+
# # │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │
|
3229
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
3230
|
+
# # │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │
|
3231
|
+
# # ╞════════╪════════╪════════╪════════╪════════╪════════╡
|
3232
|
+
# # │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │
|
3233
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
3234
|
+
# # │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │
|
3235
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
3236
|
+
# # │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │
|
3237
|
+
# # └────────┴────────┴────────┴────────┴────────┴────────┘
|
3238
|
+
#
|
3239
|
+
# @example
|
3240
|
+
# df.unstack(step: 3, how: "horizontal")
|
3241
|
+
# # =>
|
3242
|
+
# # shape: (3, 6)
|
3243
|
+
# # ┌────────┬────────┬────────┬────────┬────────┬────────┐
|
3244
|
+
# # │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │
|
3245
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
3246
|
+
# # │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │
|
3247
|
+
# # ╞════════╪════════╪════════╪════════╪════════╪════════╡
|
3248
|
+
# # │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │
|
3249
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
3250
|
+
# # │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │
|
3251
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
3252
|
+
# # │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │
|
3253
|
+
# # └────────┴────────┴────────┴────────┴────────┴────────┘
|
3254
|
+
def unstack(step:, how: "vertical", columns: nil, fill_values: nil)
|
3255
|
+
if !columns.nil?
|
3256
|
+
df = select(columns)
|
3257
|
+
else
|
3258
|
+
df = self
|
3259
|
+
end
|
3260
|
+
|
3261
|
+
height = df.height
|
3262
|
+
if how == "vertical"
|
3263
|
+
n_rows = step
|
3264
|
+
n_cols = (height / n_rows.to_f).ceil
|
3265
|
+
else
|
3266
|
+
n_cols = step
|
3267
|
+
n_rows = (height / n_cols.to_f).ceil
|
3268
|
+
end
|
3269
|
+
|
3270
|
+
n_fill = n_cols * n_rows - height
|
3271
|
+
|
3272
|
+
if n_fill > 0
|
3273
|
+
if !fill_values.is_a?(Array)
|
3274
|
+
fill_values = [fill_values] * df.width
|
3275
|
+
end
|
3276
|
+
|
3277
|
+
df = df.select(
|
3278
|
+
df.get_columns.zip(fill_values).map do |s, next_fill|
|
3279
|
+
s.extend_constant(next_fill, n_fill)
|
3280
|
+
end
|
3281
|
+
)
|
3282
|
+
end
|
3283
|
+
|
3284
|
+
if how == "horizontal"
|
3285
|
+
df = (
|
3286
|
+
df.with_column(
|
3287
|
+
(Polars.arange(0, n_cols * n_rows, eager: true) % n_cols).alias(
|
3288
|
+
"__sort_order"
|
3289
|
+
)
|
3290
|
+
)
|
3291
|
+
.sort("__sort_order")
|
3292
|
+
.drop("__sort_order")
|
3293
|
+
)
|
3294
|
+
end
|
3295
|
+
|
3296
|
+
zfill_val = Math.log10(n_cols).floor + 1
|
3297
|
+
slices =
|
3298
|
+
df.get_columns.flat_map do |s|
|
3299
|
+
n_cols.times.map do |slice_nbr|
|
3300
|
+
s.slice(slice_nbr * n_rows, n_rows).alias("%s_%0#{zfill_val}d" % [s.name, slice_nbr])
|
3301
|
+
end
|
3302
|
+
end
|
3303
|
+
|
3304
|
+
_from_rbdf(DataFrame.new(slices)._df)
|
3305
|
+
end
|
3306
|
+
|
3307
|
+
# Split into multiple DataFrames partitioned by groups.
|
3308
|
+
#
|
3309
|
+
# @param groups [Object]
|
3310
|
+
# Groups to partition by.
|
3311
|
+
# @param maintain_order [Boolean]
|
3312
|
+
# Keep predictable output order. This is slower as it requires an extra sort
|
3313
|
+
# operation.
|
3314
|
+
# @param as_dict [Boolean]
|
3315
|
+
# If true, return the partitions in a dictionary keyed by the distinct group
|
3316
|
+
# values instead of a list.
|
3317
|
+
#
|
3318
|
+
# @return [Object]
|
3319
|
+
#
|
3320
|
+
# @example
|
3321
|
+
# df = Polars::DataFrame.new(
|
3322
|
+
# {
|
3323
|
+
# "foo" => ["A", "A", "B", "B", "C"],
|
3324
|
+
# "N" => [1, 2, 2, 4, 2],
|
3325
|
+
# "bar" => ["k", "l", "m", "m", "l"]
|
3326
|
+
# }
|
3327
|
+
# )
|
3328
|
+
# df.partition_by("foo", maintain_order: true)
|
3329
|
+
# # =>
|
3330
|
+
# # [shape: (2, 3)
|
3331
|
+
# # ┌─────┬─────┬─────┐
|
3332
|
+
# # │ foo ┆ N ┆ bar │
|
3333
|
+
# # │ --- ┆ --- ┆ --- │
|
3334
|
+
# # │ str ┆ i64 ┆ str │
|
3335
|
+
# # ╞═════╪═════╪═════╡
|
3336
|
+
# # │ A ┆ 1 ┆ k │
|
3337
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
3338
|
+
# # │ A ┆ 2 ┆ l │
|
3339
|
+
# # └─────┴─────┴─────┘, shape: (2, 3)
|
3340
|
+
# # ┌─────┬─────┬─────┐
|
3341
|
+
# # │ foo ┆ N ┆ bar │
|
3342
|
+
# # │ --- ┆ --- ┆ --- │
|
3343
|
+
# # │ str ┆ i64 ┆ str │
|
3344
|
+
# # ╞═════╪═════╪═════╡
|
3345
|
+
# # │ B ┆ 2 ┆ m │
|
3346
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
3347
|
+
# # │ B ┆ 4 ┆ m │
|
3348
|
+
# # └─────┴─────┴─────┘, shape: (1, 3)
|
3349
|
+
# # ┌─────┬─────┬─────┐
|
3350
|
+
# # │ foo ┆ N ┆ bar │
|
3351
|
+
# # │ --- ┆ --- ┆ --- │
|
3352
|
+
# # │ str ┆ i64 ┆ str │
|
3353
|
+
# # ╞═════╪═════╪═════╡
|
3354
|
+
# # │ C ┆ 2 ┆ l │
|
3355
|
+
# # └─────┴─────┴─────┘]
|
3356
|
+
#
|
3357
|
+
# @example
|
3358
|
+
# df.partition_by("foo", maintain_order: true, as_dict: true)
|
3359
|
+
# # =>
|
3360
|
+
# # {"A"=>shape: (2, 3)
|
3361
|
+
# # ┌─────┬─────┬─────┐
|
3362
|
+
# # │ foo ┆ N ┆ bar │
|
3363
|
+
# # │ --- ┆ --- ┆ --- │
|
3364
|
+
# # │ str ┆ i64 ┆ str │
|
3365
|
+
# # ╞═════╪═════╪═════╡
|
3366
|
+
# # │ A ┆ 1 ┆ k │
|
3367
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
3368
|
+
# # │ A ┆ 2 ┆ l │
|
3369
|
+
# # └─────┴─────┴─────┘, "B"=>shape: (2, 3)
|
3370
|
+
# # ┌─────┬─────┬─────┐
|
3371
|
+
# # │ foo ┆ N ┆ bar │
|
3372
|
+
# # │ --- ┆ --- ┆ --- │
|
3373
|
+
# # │ str ┆ i64 ┆ str │
|
3374
|
+
# # ╞═════╪═════╪═════╡
|
3375
|
+
# # │ B ┆ 2 ┆ m │
|
3376
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
3377
|
+
# # │ B ┆ 4 ┆ m │
|
3378
|
+
# # └─────┴─────┴─────┘, "C"=>shape: (1, 3)
|
3379
|
+
# # ┌─────┬─────┬─────┐
|
3380
|
+
# # │ foo ┆ N ┆ bar │
|
3381
|
+
# # │ --- ┆ --- ┆ --- │
|
3382
|
+
# # │ str ┆ i64 ┆ str │
|
3383
|
+
# # ╞═════╪═════╪═════╡
|
3384
|
+
# # │ C ┆ 2 ┆ l │
|
3385
|
+
# # └─────┴─────┴─────┘}
|
3386
|
+
def partition_by(groups, maintain_order: true, as_dict: false)
|
3387
|
+
if groups.is_a?(String)
|
3388
|
+
groups = [groups]
|
3389
|
+
elsif !groups.is_a?(Array)
|
3390
|
+
groups = Array(groups)
|
3391
|
+
end
|
3392
|
+
|
3393
|
+
if as_dict
|
3394
|
+
out = {}
|
3395
|
+
if groups.length == 1
|
3396
|
+
_df.partition_by(groups, maintain_order).each do |df|
|
3397
|
+
df = _from_rbdf(df)
|
3398
|
+
out[df[groups][0, 0]] = df
|
3399
|
+
end
|
3400
|
+
else
|
3401
|
+
_df.partition_by(groups, maintain_order).each do |df|
|
3402
|
+
df = _from_rbdf(df)
|
3403
|
+
out[df[groups].row(0)] = df
|
3404
|
+
end
|
3405
|
+
end
|
3406
|
+
out
|
3407
|
+
else
|
3408
|
+
_df.partition_by(groups, maintain_order).map { |df| _from_rbdf(df) }
|
3409
|
+
end
|
3410
|
+
end
|
2192
3411
|
|
2193
3412
|
# Shift values by the given period.
|
2194
3413
|
#
|
@@ -3061,8 +4280,93 @@ module Polars
|
|
3061
4280
|
_from_rbdf(_df.sample_n(n, with_replacement, shuffle, seed))
|
3062
4281
|
end
|
3063
4282
|
|
3064
|
-
#
|
3065
|
-
#
|
4283
|
+
# Apply a horizontal reduction on a DataFrame.
|
4284
|
+
#
|
4285
|
+
# This can be used to effectively determine aggregations on a row level, and can
|
4286
|
+
# be applied to any DataType that can be supercasted (casted to a similar parent
|
4287
|
+
# type).
|
4288
|
+
#
|
4289
|
+
# An example of the supercast rules when applying an arithmetic operation on two
|
4290
|
+
# DataTypes are for instance:
|
4291
|
+
#
|
4292
|
+
# i8 + str = str
|
4293
|
+
# f32 + i64 = f32
|
4294
|
+
# f32 + f64 = f64
|
4295
|
+
#
|
4296
|
+
# @return [Series]
|
4297
|
+
#
|
4298
|
+
# @example A horizontal sum operation:
|
4299
|
+
# df = Polars::DataFrame.new(
|
4300
|
+
# {
|
4301
|
+
# "a" => [2, 1, 3],
|
4302
|
+
# "b" => [1, 2, 3],
|
4303
|
+
# "c" => [1.0, 2.0, 3.0]
|
4304
|
+
# }
|
4305
|
+
# )
|
4306
|
+
# df.fold { |s1, s2| s1 + s2 }
|
4307
|
+
# # =>
|
4308
|
+
# # shape: (3,)
|
4309
|
+
# # Series: 'a' [f64]
|
4310
|
+
# # [
|
4311
|
+
# # 4.0
|
4312
|
+
# # 5.0
|
4313
|
+
# # 9.0
|
4314
|
+
# # ]
|
4315
|
+
#
|
4316
|
+
# @example A horizontal minimum operation:
|
4317
|
+
# df = Polars::DataFrame.new({"a" => [2, 1, 3], "b" => [1, 2, 3], "c" => [1.0, 2.0, 3.0]})
|
4318
|
+
# df.fold { |s1, s2| s1.zip_with(s1 < s2, s2) }
|
4319
|
+
# # =>
|
4320
|
+
# # shape: (3,)
|
4321
|
+
# # Series: 'a' [f64]
|
4322
|
+
# # [
|
4323
|
+
# # 1.0
|
4324
|
+
# # 1.0
|
4325
|
+
# # 3.0
|
4326
|
+
# # ]
|
4327
|
+
#
|
4328
|
+
# @example A horizontal string concatenation:
|
4329
|
+
# df = Polars::DataFrame.new(
|
4330
|
+
# {
|
4331
|
+
# "a" => ["foo", "bar", 2],
|
4332
|
+
# "b" => [1, 2, 3],
|
4333
|
+
# "c" => [1.0, 2.0, 3.0]
|
4334
|
+
# }
|
4335
|
+
# )
|
4336
|
+
# df.fold { |s1, s2| s1 + s2 }
|
4337
|
+
# # =>
|
4338
|
+
# # shape: (3,)
|
4339
|
+
# # Series: 'a' [str]
|
4340
|
+
# # [
|
4341
|
+
# # "foo11.0"
|
4342
|
+
# # "bar22.0"
|
4343
|
+
# # null
|
4344
|
+
# # ]
|
4345
|
+
#
|
4346
|
+
# @example A horizontal boolean or, similar to a row-wise .any():
|
4347
|
+
# df = Polars::DataFrame.new(
|
4348
|
+
# {
|
4349
|
+
# "a" => [false, false, true],
|
4350
|
+
# "b" => [false, true, false]
|
4351
|
+
# }
|
4352
|
+
# )
|
4353
|
+
# df.fold { |s1, s2| s1 | s2 }
|
4354
|
+
# # =>
|
4355
|
+
# # shape: (3,)
|
4356
|
+
# # Series: 'a' [bool]
|
4357
|
+
# # [
|
4358
|
+
# # false
|
4359
|
+
# # true
|
4360
|
+
# # true
|
4361
|
+
# # ]
|
4362
|
+
def fold(&operation)
|
4363
|
+
acc = to_series(0)
|
4364
|
+
|
4365
|
+
1.upto(width - 1) do |i|
|
4366
|
+
acc = operation.call(acc, to_series(i))
|
4367
|
+
end
|
4368
|
+
acc
|
4369
|
+
end
|
3066
4370
|
|
3067
4371
|
# Get a row as tuple, either by index or by predicate.
|
3068
4372
|
#
|
@@ -3171,8 +4475,45 @@ module Polars
|
|
3171
4475
|
select(Utils.col("*").take_every(n))
|
3172
4476
|
end
|
3173
4477
|
|
3174
|
-
#
|
3175
|
-
#
|
4478
|
+
# Hash and combine the rows in this DataFrame.
|
4479
|
+
#
|
4480
|
+
# The hash value is of type `:u64`.
|
4481
|
+
#
|
4482
|
+
# @param seed [Integer]
|
4483
|
+
# Random seed parameter. Defaults to 0.
|
4484
|
+
# @param seed_1 [Integer]
|
4485
|
+
# Random seed parameter. Defaults to `seed` if not set.
|
4486
|
+
# @param seed_2 [Integer]
|
4487
|
+
# Random seed parameter. Defaults to `seed` if not set.
|
4488
|
+
# @param seed_3 [Integer]
|
4489
|
+
# Random seed parameter. Defaults to `seed` if not set.
|
4490
|
+
#
|
4491
|
+
# @return [Series]
|
4492
|
+
#
|
4493
|
+
# @example
|
4494
|
+
# df = Polars::DataFrame.new(
|
4495
|
+
# {
|
4496
|
+
# "foo" => [1, nil, 3, 4],
|
4497
|
+
# "ham" => ["a", "b", nil, "d"]
|
4498
|
+
# }
|
4499
|
+
# )
|
4500
|
+
# df.hash_rows(seed: 42)
|
4501
|
+
# # =>
|
4502
|
+
# # shape: (4,)
|
4503
|
+
# # Series: '' [u64]
|
4504
|
+
# # [
|
4505
|
+
# # 4238614331852490969
|
4506
|
+
# # 17976148875586754089
|
4507
|
+
# # 4702262519505526977
|
4508
|
+
# # 18144177983981041107
|
4509
|
+
# # ]
|
4510
|
+
def hash_rows(seed: 0, seed_1: nil, seed_2: nil, seed_3: nil)
|
4511
|
+
k0 = seed
|
4512
|
+
k1 = seed_1.nil? ? seed : seed_1
|
4513
|
+
k2 = seed_2.nil? ? seed : seed_2
|
4514
|
+
k3 = seed_3.nil? ? seed : seed_3
|
4515
|
+
Utils.wrap_s(_df.hash_rows(k0, k1, k2, k3))
|
4516
|
+
end
|
3176
4517
|
|
3177
4518
|
# Interpolate intermediate values. The interpolation method is linear.
|
3178
4519
|
#
|
@@ -3297,7 +4638,19 @@ module Polars
|
|
3297
4638
|
self._df = _df._clone
|
3298
4639
|
end
|
3299
4640
|
|
3300
|
-
def
|
4641
|
+
def _pos_idx(idx, dim)
|
4642
|
+
if idx >= 0
|
4643
|
+
idx
|
4644
|
+
else
|
4645
|
+
shape[dim] + idx
|
4646
|
+
end
|
4647
|
+
end
|
4648
|
+
|
4649
|
+
# def _pos_idxs
|
4650
|
+
# end
|
4651
|
+
|
4652
|
+
# @private
|
4653
|
+
def self.hash_to_rbdf(data, columns: nil)
|
3301
4654
|
if !columns.nil?
|
3302
4655
|
columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
|
3303
4656
|
|
@@ -3313,11 +4666,34 @@ module Polars
|
|
3313
4666
|
RbDataFrame.read_hash(data)
|
3314
4667
|
end
|
3315
4668
|
|
3316
|
-
|
3317
|
-
|
4669
|
+
# @private
|
4670
|
+
def self._unpack_columns(columns, lookup_names: nil, n_expected: nil)
|
4671
|
+
if columns.is_a?(Hash)
|
4672
|
+
columns = columns.to_a
|
4673
|
+
end
|
4674
|
+
column_names =
|
4675
|
+
(columns || []).map.with_index do |col, i|
|
4676
|
+
if col.is_a?(String)
|
4677
|
+
col || "column_#{i}"
|
4678
|
+
else
|
4679
|
+
col[0]
|
4680
|
+
end
|
4681
|
+
end
|
4682
|
+
if column_names.empty? && n_expected
|
4683
|
+
column_names = n_expected.times.map { |i| "column_#{i}" }
|
4684
|
+
end
|
4685
|
+
# TODO zip_longest
|
4686
|
+
lookup = column_names.zip(lookup_names || []).to_h
|
4687
|
+
|
4688
|
+
[
|
4689
|
+
column_names,
|
4690
|
+
(columns || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
|
4691
|
+
[lookup[col[0]] || col[0], col[1]]
|
4692
|
+
end
|
4693
|
+
]
|
3318
4694
|
end
|
3319
4695
|
|
3320
|
-
def _handle_columns_arg(data, columns: nil)
|
4696
|
+
def self._handle_columns_arg(data, columns: nil)
|
3321
4697
|
if columns.nil?
|
3322
4698
|
data
|
3323
4699
|
else
|
@@ -3335,14 +4711,39 @@ module Polars
|
|
3335
4711
|
end
|
3336
4712
|
end
|
3337
4713
|
|
3338
|
-
|
3339
|
-
|
3340
|
-
|
4714
|
+
# @private
|
4715
|
+
def self.sequence_to_rbdf(data, columns: nil, orient: nil)
|
4716
|
+
if data.length == 0
|
4717
|
+
return hash_to_rbdf({}, columns: columns)
|
4718
|
+
end
|
4719
|
+
|
4720
|
+
if data[0].is_a?(Series)
|
4721
|
+
# series_names = data.map(&:name)
|
4722
|
+
# columns, dtypes = _unpack_columns(columns || series_names, n_expected: data.length)
|
4723
|
+
data_series = []
|
4724
|
+
data.each do |s|
|
4725
|
+
data_series << s._s
|
4726
|
+
end
|
4727
|
+
elsif data[0].is_a?(Array)
|
4728
|
+
if orient.nil? && !columns.nil?
|
4729
|
+
orient = columns.length == data.length ? "col" : "row"
|
4730
|
+
end
|
4731
|
+
|
4732
|
+
if orient == "row"
|
4733
|
+
raise Todo
|
4734
|
+
elsif orient == "col" || orient.nil?
|
4735
|
+
raise Todo
|
4736
|
+
else
|
4737
|
+
raise ArgumentError, "orient must be one of {{'col', 'row', nil}}, got #{orient} instead."
|
4738
|
+
end
|
3341
4739
|
end
|
3342
|
-
|
4740
|
+
|
4741
|
+
data_series = _handle_columns_arg(data_series, columns: columns)
|
4742
|
+
RbDataFrame.new(data_series)
|
3343
4743
|
end
|
3344
4744
|
|
3345
|
-
|
4745
|
+
# @private
|
4746
|
+
def self.series_to_rbdf(data, columns: nil)
|
3346
4747
|
if columns
|
3347
4748
|
raise Todo
|
3348
4749
|
end
|