polars-df 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -26,14 +26,14 @@ module Polars
26
26
  end
27
27
 
28
28
  if data.nil?
29
- self._df = hash_to_rbdf({}, columns: columns)
29
+ self._df = self.class.hash_to_rbdf({}, columns: columns)
30
30
  elsif data.is_a?(Hash)
31
31
  data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
32
- self._df = hash_to_rbdf(data, columns: columns)
32
+ self._df = self.class.hash_to_rbdf(data, columns: columns)
33
33
  elsif data.is_a?(Array)
34
- self._df = sequence_to_rbdf(data, columns: columns, orient: orient)
34
+ self._df = self.class.sequence_to_rbdf(data, columns: columns, orient: orient)
35
35
  elsif data.is_a?(Series)
36
- self._df = series_to_rbdf(data, columns: columns)
36
+ self._df = self.class.series_to_rbdf(data, columns: columns)
37
37
  else
38
38
  raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
39
39
  end
@@ -46,11 +46,16 @@ module Polars
46
46
  df
47
47
  end
48
48
 
49
- # def self._from_hashes
50
- # end
49
+ # @private
50
+ def self._from_hashes(data, infer_schema_length: 100, schema: nil)
51
+ rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema)
52
+ _from_rbdf(rbdf)
53
+ end
51
54
 
52
- # def self._from_hash
53
- # end
55
+ # @private
56
+ def self._from_hash(data, columns: nil)
57
+ _from_rbdf(hash_to_rbdf(data, columns: columns))
58
+ end
54
59
 
55
60
  # def self._from_records
56
61
  # end
@@ -186,8 +191,14 @@ module Polars
186
191
  )
187
192
  end
188
193
 
189
- # def self._read_avro
190
- # end
194
+ # @private
195
+ def self._read_avro(file, columns: nil, n_rows: nil)
196
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
197
+ file = Utils.format_path(file)
198
+ end
199
+ projection, columns = Utils.handle_projection_columns(columns)
200
+ _from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
201
+ end
191
202
 
192
203
  # @private
193
204
  def self._read_ipc(
@@ -486,12 +497,6 @@ module Polars
486
497
  # def each
487
498
  # end
488
499
 
489
- # def _pos_idx
490
- # end
491
-
492
- # def _pos_idxs
493
- # end
494
-
495
500
  # Returns subset of the DataFrame.
496
501
  #
497
502
  # @return [Object]
@@ -554,19 +559,33 @@ module Polars
554
559
 
555
560
  # df[idx]
556
561
  if item.is_a?(Integer)
557
- return slice(_pos_idx(item, dim: 0), 1)
562
+ return slice(_pos_idx(item, 0), 1)
558
563
  end
559
564
 
560
565
  # df[..]
561
566
  if item.is_a?(Range)
562
567
  return Slice.new(self).apply(item)
563
568
  end
569
+
570
+ if Utils.is_str_sequence(item, allow_str: false)
571
+ # select multiple columns
572
+ # df[["foo", "bar"]]
573
+ return _from_rbdf(_df.select(item))
574
+ end
564
575
  end
565
576
 
566
577
  raise ArgumentError, "Cannot get item of type: #{item.class.name}"
567
578
  end
568
579
 
580
+ # Set item.
581
+ #
582
+ # @return [Object]
569
583
  # def []=(key, value)
584
+ # if key.is_a?(String)
585
+ # raise TypeError, "'DataFrame' object does not support 'Series' assignment by index. Use 'DataFrame.with_columns'"
586
+ # end
587
+
588
+ # raise Todo
570
589
  # end
571
590
 
572
591
  # no to_arrow
@@ -582,8 +601,24 @@ module Polars
582
601
  end
583
602
  end
584
603
 
585
- # def to_hashes / to_a
586
- # end
604
+ # Convert every row to a dictionary.
605
+ #
606
+ # Note that this is slow.
607
+ #
608
+ # @return [Array]
609
+ #
610
+ # @example
611
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
612
+ # df.to_hashes
613
+ # [{'foo': 1, 'bar': 4}, {'foo': 2, 'bar': 5}, {'foo': 3, 'bar': 6}]
614
+ def to_hashes
615
+ rbdf = _df
616
+ names = columns
617
+
618
+ height.times.map do |i|
619
+ names.zip(rbdf.row_tuple(i)).to_h
620
+ end
621
+ end
587
622
 
588
623
  # def to_numo
589
624
  # end
@@ -762,8 +797,24 @@ module Polars
762
797
  nil
763
798
  end
764
799
 
765
- # def write_avro
766
- # end
800
+ # Write to Apache Avro file.
801
+ #
802
+ # @param file [String]
803
+ # File path to which the file should be written.
804
+ # @param compression ["uncompressed", "snappy", "deflate"]
805
+ # Compression method. Defaults to "uncompressed".
806
+ #
807
+ # @return [nil]
808
+ def write_avro(file, compression = "uncompressed")
809
+ if compression.nil?
810
+ compression = "uncompressed"
811
+ end
812
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
813
+ file = Utils.format_path(file)
814
+ end
815
+
816
+ _df.write_avro(file, compression)
817
+ end
767
818
 
768
819
  # Write to Arrow IPC binary stream or Feather file.
769
820
  #
@@ -866,8 +917,84 @@ module Polars
866
917
  Utils.scale_bytes(sz, to: unit)
867
918
  end
868
919
 
869
- # def transpose
870
- # end
920
+ # Transpose a DataFrame over the diagonal.
921
+ #
922
+ # @param include_header [Boolean]
923
+ # If set, the column names will be added as first column.
924
+ # @param header_name [String]
925
+ # If `include_header` is set, this determines the name of the column that will
926
+ # be inserted.
927
+ # @param column_names [Array]
928
+ # Optional generator/iterator that yields column names. Will be used to
929
+ # replace the columns in the DataFrame.
930
+ #
931
+ # @return [DataFrame]
932
+ #
933
+ # @note
934
+ # This is a very expensive operation. Perhaps you can do it differently.
935
+ #
936
+ # @example
937
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => [1, 2, 3]})
938
+ # df.transpose(include_header: true)
939
+ # # =>
940
+ # # shape: (2, 4)
941
+ # # ┌────────┬──────────┬──────────┬──────────┐
942
+ # # │ column ┆ column_0 ┆ column_1 ┆ column_2 │
943
+ # # │ --- ┆ --- ┆ --- ┆ --- │
944
+ # # │ str ┆ i64 ┆ i64 ┆ i64 │
945
+ # # ╞════════╪══════════╪══════════╪══════════╡
946
+ # # │ a ┆ 1 ┆ 2 ┆ 3 │
947
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
948
+ # # │ b ┆ 1 ┆ 2 ┆ 3 │
949
+ # # └────────┴──────────┴──────────┴──────────┘
950
+ #
951
+ # @example Replace the auto-generated column names with a list
952
+ # df.transpose(include_header: false, column_names: ["a", "b", "c"])
953
+ # # =>
954
+ # # shape: (2, 3)
955
+ # # ┌─────┬─────┬─────┐
956
+ # # │ a ┆ b ┆ c │
957
+ # # │ --- ┆ --- ┆ --- │
958
+ # # │ i64 ┆ i64 ┆ i64 │
959
+ # # ╞═════╪═════╪═════╡
960
+ # # │ 1 ┆ 2 ┆ 3 │
961
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
962
+ # # │ 1 ┆ 2 ┆ 3 │
963
+ # # └─────┴─────┴─────┘
964
+ #
965
+ # @example Include the header as a separate column
966
+ # df.transpose(
967
+ # include_header: true, header_name: "foo", column_names: ["a", "b", "c"]
968
+ # )
969
+ # # =>
970
+ # # shape: (2, 4)
971
+ # # ┌─────┬─────┬─────┬─────┐
972
+ # # │ foo ┆ a ┆ b ┆ c │
973
+ # # │ --- ┆ --- ┆ --- ┆ --- │
974
+ # # │ str ┆ i64 ┆ i64 ┆ i64 │
975
+ # # ╞═════╪═════╪═════╪═════╡
976
+ # # │ a ┆ 1 ┆ 2 ┆ 3 │
977
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
978
+ # # │ b ┆ 1 ┆ 2 ┆ 3 │
979
+ # # └─────┴─────┴─────┴─────┘
980
+ def transpose(include_header: false, header_name: "column", column_names: nil)
981
+ df = _from_rbdf(_df.transpose(include_header, header_name))
982
+ if !column_names.nil?
983
+ names = []
984
+ n = df.width
985
+ if include_header
986
+ names << header_name
987
+ n -= 1
988
+ end
989
+
990
+ column_names = column_names.each
991
+ n.times do
992
+ names << column_names.next
993
+ end
994
+ df.columns = names
995
+ end
996
+ df
997
+ end
871
998
 
872
999
  # Reverse the DataFrame.
873
1000
  #
@@ -1462,8 +1589,48 @@ module Polars
1462
1589
  _from_rbdf(_df.drop_nulls(subset))
1463
1590
  end
1464
1591
 
1465
- # def pipe
1466
- # end
1592
+ # Offers a structured way to apply a sequence of user-defined functions (UDFs).
1593
+ #
1594
+ # @param func [Object]
1595
+ # Callable; will receive the frame as the first parameter,
1596
+ # followed by any given args/kwargs.
1597
+ # @param args [Object]
1598
+ # Arguments to pass to the UDF.
1599
+ # @param kwargs [Object]
1600
+ # Keyword arguments to pass to the UDF.
1601
+ #
1602
+ # @return [Object]
1603
+ #
1604
+ # @note
1605
+ # It is recommended to use LazyFrame when piping operations, in order
1606
+ # to fully take advantage of query optimization and parallelization.
1607
+ # See {#lazy}.
1608
+ #
1609
+ # @example
1610
+ # cast_str_to_int = lambda do |data, col_name:|
1611
+ # data.with_column(Polars.col(col_name).cast(:i64))
1612
+ # end
1613
+ #
1614
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]})
1615
+ # df.pipe(cast_str_to_int, col_name: "b")
1616
+ # # =>
1617
+ # # shape: (4, 2)
1618
+ # # ┌─────┬─────┐
1619
+ # # │ a ┆ b │
1620
+ # # │ --- ┆ --- │
1621
+ # # │ i64 ┆ i64 │
1622
+ # # ╞═════╪═════╡
1623
+ # # │ 1 ┆ 10 │
1624
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1625
+ # # │ 2 ┆ 20 │
1626
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1627
+ # # │ 3 ┆ 30 │
1628
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1629
+ # # │ 4 ┆ 40 │
1630
+ # # └─────┴─────┘
1631
+ def pipe(func, *args, **kwargs, &block)
1632
+ func.call(self, *args, **kwargs, &block)
1633
+ end
1467
1634
 
1468
1635
  # Add a column at index 0 that counts the rows.
1469
1636
  #
@@ -1547,17 +1714,612 @@ module Polars
1547
1714
  )
1548
1715
  end
1549
1716
 
1550
- # def groupby_rolling
1551
- # end
1717
+ # Create rolling groups based on a time column.
1718
+ #
1719
+ # Also works for index values of type `:i32` or `:i64`.
1720
+ #
1721
+ # Different from a `dynamic_groupby` the windows are now determined by the
1722
+ # individual values and are not of constant intervals. For constant intervals use
1723
+ # *groupby_dynamic*
1724
+ #
1725
+ # The `period` and `offset` arguments are created either from a timedelta, or
1726
+ # by using the following string language:
1727
+ #
1728
+ # - 1ns (1 nanosecond)
1729
+ # - 1us (1 microsecond)
1730
+ # - 1ms (1 millisecond)
1731
+ # - 1s (1 second)
1732
+ # - 1m (1 minute)
1733
+ # - 1h (1 hour)
1734
+ # - 1d (1 day)
1735
+ # - 1w (1 week)
1736
+ # - 1mo (1 calendar month)
1737
+ # - 1y (1 calendar year)
1738
+ # - 1i (1 index count)
1739
+ #
1740
+ # Or combine them:
1741
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1742
+ #
1743
+ # In case of a groupby_rolling on an integer column, the windows are defined by:
1744
+ #
1745
+ # - **"1i" # length 1**
1746
+ # - **"10i" # length 10**
1747
+ #
1748
+ # @param index_column [Object]
1749
+ # Column used to group based on the time window.
1750
+ # Often to type Date/Datetime
1751
+ # This column must be sorted in ascending order. If not the output will not
1752
+ # make sense.
1753
+ #
1754
+ # In case of a rolling groupby on indices, dtype needs to be one of
1755
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1756
+ # performance matters use an `:i64` column.
1757
+ # @param period [Object]
1758
+ # Length of the window.
1759
+ # @param offset [Object]
1760
+ # Offset of the window. Default is -period.
1761
+ # @param closed ["right", "left", "both", "none"]
1762
+ # Define whether the temporal window interval is closed or not.
1763
+ # @param by [Object]
1764
+ # Also group by this column/these columns.
1765
+ #
1766
+ # @return [RollingGroupBy]
1767
+ #
1768
+ # @example
1769
+ # dates = [
1770
+ # "2020-01-01 13:45:48",
1771
+ # "2020-01-01 16:42:13",
1772
+ # "2020-01-01 16:45:09",
1773
+ # "2020-01-02 18:12:48",
1774
+ # "2020-01-03 19:45:32",
1775
+ # "2020-01-08 23:16:43"
1776
+ # ]
1777
+ # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
1778
+ # Polars.col("dt").str.strptime(:datetime)
1779
+ # )
1780
+ # df.groupby_rolling(index_column: "dt", period: "2d").agg(
1781
+ # [
1782
+ # Polars.sum("a").alias("sum_a"),
1783
+ # Polars.min("a").alias("min_a"),
1784
+ # Polars.max("a").alias("max_a")
1785
+ # ]
1786
+ # )
1787
+ # # =>
1788
+ # # shape: (6, 4)
1789
+ # # ┌─────────────────────┬───────┬───────┬───────┐
1790
+ # # │ dt ┆ sum_a ┆ min_a ┆ max_a │
1791
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1792
+ # # │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │
1793
+ # # ╞═════════════════════╪═══════╪═══════╪═══════╡
1794
+ # # │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │
1795
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1796
+ # # │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │
1797
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1798
+ # # │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │
1799
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1800
+ # # │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │
1801
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1802
+ # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
1803
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1804
+ # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
1805
+ # # └─────────────────────┴───────┴───────┴───────┘
1806
+ def groupby_rolling(
1807
+ index_column:,
1808
+ period:,
1809
+ offset: nil,
1810
+ closed: "right",
1811
+ by: nil
1812
+ )
1813
+ RollingGroupBy.new(self, index_column, period, offset, closed, by)
1814
+ end
1552
1815
 
1553
- # def groupby_dynamic
1554
- # end
1816
+ # Group based on a time value (or index value of type `:i32`, `:i64`).
1817
+ #
1818
+ # Time windows are calculated and rows are assigned to windows. Different from a
1819
+ # normal groupby is that a row can be member of multiple groups. The time/index
1820
+ # window could be seen as a rolling window, with a window size determined by
1821
+ # dates/times/values instead of slots in the DataFrame.
1822
+ #
1823
+ # A window is defined by:
1824
+ #
1825
+ # - every: interval of the window
1826
+ # - period: length of the window
1827
+ # - offset: offset of the window
1828
+ #
1829
+ # The `every`, `period` and `offset` arguments are created with
1830
+ # the following string language:
1831
+ #
1832
+ # - 1ns (1 nanosecond)
1833
+ # - 1us (1 microsecond)
1834
+ # - 1ms (1 millisecond)
1835
+ # - 1s (1 second)
1836
+ # - 1m (1 minute)
1837
+ # - 1h (1 hour)
1838
+ # - 1d (1 day)
1839
+ # - 1w (1 week)
1840
+ # - 1mo (1 calendar month)
1841
+ # - 1y (1 calendar year)
1842
+ # - 1i (1 index count)
1843
+ #
1844
+ # Or combine them:
1845
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1846
+ #
1847
+ # In case of a groupby_dynamic on an integer column, the windows are defined by:
1848
+ #
1849
+ # - "1i" # length 1
1850
+ # - "10i" # length 10
1851
+ #
1852
+ # @param index_column
1853
+ # Column used to group based on the time window.
1854
+ # Often to type Date/Datetime
1855
+ # This column must be sorted in ascending order. If not the output will not
1856
+ # make sense.
1857
+ #
1858
+ # In case of a dynamic groupby on indices, dtype needs to be one of
1859
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1860
+ # performance matters use an `:i64` column.
1861
+ # @param every
1862
+ # Interval of the window.
1863
+ # @param period
1864
+ # Length of the window, if None it is equal to 'every'.
1865
+ # @param offset
1866
+ # Offset of the window if None and period is None it will be equal to negative
1867
+ # `every`.
1868
+ # @param truncate
1869
+ # Truncate the time value to the window lower bound.
1870
+ # @param include_boundaries
1871
+ # Add the lower and upper bound of the window to the "_lower_bound" and
1872
+ # "_upper_bound" columns. This will impact performance because it's harder to
1873
+ # parallelize
1874
+ # @param closed ["right", "left", "both", "none"]
1875
+ # Define whether the temporal window interval is closed or not.
1876
+ # @param by
1877
+ # Also group by this column/these columns
1878
+ #
1879
+ # @return [DataFrame]
1880
+ #
1881
+ # @example
1882
+ # df = Polars::DataFrame.new(
1883
+ # {
1884
+ # "time" => Polars.date_range(
1885
+ # DateTime.new(2021, 12, 16),
1886
+ # DateTime.new(2021, 12, 16, 3),
1887
+ # "30m"
1888
+ # ),
1889
+ # "n" => 0..6
1890
+ # }
1891
+ # )
1892
+ # # =>
1893
+ # # shape: (7, 2)
1894
+ # # ┌─────────────────────┬─────┐
1895
+ # # │ time ┆ n │
1896
+ # # │ --- ┆ --- │
1897
+ # # │ datetime[μs] ┆ i64 │
1898
+ # # ╞═════════════════════╪═════╡
1899
+ # # │ 2021-12-16 00:00:00 ┆ 0 │
1900
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1901
+ # # │ 2021-12-16 00:30:00 ┆ 1 │
1902
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1903
+ # # │ 2021-12-16 01:00:00 ┆ 2 │
1904
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1905
+ # # │ 2021-12-16 01:30:00 ┆ 3 │
1906
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1907
+ # # │ 2021-12-16 02:00:00 ┆ 4 │
1908
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1909
+ # # │ 2021-12-16 02:30:00 ┆ 5 │
1910
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1911
+ # # │ 2021-12-16 03:00:00 ┆ 6 │
1912
+ # # └─────────────────────┴─────┘
1913
+ #
1914
+ # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
1915
+ # df.groupby_dynamic("time", every: "1h", closed: "right").agg(
1916
+ # [
1917
+ # Polars.col("time").min.alias("time_min"),
1918
+ # Polars.col("time").max.alias("time_max")
1919
+ # ]
1920
+ # )
1921
+ # # =>
1922
+ # # shape: (4, 3)
1923
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┐
1924
+ # # │ time ┆ time_min ┆ time_max │
1925
+ # # │ --- ┆ --- ┆ --- │
1926
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │
1927
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╡
1928
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │
1929
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1930
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │
1931
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1932
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │
1933
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1934
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │
1935
+ # # └─────────────────────┴─────────────────────┴─────────────────────┘
1936
+ #
1937
+ # @example The window boundaries can also be added to the aggregation result.
1938
+ # df.groupby_dynamic(
1939
+ # "time", every: "1h", include_boundaries: true, closed: "right"
1940
+ # ).agg([Polars.col("time").count.alias("time_count")])
1941
+ # # =>
1942
+ # # shape: (4, 4)
1943
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
1944
+ # # │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
1945
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1946
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
1947
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
1948
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
1949
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1950
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │
1951
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1952
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
1953
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1954
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
1955
+ # # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
1956
+ #
1957
+ # @example When closed="left", should not include right end of interval.
1958
+ # df.groupby_dynamic("time", every: "1h", closed: "left").agg(
1959
+ # [
1960
+ # Polars.col("time").count.alias("time_count"),
1961
+ # Polars.col("time").list.alias("time_agg_list")
1962
+ # ]
1963
+ # )
1964
+ # # =>
1965
+ # # shape: (4, 3)
1966
+ # # ┌─────────────────────┬────────────┬─────────────────────────────────────┐
1967
+ # # │ time ┆ time_count ┆ time_agg_list │
1968
+ # # │ --- ┆ --- ┆ --- │
1969
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
1970
+ # # ╞═════════════════════╪════════════╪═════════════════════════════════════╡
1971
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16... │
1972
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1973
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16... │
1974
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1975
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16... │
1976
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1977
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
1978
+ # # └─────────────────────┴────────────┴─────────────────────────────────────┘
1979
+ #
1980
+ # @example When closed="both" the time values at the window boundaries belong to 2 groups.
1981
+ # df.groupby_dynamic("time", every: "1h", closed: "both").agg(
1982
+ # [Polars.col("time").count.alias("time_count")]
1983
+ # )
1984
+ # # =>
1985
+ # # shape: (5, 2)
1986
+ # # ┌─────────────────────┬────────────┐
1987
+ # # │ time ┆ time_count │
1988
+ # # │ --- ┆ --- │
1989
+ # # │ datetime[μs] ┆ u32 │
1990
+ # # ╞═════════════════════╪════════════╡
1991
+ # # │ 2021-12-15 23:00:00 ┆ 1 │
1992
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1993
+ # # │ 2021-12-16 00:00:00 ┆ 3 │
1994
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1995
+ # # │ 2021-12-16 01:00:00 ┆ 3 │
1996
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1997
+ # # │ 2021-12-16 02:00:00 ┆ 3 │
1998
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1999
+ # # │ 2021-12-16 03:00:00 ┆ 1 │
2000
+ # # └─────────────────────┴────────────┘
2001
+ #
2002
+ # @example Dynamic groupbys can also be combined with grouping on normal keys.
2003
+ # df = Polars::DataFrame.new(
2004
+ # {
2005
+ # "time" => Polars.date_range(
2006
+ # DateTime.new(2021, 12, 16),
2007
+ # DateTime.new(2021, 12, 16, 3),
2008
+ # "30m"
2009
+ # ),
2010
+ # "groups" => ["a", "a", "a", "b", "b", "a", "a"]
2011
+ # }
2012
+ # )
2013
+ # df.groupby_dynamic(
2014
+ # "time",
2015
+ # every: "1h",
2016
+ # closed: "both",
2017
+ # by: "groups",
2018
+ # include_boundaries: true
2019
+ # ).agg([Polars.col("time").count.alias("time_count")])
2020
+ # # =>
2021
+ # # shape: (7, 5)
2022
+ # # ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
2023
+ # # │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
2024
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2025
+ # # │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
2026
+ # # ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
2027
+ # # │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
2028
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2029
+ # # │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │
2030
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2031
+ # # │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │
2032
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2033
+ # # │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
2034
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2035
+ # # │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │
2036
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2037
+ # # │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
2038
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2039
+ # # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
2040
+ # # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
2041
+ #
2042
+ # @example Dynamic groupby on an index column.
2043
+ # df = Polars::DataFrame.new(
2044
+ # {
2045
+ # "idx" => Polars.arange(0, 6, eager: true),
2046
+ # "A" => ["A", "A", "B", "B", "B", "C"]
2047
+ # }
2048
+ # )
2049
+ # df.groupby_dynamic(
2050
+ # "idx",
2051
+ # every: "2i",
2052
+ # period: "3i",
2053
+ # include_boundaries: true,
2054
+ # closed: "right"
2055
+ # ).agg(Polars.col("A").list.alias("A_agg_list"))
2056
+ # # =>
2057
+ # # shape: (3, 4)
2058
+ # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
2059
+ # # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
2060
+ # # │ --- ┆ --- ┆ --- ┆ --- │
2061
+ # # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
2062
+ # # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
2063
+ # # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
2064
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
2065
+ # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
2066
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
2067
+ # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
2068
+ # # └─────────────────┴─────────────────┴─────┴─────────────────┘
2069
+ def groupby_dynamic(
2070
+ index_column,
2071
+ every:,
2072
+ period: nil,
2073
+ offset: nil,
2074
+ truncate: true,
2075
+ include_boundaries: false,
2076
+ closed: "left",
2077
+ by: nil
2078
+ )
2079
+ DynamicGroupBy.new(
2080
+ self,
2081
+ index_column,
2082
+ every,
2083
+ period,
2084
+ offset,
2085
+ truncate,
2086
+ include_boundaries,
2087
+ closed,
2088
+ by
2089
+ )
2090
+ end
1555
2091
 
1556
- # def upsample
1557
- # end
2092
+ # Upsample a DataFrame at a regular frequency.
2093
+ #
2094
+ # @param time_column [Object]
2095
+ # time column will be used to determine a date_range.
2096
+ # Note that this column has to be sorted for the output to make sense.
2097
+ # @param every [String]
2098
+ # interval will start 'every' duration
2099
+ # @param offset [String]
2100
+ # change the start of the date_range by this offset.
2101
+ # @param by [Object]
2102
+ # First group by these columns and then upsample for every group
2103
+ # @param maintain_order [Boolean]
2104
+ # Keep the ordering predictable. This is slower.
2105
+ #
2106
+ # The `every` and `offset` arguments are created with
2107
+ # the following string language:
2108
+ #
2109
+ # - 1ns (1 nanosecond)
2110
+ # - 1us (1 microsecond)
2111
+ # - 1ms (1 millisecond)
2112
+ # - 1s (1 second)
2113
+ # - 1m (1 minute)
2114
+ # - 1h (1 hour)
2115
+ # - 1d (1 day)
2116
+ # - 1w (1 week)
2117
+ # - 1mo (1 calendar month)
2118
+ # - 1y (1 calendar year)
2119
+ # - 1i (1 index count)
2120
+ #
2121
+ # Or combine them:
2122
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
2123
+ #
2124
+ # @return [DataFrame]
2125
+ #
2126
+ # @example Upsample a DataFrame by a certain interval.
2127
+ # df = Polars::DataFrame.new(
2128
+ # {
2129
+ # "time" => [
2130
+ # DateTime.new(2021, 2, 1),
2131
+ # DateTime.new(2021, 4, 1),
2132
+ # DateTime.new(2021, 5, 1),
2133
+ # DateTime.new(2021, 6, 1)
2134
+ # ],
2135
+ # "groups" => ["A", "B", "A", "B"],
2136
+ # "values" => [0, 1, 2, 3]
2137
+ # }
2138
+ # )
2139
+ # df.upsample(
2140
+ # time_column: "time", every: "1mo", by: "groups", maintain_order: true
2141
+ # ).select(Polars.all.forward_fill)
2142
+ # # =>
2143
+ # # shape: (7, 3)
2144
+ # # ┌─────────────────────┬────────┬────────┐
2145
+ # # │ time ┆ groups ┆ values │
2146
+ # # │ --- ┆ --- ┆ --- │
2147
+ # # │ datetime[ns] ┆ str ┆ i64 │
2148
+ # # ╞═════════════════════╪════════╪════════╡
2149
+ # # │ 2021-02-01 00:00:00 ┆ A ┆ 0 │
2150
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2151
+ # # │ 2021-03-01 00:00:00 ┆ A ┆ 0 │
2152
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2153
+ # # │ 2021-04-01 00:00:00 ┆ A ┆ 0 │
2154
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2155
+ # # │ 2021-05-01 00:00:00 ┆ A ┆ 2 │
2156
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2157
+ # # │ 2021-04-01 00:00:00 ┆ B ┆ 1 │
2158
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2159
+ # # │ 2021-05-01 00:00:00 ┆ B ┆ 1 │
2160
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2161
+ # # │ 2021-06-01 00:00:00 ┆ B ┆ 3 │
2162
+ # # └─────────────────────┴────────┴────────┘
2163
+ def upsample(
2164
+ time_column:,
2165
+ every:,
2166
+ offset: nil,
2167
+ by: nil,
2168
+ maintain_order: false
2169
+ )
2170
+ if by.nil?
2171
+ by = []
2172
+ end
2173
+ if by.is_a?(String)
2174
+ by = [by]
2175
+ end
2176
+ if offset.nil?
2177
+ offset = "0ns"
2178
+ end
1558
2179
 
1559
- # def join_asof
1560
- # end
2180
+ every = Utils._timedelta_to_pl_duration(every)
2181
+ offset = Utils._timedelta_to_pl_duration(offset)
2182
+
2183
+ _from_rbdf(
2184
+ _df.upsample(by, time_column, every, offset, maintain_order)
2185
+ )
2186
+ end
2187
+
2188
+ # Perform an asof join.
2189
+ #
2190
+ # This is similar to a left-join except that we match on nearest key rather than
2191
+ # equal keys.
2192
+ #
2193
+ # Both DataFrames must be sorted by the asof_join key.
2194
+ #
2195
+ # For each row in the left DataFrame:
2196
+ #
2197
+ # - A "backward" search selects the last row in the right DataFrame whose 'on' key is less than or equal to the left's key.
2198
+ # - A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key.
2199
+ #
2200
+ # The default is "backward".
2201
+ #
2202
+ # @param other [DataFrame]
2203
+ # DataFrame to join with.
2204
+ # @param left_on [String]
2205
+ # Join column of the left DataFrame.
2206
+ # @param right_on [String]
2207
+ # Join column of the right DataFrame.
2208
+ # @param on [String]
2209
+ # Join column of both DataFrames. If set, `left_on` and `right_on` should be
2210
+ # None.
2211
+ # @param by [Object]
2212
+ # join on these columns before doing asof join
2213
+ # @param by_left [Object]
2214
+ # join on these columns before doing asof join
2215
+ # @param by_right [Object]
2216
+ # join on these columns before doing asof join
2217
+ # @param strategy ["backward", "forward"]
2218
+ # Join strategy.
2219
+ # @param suffix [String]
2220
+ # Suffix to append to columns with a duplicate name.
2221
+ # @param tolerance [Object]
2222
+ # Numeric tolerance. By setting this the join will only be done if the near
2223
+ # keys are within this distance. If an asof join is done on columns of dtype
2224
+ # "Date", "Datetime", "Duration" or "Time" you use the following string
2225
+ # language:
2226
+ #
2227
+ # - 1ns (1 nanosecond)
2228
+ # - 1us (1 microsecond)
2229
+ # - 1ms (1 millisecond)
2230
+ # - 1s (1 second)
2231
+ # - 1m (1 minute)
2232
+ # - 1h (1 hour)
2233
+ # - 1d (1 day)
2234
+ # - 1w (1 week)
2235
+ # - 1mo (1 calendar month)
2236
+ # - 1y (1 calendar year)
2237
+ # - 1i (1 index count)
2238
+ #
2239
+ # Or combine them:
2240
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
2241
+ #
2242
+ # @param allow_parallel [Boolean]
2243
+ # Allow the physical plan to optionally evaluate the computation of both
2244
+ # DataFrames up to the join in parallel.
2245
+ # @param force_parallel [Boolean]
2246
+ # Force the physical plan to evaluate the computation of both DataFrames up to
2247
+ # the join in parallel.
2248
+ #
2249
+ # @return [DataFrame]
2250
+ #
2251
+ # @example
2252
+ # gdp = Polars::DataFrame.new(
2253
+ # {
2254
+ # "date" => [
2255
+ # DateTime.new(2016, 1, 1),
2256
+ # DateTime.new(2017, 1, 1),
2257
+ # DateTime.new(2018, 1, 1),
2258
+ # DateTime.new(2019, 1, 1),
2259
+ # ], # note record date: Jan 1st (sorted!)
2260
+ # "gdp" => [4164, 4411, 4566, 4696]
2261
+ # }
2262
+ # )
2263
+ # population = Polars::DataFrame.new(
2264
+ # {
2265
+ # "date" => [
2266
+ # DateTime.new(2016, 5, 12),
2267
+ # DateTime.new(2017, 5, 12),
2268
+ # DateTime.new(2018, 5, 12),
2269
+ # DateTime.new(2019, 5, 12),
2270
+ # ], # note record date: May 12th (sorted!)
2271
+ # "population" => [82.19, 82.66, 83.12, 83.52]
2272
+ # }
2273
+ # )
2274
+ # population.join_asof(
2275
+ # gdp, left_on: "date", right_on: "date", strategy: "backward"
2276
+ # )
2277
+ # # =>
2278
+ # # shape: (4, 3)
2279
+ # # ┌─────────────────────┬────────────┬──────┐
2280
+ # # │ date ┆ population ┆ gdp │
2281
+ # # │ --- ┆ --- ┆ --- │
2282
+ # # │ datetime[ns] ┆ f64 ┆ i64 │
2283
+ # # ╞═════════════════════╪════════════╪══════╡
2284
+ # # │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │
2285
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2286
+ # # │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │
2287
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2288
+ # # │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │
2289
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2290
+ # # │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │
2291
+ # # └─────────────────────┴────────────┴──────┘
2292
+ def join_asof(
2293
+ other,
2294
+ left_on: nil,
2295
+ right_on: nil,
2296
+ on: nil,
2297
+ by_left: nil,
2298
+ by_right: nil,
2299
+ by: nil,
2300
+ strategy: "backward",
2301
+ suffix: "_right",
2302
+ tolerance: nil,
2303
+ allow_parallel: true,
2304
+ force_parallel: false
2305
+ )
2306
+ lazy
2307
+ .join_asof(
2308
+ other.lazy,
2309
+ left_on: left_on,
2310
+ right_on: right_on,
2311
+ on: on,
2312
+ by_left: by_left,
2313
+ by_right: by_right,
2314
+ by: by,
2315
+ strategy: strategy,
2316
+ suffix: suffix,
2317
+ tolerance: tolerance,
2318
+ allow_parallel: allow_parallel,
2319
+ force_parallel: force_parallel
2320
+ )
2321
+ .collect(no_optimization: true)
2322
+ end
1561
2323
 
1562
2324
  # Join in SQL-like fashion.
1563
2325
  #
@@ -1675,8 +2437,78 @@ module Polars
1675
2437
  .collect(no_optimization: true)
1676
2438
  end
1677
2439
 
1678
- # def apply
1679
- # end
2440
+ # Apply a custom/user-defined function (UDF) over the rows of the DataFrame.
2441
+ #
2442
+ # The UDF will receive each row as a tuple of values: `udf(row)`.
2443
+ #
2444
+ # Implementing logic using a Ruby function is almost always _significantly_
2445
+ # slower and more memory intensive than implementing the same logic using
2446
+ # the native expression API because:
2447
+ #
2448
+ # - The native expression engine runs in Rust; UDFs run in Ruby.
2449
+ # - Use of Ruby UDFs forces the DataFrame to be materialized in memory.
2450
+ # - Polars-native expressions can be parallelised (UDFs cannot).
2451
+ # - Polars-native expressions can be logically optimised (UDFs cannot).
2452
+ #
2453
+ # Wherever possible you should strongly prefer the native expression API
2454
+ # to achieve the best performance.
2455
+ #
2456
+ # @param return_dtype [Symbol]
2457
+ # Output type of the operation. If none given, Polars tries to infer the type.
2458
+ # @param inference_size [Integer]
2459
+ # Only used in the case when the custom function returns rows.
2460
+ # This uses the first `n` rows to determine the output schema
2461
+ #
2462
+ # @return [Object]
2463
+ #
2464
+ # @note
2465
+ # The frame-level `apply` cannot track column names (as the UDF is a black-box
2466
+ # that may arbitrarily drop, rearrange, transform, or add new columns); if you
2467
+ # want to apply a UDF such that column names are preserved, you should use the
2468
+ # expression-level `apply` syntax instead.
2469
+ #
2470
+ # @example
2471
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [-1, 5, 8]})
2472
+ #
2473
+ # @example Return a DataFrame by mapping each row to a tuple:
2474
+ # df.apply { |t| [t[0] * 2, t[1] * 3] }
2475
+ # # =>
2476
+ # # shape: (3, 2)
2477
+ # # ┌──────────┬──────────┐
2478
+ # # │ column_0 ┆ column_1 │
2479
+ # # │ --- ┆ --- │
2480
+ # # │ i64 ┆ i64 │
2481
+ # # ╞══════════╪══════════╡
2482
+ # # │ 2 ┆ -3 │
2483
+ # # ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
2484
+ # # │ 4 ┆ 15 │
2485
+ # # ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
2486
+ # # │ 6 ┆ 24 │
2487
+ # # └──────────┴──────────┘
2488
+ #
2489
+ # @example Return a Series by mapping each row to a scalar:
2490
+ # df.apply { |t| t[0] * 2 + t[1] }
2491
+ # # =>
2492
+ # # shape: (3, 1)
2493
+ # # ┌───────┐
2494
+ # # │ apply │
2495
+ # # │ --- │
2496
+ # # │ i64 │
2497
+ # # ╞═══════╡
2498
+ # # │ 1 │
2499
+ # # ├╌╌╌╌╌╌╌┤
2500
+ # # │ 9 │
2501
+ # # ├╌╌╌╌╌╌╌┤
2502
+ # # │ 14 │
2503
+ # # └───────┘
2504
+ def apply(return_dtype: nil, inference_size: 256, &f)
2505
+ out, is_df = _df.apply(f, return_dtype, inference_size)
2506
+ if is_df
2507
+ _from_rbdf(out)
2508
+ else
2509
+ _from_rbdf(Utils.wrap_s(out).to_frame._df)
2510
+ end
2511
+ end
1680
2512
 
1681
2513
  # Return a new DataFrame with the column added or replaced.
1682
2514
  #
@@ -2178,17 +3010,404 @@ module Polars
2178
3010
  lazy.explode(columns).collect(no_optimization: true)
2179
3011
  end
2180
3012
 
2181
- # def pivot
2182
- # end
3013
+ # Create a spreadsheet-style pivot table as a DataFrame.
3014
+ #
3015
+ # @param values [Object]
3016
+ # Column values to aggregate. Can be multiple columns if the *columns*
3017
+ # arguments contains multiple columns as well
3018
+ # @param index [Object]
3019
+ # One or multiple keys to group by
3020
+ # @param columns [Object]
3021
+ # Columns whose values will be used as the header of the output DataFrame
3022
+ # @param aggregate_fn ["first", "sum", "max", "min", "mean", "median", "last", "count"]
3023
+ # A predefined aggregate function str or an expression.
3024
+ # @param maintain_order [Object]
3025
+ # Sort the grouped keys so that the output order is predictable.
3026
+ # @param sort_columns [Object]
3027
+ # Sort the transposed columns by name. Default is by order of discovery.
3028
+ #
3029
+ # @return [DataFrame]
3030
+ #
3031
+ # @example
3032
+ # df = Polars::DataFrame.new(
3033
+ # {
3034
+ # "foo" => ["one", "one", "one", "two", "two", "two"],
3035
+ # "bar" => ["A", "B", "C", "A", "B", "C"],
3036
+ # "baz" => [1, 2, 3, 4, 5, 6]
3037
+ # }
3038
+ # )
3039
+ # df.pivot(values: "baz", index: "foo", columns: "bar")
3040
+ # # =>
3041
+ # # shape: (2, 4)
3042
+ # # ┌─────┬─────┬─────┬─────┐
3043
+ # # │ foo ┆ A ┆ B ┆ C │
3044
+ # # │ --- ┆ --- ┆ --- ┆ --- │
3045
+ # # │ str ┆ i64 ┆ i64 ┆ i64 │
3046
+ # # ╞═════╪═════╪═════╪═════╡
3047
+ # # │ one ┆ 1 ┆ 2 ┆ 3 │
3048
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
3049
+ # # │ two ┆ 4 ┆ 5 ┆ 6 │
3050
+ # # └─────┴─────┴─────┴─────┘
3051
+ def pivot(
3052
+ values:,
3053
+ index:,
3054
+ columns:,
3055
+ aggregate_fn: "first",
3056
+ maintain_order: true,
3057
+ sort_columns: false
3058
+ )
3059
+ if values.is_a?(String)
3060
+ values = [values]
3061
+ end
3062
+ if index.is_a?(String)
3063
+ index = [index]
3064
+ end
3065
+ if columns.is_a?(String)
3066
+ columns = [columns]
3067
+ end
2183
3068
 
2184
- # def melt
2185
- # end
3069
+ if aggregate_fn.is_a?(String)
3070
+ case aggregate_fn
3071
+ when "first"
3072
+ aggregate_fn = Polars.element.first
3073
+ when "sum"
3074
+ aggregate_fn = Polars.element.sum
3075
+ when "max"
3076
+ aggregate_fn = Polars.element.max
3077
+ when "min"
3078
+ aggregate_fn = Polars.element.min
3079
+ when "mean"
3080
+ aggregate_fn = Polars.element.mean
3081
+ when "median"
3082
+ aggregate_fn = Polars.element.median
3083
+ when "last"
3084
+ aggregate_fn = Polars.element.last
3085
+ when "count"
3086
+ aggregate_fn = Polars.count
3087
+ else
3088
+ raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
3089
+ end
3090
+ end
2186
3091
 
2187
- # def unstack
2188
- # end
3092
+ _from_rbdf(
3093
+ _df.pivot_expr(
3094
+ values,
3095
+ index,
3096
+ columns,
3097
+ aggregate_fn._rbexpr,
3098
+ maintain_order,
3099
+ sort_columns
3100
+ )
3101
+ )
3102
+ end
2189
3103
 
2190
- # def partition_by
2191
- # end
3104
+ # Unpivot a DataFrame from wide to long format.
3105
+ #
3106
+ # Optionally leaves identifiers set.
3107
+ #
3108
+ # This function is useful to massage a DataFrame into a format where one or more
3109
+ # columns are identifier variables (id_vars), while all other columns, considered
3110
+ # measured variables (value_vars), are "unpivoted" to the row axis, leaving just
3111
+ # two non-identifier columns, 'variable' and 'value'.
3112
+ #
3113
+ # @param id_vars [Object]
3114
+ # Columns to use as identifier variables.
3115
+ # @param value_vars [Object]
3116
+ # Values to use as identifier variables.
3117
+ # If `value_vars` is empty all columns that are not in `id_vars` will be used.
3118
+ # @param variable_name [String]
3119
+ # Name to give to the `value` column. Defaults to "variable"
3120
+ # @param value_name [String]
3121
+ # Name to give to the `value` column. Defaults to "value"
3122
+ #
3123
+ # @return [DataFrame]
3124
+ #
3125
+ # @example
3126
+ # df = Polars::DataFrame.new(
3127
+ # {
3128
+ # "a" => ["x", "y", "z"],
3129
+ # "b" => [1, 3, 5],
3130
+ # "c" => [2, 4, 6]
3131
+ # }
3132
+ # )
3133
+ # df.melt(id_vars: "a", value_vars: ["b", "c"])
3134
+ # # =>
3135
+ # # shape: (6, 3)
3136
+ # # ┌─────┬──────────┬───────┐
3137
+ # # │ a ┆ variable ┆ value │
3138
+ # # │ --- ┆ --- ┆ --- │
3139
+ # # │ str ┆ str ┆ i64 │
3140
+ # # ╞═════╪══════════╪═══════╡
3141
+ # # │ x ┆ b ┆ 1 │
3142
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3143
+ # # │ y ┆ b ┆ 3 │
3144
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3145
+ # # │ z ┆ b ┆ 5 │
3146
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3147
+ # # │ x ┆ c ┆ 2 │
3148
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3149
+ # # │ y ┆ c ┆ 4 │
3150
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3151
+ # # │ z ┆ c ┆ 6 │
3152
+ # # └─────┴──────────┴───────┘
3153
+ def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
3154
+ if value_vars.is_a?(String)
3155
+ value_vars = [value_vars]
3156
+ end
3157
+ if id_vars.is_a?(String)
3158
+ id_vars = [id_vars]
3159
+ end
3160
+ if value_vars.nil?
3161
+ value_vars = []
3162
+ end
3163
+ if id_vars.nil?
3164
+ id_vars = []
3165
+ end
3166
+ _from_rbdf(
3167
+ _df.melt(id_vars, value_vars, value_name, variable_name)
3168
+ )
3169
+ end
3170
+
3171
+ # Unstack a long table to a wide form without doing an aggregation.
3172
+ #
3173
+ # This can be much faster than a pivot, because it can skip the grouping phase.
3174
+ #
3175
+ # @note
3176
+ # This functionality is experimental and may be subject to changes
3177
+ # without it being considered a breaking change.
3178
+ #
3179
+ # @param step Integer
3180
+ # Number of rows in the unstacked frame.
3181
+ # @param how ["vertical", "horizontal"]
3182
+ # Direction of the unstack.
3183
+ # @param columns [Object]
3184
+ # Column to include in the operation.
3185
+ # @param fill_values [Object]
3186
+ # Fill values that don't fit the new size with this value.
3187
+ #
3188
+ # @return [DataFrame]
3189
+ #
3190
+ # @example
3191
+ # df = Polars::DataFrame.new(
3192
+ # {
3193
+ # "col1" => "A".."I",
3194
+ # "col2" => Polars.arange(0, 9, eager: true)
3195
+ # }
3196
+ # )
3197
+ # # =>
3198
+ # # shape: (9, 2)
3199
+ # # ┌──────┬──────┐
3200
+ # # │ col1 ┆ col2 │
3201
+ # # │ --- ┆ --- │
3202
+ # # │ str ┆ i64 │
3203
+ # # ╞══════╪══════╡
3204
+ # # │ A ┆ 0 │
3205
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3206
+ # # │ B ┆ 1 │
3207
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3208
+ # # │ C ┆ 2 │
3209
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3210
+ # # │ D ┆ 3 │
3211
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3212
+ # # │ ... ┆ ... │
3213
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3214
+ # # │ F ┆ 5 │
3215
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3216
+ # # │ G ┆ 6 │
3217
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3218
+ # # │ H ┆ 7 │
3219
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3220
+ # # │ I ┆ 8 │
3221
+ # # └──────┴──────┘
3222
+ #
3223
+ # @example
3224
+ # df.unstack(step: 3, how: "vertical")
3225
+ # # =>
3226
+ # # shape: (3, 6)
3227
+ # # ┌────────┬────────┬────────┬────────┬────────┬────────┐
3228
+ # # │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │
3229
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3230
+ # # │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │
3231
+ # # ╞════════╪════════╪════════╪════════╪════════╪════════╡
3232
+ # # │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │
3233
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
3234
+ # # │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │
3235
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
3236
+ # # │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │
3237
+ # # └────────┴────────┴────────┴────────┴────────┴────────┘
3238
+ #
3239
+ # @example
3240
+ # df.unstack(step: 3, how: "horizontal")
3241
+ # # =>
3242
+ # # shape: (3, 6)
3243
+ # # ┌────────┬────────┬────────┬────────┬────────┬────────┐
3244
+ # # │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │
3245
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3246
+ # # │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │
3247
+ # # ╞════════╪════════╪════════╪════════╪════════╪════════╡
3248
+ # # │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │
3249
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
3250
+ # # │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │
3251
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
3252
+ # # │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │
3253
+ # # └────────┴────────┴────────┴────────┴────────┴────────┘
3254
+ def unstack(step:, how: "vertical", columns: nil, fill_values: nil)
3255
+ if !columns.nil?
3256
+ df = select(columns)
3257
+ else
3258
+ df = self
3259
+ end
3260
+
3261
+ height = df.height
3262
+ if how == "vertical"
3263
+ n_rows = step
3264
+ n_cols = (height / n_rows.to_f).ceil
3265
+ else
3266
+ n_cols = step
3267
+ n_rows = (height / n_cols.to_f).ceil
3268
+ end
3269
+
3270
+ n_fill = n_cols * n_rows - height
3271
+
3272
+ if n_fill > 0
3273
+ if !fill_values.is_a?(Array)
3274
+ fill_values = [fill_values] * df.width
3275
+ end
3276
+
3277
+ df = df.select(
3278
+ df.get_columns.zip(fill_values).map do |s, next_fill|
3279
+ s.extend_constant(next_fill, n_fill)
3280
+ end
3281
+ )
3282
+ end
3283
+
3284
+ if how == "horizontal"
3285
+ df = (
3286
+ df.with_column(
3287
+ (Polars.arange(0, n_cols * n_rows, eager: true) % n_cols).alias(
3288
+ "__sort_order"
3289
+ )
3290
+ )
3291
+ .sort("__sort_order")
3292
+ .drop("__sort_order")
3293
+ )
3294
+ end
3295
+
3296
+ zfill_val = Math.log10(n_cols).floor + 1
3297
+ slices =
3298
+ df.get_columns.flat_map do |s|
3299
+ n_cols.times.map do |slice_nbr|
3300
+ s.slice(slice_nbr * n_rows, n_rows).alias("%s_%0#{zfill_val}d" % [s.name, slice_nbr])
3301
+ end
3302
+ end
3303
+
3304
+ _from_rbdf(DataFrame.new(slices)._df)
3305
+ end
3306
+
3307
+ # Split into multiple DataFrames partitioned by groups.
3308
+ #
3309
+ # @param groups [Object]
3310
+ # Groups to partition by.
3311
+ # @param maintain_order [Boolean]
3312
+ # Keep predictable output order. This is slower as it requires an extra sort
3313
+ # operation.
3314
+ # @param as_dict [Boolean]
3315
+ # If true, return the partitions in a dictionary keyed by the distinct group
3316
+ # values instead of a list.
3317
+ #
3318
+ # @return [Object]
3319
+ #
3320
+ # @example
3321
+ # df = Polars::DataFrame.new(
3322
+ # {
3323
+ # "foo" => ["A", "A", "B", "B", "C"],
3324
+ # "N" => [1, 2, 2, 4, 2],
3325
+ # "bar" => ["k", "l", "m", "m", "l"]
3326
+ # }
3327
+ # )
3328
+ # df.partition_by("foo", maintain_order: true)
3329
+ # # =>
3330
+ # # [shape: (2, 3)
3331
+ # # ┌─────┬─────┬─────┐
3332
+ # # │ foo ┆ N ┆ bar │
3333
+ # # │ --- ┆ --- ┆ --- │
3334
+ # # │ str ┆ i64 ┆ str │
3335
+ # # ╞═════╪═════╪═════╡
3336
+ # # │ A ┆ 1 ┆ k │
3337
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
3338
+ # # │ A ┆ 2 ┆ l │
3339
+ # # └─────┴─────┴─────┘, shape: (2, 3)
3340
+ # # ┌─────┬─────┬─────┐
3341
+ # # │ foo ┆ N ┆ bar │
3342
+ # # │ --- ┆ --- ┆ --- │
3343
+ # # │ str ┆ i64 ┆ str │
3344
+ # # ╞═════╪═════╪═════╡
3345
+ # # │ B ┆ 2 ┆ m │
3346
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
3347
+ # # │ B ┆ 4 ┆ m │
3348
+ # # └─────┴─────┴─────┘, shape: (1, 3)
3349
+ # # ┌─────┬─────┬─────┐
3350
+ # # │ foo ┆ N ┆ bar │
3351
+ # # │ --- ┆ --- ┆ --- │
3352
+ # # │ str ┆ i64 ┆ str │
3353
+ # # ╞═════╪═════╪═════╡
3354
+ # # │ C ┆ 2 ┆ l │
3355
+ # # └─────┴─────┴─────┘]
3356
+ #
3357
+ # @example
3358
+ # df.partition_by("foo", maintain_order: true, as_dict: true)
3359
+ # # =>
3360
+ # # {"A"=>shape: (2, 3)
3361
+ # # ┌─────┬─────┬─────┐
3362
+ # # │ foo ┆ N ┆ bar │
3363
+ # # │ --- ┆ --- ┆ --- │
3364
+ # # │ str ┆ i64 ┆ str │
3365
+ # # ╞═════╪═════╪═════╡
3366
+ # # │ A ┆ 1 ┆ k │
3367
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
3368
+ # # │ A ┆ 2 ┆ l │
3369
+ # # └─────┴─────┴─────┘, "B"=>shape: (2, 3)
3370
+ # # ┌─────┬─────┬─────┐
3371
+ # # │ foo ┆ N ┆ bar │
3372
+ # # │ --- ┆ --- ┆ --- │
3373
+ # # │ str ┆ i64 ┆ str │
3374
+ # # ╞═════╪═════╪═════╡
3375
+ # # │ B ┆ 2 ┆ m │
3376
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
3377
+ # # │ B ┆ 4 ┆ m │
3378
+ # # └─────┴─────┴─────┘, "C"=>shape: (1, 3)
3379
+ # # ┌─────┬─────┬─────┐
3380
+ # # │ foo ┆ N ┆ bar │
3381
+ # # │ --- ┆ --- ┆ --- │
3382
+ # # │ str ┆ i64 ┆ str │
3383
+ # # ╞═════╪═════╪═════╡
3384
+ # # │ C ┆ 2 ┆ l │
3385
+ # # └─────┴─────┴─────┘}
3386
+ def partition_by(groups, maintain_order: true, as_dict: false)
3387
+ if groups.is_a?(String)
3388
+ groups = [groups]
3389
+ elsif !groups.is_a?(Array)
3390
+ groups = Array(groups)
3391
+ end
3392
+
3393
+ if as_dict
3394
+ out = {}
3395
+ if groups.length == 1
3396
+ _df.partition_by(groups, maintain_order).each do |df|
3397
+ df = _from_rbdf(df)
3398
+ out[df[groups][0, 0]] = df
3399
+ end
3400
+ else
3401
+ _df.partition_by(groups, maintain_order).each do |df|
3402
+ df = _from_rbdf(df)
3403
+ out[df[groups].row(0)] = df
3404
+ end
3405
+ end
3406
+ out
3407
+ else
3408
+ _df.partition_by(groups, maintain_order).map { |df| _from_rbdf(df) }
3409
+ end
3410
+ end
2192
3411
 
2193
3412
  # Shift values by the given period.
2194
3413
  #
@@ -3061,8 +4280,93 @@ module Polars
3061
4280
  _from_rbdf(_df.sample_n(n, with_replacement, shuffle, seed))
3062
4281
  end
3063
4282
 
3064
- # def fold
3065
- # end
4283
+ # Apply a horizontal reduction on a DataFrame.
4284
+ #
4285
+ # This can be used to effectively determine aggregations on a row level, and can
4286
+ # be applied to any DataType that can be supercasted (casted to a similar parent
4287
+ # type).
4288
+ #
4289
+ # An example of the supercast rules when applying an arithmetic operation on two
4290
+ # DataTypes are for instance:
4291
+ #
4292
+ # i8 + str = str
4293
+ # f32 + i64 = f32
4294
+ # f32 + f64 = f64
4295
+ #
4296
+ # @return [Series]
4297
+ #
4298
+ # @example A horizontal sum operation:
4299
+ # df = Polars::DataFrame.new(
4300
+ # {
4301
+ # "a" => [2, 1, 3],
4302
+ # "b" => [1, 2, 3],
4303
+ # "c" => [1.0, 2.0, 3.0]
4304
+ # }
4305
+ # )
4306
+ # df.fold { |s1, s2| s1 + s2 }
4307
+ # # =>
4308
+ # # shape: (3,)
4309
+ # # Series: 'a' [f64]
4310
+ # # [
4311
+ # # 4.0
4312
+ # # 5.0
4313
+ # # 9.0
4314
+ # # ]
4315
+ #
4316
+ # @example A horizontal minimum operation:
4317
+ # df = Polars::DataFrame.new({"a" => [2, 1, 3], "b" => [1, 2, 3], "c" => [1.0, 2.0, 3.0]})
4318
+ # df.fold { |s1, s2| s1.zip_with(s1 < s2, s2) }
4319
+ # # =>
4320
+ # # shape: (3,)
4321
+ # # Series: 'a' [f64]
4322
+ # # [
4323
+ # # 1.0
4324
+ # # 1.0
4325
+ # # 3.0
4326
+ # # ]
4327
+ #
4328
+ # @example A horizontal string concatenation:
4329
+ # df = Polars::DataFrame.new(
4330
+ # {
4331
+ # "a" => ["foo", "bar", 2],
4332
+ # "b" => [1, 2, 3],
4333
+ # "c" => [1.0, 2.0, 3.0]
4334
+ # }
4335
+ # )
4336
+ # df.fold { |s1, s2| s1 + s2 }
4337
+ # # =>
4338
+ # # shape: (3,)
4339
+ # # Series: 'a' [str]
4340
+ # # [
4341
+ # # "foo11.0"
4342
+ # # "bar22.0"
4343
+ # # null
4344
+ # # ]
4345
+ #
4346
+ # @example A horizontal boolean or, similar to a row-wise .any():
4347
+ # df = Polars::DataFrame.new(
4348
+ # {
4349
+ # "a" => [false, false, true],
4350
+ # "b" => [false, true, false]
4351
+ # }
4352
+ # )
4353
+ # df.fold { |s1, s2| s1 | s2 }
4354
+ # # =>
4355
+ # # shape: (3,)
4356
+ # # Series: 'a' [bool]
4357
+ # # [
4358
+ # # false
4359
+ # # true
4360
+ # # true
4361
+ # # ]
4362
+ def fold(&operation)
4363
+ acc = to_series(0)
4364
+
4365
+ 1.upto(width - 1) do |i|
4366
+ acc = operation.call(acc, to_series(i))
4367
+ end
4368
+ acc
4369
+ end
3066
4370
 
3067
4371
  # Get a row as tuple, either by index or by predicate.
3068
4372
  #
@@ -3171,8 +4475,45 @@ module Polars
3171
4475
  select(Utils.col("*").take_every(n))
3172
4476
  end
3173
4477
 
3174
- # def hash_rows
3175
- # end
4478
+ # Hash and combine the rows in this DataFrame.
4479
+ #
4480
+ # The hash value is of type `:u64`.
4481
+ #
4482
+ # @param seed [Integer]
4483
+ # Random seed parameter. Defaults to 0.
4484
+ # @param seed_1 [Integer]
4485
+ # Random seed parameter. Defaults to `seed` if not set.
4486
+ # @param seed_2 [Integer]
4487
+ # Random seed parameter. Defaults to `seed` if not set.
4488
+ # @param seed_3 [Integer]
4489
+ # Random seed parameter. Defaults to `seed` if not set.
4490
+ #
4491
+ # @return [Series]
4492
+ #
4493
+ # @example
4494
+ # df = Polars::DataFrame.new(
4495
+ # {
4496
+ # "foo" => [1, nil, 3, 4],
4497
+ # "ham" => ["a", "b", nil, "d"]
4498
+ # }
4499
+ # )
4500
+ # df.hash_rows(seed: 42)
4501
+ # # =>
4502
+ # # shape: (4,)
4503
+ # # Series: '' [u64]
4504
+ # # [
4505
+ # # 4238614331852490969
4506
+ # # 17976148875586754089
4507
+ # # 4702262519505526977
4508
+ # # 18144177983981041107
4509
+ # # ]
4510
+ def hash_rows(seed: 0, seed_1: nil, seed_2: nil, seed_3: nil)
4511
+ k0 = seed
4512
+ k1 = seed_1.nil? ? seed : seed_1
4513
+ k2 = seed_2.nil? ? seed : seed_2
4514
+ k3 = seed_3.nil? ? seed : seed_3
4515
+ Utils.wrap_s(_df.hash_rows(k0, k1, k2, k3))
4516
+ end
3176
4517
 
3177
4518
  # Interpolate intermediate values. The interpolation method is linear.
3178
4519
  #
@@ -3297,7 +4638,19 @@ module Polars
3297
4638
  self._df = _df._clone
3298
4639
  end
3299
4640
 
3300
- def hash_to_rbdf(data, columns: nil)
4641
+ def _pos_idx(idx, dim)
4642
+ if idx >= 0
4643
+ idx
4644
+ else
4645
+ shape[dim] + idx
4646
+ end
4647
+ end
4648
+
4649
+ # def _pos_idxs
4650
+ # end
4651
+
4652
+ # @private
4653
+ def self.hash_to_rbdf(data, columns: nil)
3301
4654
  if !columns.nil?
3302
4655
  columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
3303
4656
 
@@ -3313,11 +4666,34 @@ module Polars
3313
4666
  RbDataFrame.read_hash(data)
3314
4667
  end
3315
4668
 
3316
- def _unpack_columns(columns, lookup_names: nil)
3317
- [columns.keys, columns]
4669
+ # @private
4670
+ def self._unpack_columns(columns, lookup_names: nil, n_expected: nil)
4671
+ if columns.is_a?(Hash)
4672
+ columns = columns.to_a
4673
+ end
4674
+ column_names =
4675
+ (columns || []).map.with_index do |col, i|
4676
+ if col.is_a?(String)
4677
+ col || "column_#{i}"
4678
+ else
4679
+ col[0]
4680
+ end
4681
+ end
4682
+ if column_names.empty? && n_expected
4683
+ column_names = n_expected.times.map { |i| "column_#{i}" }
4684
+ end
4685
+ # TODO zip_longest
4686
+ lookup = column_names.zip(lookup_names || []).to_h
4687
+
4688
+ [
4689
+ column_names,
4690
+ (columns || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
4691
+ [lookup[col[0]] || col[0], col[1]]
4692
+ end
4693
+ ]
3318
4694
  end
3319
4695
 
3320
- def _handle_columns_arg(data, columns: nil)
4696
+ def self._handle_columns_arg(data, columns: nil)
3321
4697
  if columns.nil?
3322
4698
  data
3323
4699
  else
@@ -3335,14 +4711,39 @@ module Polars
3335
4711
  end
3336
4712
  end
3337
4713
 
3338
- def sequence_to_rbdf(data, columns: nil, orient: nil)
3339
- if columns || orient
3340
- raise Todo
4714
+ # @private
4715
+ def self.sequence_to_rbdf(data, columns: nil, orient: nil)
4716
+ if data.length == 0
4717
+ return hash_to_rbdf({}, columns: columns)
4718
+ end
4719
+
4720
+ if data[0].is_a?(Series)
4721
+ # series_names = data.map(&:name)
4722
+ # columns, dtypes = _unpack_columns(columns || series_names, n_expected: data.length)
4723
+ data_series = []
4724
+ data.each do |s|
4725
+ data_series << s._s
4726
+ end
4727
+ elsif data[0].is_a?(Array)
4728
+ if orient.nil? && !columns.nil?
4729
+ orient = columns.length == data.length ? "col" : "row"
4730
+ end
4731
+
4732
+ if orient == "row"
4733
+ raise Todo
4734
+ elsif orient == "col" || orient.nil?
4735
+ raise Todo
4736
+ else
4737
+ raise ArgumentError, "orient must be one of {{'col', 'row', nil}}, got #{orient} instead."
4738
+ end
3341
4739
  end
3342
- RbDataFrame.new(data.map(&:_s))
4740
+
4741
+ data_series = _handle_columns_arg(data_series, columns: columns)
4742
+ RbDataFrame.new(data_series)
3343
4743
  end
3344
4744
 
3345
- def series_to_rbdf(data, columns: nil)
4745
+ # @private
4746
+ def self.series_to_rbdf(data, columns: nil)
3346
4747
  if columns
3347
4748
  raise Todo
3348
4749
  end