polars-df 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -26,14 +26,14 @@ module Polars
26
26
  end
27
27
 
28
28
  if data.nil?
29
- self._df = hash_to_rbdf({}, columns: columns)
29
+ self._df = self.class.hash_to_rbdf({}, columns: columns)
30
30
  elsif data.is_a?(Hash)
31
31
  data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
32
- self._df = hash_to_rbdf(data, columns: columns)
32
+ self._df = self.class.hash_to_rbdf(data, columns: columns)
33
33
  elsif data.is_a?(Array)
34
- self._df = sequence_to_rbdf(data, columns: columns, orient: orient)
34
+ self._df = self.class.sequence_to_rbdf(data, columns: columns, orient: orient)
35
35
  elsif data.is_a?(Series)
36
- self._df = series_to_rbdf(data, columns: columns)
36
+ self._df = self.class.series_to_rbdf(data, columns: columns)
37
37
  else
38
38
  raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
39
39
  end
@@ -46,11 +46,16 @@ module Polars
46
46
  df
47
47
  end
48
48
 
49
- # def self._from_hashes
50
- # end
49
+ # @private
50
+ def self._from_hashes(data, infer_schema_length: 100, schema: nil)
51
+ rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema)
52
+ _from_rbdf(rbdf)
53
+ end
51
54
 
52
- # def self._from_hash
53
- # end
55
+ # @private
56
+ def self._from_hash(data, columns: nil)
57
+ _from_rbdf(hash_to_rbdf(data, columns: columns))
58
+ end
54
59
 
55
60
  # def self._from_records
56
61
  # end
@@ -186,8 +191,14 @@ module Polars
186
191
  )
187
192
  end
188
193
 
189
- # def self._read_avro
190
- # end
194
+ # @private
195
+ def self._read_avro(file, columns: nil, n_rows: nil)
196
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
197
+ file = Utils.format_path(file)
198
+ end
199
+ projection, columns = Utils.handle_projection_columns(columns)
200
+ _from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
201
+ end
191
202
 
192
203
  # @private
193
204
  def self._read_ipc(
@@ -343,7 +354,7 @@ module Polars
343
354
  # }
344
355
  # )
345
356
  # df.dtypes
346
- # # => [:i64, :f64, :str]
357
+ # # => [Polars::Int64, Polars::Float64, Polars::Utf8]
347
358
  def dtypes
348
359
  _df.dtypes
349
360
  end
@@ -361,7 +372,7 @@ module Polars
361
372
  # }
362
373
  # )
363
374
  # df.schema
364
- # # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str}
375
+ # # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::Utf8}
365
376
  def schema
366
377
  columns.zip(dtypes).to_h
367
378
  end
@@ -486,12 +497,6 @@ module Polars
486
497
  # def each
487
498
  # end
488
499
 
489
- # def _pos_idx
490
- # end
491
-
492
- # def _pos_idxs
493
- # end
494
-
495
500
  # Returns subset of the DataFrame.
496
501
  #
497
502
  # @return [Object]
@@ -554,19 +559,33 @@ module Polars
554
559
 
555
560
  # df[idx]
556
561
  if item.is_a?(Integer)
557
- return slice(_pos_idx(item, dim: 0), 1)
562
+ return slice(_pos_idx(item, 0), 1)
558
563
  end
559
564
 
560
565
  # df[..]
561
566
  if item.is_a?(Range)
562
567
  return Slice.new(self).apply(item)
563
568
  end
569
+
570
+ if Utils.is_str_sequence(item, allow_str: false)
571
+ # select multiple columns
572
+ # df[["foo", "bar"]]
573
+ return _from_rbdf(_df.select(item))
574
+ end
564
575
  end
565
576
 
566
577
  raise ArgumentError, "Cannot get item of type: #{item.class.name}"
567
578
  end
568
579
 
580
+ # Set item.
581
+ #
582
+ # @return [Object]
569
583
  # def []=(key, value)
584
+ # if key.is_a?(String)
585
+ # raise TypeError, "'DataFrame' object does not support 'Series' assignment by index. Use 'DataFrame.with_columns'"
586
+ # end
587
+
588
+ # raise Todo
570
589
  # end
571
590
 
572
591
  # no to_arrow
@@ -582,8 +601,24 @@ module Polars
582
601
  end
583
602
  end
584
603
 
585
- # def to_hashes / to_a
586
- # end
604
+ # Convert every row to a dictionary.
605
+ #
606
+ # Note that this is slow.
607
+ #
608
+ # @return [Array]
609
+ #
610
+ # @example
611
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
612
+ # df.to_hashes
613
+ # [{'foo': 1, 'bar': 4}, {'foo': 2, 'bar': 5}, {'foo': 3, 'bar': 6}]
614
+ def to_hashes
615
+ rbdf = _df
616
+ names = columns
617
+
618
+ height.times.map do |i|
619
+ names.zip(rbdf.row_tuple(i)).to_h
620
+ end
621
+ end
587
622
 
588
623
  # def to_numo
589
624
  # end
@@ -762,8 +797,24 @@ module Polars
762
797
  nil
763
798
  end
764
799
 
765
- # def write_avro
766
- # end
800
+ # Write to Apache Avro file.
801
+ #
802
+ # @param file [String]
803
+ # File path to which the file should be written.
804
+ # @param compression ["uncompressed", "snappy", "deflate"]
805
+ # Compression method. Defaults to "uncompressed".
806
+ #
807
+ # @return [nil]
808
+ def write_avro(file, compression = "uncompressed")
809
+ if compression.nil?
810
+ compression = "uncompressed"
811
+ end
812
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
813
+ file = Utils.format_path(file)
814
+ end
815
+
816
+ _df.write_avro(file, compression)
817
+ end
767
818
 
768
819
  # Write to Arrow IPC binary stream or Feather file.
769
820
  #
@@ -866,8 +917,84 @@ module Polars
866
917
  Utils.scale_bytes(sz, to: unit)
867
918
  end
868
919
 
869
- # def transpose
870
- # end
920
+ # Transpose a DataFrame over the diagonal.
921
+ #
922
+ # @param include_header [Boolean]
923
+ # If set, the column names will be added as first column.
924
+ # @param header_name [String]
925
+ # If `include_header` is set, this determines the name of the column that will
926
+ # be inserted.
927
+ # @param column_names [Array]
928
+ # Optional generator/iterator that yields column names. Will be used to
929
+ # replace the columns in the DataFrame.
930
+ #
931
+ # @return [DataFrame]
932
+ #
933
+ # @note
934
+ # This is a very expensive operation. Perhaps you can do it differently.
935
+ #
936
+ # @example
937
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => [1, 2, 3]})
938
+ # df.transpose(include_header: true)
939
+ # # =>
940
+ # # shape: (2, 4)
941
+ # # ┌────────┬──────────┬──────────┬──────────┐
942
+ # # │ column ┆ column_0 ┆ column_1 ┆ column_2 │
943
+ # # │ --- ┆ --- ┆ --- ┆ --- │
944
+ # # │ str ┆ i64 ┆ i64 ┆ i64 │
945
+ # # ╞════════╪══════════╪══════════╪══════════╡
946
+ # # │ a ┆ 1 ┆ 2 ┆ 3 │
947
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
948
+ # # │ b ┆ 1 ┆ 2 ┆ 3 │
949
+ # # └────────┴──────────┴──────────┴──────────┘
950
+ #
951
+ # @example Replace the auto-generated column names with a list
952
+ # df.transpose(include_header: false, column_names: ["a", "b", "c"])
953
+ # # =>
954
+ # # shape: (2, 3)
955
+ # # ┌─────┬─────┬─────┐
956
+ # # │ a ┆ b ┆ c │
957
+ # # │ --- ┆ --- ┆ --- │
958
+ # # │ i64 ┆ i64 ┆ i64 │
959
+ # # ╞═════╪═════╪═════╡
960
+ # # │ 1 ┆ 2 ┆ 3 │
961
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
962
+ # # │ 1 ┆ 2 ┆ 3 │
963
+ # # └─────┴─────┴─────┘
964
+ #
965
+ # @example Include the header as a separate column
966
+ # df.transpose(
967
+ # include_header: true, header_name: "foo", column_names: ["a", "b", "c"]
968
+ # )
969
+ # # =>
970
+ # # shape: (2, 4)
971
+ # # ┌─────┬─────┬─────┬─────┐
972
+ # # │ foo ┆ a ┆ b ┆ c │
973
+ # # │ --- ┆ --- ┆ --- ┆ --- │
974
+ # # │ str ┆ i64 ┆ i64 ┆ i64 │
975
+ # # ╞═════╪═════╪═════╪═════╡
976
+ # # │ a ┆ 1 ┆ 2 ┆ 3 │
977
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
978
+ # # │ b ┆ 1 ┆ 2 ┆ 3 │
979
+ # # └─────┴─────┴─────┴─────┘
980
+ def transpose(include_header: false, header_name: "column", column_names: nil)
981
+ df = _from_rbdf(_df.transpose(include_header, header_name))
982
+ if !column_names.nil?
983
+ names = []
984
+ n = df.width
985
+ if include_header
986
+ names << header_name
987
+ n -= 1
988
+ end
989
+
990
+ column_names = column_names.each
991
+ n.times do
992
+ names << column_names.next
993
+ end
994
+ df.columns = names
995
+ end
996
+ df
997
+ end
871
998
 
872
999
  # Reverse the DataFrame.
873
1000
  #
@@ -1051,25 +1178,25 @@ module Polars
1051
1178
  # df.describe
1052
1179
  # # =>
1053
1180
  # # shape: (7, 6)
1054
- # # ┌────────────┬──────────┬──────────┬──────┬──────┬──────┐
1055
- # # │ describe ┆ a ┆ b ┆ c ┆ d ┆ e │
1056
- # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1057
- # # │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str │
1058
- # # ╞════════════╪══════════╪══════════╪══════╪══════╪══════╡
1059
- # # │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 │
1060
- # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1061
- # # │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 │
1062
- # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1063
- # # │ mean ┆ 2.266667 ┆ 4.5 ┆ null ┆ null ┆ null │
1064
- # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1065
- # # │ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null │
1066
- # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1067
- # # │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur │
1068
- # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1069
- # # │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd │
1070
- # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1071
- # # │ median ┆ 2.8 ┆ 4.5 ┆ null ┆ null ┆ null │
1072
- # # └────────────┴──────────┴──────────┴──────┴──────┴──────┘
1181
+ # # ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┐
1182
+ # # │ describe ┆ a ┆ b ┆ c ┆ d ┆ e │
1183
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1184
+ # # │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str │
1185
+ # # ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╡
1186
+ # # │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 │
1187
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1188
+ # # │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 │
1189
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1190
+ # # │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null │
1191
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1192
+ # # │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null │
1193
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1194
+ # # │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur │
1195
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1196
+ # # │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd │
1197
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1198
+ # # │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null │
1199
+ # # └────────────┴──────────┴──────────┴──────────┴──────┴──────┘
1073
1200
  def describe
1074
1201
  describe_cast = lambda do |stat|
1075
1202
  columns = []
@@ -1462,8 +1589,48 @@ module Polars
1462
1589
  _from_rbdf(_df.drop_nulls(subset))
1463
1590
  end
1464
1591
 
1465
- # def pipe
1466
- # end
1592
+ # Offers a structured way to apply a sequence of user-defined functions (UDFs).
1593
+ #
1594
+ # @param func [Object]
1595
+ # Callable; will receive the frame as the first parameter,
1596
+ # followed by any given args/kwargs.
1597
+ # @param args [Object]
1598
+ # Arguments to pass to the UDF.
1599
+ # @param kwargs [Object]
1600
+ # Keyword arguments to pass to the UDF.
1601
+ #
1602
+ # @return [Object]
1603
+ #
1604
+ # @note
1605
+ # It is recommended to use LazyFrame when piping operations, in order
1606
+ # to fully take advantage of query optimization and parallelization.
1607
+ # See {#lazy}.
1608
+ #
1609
+ # @example
1610
+ # cast_str_to_int = lambda do |data, col_name:|
1611
+ # data.with_column(Polars.col(col_name).cast(:i64))
1612
+ # end
1613
+ #
1614
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]})
1615
+ # df.pipe(cast_str_to_int, col_name: "b")
1616
+ # # =>
1617
+ # # shape: (4, 2)
1618
+ # # ┌─────┬─────┐
1619
+ # # │ a ┆ b │
1620
+ # # │ --- ┆ --- │
1621
+ # # │ i64 ┆ i64 │
1622
+ # # ╞═════╪═════╡
1623
+ # # │ 1 ┆ 10 │
1624
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1625
+ # # │ 2 ┆ 20 │
1626
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1627
+ # # │ 3 ┆ 30 │
1628
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1629
+ # # │ 4 ┆ 40 │
1630
+ # # └─────┴─────┘
1631
+ def pipe(func, *args, **kwargs, &block)
1632
+ func.call(self, *args, **kwargs, &block)
1633
+ end
1467
1634
 
1468
1635
  # Add a column at index 0 that counts the rows.
1469
1636
  #
@@ -1547,17 +1714,614 @@ module Polars
1547
1714
  )
1548
1715
  end
1549
1716
 
1550
- # def groupby_rolling
1551
- # end
1717
+ # Create rolling groups based on a time column.
1718
+ #
1719
+ # Also works for index values of type `:i32` or `:i64`.
1720
+ #
1721
+ # Different from a `dynamic_groupby` the windows are now determined by the
1722
+ # individual values and are not of constant intervals. For constant intervals use
1723
+ # *groupby_dynamic*
1724
+ #
1725
+ # The `period` and `offset` arguments are created either from a timedelta, or
1726
+ # by using the following string language:
1727
+ #
1728
+ # - 1ns (1 nanosecond)
1729
+ # - 1us (1 microsecond)
1730
+ # - 1ms (1 millisecond)
1731
+ # - 1s (1 second)
1732
+ # - 1m (1 minute)
1733
+ # - 1h (1 hour)
1734
+ # - 1d (1 day)
1735
+ # - 1w (1 week)
1736
+ # - 1mo (1 calendar month)
1737
+ # - 1y (1 calendar year)
1738
+ # - 1i (1 index count)
1739
+ #
1740
+ # Or combine them:
1741
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1742
+ #
1743
+ # In case of a groupby_rolling on an integer column, the windows are defined by:
1744
+ #
1745
+ # - **"1i" # length 1**
1746
+ # - **"10i" # length 10**
1747
+ #
1748
+ # @param index_column [Object]
1749
+ # Column used to group based on the time window.
1750
+ # Often to type Date/Datetime
1751
+ # This column must be sorted in ascending order. If not the output will not
1752
+ # make sense.
1753
+ #
1754
+ # In case of a rolling groupby on indices, dtype needs to be one of
1755
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1756
+ # performance matters use an `:i64` column.
1757
+ # @param period [Object]
1758
+ # Length of the window.
1759
+ # @param offset [Object]
1760
+ # Offset of the window. Default is -period.
1761
+ # @param closed ["right", "left", "both", "none"]
1762
+ # Define whether the temporal window interval is closed or not.
1763
+ # @param by [Object]
1764
+ # Also group by this column/these columns.
1765
+ #
1766
+ # @return [RollingGroupBy]
1767
+ #
1768
+ # @example
1769
+ # dates = [
1770
+ # "2020-01-01 13:45:48",
1771
+ # "2020-01-01 16:42:13",
1772
+ # "2020-01-01 16:45:09",
1773
+ # "2020-01-02 18:12:48",
1774
+ # "2020-01-03 19:45:32",
1775
+ # "2020-01-08 23:16:43"
1776
+ # ]
1777
+ # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
1778
+ # Polars.col("dt").str.strptime(:datetime)
1779
+ # )
1780
+ # df.groupby_rolling(index_column: "dt", period: "2d").agg(
1781
+ # [
1782
+ # Polars.sum("a").alias("sum_a"),
1783
+ # Polars.min("a").alias("min_a"),
1784
+ # Polars.max("a").alias("max_a")
1785
+ # ]
1786
+ # )
1787
+ # # =>
1788
+ # # shape: (6, 4)
1789
+ # # ┌─────────────────────┬───────┬───────┬───────┐
1790
+ # # │ dt ┆ sum_a ┆ min_a ┆ max_a │
1791
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1792
+ # # │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │
1793
+ # # ╞═════════════════════╪═══════╪═══════╪═══════╡
1794
+ # # │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │
1795
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1796
+ # # │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │
1797
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1798
+ # # │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │
1799
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1800
+ # # │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │
1801
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1802
+ # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
1803
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1804
+ # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
1805
+ # # └─────────────────────┴───────┴───────┴───────┘
1806
+ def groupby_rolling(
1807
+ index_column:,
1808
+ period:,
1809
+ offset: nil,
1810
+ closed: "right",
1811
+ by: nil
1812
+ )
1813
+ RollingGroupBy.new(self, index_column, period, offset, closed, by)
1814
+ end
1552
1815
 
1553
- # def groupby_dynamic
1554
- # end
1816
+ # Group based on a time value (or index value of type `:i32`, `:i64`).
1817
+ #
1818
+ # Time windows are calculated and rows are assigned to windows. Different from a
1819
+ # normal groupby is that a row can be member of multiple groups. The time/index
1820
+ # window could be seen as a rolling window, with a window size determined by
1821
+ # dates/times/values instead of slots in the DataFrame.
1822
+ #
1823
+ # A window is defined by:
1824
+ #
1825
+ # - every: interval of the window
1826
+ # - period: length of the window
1827
+ # - offset: offset of the window
1828
+ #
1829
+ # The `every`, `period` and `offset` arguments are created with
1830
+ # the following string language:
1831
+ #
1832
+ # - 1ns (1 nanosecond)
1833
+ # - 1us (1 microsecond)
1834
+ # - 1ms (1 millisecond)
1835
+ # - 1s (1 second)
1836
+ # - 1m (1 minute)
1837
+ # - 1h (1 hour)
1838
+ # - 1d (1 day)
1839
+ # - 1w (1 week)
1840
+ # - 1mo (1 calendar month)
1841
+ # - 1y (1 calendar year)
1842
+ # - 1i (1 index count)
1843
+ #
1844
+ # Or combine them:
1845
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1846
+ #
1847
+ # In case of a groupby_dynamic on an integer column, the windows are defined by:
1848
+ #
1849
+ # - "1i" # length 1
1850
+ # - "10i" # length 10
1851
+ #
1852
+ # @param index_column
1853
+ # Column used to group based on the time window.
1854
+ # Often to type Date/Datetime
1855
+ # This column must be sorted in ascending order. If not the output will not
1856
+ # make sense.
1857
+ #
1858
+ # In case of a dynamic groupby on indices, dtype needs to be one of
1859
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1860
+ # performance matters use an `:i64` column.
1861
+ # @param every
1862
+ # Interval of the window.
1863
+ # @param period
1864
+ # Length of the window, if None it is equal to 'every'.
1865
+ # @param offset
1866
+ # Offset of the window if None and period is None it will be equal to negative
1867
+ # `every`.
1868
+ # @param truncate
1869
+ # Truncate the time value to the window lower bound.
1870
+ # @param include_boundaries
1871
+ # Add the lower and upper bound of the window to the "_lower_bound" and
1872
+ # "_upper_bound" columns. This will impact performance because it's harder to
1873
+ # parallelize
1874
+ # @param closed ["right", "left", "both", "none"]
1875
+ # Define whether the temporal window interval is closed or not.
1876
+ # @param by
1877
+ # Also group by this column/these columns
1878
+ #
1879
+ # @return [DataFrame]
1880
+ #
1881
+ # @example
1882
+ # df = Polars::DataFrame.new(
1883
+ # {
1884
+ # "time" => Polars.date_range(
1885
+ # DateTime.new(2021, 12, 16),
1886
+ # DateTime.new(2021, 12, 16, 3),
1887
+ # "30m"
1888
+ # ),
1889
+ # "n" => 0..6
1890
+ # }
1891
+ # )
1892
+ # # =>
1893
+ # # shape: (7, 2)
1894
+ # # ┌─────────────────────┬─────┐
1895
+ # # │ time ┆ n │
1896
+ # # │ --- ┆ --- │
1897
+ # # │ datetime[μs] ┆ i64 │
1898
+ # # ╞═════════════════════╪═════╡
1899
+ # # │ 2021-12-16 00:00:00 ┆ 0 │
1900
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1901
+ # # │ 2021-12-16 00:30:00 ┆ 1 │
1902
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1903
+ # # │ 2021-12-16 01:00:00 ┆ 2 │
1904
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1905
+ # # │ 2021-12-16 01:30:00 ┆ 3 │
1906
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1907
+ # # │ 2021-12-16 02:00:00 ┆ 4 │
1908
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1909
+ # # │ 2021-12-16 02:30:00 ┆ 5 │
1910
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1911
+ # # │ 2021-12-16 03:00:00 ┆ 6 │
1912
+ # # └─────────────────────┴─────┘
1913
+ #
1914
+ # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
1915
+ # df.groupby_dynamic("time", every: "1h", closed: "right").agg(
1916
+ # [
1917
+ # Polars.col("time").min.alias("time_min"),
1918
+ # Polars.col("time").max.alias("time_max")
1919
+ # ]
1920
+ # )
1921
+ # # =>
1922
+ # # shape: (4, 3)
1923
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┐
1924
+ # # │ time ┆ time_min ┆ time_max │
1925
+ # # │ --- ┆ --- ┆ --- │
1926
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │
1927
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╡
1928
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │
1929
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1930
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │
1931
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1932
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │
1933
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1934
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │
1935
+ # # └─────────────────────┴─────────────────────┴─────────────────────┘
1936
+ #
1937
+ # @example The window boundaries can also be added to the aggregation result.
1938
+ # df.groupby_dynamic(
1939
+ # "time", every: "1h", include_boundaries: true, closed: "right"
1940
+ # ).agg([Polars.col("time").count.alias("time_count")])
1941
+ # # =>
1942
+ # # shape: (4, 4)
1943
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
1944
+ # # │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
1945
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1946
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
1947
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
1948
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
1949
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1950
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │
1951
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1952
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
1953
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1954
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
1955
+ # # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
1956
+ #
1957
+ # @example When closed="left", should not include right end of interval.
1958
+ # df.groupby_dynamic("time", every: "1h", closed: "left").agg(
1959
+ # [
1960
+ # Polars.col("time").count.alias("time_count"),
1961
+ # Polars.col("time").list.alias("time_agg_list")
1962
+ # ]
1963
+ # )
1964
+ # # =>
1965
+ # # shape: (4, 3)
1966
+ # # ┌─────────────────────┬────────────┬─────────────────────────────────────┐
1967
+ # # │ time ┆ time_count ┆ time_agg_list │
1968
+ # # │ --- ┆ --- ┆ --- │
1969
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
1970
+ # # ╞═════════════════════╪════════════╪═════════════════════════════════════╡
1971
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16... │
1972
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1973
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16... │
1974
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1975
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16... │
1976
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1977
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
1978
+ # # └─────────────────────┴────────────┴─────────────────────────────────────┘
1979
+ #
1980
+ # @example When closed="both" the time values at the window boundaries belong to 2 groups.
1981
+ # df.groupby_dynamic("time", every: "1h", closed: "both").agg(
1982
+ # [Polars.col("time").count.alias("time_count")]
1983
+ # )
1984
+ # # =>
1985
+ # # shape: (5, 2)
1986
+ # # ┌─────────────────────┬────────────┐
1987
+ # # │ time ┆ time_count │
1988
+ # # │ --- ┆ --- │
1989
+ # # │ datetime[μs] ┆ u32 │
1990
+ # # ╞═════════════════════╪════════════╡
1991
+ # # │ 2021-12-15 23:00:00 ┆ 1 │
1992
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1993
+ # # │ 2021-12-16 00:00:00 ┆ 3 │
1994
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1995
+ # # │ 2021-12-16 01:00:00 ┆ 3 │
1996
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1997
+ # # │ 2021-12-16 02:00:00 ┆ 3 │
1998
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1999
+ # # │ 2021-12-16 03:00:00 ┆ 1 │
2000
+ # # └─────────────────────┴────────────┘
2001
+ #
2002
+ # @example Dynamic groupbys can also be combined with grouping on normal keys.
2003
+ # df = Polars::DataFrame.new(
2004
+ # {
2005
+ # "time" => Polars.date_range(
2006
+ # DateTime.new(2021, 12, 16),
2007
+ # DateTime.new(2021, 12, 16, 3),
2008
+ # "30m"
2009
+ # ),
2010
+ # "groups" => ["a", "a", "a", "b", "b", "a", "a"]
2011
+ # }
2012
+ # )
2013
+ # df.groupby_dynamic(
2014
+ # "time",
2015
+ # every: "1h",
2016
+ # closed: "both",
2017
+ # by: "groups",
2018
+ # include_boundaries: true
2019
+ # ).agg([Polars.col("time").count.alias("time_count")])
2020
+ # # =>
2021
+ # # shape: (7, 5)
2022
+ # # ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
2023
+ # # │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
2024
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2025
+ # # │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
2026
+ # # ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
2027
+ # # │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
2028
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2029
+ # # │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │
2030
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2031
+ # # │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │
2032
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2033
+ # # │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
2034
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2035
+ # # │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │
2036
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2037
+ # # │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
2038
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2039
+ # # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
2040
+ # # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
2041
+ #
2042
+ # @example Dynamic groupby on an index column.
2043
+ # df = Polars::DataFrame.new(
2044
+ # {
2045
+ # "idx" => Polars.arange(0, 6, eager: true),
2046
+ # "A" => ["A", "A", "B", "B", "B", "C"]
2047
+ # }
2048
+ # )
2049
+ # df.groupby_dynamic(
2050
+ # "idx",
2051
+ # every: "2i",
2052
+ # period: "3i",
2053
+ # include_boundaries: true,
2054
+ # closed: "right"
2055
+ # ).agg(Polars.col("A").list.alias("A_agg_list"))
2056
+ # # =>
2057
+ # # shape: (3, 4)
2058
+ # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
2059
+ # # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
2060
+ # # │ --- ┆ --- ┆ --- ┆ --- │
2061
+ # # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
2062
+ # # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
2063
+ # # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
2064
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
2065
+ # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
2066
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
2067
+ # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
2068
+ # # └─────────────────┴─────────────────┴─────┴─────────────────┘
2069
+ def groupby_dynamic(
2070
+ index_column,
2071
+ every:,
2072
+ period: nil,
2073
+ offset: nil,
2074
+ truncate: true,
2075
+ include_boundaries: false,
2076
+ closed: "left",
2077
+ by: nil,
2078
+ start_by: "window"
2079
+ )
2080
+ DynamicGroupBy.new(
2081
+ self,
2082
+ index_column,
2083
+ every,
2084
+ period,
2085
+ offset,
2086
+ truncate,
2087
+ include_boundaries,
2088
+ closed,
2089
+ by,
2090
+ start_by
2091
+ )
2092
+ end
1555
2093
 
1556
- # def upsample
1557
- # end
2094
+ # Upsample a DataFrame at a regular frequency.
2095
+ #
2096
+ # @param time_column [Object]
2097
+ # time column will be used to determine a date_range.
2098
+ # Note that this column has to be sorted for the output to make sense.
2099
+ # @param every [String]
2100
+ # interval will start 'every' duration
2101
+ # @param offset [String]
2102
+ # change the start of the date_range by this offset.
2103
+ # @param by [Object]
2104
+ # First group by these columns and then upsample for every group
2105
+ # @param maintain_order [Boolean]
2106
+ # Keep the ordering predictable. This is slower.
2107
+ #
2108
+ # The `every` and `offset` arguments are created with
2109
+ # the following string language:
2110
+ #
2111
+ # - 1ns (1 nanosecond)
2112
+ # - 1us (1 microsecond)
2113
+ # - 1ms (1 millisecond)
2114
+ # - 1s (1 second)
2115
+ # - 1m (1 minute)
2116
+ # - 1h (1 hour)
2117
+ # - 1d (1 day)
2118
+ # - 1w (1 week)
2119
+ # - 1mo (1 calendar month)
2120
+ # - 1y (1 calendar year)
2121
+ # - 1i (1 index count)
2122
+ #
2123
+ # Or combine them:
2124
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
2125
+ #
2126
+ # @return [DataFrame]
2127
+ #
2128
+ # @example Upsample a DataFrame by a certain interval.
2129
+ # df = Polars::DataFrame.new(
2130
+ # {
2131
+ # "time" => [
2132
+ # DateTime.new(2021, 2, 1),
2133
+ # DateTime.new(2021, 4, 1),
2134
+ # DateTime.new(2021, 5, 1),
2135
+ # DateTime.new(2021, 6, 1)
2136
+ # ],
2137
+ # "groups" => ["A", "B", "A", "B"],
2138
+ # "values" => [0, 1, 2, 3]
2139
+ # }
2140
+ # )
2141
+ # df.upsample(
2142
+ # time_column: "time", every: "1mo", by: "groups", maintain_order: true
2143
+ # ).select(Polars.all.forward_fill)
2144
+ # # =>
2145
+ # # shape: (7, 3)
2146
+ # # ┌─────────────────────┬────────┬────────┐
2147
+ # # │ time ┆ groups ┆ values │
2148
+ # # │ --- ┆ --- ┆ --- │
2149
+ # # │ datetime[ns] ┆ str ┆ i64 │
2150
+ # # ╞═════════════════════╪════════╪════════╡
2151
+ # # │ 2021-02-01 00:00:00 ┆ A ┆ 0 │
2152
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2153
+ # # │ 2021-03-01 00:00:00 ┆ A ┆ 0 │
2154
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2155
+ # # │ 2021-04-01 00:00:00 ┆ A ┆ 0 │
2156
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2157
+ # # │ 2021-05-01 00:00:00 ┆ A ┆ 2 │
2158
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2159
+ # # │ 2021-04-01 00:00:00 ┆ B ┆ 1 │
2160
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2161
+ # # │ 2021-05-01 00:00:00 ┆ B ┆ 1 │
2162
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2163
+ # # │ 2021-06-01 00:00:00 ┆ B ┆ 3 │
2164
+ # # └─────────────────────┴────────┴────────┘
2165
+ def upsample(
2166
+ time_column:,
2167
+ every:,
2168
+ offset: nil,
2169
+ by: nil,
2170
+ maintain_order: false
2171
+ )
2172
+ if by.nil?
2173
+ by = []
2174
+ end
2175
+ if by.is_a?(String)
2176
+ by = [by]
2177
+ end
2178
+ if offset.nil?
2179
+ offset = "0ns"
2180
+ end
1558
2181
 
1559
- # def join_asof
1560
- # end
2182
+ every = Utils._timedelta_to_pl_duration(every)
2183
+ offset = Utils._timedelta_to_pl_duration(offset)
2184
+
2185
+ _from_rbdf(
2186
+ _df.upsample(by, time_column, every, offset, maintain_order)
2187
+ )
2188
+ end
2189
+
2190
+ # Perform an asof join.
2191
+ #
2192
+ # This is similar to a left-join except that we match on nearest key rather than
2193
+ # equal keys.
2194
+ #
2195
+ # Both DataFrames must be sorted by the asof_join key.
2196
+ #
2197
+ # For each row in the left DataFrame:
2198
+ #
2199
+ # - A "backward" search selects the last row in the right DataFrame whose 'on' key is less than or equal to the left's key.
2200
+ # - A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key.
2201
+ #
2202
+ # The default is "backward".
2203
+ #
2204
+ # @param other [DataFrame]
2205
+ # DataFrame to join with.
2206
+ # @param left_on [String]
2207
+ # Join column of the left DataFrame.
2208
+ # @param right_on [String]
2209
+ # Join column of the right DataFrame.
2210
+ # @param on [String]
2211
+ # Join column of both DataFrames. If set, `left_on` and `right_on` should be
2212
+ # None.
2213
+ # @param by [Object]
2214
+ # join on these columns before doing asof join
2215
+ # @param by_left [Object]
2216
+ # join on these columns before doing asof join
2217
+ # @param by_right [Object]
2218
+ # join on these columns before doing asof join
2219
+ # @param strategy ["backward", "forward"]
2220
+ # Join strategy.
2221
+ # @param suffix [String]
2222
+ # Suffix to append to columns with a duplicate name.
2223
+ # @param tolerance [Object]
2224
+ # Numeric tolerance. By setting this the join will only be done if the near
2225
+ # keys are within this distance. If an asof join is done on columns of dtype
2226
+ # "Date", "Datetime", "Duration" or "Time" you use the following string
2227
+ # language:
2228
+ #
2229
+ # - 1ns (1 nanosecond)
2230
+ # - 1us (1 microsecond)
2231
+ # - 1ms (1 millisecond)
2232
+ # - 1s (1 second)
2233
+ # - 1m (1 minute)
2234
+ # - 1h (1 hour)
2235
+ # - 1d (1 day)
2236
+ # - 1w (1 week)
2237
+ # - 1mo (1 calendar month)
2238
+ # - 1y (1 calendar year)
2239
+ # - 1i (1 index count)
2240
+ #
2241
+ # Or combine them:
2242
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
2243
+ #
2244
+ # @param allow_parallel [Boolean]
2245
+ # Allow the physical plan to optionally evaluate the computation of both
2246
+ # DataFrames up to the join in parallel.
2247
+ # @param force_parallel [Boolean]
2248
+ # Force the physical plan to evaluate the computation of both DataFrames up to
2249
+ # the join in parallel.
2250
+ #
2251
+ # @return [DataFrame]
2252
+ #
2253
+ # @example
2254
+ # gdp = Polars::DataFrame.new(
2255
+ # {
2256
+ # "date" => [
2257
+ # DateTime.new(2016, 1, 1),
2258
+ # DateTime.new(2017, 1, 1),
2259
+ # DateTime.new(2018, 1, 1),
2260
+ # DateTime.new(2019, 1, 1),
2261
+ # ], # note record date: Jan 1st (sorted!)
2262
+ # "gdp" => [4164, 4411, 4566, 4696]
2263
+ # }
2264
+ # )
2265
+ # population = Polars::DataFrame.new(
2266
+ # {
2267
+ # "date" => [
2268
+ # DateTime.new(2016, 5, 12),
2269
+ # DateTime.new(2017, 5, 12),
2270
+ # DateTime.new(2018, 5, 12),
2271
+ # DateTime.new(2019, 5, 12),
2272
+ # ], # note record date: May 12th (sorted!)
2273
+ # "population" => [82.19, 82.66, 83.12, 83.52]
2274
+ # }
2275
+ # )
2276
+ # population.join_asof(
2277
+ # gdp, left_on: "date", right_on: "date", strategy: "backward"
2278
+ # )
2279
+ # # =>
2280
+ # # shape: (4, 3)
2281
+ # # ┌─────────────────────┬────────────┬──────┐
2282
+ # # │ date ┆ population ┆ gdp │
2283
+ # # │ --- ┆ --- ┆ --- │
2284
+ # # │ datetime[ns] ┆ f64 ┆ i64 │
2285
+ # # ╞═════════════════════╪════════════╪══════╡
2286
+ # # │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │
2287
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2288
+ # # │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │
2289
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2290
+ # # │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │
2291
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2292
+ # # │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │
2293
+ # # └─────────────────────┴────────────┴──────┘
2294
+ def join_asof(
2295
+ other,
2296
+ left_on: nil,
2297
+ right_on: nil,
2298
+ on: nil,
2299
+ by_left: nil,
2300
+ by_right: nil,
2301
+ by: nil,
2302
+ strategy: "backward",
2303
+ suffix: "_right",
2304
+ tolerance: nil,
2305
+ allow_parallel: true,
2306
+ force_parallel: false
2307
+ )
2308
+ lazy
2309
+ .join_asof(
2310
+ other.lazy,
2311
+ left_on: left_on,
2312
+ right_on: right_on,
2313
+ on: on,
2314
+ by_left: by_left,
2315
+ by_right: by_right,
2316
+ by: by,
2317
+ strategy: strategy,
2318
+ suffix: suffix,
2319
+ tolerance: tolerance,
2320
+ allow_parallel: allow_parallel,
2321
+ force_parallel: force_parallel
2322
+ )
2323
+ .collect(no_optimization: true)
2324
+ end
1561
2325
 
1562
2326
  # Join in SQL-like fashion.
1563
2327
  #
@@ -1675,8 +2439,78 @@ module Polars
1675
2439
  .collect(no_optimization: true)
1676
2440
  end
1677
2441
 
1678
- # def apply
1679
- # end
2442
+ # Apply a custom/user-defined function (UDF) over the rows of the DataFrame.
2443
+ #
2444
+ # The UDF will receive each row as a tuple of values: `udf(row)`.
2445
+ #
2446
+ # Implementing logic using a Ruby function is almost always _significantly_
2447
+ # slower and more memory intensive than implementing the same logic using
2448
+ # the native expression API because:
2449
+ #
2450
+ # - The native expression engine runs in Rust; UDFs run in Ruby.
2451
+ # - Use of Ruby UDFs forces the DataFrame to be materialized in memory.
2452
+ # - Polars-native expressions can be parallelised (UDFs cannot).
2453
+ # - Polars-native expressions can be logically optimised (UDFs cannot).
2454
+ #
2455
+ # Wherever possible you should strongly prefer the native expression API
2456
+ # to achieve the best performance.
2457
+ #
2458
+ # @param return_dtype [Symbol]
2459
+ # Output type of the operation. If none given, Polars tries to infer the type.
2460
+ # @param inference_size [Integer]
2461
+ # Only used in the case when the custom function returns rows.
2462
+ # This uses the first `n` rows to determine the output schema
2463
+ #
2464
+ # @return [Object]
2465
+ #
2466
+ # @note
2467
+ # The frame-level `apply` cannot track column names (as the UDF is a black-box
2468
+ # that may arbitrarily drop, rearrange, transform, or add new columns); if you
2469
+ # want to apply a UDF such that column names are preserved, you should use the
2470
+ # expression-level `apply` syntax instead.
2471
+ #
2472
+ # @example
2473
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [-1, 5, 8]})
2474
+ #
2475
+ # @example Return a DataFrame by mapping each row to a tuple:
2476
+ # df.apply { |t| [t[0] * 2, t[1] * 3] }
2477
+ # # =>
2478
+ # # shape: (3, 2)
2479
+ # # ┌──────────┬──────────┐
2480
+ # # │ column_0 ┆ column_1 │
2481
+ # # │ --- ┆ --- │
2482
+ # # │ i64 ┆ i64 │
2483
+ # # ╞══════════╪══════════╡
2484
+ # # │ 2 ┆ -3 │
2485
+ # # ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
2486
+ # # │ 4 ┆ 15 │
2487
+ # # ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
2488
+ # # │ 6 ┆ 24 │
2489
+ # # └──────────┴──────────┘
2490
+ #
2491
+ # @example Return a Series by mapping each row to a scalar:
2492
+ # df.apply { |t| t[0] * 2 + t[1] }
2493
+ # # =>
2494
+ # # shape: (3, 1)
2495
+ # # ┌───────┐
2496
+ # # │ apply │
2497
+ # # │ --- │
2498
+ # # │ i64 │
2499
+ # # ╞═══════╡
2500
+ # # │ 1 │
2501
+ # # ├╌╌╌╌╌╌╌┤
2502
+ # # │ 9 │
2503
+ # # ├╌╌╌╌╌╌╌┤
2504
+ # # │ 14 │
2505
+ # # └───────┘
2506
+ def apply(return_dtype: nil, inference_size: 256, &f)
2507
+ out, is_df = _df.apply(f, return_dtype, inference_size)
2508
+ if is_df
2509
+ _from_rbdf(out)
2510
+ else
2511
+ _from_rbdf(Utils.wrap_s(out).to_frame._df)
2512
+ end
2513
+ end
1680
2514
 
1681
2515
  # Return a new DataFrame with the column added or replaced.
1682
2516
  #
@@ -2178,17 +3012,404 @@ module Polars
2178
3012
  lazy.explode(columns).collect(no_optimization: true)
2179
3013
  end
2180
3014
 
2181
- # def pivot
2182
- # end
3015
+ # Create a spreadsheet-style pivot table as a DataFrame.
3016
+ #
3017
+ # @param values [Object]
3018
+ # Column values to aggregate. Can be multiple columns if the *columns*
3019
+ # arguments contains multiple columns as well
3020
+ # @param index [Object]
3021
+ # One or multiple keys to group by
3022
+ # @param columns [Object]
3023
+ # Columns whose values will be used as the header of the output DataFrame
3024
+ # @param aggregate_fn ["first", "sum", "max", "min", "mean", "median", "last", "count"]
3025
+ # A predefined aggregate function str or an expression.
3026
+ # @param maintain_order [Object]
3027
+ # Sort the grouped keys so that the output order is predictable.
3028
+ # @param sort_columns [Object]
3029
+ # Sort the transposed columns by name. Default is by order of discovery.
3030
+ #
3031
+ # @return [DataFrame]
3032
+ #
3033
+ # @example
3034
+ # df = Polars::DataFrame.new(
3035
+ # {
3036
+ # "foo" => ["one", "one", "one", "two", "two", "two"],
3037
+ # "bar" => ["A", "B", "C", "A", "B", "C"],
3038
+ # "baz" => [1, 2, 3, 4, 5, 6]
3039
+ # }
3040
+ # )
3041
+ # df.pivot(values: "baz", index: "foo", columns: "bar")
3042
+ # # =>
3043
+ # # shape: (2, 4)
3044
+ # # ┌─────┬─────┬─────┬─────┐
3045
+ # # │ foo ┆ A ┆ B ┆ C │
3046
+ # # │ --- ┆ --- ┆ --- ┆ --- │
3047
+ # # │ str ┆ i64 ┆ i64 ┆ i64 │
3048
+ # # ╞═════╪═════╪═════╪═════╡
3049
+ # # │ one ┆ 1 ┆ 2 ┆ 3 │
3050
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
3051
+ # # │ two ┆ 4 ┆ 5 ┆ 6 │
3052
+ # # └─────┴─────┴─────┴─────┘
3053
+ def pivot(
3054
+ values:,
3055
+ index:,
3056
+ columns:,
3057
+ aggregate_fn: "first",
3058
+ maintain_order: true,
3059
+ sort_columns: false
3060
+ )
3061
+ if values.is_a?(String)
3062
+ values = [values]
3063
+ end
3064
+ if index.is_a?(String)
3065
+ index = [index]
3066
+ end
3067
+ if columns.is_a?(String)
3068
+ columns = [columns]
3069
+ end
2183
3070
 
2184
- # def melt
2185
- # end
3071
+ if aggregate_fn.is_a?(String)
3072
+ case aggregate_fn
3073
+ when "first"
3074
+ aggregate_fn = Polars.element.first
3075
+ when "sum"
3076
+ aggregate_fn = Polars.element.sum
3077
+ when "max"
3078
+ aggregate_fn = Polars.element.max
3079
+ when "min"
3080
+ aggregate_fn = Polars.element.min
3081
+ when "mean"
3082
+ aggregate_fn = Polars.element.mean
3083
+ when "median"
3084
+ aggregate_fn = Polars.element.median
3085
+ when "last"
3086
+ aggregate_fn = Polars.element.last
3087
+ when "count"
3088
+ aggregate_fn = Polars.count
3089
+ else
3090
+ raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
3091
+ end
3092
+ end
2186
3093
 
2187
- # def unstack
2188
- # end
3094
+ _from_rbdf(
3095
+ _df.pivot_expr(
3096
+ values,
3097
+ index,
3098
+ columns,
3099
+ aggregate_fn._rbexpr,
3100
+ maintain_order,
3101
+ sort_columns
3102
+ )
3103
+ )
3104
+ end
2189
3105
 
2190
- # def partition_by
2191
- # end
3106
+ # Unpivot a DataFrame from wide to long format.
3107
+ #
3108
+ # Optionally leaves identifiers set.
3109
+ #
3110
+ # This function is useful to massage a DataFrame into a format where one or more
3111
+ # columns are identifier variables (id_vars), while all other columns, considered
3112
+ # measured variables (value_vars), are "unpivoted" to the row axis, leaving just
3113
+ # two non-identifier columns, 'variable' and 'value'.
3114
+ #
3115
+ # @param id_vars [Object]
3116
+ # Columns to use as identifier variables.
3117
+ # @param value_vars [Object]
3118
+ # Values to use as identifier variables.
3119
+ # If `value_vars` is empty all columns that are not in `id_vars` will be used.
3120
+ # @param variable_name [String]
3121
+ # Name to give to the `value` column. Defaults to "variable"
3122
+ # @param value_name [String]
3123
+ # Name to give to the `value` column. Defaults to "value"
3124
+ #
3125
+ # @return [DataFrame]
3126
+ #
3127
+ # @example
3128
+ # df = Polars::DataFrame.new(
3129
+ # {
3130
+ # "a" => ["x", "y", "z"],
3131
+ # "b" => [1, 3, 5],
3132
+ # "c" => [2, 4, 6]
3133
+ # }
3134
+ # )
3135
+ # df.melt(id_vars: "a", value_vars: ["b", "c"])
3136
+ # # =>
3137
+ # # shape: (6, 3)
3138
+ # # ┌─────┬──────────┬───────┐
3139
+ # # │ a ┆ variable ┆ value │
3140
+ # # │ --- ┆ --- ┆ --- │
3141
+ # # │ str ┆ str ┆ i64 │
3142
+ # # ╞═════╪══════════╪═══════╡
3143
+ # # │ x ┆ b ┆ 1 │
3144
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3145
+ # # │ y ┆ b ┆ 3 │
3146
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3147
+ # # │ z ┆ b ┆ 5 │
3148
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3149
+ # # │ x ┆ c ┆ 2 │
3150
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3151
+ # # │ y ┆ c ┆ 4 │
3152
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3153
+ # # │ z ┆ c ┆ 6 │
3154
+ # # └─────┴──────────┴───────┘
3155
+ def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
3156
+ if value_vars.is_a?(String)
3157
+ value_vars = [value_vars]
3158
+ end
3159
+ if id_vars.is_a?(String)
3160
+ id_vars = [id_vars]
3161
+ end
3162
+ if value_vars.nil?
3163
+ value_vars = []
3164
+ end
3165
+ if id_vars.nil?
3166
+ id_vars = []
3167
+ end
3168
+ _from_rbdf(
3169
+ _df.melt(id_vars, value_vars, value_name, variable_name)
3170
+ )
3171
+ end
3172
+
3173
+ # Unstack a long table to a wide form without doing an aggregation.
3174
+ #
3175
+ # This can be much faster than a pivot, because it can skip the grouping phase.
3176
+ #
3177
+ # @note
3178
+ # This functionality is experimental and may be subject to changes
3179
+ # without it being considered a breaking change.
3180
+ #
3181
+ # @param step Integer
3182
+ # Number of rows in the unstacked frame.
3183
+ # @param how ["vertical", "horizontal"]
3184
+ # Direction of the unstack.
3185
+ # @param columns [Object]
3186
+ # Column to include in the operation.
3187
+ # @param fill_values [Object]
3188
+ # Fill values that don't fit the new size with this value.
3189
+ #
3190
+ # @return [DataFrame]
3191
+ #
3192
+ # @example
3193
+ # df = Polars::DataFrame.new(
3194
+ # {
3195
+ # "col1" => "A".."I",
3196
+ # "col2" => Polars.arange(0, 9, eager: true)
3197
+ # }
3198
+ # )
3199
+ # # =>
3200
+ # # shape: (9, 2)
3201
+ # # ┌──────┬──────┐
3202
+ # # │ col1 ┆ col2 │
3203
+ # # │ --- ┆ --- │
3204
+ # # │ str ┆ i64 │
3205
+ # # ╞══════╪══════╡
3206
+ # # │ A ┆ 0 │
3207
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3208
+ # # │ B ┆ 1 │
3209
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3210
+ # # │ C ┆ 2 │
3211
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3212
+ # # │ D ┆ 3 │
3213
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3214
+ # # │ ... ┆ ... │
3215
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3216
+ # # │ F ┆ 5 │
3217
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3218
+ # # │ G ┆ 6 │
3219
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3220
+ # # │ H ┆ 7 │
3221
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3222
+ # # │ I ┆ 8 │
3223
+ # # └──────┴──────┘
3224
+ #
3225
+ # @example
3226
+ # df.unstack(step: 3, how: "vertical")
3227
+ # # =>
3228
+ # # shape: (3, 6)
3229
+ # # ┌────────┬────────┬────────┬────────┬────────┬────────┐
3230
+ # # │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │
3231
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3232
+ # # │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │
3233
+ # # ╞════════╪════════╪════════╪════════╪════════╪════════╡
3234
+ # # │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │
3235
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
3236
+ # # │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │
3237
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
3238
+ # # │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │
3239
+ # # └────────┴────────┴────────┴────────┴────────┴────────┘
3240
+ #
3241
+ # @example
3242
+ # df.unstack(step: 3, how: "horizontal")
3243
+ # # =>
3244
+ # # shape: (3, 6)
3245
+ # # ┌────────┬────────┬────────┬────────┬────────┬────────┐
3246
+ # # │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │
3247
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3248
+ # # │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │
3249
+ # # ╞════════╪════════╪════════╪════════╪════════╪════════╡
3250
+ # # │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │
3251
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
3252
+ # # │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │
3253
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
3254
+ # # │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │
3255
+ # # └────────┴────────┴────────┴────────┴────────┴────────┘
3256
+ def unstack(step:, how: "vertical", columns: nil, fill_values: nil)
3257
+ if !columns.nil?
3258
+ df = select(columns)
3259
+ else
3260
+ df = self
3261
+ end
3262
+
3263
+ height = df.height
3264
+ if how == "vertical"
3265
+ n_rows = step
3266
+ n_cols = (height / n_rows.to_f).ceil
3267
+ else
3268
+ n_cols = step
3269
+ n_rows = (height / n_cols.to_f).ceil
3270
+ end
3271
+
3272
+ n_fill = n_cols * n_rows - height
3273
+
3274
+ if n_fill > 0
3275
+ if !fill_values.is_a?(Array)
3276
+ fill_values = [fill_values] * df.width
3277
+ end
3278
+
3279
+ df = df.select(
3280
+ df.get_columns.zip(fill_values).map do |s, next_fill|
3281
+ s.extend_constant(next_fill, n_fill)
3282
+ end
3283
+ )
3284
+ end
3285
+
3286
+ if how == "horizontal"
3287
+ df = (
3288
+ df.with_column(
3289
+ (Polars.arange(0, n_cols * n_rows, eager: true) % n_cols).alias(
3290
+ "__sort_order"
3291
+ )
3292
+ )
3293
+ .sort("__sort_order")
3294
+ .drop("__sort_order")
3295
+ )
3296
+ end
3297
+
3298
+ zfill_val = Math.log10(n_cols).floor + 1
3299
+ slices =
3300
+ df.get_columns.flat_map do |s|
3301
+ n_cols.times.map do |slice_nbr|
3302
+ s.slice(slice_nbr * n_rows, n_rows).alias("%s_%0#{zfill_val}d" % [s.name, slice_nbr])
3303
+ end
3304
+ end
3305
+
3306
+ _from_rbdf(DataFrame.new(slices)._df)
3307
+ end
3308
+
3309
+ # Split into multiple DataFrames partitioned by groups.
3310
+ #
3311
+ # @param groups [Object]
3312
+ # Groups to partition by.
3313
+ # @param maintain_order [Boolean]
3314
+ # Keep predictable output order. This is slower as it requires an extra sort
3315
+ # operation.
3316
+ # @param as_dict [Boolean]
3317
+ # If true, return the partitions in a dictionary keyed by the distinct group
3318
+ # values instead of a list.
3319
+ #
3320
+ # @return [Object]
3321
+ #
3322
+ # @example
3323
+ # df = Polars::DataFrame.new(
3324
+ # {
3325
+ # "foo" => ["A", "A", "B", "B", "C"],
3326
+ # "N" => [1, 2, 2, 4, 2],
3327
+ # "bar" => ["k", "l", "m", "m", "l"]
3328
+ # }
3329
+ # )
3330
+ # df.partition_by("foo", maintain_order: true)
3331
+ # # =>
3332
+ # # [shape: (2, 3)
3333
+ # # ┌─────┬─────┬─────┐
3334
+ # # │ foo ┆ N ┆ bar │
3335
+ # # │ --- ┆ --- ┆ --- │
3336
+ # # │ str ┆ i64 ┆ str │
3337
+ # # ╞═════╪═════╪═════╡
3338
+ # # │ A ┆ 1 ┆ k │
3339
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
3340
+ # # │ A ┆ 2 ┆ l │
3341
+ # # └─────┴─────┴─────┘, shape: (2, 3)
3342
+ # # ┌─────┬─────┬─────┐
3343
+ # # │ foo ┆ N ┆ bar │
3344
+ # # │ --- ┆ --- ┆ --- │
3345
+ # # │ str ┆ i64 ┆ str │
3346
+ # # ╞═════╪═════╪═════╡
3347
+ # # │ B ┆ 2 ┆ m │
3348
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
3349
+ # # │ B ┆ 4 ┆ m │
3350
+ # # └─────┴─────┴─────┘, shape: (1, 3)
3351
+ # # ┌─────┬─────┬─────┐
3352
+ # # │ foo ┆ N ┆ bar │
3353
+ # # │ --- ┆ --- ┆ --- │
3354
+ # # │ str ┆ i64 ┆ str │
3355
+ # # ╞═════╪═════╪═════╡
3356
+ # # │ C ┆ 2 ┆ l │
3357
+ # # └─────┴─────┴─────┘]
3358
+ #
3359
+ # @example
3360
+ # df.partition_by("foo", maintain_order: true, as_dict: true)
3361
+ # # =>
3362
+ # # {"A"=>shape: (2, 3)
3363
+ # # ┌─────┬─────┬─────┐
3364
+ # # │ foo ┆ N ┆ bar │
3365
+ # # │ --- ┆ --- ┆ --- │
3366
+ # # │ str ┆ i64 ┆ str │
3367
+ # # ╞═════╪═════╪═════╡
3368
+ # # │ A ┆ 1 ┆ k │
3369
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
3370
+ # # │ A ┆ 2 ┆ l │
3371
+ # # └─────┴─────┴─────┘, "B"=>shape: (2, 3)
3372
+ # # ┌─────┬─────┬─────┐
3373
+ # # │ foo ┆ N ┆ bar │
3374
+ # # │ --- ┆ --- ┆ --- │
3375
+ # # │ str ┆ i64 ┆ str │
3376
+ # # ╞═════╪═════╪═════╡
3377
+ # # │ B ┆ 2 ┆ m │
3378
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
3379
+ # # │ B ┆ 4 ┆ m │
3380
+ # # └─────┴─────┴─────┘, "C"=>shape: (1, 3)
3381
+ # # ┌─────┬─────┬─────┐
3382
+ # # │ foo ┆ N ┆ bar │
3383
+ # # │ --- ┆ --- ┆ --- │
3384
+ # # │ str ┆ i64 ┆ str │
3385
+ # # ╞═════╪═════╪═════╡
3386
+ # # │ C ┆ 2 ┆ l │
3387
+ # # └─────┴─────┴─────┘}
3388
+ def partition_by(groups, maintain_order: true, as_dict: false)
3389
+ if groups.is_a?(String)
3390
+ groups = [groups]
3391
+ elsif !groups.is_a?(Array)
3392
+ groups = Array(groups)
3393
+ end
3394
+
3395
+ if as_dict
3396
+ out = {}
3397
+ if groups.length == 1
3398
+ _df.partition_by(groups, maintain_order).each do |df|
3399
+ df = _from_rbdf(df)
3400
+ out[df[groups][0, 0]] = df
3401
+ end
3402
+ else
3403
+ _df.partition_by(groups, maintain_order).each do |df|
3404
+ df = _from_rbdf(df)
3405
+ out[df[groups].row(0)] = df
3406
+ end
3407
+ end
3408
+ out
3409
+ else
3410
+ _df.partition_by(groups, maintain_order).map { |df| _from_rbdf(df) }
3411
+ end
3412
+ end
2192
3413
 
2193
3414
  # Shift values by the given period.
2194
3415
  #
@@ -3061,8 +4282,93 @@ module Polars
3061
4282
  _from_rbdf(_df.sample_n(n, with_replacement, shuffle, seed))
3062
4283
  end
3063
4284
 
3064
- # def fold
3065
- # end
4285
+ # Apply a horizontal reduction on a DataFrame.
4286
+ #
4287
+ # This can be used to effectively determine aggregations on a row level, and can
4288
+ # be applied to any DataType that can be supercasted (casted to a similar parent
4289
+ # type).
4290
+ #
4291
+ # An example of the supercast rules when applying an arithmetic operation on two
4292
+ # DataTypes are for instance:
4293
+ #
4294
+ # i8 + str = str
4295
+ # f32 + i64 = f32
4296
+ # f32 + f64 = f64
4297
+ #
4298
+ # @return [Series]
4299
+ #
4300
+ # @example A horizontal sum operation:
4301
+ # df = Polars::DataFrame.new(
4302
+ # {
4303
+ # "a" => [2, 1, 3],
4304
+ # "b" => [1, 2, 3],
4305
+ # "c" => [1.0, 2.0, 3.0]
4306
+ # }
4307
+ # )
4308
+ # df.fold { |s1, s2| s1 + s2 }
4309
+ # # =>
4310
+ # # shape: (3,)
4311
+ # # Series: 'a' [f64]
4312
+ # # [
4313
+ # # 4.0
4314
+ # # 5.0
4315
+ # # 9.0
4316
+ # # ]
4317
+ #
4318
+ # @example A horizontal minimum operation:
4319
+ # df = Polars::DataFrame.new({"a" => [2, 1, 3], "b" => [1, 2, 3], "c" => [1.0, 2.0, 3.0]})
4320
+ # df.fold { |s1, s2| s1.zip_with(s1 < s2, s2) }
4321
+ # # =>
4322
+ # # shape: (3,)
4323
+ # # Series: 'a' [f64]
4324
+ # # [
4325
+ # # 1.0
4326
+ # # 1.0
4327
+ # # 3.0
4328
+ # # ]
4329
+ #
4330
+ # @example A horizontal string concatenation:
4331
+ # df = Polars::DataFrame.new(
4332
+ # {
4333
+ # "a" => ["foo", "bar", 2],
4334
+ # "b" => [1, 2, 3],
4335
+ # "c" => [1.0, 2.0, 3.0]
4336
+ # }
4337
+ # )
4338
+ # df.fold { |s1, s2| s1 + s2 }
4339
+ # # =>
4340
+ # # shape: (3,)
4341
+ # # Series: 'a' [str]
4342
+ # # [
4343
+ # # "foo11.0"
4344
+ # # "bar22.0"
4345
+ # # null
4346
+ # # ]
4347
+ #
4348
+ # @example A horizontal boolean or, similar to a row-wise .any():
4349
+ # df = Polars::DataFrame.new(
4350
+ # {
4351
+ # "a" => [false, false, true],
4352
+ # "b" => [false, true, false]
4353
+ # }
4354
+ # )
4355
+ # df.fold { |s1, s2| s1 | s2 }
4356
+ # # =>
4357
+ # # shape: (3,)
4358
+ # # Series: 'a' [bool]
4359
+ # # [
4360
+ # # false
4361
+ # # true
4362
+ # # true
4363
+ # # ]
4364
+ def fold(&operation)
4365
+ acc = to_series(0)
4366
+
4367
+ 1.upto(width - 1) do |i|
4368
+ acc = operation.call(acc, to_series(i))
4369
+ end
4370
+ acc
4371
+ end
3066
4372
 
3067
4373
  # Get a row as tuple, either by index or by predicate.
3068
4374
  #
@@ -3171,8 +4477,45 @@ module Polars
3171
4477
  select(Utils.col("*").take_every(n))
3172
4478
  end
3173
4479
 
3174
- # def hash_rows
3175
- # end
4480
+ # Hash and combine the rows in this DataFrame.
4481
+ #
4482
+ # The hash value is of type `:u64`.
4483
+ #
4484
+ # @param seed [Integer]
4485
+ # Random seed parameter. Defaults to 0.
4486
+ # @param seed_1 [Integer]
4487
+ # Random seed parameter. Defaults to `seed` if not set.
4488
+ # @param seed_2 [Integer]
4489
+ # Random seed parameter. Defaults to `seed` if not set.
4490
+ # @param seed_3 [Integer]
4491
+ # Random seed parameter. Defaults to `seed` if not set.
4492
+ #
4493
+ # @return [Series]
4494
+ #
4495
+ # @example
4496
+ # df = Polars::DataFrame.new(
4497
+ # {
4498
+ # "foo" => [1, nil, 3, 4],
4499
+ # "ham" => ["a", "b", nil, "d"]
4500
+ # }
4501
+ # )
4502
+ # df.hash_rows(seed: 42)
4503
+ # # =>
4504
+ # # shape: (4,)
4505
+ # # Series: '' [u64]
4506
+ # # [
4507
+ # # 4238614331852490969
4508
+ # # 17976148875586754089
4509
+ # # 4702262519505526977
4510
+ # # 18144177983981041107
4511
+ # # ]
4512
+ def hash_rows(seed: 0, seed_1: nil, seed_2: nil, seed_3: nil)
4513
+ k0 = seed
4514
+ k1 = seed_1.nil? ? seed : seed_1
4515
+ k2 = seed_2.nil? ? seed : seed_2
4516
+ k3 = seed_3.nil? ? seed : seed_3
4517
+ Utils.wrap_s(_df.hash_rows(k0, k1, k2, k3))
4518
+ end
3176
4519
 
3177
4520
  # Interpolate intermediate values. The interpolation method is linear.
3178
4521
  #
@@ -3297,7 +4640,19 @@ module Polars
3297
4640
  self._df = _df._clone
3298
4641
  end
3299
4642
 
3300
- def hash_to_rbdf(data, columns: nil)
4643
+ def _pos_idx(idx, dim)
4644
+ if idx >= 0
4645
+ idx
4646
+ else
4647
+ shape[dim] + idx
4648
+ end
4649
+ end
4650
+
4651
+ # def _pos_idxs
4652
+ # end
4653
+
4654
+ # @private
4655
+ def self.hash_to_rbdf(data, columns: nil)
3301
4656
  if !columns.nil?
3302
4657
  columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
3303
4658
 
@@ -3313,11 +4668,34 @@ module Polars
3313
4668
  RbDataFrame.read_hash(data)
3314
4669
  end
3315
4670
 
3316
- def _unpack_columns(columns, lookup_names: nil)
3317
- [columns.keys, columns]
4671
+ # @private
4672
+ def self._unpack_columns(columns, lookup_names: nil, n_expected: nil)
4673
+ if columns.is_a?(Hash)
4674
+ columns = columns.to_a
4675
+ end
4676
+ column_names =
4677
+ (columns || []).map.with_index do |col, i|
4678
+ if col.is_a?(String)
4679
+ col || "column_#{i}"
4680
+ else
4681
+ col[0]
4682
+ end
4683
+ end
4684
+ if column_names.empty? && n_expected
4685
+ column_names = n_expected.times.map { |i| "column_#{i}" }
4686
+ end
4687
+ # TODO zip_longest
4688
+ lookup = column_names.zip(lookup_names || []).to_h
4689
+
4690
+ [
4691
+ column_names,
4692
+ (columns || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
4693
+ [lookup[col[0]] || col[0], col[1]]
4694
+ end
4695
+ ]
3318
4696
  end
3319
4697
 
3320
- def _handle_columns_arg(data, columns: nil)
4698
+ def self._handle_columns_arg(data, columns: nil)
3321
4699
  if columns.nil?
3322
4700
  data
3323
4701
  else
@@ -3335,14 +4713,39 @@ module Polars
3335
4713
  end
3336
4714
  end
3337
4715
 
3338
- def sequence_to_rbdf(data, columns: nil, orient: nil)
3339
- if columns || orient
3340
- raise Todo
4716
+ # @private
4717
+ def self.sequence_to_rbdf(data, columns: nil, orient: nil)
4718
+ if data.length == 0
4719
+ return hash_to_rbdf({}, columns: columns)
4720
+ end
4721
+
4722
+ if data[0].is_a?(Series)
4723
+ # series_names = data.map(&:name)
4724
+ # columns, dtypes = _unpack_columns(columns || series_names, n_expected: data.length)
4725
+ data_series = []
4726
+ data.each do |s|
4727
+ data_series << s._s
4728
+ end
4729
+ elsif data[0].is_a?(Array)
4730
+ if orient.nil? && !columns.nil?
4731
+ orient = columns.length == data.length ? "col" : "row"
4732
+ end
4733
+
4734
+ if orient == "row"
4735
+ raise Todo
4736
+ elsif orient == "col" || orient.nil?
4737
+ raise Todo
4738
+ else
4739
+ raise ArgumentError, "orient must be one of {{'col', 'row', nil}}, got #{orient} instead."
4740
+ end
3341
4741
  end
3342
- RbDataFrame.new(data.map(&:_s))
4742
+
4743
+ data_series = _handle_columns_arg(data_series, columns: columns)
4744
+ RbDataFrame.new(data_series)
3343
4745
  end
3344
4746
 
3345
- def series_to_rbdf(data, columns: nil)
4747
+ # @private
4748
+ def self.series_to_rbdf(data, columns: nil)
3346
4749
  if columns
3347
4750
  raise Todo
3348
4751
  end