polars-df 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -26,14 +26,14 @@ module Polars
26
26
  end
27
27
 
28
28
  if data.nil?
29
- self._df = hash_to_rbdf({}, columns: columns)
29
+ self._df = self.class.hash_to_rbdf({}, columns: columns)
30
30
  elsif data.is_a?(Hash)
31
31
  data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
32
- self._df = hash_to_rbdf(data, columns: columns)
32
+ self._df = self.class.hash_to_rbdf(data, columns: columns)
33
33
  elsif data.is_a?(Array)
34
- self._df = sequence_to_rbdf(data, columns: columns, orient: orient)
34
+ self._df = self.class.sequence_to_rbdf(data, columns: columns, orient: orient)
35
35
  elsif data.is_a?(Series)
36
- self._df = series_to_rbdf(data, columns: columns)
36
+ self._df = self.class.series_to_rbdf(data, columns: columns)
37
37
  else
38
38
  raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
39
39
  end
@@ -46,11 +46,16 @@ module Polars
46
46
  df
47
47
  end
48
48
 
49
- # def self._from_hashes
50
- # end
49
+ # @private
50
+ def self._from_hashes(data, infer_schema_length: 100, schema: nil)
51
+ rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema)
52
+ _from_rbdf(rbdf)
53
+ end
51
54
 
52
- # def self._from_hash
53
- # end
55
+ # @private
56
+ def self._from_hash(data, columns: nil)
57
+ _from_rbdf(hash_to_rbdf(data, columns: columns))
58
+ end
54
59
 
55
60
  # def self._from_records
56
61
  # end
@@ -186,8 +191,14 @@ module Polars
186
191
  )
187
192
  end
188
193
 
189
- # def self._read_avro
190
- # end
194
+ # @private
195
+ def self._read_avro(file, columns: nil, n_rows: nil)
196
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
197
+ file = Utils.format_path(file)
198
+ end
199
+ projection, columns = Utils.handle_projection_columns(columns)
200
+ _from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
201
+ end
191
202
 
192
203
  # @private
193
204
  def self._read_ipc(
@@ -343,7 +354,7 @@ module Polars
343
354
  # }
344
355
  # )
345
356
  # df.dtypes
346
- # # => [:i64, :f64, :str]
357
+ # # => [Polars::Int64, Polars::Float64, Polars::Utf8]
347
358
  def dtypes
348
359
  _df.dtypes
349
360
  end
@@ -361,7 +372,7 @@ module Polars
361
372
  # }
362
373
  # )
363
374
  # df.schema
364
- # # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str}
375
+ # # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::Utf8}
365
376
  def schema
366
377
  columns.zip(dtypes).to_h
367
378
  end
@@ -486,12 +497,6 @@ module Polars
486
497
  # def each
487
498
  # end
488
499
 
489
- # def _pos_idx
490
- # end
491
-
492
- # def _pos_idxs
493
- # end
494
-
495
500
  # Returns subset of the DataFrame.
496
501
  #
497
502
  # @return [Object]
@@ -554,19 +559,33 @@ module Polars
554
559
 
555
560
  # df[idx]
556
561
  if item.is_a?(Integer)
557
- return slice(_pos_idx(item, dim: 0), 1)
562
+ return slice(_pos_idx(item, 0), 1)
558
563
  end
559
564
 
560
565
  # df[..]
561
566
  if item.is_a?(Range)
562
567
  return Slice.new(self).apply(item)
563
568
  end
569
+
570
+ if Utils.is_str_sequence(item, allow_str: false)
571
+ # select multiple columns
572
+ # df[["foo", "bar"]]
573
+ return _from_rbdf(_df.select(item))
574
+ end
564
575
  end
565
576
 
566
577
  raise ArgumentError, "Cannot get item of type: #{item.class.name}"
567
578
  end
568
579
 
580
+ # Set item.
581
+ #
582
+ # @return [Object]
569
583
  # def []=(key, value)
584
+ # if key.is_a?(String)
585
+ # raise TypeError, "'DataFrame' object does not support 'Series' assignment by index. Use 'DataFrame.with_columns'"
586
+ # end
587
+
588
+ # raise Todo
570
589
  # end
571
590
 
572
591
  # no to_arrow
@@ -582,8 +601,24 @@ module Polars
582
601
  end
583
602
  end
584
603
 
585
- # def to_hashes / to_a
586
- # end
604
+ # Convert every row to a dictionary.
605
+ #
606
+ # Note that this is slow.
607
+ #
608
+ # @return [Array]
609
+ #
610
+ # @example
611
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
612
+ # df.to_hashes
613
+ # [{'foo': 1, 'bar': 4}, {'foo': 2, 'bar': 5}, {'foo': 3, 'bar': 6}]
614
+ def to_hashes
615
+ rbdf = _df
616
+ names = columns
617
+
618
+ height.times.map do |i|
619
+ names.zip(rbdf.row_tuple(i)).to_h
620
+ end
621
+ end
587
622
 
588
623
  # def to_numo
589
624
  # end
@@ -762,8 +797,24 @@ module Polars
762
797
  nil
763
798
  end
764
799
 
765
- # def write_avro
766
- # end
800
+ # Write to Apache Avro file.
801
+ #
802
+ # @param file [String]
803
+ # File path to which the file should be written.
804
+ # @param compression ["uncompressed", "snappy", "deflate"]
805
+ # Compression method. Defaults to "uncompressed".
806
+ #
807
+ # @return [nil]
808
+ def write_avro(file, compression = "uncompressed")
809
+ if compression.nil?
810
+ compression = "uncompressed"
811
+ end
812
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
813
+ file = Utils.format_path(file)
814
+ end
815
+
816
+ _df.write_avro(file, compression)
817
+ end
767
818
 
768
819
  # Write to Arrow IPC binary stream or Feather file.
769
820
  #
@@ -866,8 +917,84 @@ module Polars
866
917
  Utils.scale_bytes(sz, to: unit)
867
918
  end
868
919
 
869
- # def transpose
870
- # end
920
+ # Transpose a DataFrame over the diagonal.
921
+ #
922
+ # @param include_header [Boolean]
923
+ # If set, the column names will be added as first column.
924
+ # @param header_name [String]
925
+ # If `include_header` is set, this determines the name of the column that will
926
+ # be inserted.
927
+ # @param column_names [Array]
928
+ # Optional generator/iterator that yields column names. Will be used to
929
+ # replace the columns in the DataFrame.
930
+ #
931
+ # @return [DataFrame]
932
+ #
933
+ # @note
934
+ # This is a very expensive operation. Perhaps you can do it differently.
935
+ #
936
+ # @example
937
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => [1, 2, 3]})
938
+ # df.transpose(include_header: true)
939
+ # # =>
940
+ # # shape: (2, 4)
941
+ # # ┌────────┬──────────┬──────────┬──────────┐
942
+ # # │ column ┆ column_0 ┆ column_1 ┆ column_2 │
943
+ # # │ --- ┆ --- ┆ --- ┆ --- │
944
+ # # │ str ┆ i64 ┆ i64 ┆ i64 │
945
+ # # ╞════════╪══════════╪══════════╪══════════╡
946
+ # # │ a ┆ 1 ┆ 2 ┆ 3 │
947
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
948
+ # # │ b ┆ 1 ┆ 2 ┆ 3 │
949
+ # # └────────┴──────────┴──────────┴──────────┘
950
+ #
951
+ # @example Replace the auto-generated column names with a list
952
+ # df.transpose(include_header: false, column_names: ["a", "b", "c"])
953
+ # # =>
954
+ # # shape: (2, 3)
955
+ # # ┌─────┬─────┬─────┐
956
+ # # │ a ┆ b ┆ c │
957
+ # # │ --- ┆ --- ┆ --- │
958
+ # # │ i64 ┆ i64 ┆ i64 │
959
+ # # ╞═════╪═════╪═════╡
960
+ # # │ 1 ┆ 2 ┆ 3 │
961
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
962
+ # # │ 1 ┆ 2 ┆ 3 │
963
+ # # └─────┴─────┴─────┘
964
+ #
965
+ # @example Include the header as a separate column
966
+ # df.transpose(
967
+ # include_header: true, header_name: "foo", column_names: ["a", "b", "c"]
968
+ # )
969
+ # # =>
970
+ # # shape: (2, 4)
971
+ # # ┌─────┬─────┬─────┬─────┐
972
+ # # │ foo ┆ a ┆ b ┆ c │
973
+ # # │ --- ┆ --- ┆ --- ┆ --- │
974
+ # # │ str ┆ i64 ┆ i64 ┆ i64 │
975
+ # # ╞═════╪═════╪═════╪═════╡
976
+ # # │ a ┆ 1 ┆ 2 ┆ 3 │
977
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
978
+ # # │ b ┆ 1 ┆ 2 ┆ 3 │
979
+ # # └─────┴─────┴─────┴─────┘
980
+ def transpose(include_header: false, header_name: "column", column_names: nil)
981
+ df = _from_rbdf(_df.transpose(include_header, header_name))
982
+ if !column_names.nil?
983
+ names = []
984
+ n = df.width
985
+ if include_header
986
+ names << header_name
987
+ n -= 1
988
+ end
989
+
990
+ column_names = column_names.each
991
+ n.times do
992
+ names << column_names.next
993
+ end
994
+ df.columns = names
995
+ end
996
+ df
997
+ end
871
998
 
872
999
  # Reverse the DataFrame.
873
1000
  #
@@ -1051,25 +1178,25 @@ module Polars
1051
1178
  # df.describe
1052
1179
  # # =>
1053
1180
  # # shape: (7, 6)
1054
- # # ┌────────────┬──────────┬──────────┬──────┬──────┬──────┐
1055
- # # │ describe ┆ a ┆ b ┆ c ┆ d ┆ e │
1056
- # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1057
- # # │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str │
1058
- # # ╞════════════╪══════════╪══════════╪══════╪══════╪══════╡
1059
- # # │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 │
1060
- # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1061
- # # │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 │
1062
- # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1063
- # # │ mean ┆ 2.266667 ┆ 4.5 ┆ null ┆ null ┆ null │
1064
- # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1065
- # # │ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null │
1066
- # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1067
- # # │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur │
1068
- # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1069
- # # │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd │
1070
- # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1071
- # # │ median ┆ 2.8 ┆ 4.5 ┆ null ┆ null ┆ null │
1072
- # # └────────────┴──────────┴──────────┴──────┴──────┴──────┘
1181
+ # # ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┐
1182
+ # # │ describe ┆ a ┆ b ┆ c ┆ d ┆ e │
1183
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1184
+ # # │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str │
1185
+ # # ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╡
1186
+ # # │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 │
1187
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1188
+ # # │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 │
1189
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1190
+ # # │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null │
1191
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1192
+ # # │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null │
1193
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1194
+ # # │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur │
1195
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1196
+ # # │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd │
1197
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1198
+ # # │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null │
1199
+ # # └────────────┴──────────┴──────────┴──────────┴──────┴──────┘
1073
1200
  def describe
1074
1201
  describe_cast = lambda do |stat|
1075
1202
  columns = []
@@ -1462,8 +1589,48 @@ module Polars
1462
1589
  _from_rbdf(_df.drop_nulls(subset))
1463
1590
  end
1464
1591
 
1465
- # def pipe
1466
- # end
1592
+ # Offers a structured way to apply a sequence of user-defined functions (UDFs).
1593
+ #
1594
+ # @param func [Object]
1595
+ # Callable; will receive the frame as the first parameter,
1596
+ # followed by any given args/kwargs.
1597
+ # @param args [Object]
1598
+ # Arguments to pass to the UDF.
1599
+ # @param kwargs [Object]
1600
+ # Keyword arguments to pass to the UDF.
1601
+ #
1602
+ # @return [Object]
1603
+ #
1604
+ # @note
1605
+ # It is recommended to use LazyFrame when piping operations, in order
1606
+ # to fully take advantage of query optimization and parallelization.
1607
+ # See {#lazy}.
1608
+ #
1609
+ # @example
1610
+ # cast_str_to_int = lambda do |data, col_name:|
1611
+ # data.with_column(Polars.col(col_name).cast(:i64))
1612
+ # end
1613
+ #
1614
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]})
1615
+ # df.pipe(cast_str_to_int, col_name: "b")
1616
+ # # =>
1617
+ # # shape: (4, 2)
1618
+ # # ┌─────┬─────┐
1619
+ # # │ a ┆ b │
1620
+ # # │ --- ┆ --- │
1621
+ # # │ i64 ┆ i64 │
1622
+ # # ╞═════╪═════╡
1623
+ # # │ 1 ┆ 10 │
1624
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1625
+ # # │ 2 ┆ 20 │
1626
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1627
+ # # │ 3 ┆ 30 │
1628
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
1629
+ # # │ 4 ┆ 40 │
1630
+ # # └─────┴─────┘
1631
+ def pipe(func, *args, **kwargs, &block)
1632
+ func.call(self, *args, **kwargs, &block)
1633
+ end
1467
1634
 
1468
1635
  # Add a column at index 0 that counts the rows.
1469
1636
  #
@@ -1547,17 +1714,614 @@ module Polars
1547
1714
  )
1548
1715
  end
1549
1716
 
1550
- # def groupby_rolling
1551
- # end
1717
+ # Create rolling groups based on a time column.
1718
+ #
1719
+ # Also works for index values of type `:i32` or `:i64`.
1720
+ #
1721
+ # Different from a `dynamic_groupby` the windows are now determined by the
1722
+ # individual values and are not of constant intervals. For constant intervals use
1723
+ # *groupby_dynamic*
1724
+ #
1725
+ # The `period` and `offset` arguments are created either from a timedelta, or
1726
+ # by using the following string language:
1727
+ #
1728
+ # - 1ns (1 nanosecond)
1729
+ # - 1us (1 microsecond)
1730
+ # - 1ms (1 millisecond)
1731
+ # - 1s (1 second)
1732
+ # - 1m (1 minute)
1733
+ # - 1h (1 hour)
1734
+ # - 1d (1 day)
1735
+ # - 1w (1 week)
1736
+ # - 1mo (1 calendar month)
1737
+ # - 1y (1 calendar year)
1738
+ # - 1i (1 index count)
1739
+ #
1740
+ # Or combine them:
1741
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1742
+ #
1743
+ # In case of a groupby_rolling on an integer column, the windows are defined by:
1744
+ #
1745
+ # - **"1i" # length 1**
1746
+ # - **"10i" # length 10**
1747
+ #
1748
+ # @param index_column [Object]
1749
+ # Column used to group based on the time window.
1750
+ # Often to type Date/Datetime
1751
+ # This column must be sorted in ascending order. If not the output will not
1752
+ # make sense.
1753
+ #
1754
+ # In case of a rolling groupby on indices, dtype needs to be one of
1755
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1756
+ # performance matters use an `:i64` column.
1757
+ # @param period [Object]
1758
+ # Length of the window.
1759
+ # @param offset [Object]
1760
+ # Offset of the window. Default is -period.
1761
+ # @param closed ["right", "left", "both", "none"]
1762
+ # Define whether the temporal window interval is closed or not.
1763
+ # @param by [Object]
1764
+ # Also group by this column/these columns.
1765
+ #
1766
+ # @return [RollingGroupBy]
1767
+ #
1768
+ # @example
1769
+ # dates = [
1770
+ # "2020-01-01 13:45:48",
1771
+ # "2020-01-01 16:42:13",
1772
+ # "2020-01-01 16:45:09",
1773
+ # "2020-01-02 18:12:48",
1774
+ # "2020-01-03 19:45:32",
1775
+ # "2020-01-08 23:16:43"
1776
+ # ]
1777
+ # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
1778
+ # Polars.col("dt").str.strptime(:datetime)
1779
+ # )
1780
+ # df.groupby_rolling(index_column: "dt", period: "2d").agg(
1781
+ # [
1782
+ # Polars.sum("a").alias("sum_a"),
1783
+ # Polars.min("a").alias("min_a"),
1784
+ # Polars.max("a").alias("max_a")
1785
+ # ]
1786
+ # )
1787
+ # # =>
1788
+ # # shape: (6, 4)
1789
+ # # ┌─────────────────────┬───────┬───────┬───────┐
1790
+ # # │ dt ┆ sum_a ┆ min_a ┆ max_a │
1791
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1792
+ # # │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │
1793
+ # # ╞═════════════════════╪═══════╪═══════╪═══════╡
1794
+ # # │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │
1795
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1796
+ # # │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │
1797
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1798
+ # # │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │
1799
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1800
+ # # │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │
1801
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1802
+ # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
1803
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
1804
+ # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
1805
+ # # └─────────────────────┴───────┴───────┴───────┘
1806
+ def groupby_rolling(
1807
+ index_column:,
1808
+ period:,
1809
+ offset: nil,
1810
+ closed: "right",
1811
+ by: nil
1812
+ )
1813
+ RollingGroupBy.new(self, index_column, period, offset, closed, by)
1814
+ end
1552
1815
 
1553
- # def groupby_dynamic
1554
- # end
1816
+ # Group based on a time value (or index value of type `:i32`, `:i64`).
1817
+ #
1818
+ # Time windows are calculated and rows are assigned to windows. Different from a
1819
+ # normal groupby is that a row can be member of multiple groups. The time/index
1820
+ # window could be seen as a rolling window, with a window size determined by
1821
+ # dates/times/values instead of slots in the DataFrame.
1822
+ #
1823
+ # A window is defined by:
1824
+ #
1825
+ # - every: interval of the window
1826
+ # - period: length of the window
1827
+ # - offset: offset of the window
1828
+ #
1829
+ # The `every`, `period` and `offset` arguments are created with
1830
+ # the following string language:
1831
+ #
1832
+ # - 1ns (1 nanosecond)
1833
+ # - 1us (1 microsecond)
1834
+ # - 1ms (1 millisecond)
1835
+ # - 1s (1 second)
1836
+ # - 1m (1 minute)
1837
+ # - 1h (1 hour)
1838
+ # - 1d (1 day)
1839
+ # - 1w (1 week)
1840
+ # - 1mo (1 calendar month)
1841
+ # - 1y (1 calendar year)
1842
+ # - 1i (1 index count)
1843
+ #
1844
+ # Or combine them:
1845
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1846
+ #
1847
+ # In case of a groupby_dynamic on an integer column, the windows are defined by:
1848
+ #
1849
+ # - "1i" # length 1
1850
+ # - "10i" # length 10
1851
+ #
1852
+ # @param index_column
1853
+ # Column used to group based on the time window.
1854
+ # Often to type Date/Datetime
1855
+ # This column must be sorted in ascending order. If not the output will not
1856
+ # make sense.
1857
+ #
1858
+ # In case of a dynamic groupby on indices, dtype needs to be one of
1859
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1860
+ # performance matters use an `:i64` column.
1861
+ # @param every
1862
+ # Interval of the window.
1863
+ # @param period
1864
+ # Length of the window, if None it is equal to 'every'.
1865
+ # @param offset
1866
+ # Offset of the window if None and period is None it will be equal to negative
1867
+ # `every`.
1868
+ # @param truncate
1869
+ # Truncate the time value to the window lower bound.
1870
+ # @param include_boundaries
1871
+ # Add the lower and upper bound of the window to the "_lower_bound" and
1872
+ # "_upper_bound" columns. This will impact performance because it's harder to
1873
+ # parallelize
1874
+ # @param closed ["right", "left", "both", "none"]
1875
+ # Define whether the temporal window interval is closed or not.
1876
+ # @param by
1877
+ # Also group by this column/these columns
1878
+ #
1879
+ # @return [DataFrame]
1880
+ #
1881
+ # @example
1882
+ # df = Polars::DataFrame.new(
1883
+ # {
1884
+ # "time" => Polars.date_range(
1885
+ # DateTime.new(2021, 12, 16),
1886
+ # DateTime.new(2021, 12, 16, 3),
1887
+ # "30m"
1888
+ # ),
1889
+ # "n" => 0..6
1890
+ # }
1891
+ # )
1892
+ # # =>
1893
+ # # shape: (7, 2)
1894
+ # # ┌─────────────────────┬─────┐
1895
+ # # │ time ┆ n │
1896
+ # # │ --- ┆ --- │
1897
+ # # │ datetime[μs] ┆ i64 │
1898
+ # # ╞═════════════════════╪═════╡
1899
+ # # │ 2021-12-16 00:00:00 ┆ 0 │
1900
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1901
+ # # │ 2021-12-16 00:30:00 ┆ 1 │
1902
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1903
+ # # │ 2021-12-16 01:00:00 ┆ 2 │
1904
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1905
+ # # │ 2021-12-16 01:30:00 ┆ 3 │
1906
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1907
+ # # │ 2021-12-16 02:00:00 ┆ 4 │
1908
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1909
+ # # │ 2021-12-16 02:30:00 ┆ 5 │
1910
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1911
+ # # │ 2021-12-16 03:00:00 ┆ 6 │
1912
+ # # └─────────────────────┴─────┘
1913
+ #
1914
+ # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
1915
+ # df.groupby_dynamic("time", every: "1h", closed: "right").agg(
1916
+ # [
1917
+ # Polars.col("time").min.alias("time_min"),
1918
+ # Polars.col("time").max.alias("time_max")
1919
+ # ]
1920
+ # )
1921
+ # # =>
1922
+ # # shape: (4, 3)
1923
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┐
1924
+ # # │ time ┆ time_min ┆ time_max │
1925
+ # # │ --- ┆ --- ┆ --- │
1926
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │
1927
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╡
1928
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │
1929
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1930
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │
1931
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1932
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │
1933
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1934
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │
1935
+ # # └─────────────────────┴─────────────────────┴─────────────────────┘
1936
+ #
1937
+ # @example The window boundaries can also be added to the aggregation result.
1938
+ # df.groupby_dynamic(
1939
+ # "time", every: "1h", include_boundaries: true, closed: "right"
1940
+ # ).agg([Polars.col("time").count.alias("time_count")])
1941
+ # # =>
1942
+ # # shape: (4, 4)
1943
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
1944
+ # # │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
1945
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1946
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
1947
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
1948
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
1949
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1950
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │
1951
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1952
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
1953
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1954
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
1955
+ # # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
1956
+ #
1957
+ # @example When closed="left", should not include right end of interval.
1958
+ # df.groupby_dynamic("time", every: "1h", closed: "left").agg(
1959
+ # [
1960
+ # Polars.col("time").count.alias("time_count"),
1961
+ # Polars.col("time").list.alias("time_agg_list")
1962
+ # ]
1963
+ # )
1964
+ # # =>
1965
+ # # shape: (4, 3)
1966
+ # # ┌─────────────────────┬────────────┬─────────────────────────────────────┐
1967
+ # # │ time ┆ time_count ┆ time_agg_list │
1968
+ # # │ --- ┆ --- ┆ --- │
1969
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
1970
+ # # ╞═════════════════════╪════════════╪═════════════════════════════════════╡
1971
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16... │
1972
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1973
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16... │
1974
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1975
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16... │
1976
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1977
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
1978
+ # # └─────────────────────┴────────────┴─────────────────────────────────────┘
1979
+ #
1980
+ # @example When closed="both" the time values at the window boundaries belong to 2 groups.
1981
+ # df.groupby_dynamic("time", every: "1h", closed: "both").agg(
1982
+ # [Polars.col("time").count.alias("time_count")]
1983
+ # )
1984
+ # # =>
1985
+ # # shape: (5, 2)
1986
+ # # ┌─────────────────────┬────────────┐
1987
+ # # │ time ┆ time_count │
1988
+ # # │ --- ┆ --- │
1989
+ # # │ datetime[μs] ┆ u32 │
1990
+ # # ╞═════════════════════╪════════════╡
1991
+ # # │ 2021-12-15 23:00:00 ┆ 1 │
1992
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1993
+ # # │ 2021-12-16 00:00:00 ┆ 3 │
1994
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1995
+ # # │ 2021-12-16 01:00:00 ┆ 3 │
1996
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1997
+ # # │ 2021-12-16 02:00:00 ┆ 3 │
1998
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1999
+ # # │ 2021-12-16 03:00:00 ┆ 1 │
2000
+ # # └─────────────────────┴────────────┘
2001
+ #
2002
+ # @example Dynamic groupbys can also be combined with grouping on normal keys.
2003
+ # df = Polars::DataFrame.new(
2004
+ # {
2005
+ # "time" => Polars.date_range(
2006
+ # DateTime.new(2021, 12, 16),
2007
+ # DateTime.new(2021, 12, 16, 3),
2008
+ # "30m"
2009
+ # ),
2010
+ # "groups" => ["a", "a", "a", "b", "b", "a", "a"]
2011
+ # }
2012
+ # )
2013
+ # df.groupby_dynamic(
2014
+ # "time",
2015
+ # every: "1h",
2016
+ # closed: "both",
2017
+ # by: "groups",
2018
+ # include_boundaries: true
2019
+ # ).agg([Polars.col("time").count.alias("time_count")])
2020
+ # # =>
2021
+ # # shape: (7, 5)
2022
+ # # ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
2023
+ # # │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
2024
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2025
+ # # │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
2026
+ # # ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
2027
+ # # │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
2028
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2029
+ # # │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │
2030
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2031
+ # # │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │
2032
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2033
+ # # │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
2034
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2035
+ # # │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │
2036
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2037
+ # # │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
2038
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
2039
+ # # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
2040
+ # # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
2041
+ #
2042
+ # @example Dynamic groupby on an index column.
2043
+ # df = Polars::DataFrame.new(
2044
+ # {
2045
+ # "idx" => Polars.arange(0, 6, eager: true),
2046
+ # "A" => ["A", "A", "B", "B", "B", "C"]
2047
+ # }
2048
+ # )
2049
+ # df.groupby_dynamic(
2050
+ # "idx",
2051
+ # every: "2i",
2052
+ # period: "3i",
2053
+ # include_boundaries: true,
2054
+ # closed: "right"
2055
+ # ).agg(Polars.col("A").list.alias("A_agg_list"))
2056
+ # # =>
2057
+ # # shape: (3, 4)
2058
+ # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
2059
+ # # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
2060
+ # # │ --- ┆ --- ┆ --- ┆ --- │
2061
+ # # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
2062
+ # # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
2063
+ # # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
2064
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
2065
+ # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
2066
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
2067
+ # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
2068
+ # # └─────────────────┴─────────────────┴─────┴─────────────────┘
2069
+ def groupby_dynamic(
2070
+ index_column,
2071
+ every:,
2072
+ period: nil,
2073
+ offset: nil,
2074
+ truncate: true,
2075
+ include_boundaries: false,
2076
+ closed: "left",
2077
+ by: nil,
2078
+ start_by: "window"
2079
+ )
2080
+ DynamicGroupBy.new(
2081
+ self,
2082
+ index_column,
2083
+ every,
2084
+ period,
2085
+ offset,
2086
+ truncate,
2087
+ include_boundaries,
2088
+ closed,
2089
+ by,
2090
+ start_by
2091
+ )
2092
+ end
1555
2093
 
1556
- # def upsample
1557
- # end
2094
+ # Upsample a DataFrame at a regular frequency.
2095
+ #
2096
+ # @param time_column [Object]
2097
+ # time column will be used to determine a date_range.
2098
+ # Note that this column has to be sorted for the output to make sense.
2099
+ # @param every [String]
2100
+ # interval will start 'every' duration
2101
+ # @param offset [String]
2102
+ # change the start of the date_range by this offset.
2103
+ # @param by [Object]
2104
+ # First group by these columns and then upsample for every group
2105
+ # @param maintain_order [Boolean]
2106
+ # Keep the ordering predictable. This is slower.
2107
+ #
2108
+ # The `every` and `offset` arguments are created with
2109
+ # the following string language:
2110
+ #
2111
+ # - 1ns (1 nanosecond)
2112
+ # - 1us (1 microsecond)
2113
+ # - 1ms (1 millisecond)
2114
+ # - 1s (1 second)
2115
+ # - 1m (1 minute)
2116
+ # - 1h (1 hour)
2117
+ # - 1d (1 day)
2118
+ # - 1w (1 week)
2119
+ # - 1mo (1 calendar month)
2120
+ # - 1y (1 calendar year)
2121
+ # - 1i (1 index count)
2122
+ #
2123
+ # Or combine them:
2124
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
2125
+ #
2126
+ # @return [DataFrame]
2127
+ #
2128
+ # @example Upsample a DataFrame by a certain interval.
2129
+ # df = Polars::DataFrame.new(
2130
+ # {
2131
+ # "time" => [
2132
+ # DateTime.new(2021, 2, 1),
2133
+ # DateTime.new(2021, 4, 1),
2134
+ # DateTime.new(2021, 5, 1),
2135
+ # DateTime.new(2021, 6, 1)
2136
+ # ],
2137
+ # "groups" => ["A", "B", "A", "B"],
2138
+ # "values" => [0, 1, 2, 3]
2139
+ # }
2140
+ # )
2141
+ # df.upsample(
2142
+ # time_column: "time", every: "1mo", by: "groups", maintain_order: true
2143
+ # ).select(Polars.all.forward_fill)
2144
+ # # =>
2145
+ # # shape: (7, 3)
2146
+ # # ┌─────────────────────┬────────┬────────┐
2147
+ # # │ time ┆ groups ┆ values │
2148
+ # # │ --- ┆ --- ┆ --- │
2149
+ # # │ datetime[ns] ┆ str ┆ i64 │
2150
+ # # ╞═════════════════════╪════════╪════════╡
2151
+ # # │ 2021-02-01 00:00:00 ┆ A ┆ 0 │
2152
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2153
+ # # │ 2021-03-01 00:00:00 ┆ A ┆ 0 │
2154
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2155
+ # # │ 2021-04-01 00:00:00 ┆ A ┆ 0 │
2156
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2157
+ # # │ 2021-05-01 00:00:00 ┆ A ┆ 2 │
2158
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2159
+ # # │ 2021-04-01 00:00:00 ┆ B ┆ 1 │
2160
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2161
+ # # │ 2021-05-01 00:00:00 ┆ B ┆ 1 │
2162
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
2163
+ # # │ 2021-06-01 00:00:00 ┆ B ┆ 3 │
2164
+ # # └─────────────────────┴────────┴────────┘
2165
+ def upsample(
2166
+ time_column:,
2167
+ every:,
2168
+ offset: nil,
2169
+ by: nil,
2170
+ maintain_order: false
2171
+ )
2172
+ if by.nil?
2173
+ by = []
2174
+ end
2175
+ if by.is_a?(String)
2176
+ by = [by]
2177
+ end
2178
+ if offset.nil?
2179
+ offset = "0ns"
2180
+ end
1558
2181
 
1559
- # def join_asof
1560
- # end
2182
+ every = Utils._timedelta_to_pl_duration(every)
2183
+ offset = Utils._timedelta_to_pl_duration(offset)
2184
+
2185
+ _from_rbdf(
2186
+ _df.upsample(by, time_column, every, offset, maintain_order)
2187
+ )
2188
+ end
2189
+
2190
+ # Perform an asof join.
2191
+ #
2192
+ # This is similar to a left-join except that we match on nearest key rather than
2193
+ # equal keys.
2194
+ #
2195
+ # Both DataFrames must be sorted by the asof_join key.
2196
+ #
2197
+ # For each row in the left DataFrame:
2198
+ #
2199
+ # - A "backward" search selects the last row in the right DataFrame whose 'on' key is less than or equal to the left's key.
2200
+ # - A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key.
2201
+ #
2202
+ # The default is "backward".
2203
+ #
2204
+ # @param other [DataFrame]
2205
+ # DataFrame to join with.
2206
+ # @param left_on [String]
2207
+ # Join column of the left DataFrame.
2208
+ # @param right_on [String]
2209
+ # Join column of the right DataFrame.
2210
+ # @param on [String]
2211
+ # Join column of both DataFrames. If set, `left_on` and `right_on` should be
2212
+ # None.
2213
+ # @param by [Object]
2214
+ # join on these columns before doing asof join
2215
+ # @param by_left [Object]
2216
+ # join on these columns before doing asof join
2217
+ # @param by_right [Object]
2218
+ # join on these columns before doing asof join
2219
+ # @param strategy ["backward", "forward"]
2220
+ # Join strategy.
2221
+ # @param suffix [String]
2222
+ # Suffix to append to columns with a duplicate name.
2223
+ # @param tolerance [Object]
2224
+ # Numeric tolerance. By setting this the join will only be done if the near
2225
+ # keys are within this distance. If an asof join is done on columns of dtype
2226
+ # "Date", "Datetime", "Duration" or "Time" you use the following string
2227
+ # language:
2228
+ #
2229
+ # - 1ns (1 nanosecond)
2230
+ # - 1us (1 microsecond)
2231
+ # - 1ms (1 millisecond)
2232
+ # - 1s (1 second)
2233
+ # - 1m (1 minute)
2234
+ # - 1h (1 hour)
2235
+ # - 1d (1 day)
2236
+ # - 1w (1 week)
2237
+ # - 1mo (1 calendar month)
2238
+ # - 1y (1 calendar year)
2239
+ # - 1i (1 index count)
2240
+ #
2241
+ # Or combine them:
2242
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
2243
+ #
2244
+ # @param allow_parallel [Boolean]
2245
+ # Allow the physical plan to optionally evaluate the computation of both
2246
+ # DataFrames up to the join in parallel.
2247
+ # @param force_parallel [Boolean]
2248
+ # Force the physical plan to evaluate the computation of both DataFrames up to
2249
+ # the join in parallel.
2250
+ #
2251
+ # @return [DataFrame]
2252
+ #
2253
+ # @example
2254
+ # gdp = Polars::DataFrame.new(
2255
+ # {
2256
+ # "date" => [
2257
+ # DateTime.new(2016, 1, 1),
2258
+ # DateTime.new(2017, 1, 1),
2259
+ # DateTime.new(2018, 1, 1),
2260
+ # DateTime.new(2019, 1, 1),
2261
+ # ], # note record date: Jan 1st (sorted!)
2262
+ # "gdp" => [4164, 4411, 4566, 4696]
2263
+ # }
2264
+ # )
2265
+ # population = Polars::DataFrame.new(
2266
+ # {
2267
+ # "date" => [
2268
+ # DateTime.new(2016, 5, 12),
2269
+ # DateTime.new(2017, 5, 12),
2270
+ # DateTime.new(2018, 5, 12),
2271
+ # DateTime.new(2019, 5, 12),
2272
+ # ], # note record date: May 12th (sorted!)
2273
+ # "population" => [82.19, 82.66, 83.12, 83.52]
2274
+ # }
2275
+ # )
2276
+ # population.join_asof(
2277
+ # gdp, left_on: "date", right_on: "date", strategy: "backward"
2278
+ # )
2279
+ # # =>
2280
+ # # shape: (4, 3)
2281
+ # # ┌─────────────────────┬────────────┬──────┐
2282
+ # # │ date ┆ population ┆ gdp │
2283
+ # # │ --- ┆ --- ┆ --- │
2284
+ # # │ datetime[ns] ┆ f64 ┆ i64 │
2285
+ # # ╞═════════════════════╪════════════╪══════╡
2286
+ # # │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │
2287
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2288
+ # # │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │
2289
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2290
+ # # │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │
2291
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
2292
+ # # │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │
2293
+ # # └─────────────────────┴────────────┴──────┘
2294
+ def join_asof(
2295
+ other,
2296
+ left_on: nil,
2297
+ right_on: nil,
2298
+ on: nil,
2299
+ by_left: nil,
2300
+ by_right: nil,
2301
+ by: nil,
2302
+ strategy: "backward",
2303
+ suffix: "_right",
2304
+ tolerance: nil,
2305
+ allow_parallel: true,
2306
+ force_parallel: false
2307
+ )
2308
+ lazy
2309
+ .join_asof(
2310
+ other.lazy,
2311
+ left_on: left_on,
2312
+ right_on: right_on,
2313
+ on: on,
2314
+ by_left: by_left,
2315
+ by_right: by_right,
2316
+ by: by,
2317
+ strategy: strategy,
2318
+ suffix: suffix,
2319
+ tolerance: tolerance,
2320
+ allow_parallel: allow_parallel,
2321
+ force_parallel: force_parallel
2322
+ )
2323
+ .collect(no_optimization: true)
2324
+ end
1561
2325
 
1562
2326
  # Join in SQL-like fashion.
1563
2327
  #
@@ -1675,8 +2439,78 @@ module Polars
1675
2439
  .collect(no_optimization: true)
1676
2440
  end
1677
2441
 
1678
- # def apply
1679
- # end
2442
+ # Apply a custom/user-defined function (UDF) over the rows of the DataFrame.
2443
+ #
2444
+ # The UDF will receive each row as a tuple of values: `udf(row)`.
2445
+ #
2446
+ # Implementing logic using a Ruby function is almost always _significantly_
2447
+ # slower and more memory intensive than implementing the same logic using
2448
+ # the native expression API because:
2449
+ #
2450
+ # - The native expression engine runs in Rust; UDFs run in Ruby.
2451
+ # - Use of Ruby UDFs forces the DataFrame to be materialized in memory.
2452
+ # - Polars-native expressions can be parallelised (UDFs cannot).
2453
+ # - Polars-native expressions can be logically optimised (UDFs cannot).
2454
+ #
2455
+ # Wherever possible you should strongly prefer the native expression API
2456
+ # to achieve the best performance.
2457
+ #
2458
+ # @param return_dtype [Symbol]
2459
+ # Output type of the operation. If none given, Polars tries to infer the type.
2460
+ # @param inference_size [Integer]
2461
+ # Only used in the case when the custom function returns rows.
2462
+ # This uses the first `n` rows to determine the output schema
2463
+ #
2464
+ # @return [Object]
2465
+ #
2466
+ # @note
2467
+ # The frame-level `apply` cannot track column names (as the UDF is a black-box
2468
+ # that may arbitrarily drop, rearrange, transform, or add new columns); if you
2469
+ # want to apply a UDF such that column names are preserved, you should use the
2470
+ # expression-level `apply` syntax instead.
2471
+ #
2472
+ # @example
2473
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [-1, 5, 8]})
2474
+ #
2475
+ # @example Return a DataFrame by mapping each row to a tuple:
2476
+ # df.apply { |t| [t[0] * 2, t[1] * 3] }
2477
+ # # =>
2478
+ # # shape: (3, 2)
2479
+ # # ┌──────────┬──────────┐
2480
+ # # │ column_0 ┆ column_1 │
2481
+ # # │ --- ┆ --- │
2482
+ # # │ i64 ┆ i64 │
2483
+ # # ╞══════════╪══════════╡
2484
+ # # │ 2 ┆ -3 │
2485
+ # # ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
2486
+ # # │ 4 ┆ 15 │
2487
+ # # ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
2488
+ # # │ 6 ┆ 24 │
2489
+ # # └──────────┴──────────┘
2490
+ #
2491
+ # @example Return a Series by mapping each row to a scalar:
2492
+ # df.apply { |t| t[0] * 2 + t[1] }
2493
+ # # =>
2494
+ # # shape: (3, 1)
2495
+ # # ┌───────┐
2496
+ # # │ apply │
2497
+ # # │ --- │
2498
+ # # │ i64 │
2499
+ # # ╞═══════╡
2500
+ # # │ 1 │
2501
+ # # ├╌╌╌╌╌╌╌┤
2502
+ # # │ 9 │
2503
+ # # ├╌╌╌╌╌╌╌┤
2504
+ # # │ 14 │
2505
+ # # └───────┘
2506
+ def apply(return_dtype: nil, inference_size: 256, &f)
2507
+ out, is_df = _df.apply(f, return_dtype, inference_size)
2508
+ if is_df
2509
+ _from_rbdf(out)
2510
+ else
2511
+ _from_rbdf(Utils.wrap_s(out).to_frame._df)
2512
+ end
2513
+ end
1680
2514
 
1681
2515
  # Return a new DataFrame with the column added or replaced.
1682
2516
  #
@@ -2178,17 +3012,404 @@ module Polars
2178
3012
  lazy.explode(columns).collect(no_optimization: true)
2179
3013
  end
2180
3014
 
2181
- # def pivot
2182
- # end
3015
+ # Create a spreadsheet-style pivot table as a DataFrame.
3016
+ #
3017
+ # @param values [Object]
3018
+ # Column values to aggregate. Can be multiple columns if the *columns*
3019
+ # arguments contains multiple columns as well
3020
+ # @param index [Object]
3021
+ # One or multiple keys to group by
3022
+ # @param columns [Object]
3023
+ # Columns whose values will be used as the header of the output DataFrame
3024
+ # @param aggregate_fn ["first", "sum", "max", "min", "mean", "median", "last", "count"]
3025
+ # A predefined aggregate function str or an expression.
3026
+ # @param maintain_order [Object]
3027
+ # Sort the grouped keys so that the output order is predictable.
3028
+ # @param sort_columns [Object]
3029
+ # Sort the transposed columns by name. Default is by order of discovery.
3030
+ #
3031
+ # @return [DataFrame]
3032
+ #
3033
+ # @example
3034
+ # df = Polars::DataFrame.new(
3035
+ # {
3036
+ # "foo" => ["one", "one", "one", "two", "two", "two"],
3037
+ # "bar" => ["A", "B", "C", "A", "B", "C"],
3038
+ # "baz" => [1, 2, 3, 4, 5, 6]
3039
+ # }
3040
+ # )
3041
+ # df.pivot(values: "baz", index: "foo", columns: "bar")
3042
+ # # =>
3043
+ # # shape: (2, 4)
3044
+ # # ┌─────┬─────┬─────┬─────┐
3045
+ # # │ foo ┆ A ┆ B ┆ C │
3046
+ # # │ --- ┆ --- ┆ --- ┆ --- │
3047
+ # # │ str ┆ i64 ┆ i64 ┆ i64 │
3048
+ # # ╞═════╪═════╪═════╪═════╡
3049
+ # # │ one ┆ 1 ┆ 2 ┆ 3 │
3050
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
3051
+ # # │ two ┆ 4 ┆ 5 ┆ 6 │
3052
+ # # └─────┴─────┴─────┴─────┘
3053
+ def pivot(
3054
+ values:,
3055
+ index:,
3056
+ columns:,
3057
+ aggregate_fn: "first",
3058
+ maintain_order: true,
3059
+ sort_columns: false
3060
+ )
3061
+ if values.is_a?(String)
3062
+ values = [values]
3063
+ end
3064
+ if index.is_a?(String)
3065
+ index = [index]
3066
+ end
3067
+ if columns.is_a?(String)
3068
+ columns = [columns]
3069
+ end
2183
3070
 
2184
- # def melt
2185
- # end
3071
+ if aggregate_fn.is_a?(String)
3072
+ case aggregate_fn
3073
+ when "first"
3074
+ aggregate_fn = Polars.element.first
3075
+ when "sum"
3076
+ aggregate_fn = Polars.element.sum
3077
+ when "max"
3078
+ aggregate_fn = Polars.element.max
3079
+ when "min"
3080
+ aggregate_fn = Polars.element.min
3081
+ when "mean"
3082
+ aggregate_fn = Polars.element.mean
3083
+ when "median"
3084
+ aggregate_fn = Polars.element.median
3085
+ when "last"
3086
+ aggregate_fn = Polars.element.last
3087
+ when "count"
3088
+ aggregate_fn = Polars.count
3089
+ else
3090
+ raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
3091
+ end
3092
+ end
2186
3093
 
2187
- # def unstack
2188
- # end
3094
+ _from_rbdf(
3095
+ _df.pivot_expr(
3096
+ values,
3097
+ index,
3098
+ columns,
3099
+ aggregate_fn._rbexpr,
3100
+ maintain_order,
3101
+ sort_columns
3102
+ )
3103
+ )
3104
+ end
2189
3105
 
2190
- # def partition_by
2191
- # end
3106
+ # Unpivot a DataFrame from wide to long format.
3107
+ #
3108
+ # Optionally leaves identifiers set.
3109
+ #
3110
+ # This function is useful to massage a DataFrame into a format where one or more
3111
+ # columns are identifier variables (id_vars), while all other columns, considered
3112
+ # measured variables (value_vars), are "unpivoted" to the row axis, leaving just
3113
+ # two non-identifier columns, 'variable' and 'value'.
3114
+ #
3115
+ # @param id_vars [Object]
3116
+ # Columns to use as identifier variables.
3117
+ # @param value_vars [Object]
3118
+ # Values to use as identifier variables.
3119
+ # If `value_vars` is empty all columns that are not in `id_vars` will be used.
3120
+ # @param variable_name [String]
3121
+ # Name to give to the `value` column. Defaults to "variable"
3122
+ # @param value_name [String]
3123
+ # Name to give to the `value` column. Defaults to "value"
3124
+ #
3125
+ # @return [DataFrame]
3126
+ #
3127
+ # @example
3128
+ # df = Polars::DataFrame.new(
3129
+ # {
3130
+ # "a" => ["x", "y", "z"],
3131
+ # "b" => [1, 3, 5],
3132
+ # "c" => [2, 4, 6]
3133
+ # }
3134
+ # )
3135
+ # df.melt(id_vars: "a", value_vars: ["b", "c"])
3136
+ # # =>
3137
+ # # shape: (6, 3)
3138
+ # # ┌─────┬──────────┬───────┐
3139
+ # # │ a ┆ variable ┆ value │
3140
+ # # │ --- ┆ --- ┆ --- │
3141
+ # # │ str ┆ str ┆ i64 │
3142
+ # # ╞═════╪══════════╪═══════╡
3143
+ # # │ x ┆ b ┆ 1 │
3144
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3145
+ # # │ y ┆ b ┆ 3 │
3146
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3147
+ # # │ z ┆ b ┆ 5 │
3148
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3149
+ # # │ x ┆ c ┆ 2 │
3150
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3151
+ # # │ y ┆ c ┆ 4 │
3152
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
3153
+ # # │ z ┆ c ┆ 6 │
3154
+ # # └─────┴──────────┴───────┘
3155
+ def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
3156
+ if value_vars.is_a?(String)
3157
+ value_vars = [value_vars]
3158
+ end
3159
+ if id_vars.is_a?(String)
3160
+ id_vars = [id_vars]
3161
+ end
3162
+ if value_vars.nil?
3163
+ value_vars = []
3164
+ end
3165
+ if id_vars.nil?
3166
+ id_vars = []
3167
+ end
3168
+ _from_rbdf(
3169
+ _df.melt(id_vars, value_vars, value_name, variable_name)
3170
+ )
3171
+ end
3172
+
3173
+ # Unstack a long table to a wide form without doing an aggregation.
3174
+ #
3175
+ # This can be much faster than a pivot, because it can skip the grouping phase.
3176
+ #
3177
+ # @note
3178
+ # This functionality is experimental and may be subject to changes
3179
+ # without it being considered a breaking change.
3180
+ #
3181
+ # @param step Integer
3182
+ # Number of rows in the unstacked frame.
3183
+ # @param how ["vertical", "horizontal"]
3184
+ # Direction of the unstack.
3185
+ # @param columns [Object]
3186
+ # Column to include in the operation.
3187
+ # @param fill_values [Object]
3188
+ # Fill values that don't fit the new size with this value.
3189
+ #
3190
+ # @return [DataFrame]
3191
+ #
3192
+ # @example
3193
+ # df = Polars::DataFrame.new(
3194
+ # {
3195
+ # "col1" => "A".."I",
3196
+ # "col2" => Polars.arange(0, 9, eager: true)
3197
+ # }
3198
+ # )
3199
+ # # =>
3200
+ # # shape: (9, 2)
3201
+ # # ┌──────┬──────┐
3202
+ # # │ col1 ┆ col2 │
3203
+ # # │ --- ┆ --- │
3204
+ # # │ str ┆ i64 │
3205
+ # # ╞══════╪══════╡
3206
+ # # │ A ┆ 0 │
3207
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3208
+ # # │ B ┆ 1 │
3209
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3210
+ # # │ C ┆ 2 │
3211
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3212
+ # # │ D ┆ 3 │
3213
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3214
+ # # │ ... ┆ ... │
3215
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3216
+ # # │ F ┆ 5 │
3217
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3218
+ # # │ G ┆ 6 │
3219
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3220
+ # # │ H ┆ 7 │
3221
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
3222
+ # # │ I ┆ 8 │
3223
+ # # └──────┴──────┘
3224
+ #
3225
+ # @example
3226
+ # df.unstack(step: 3, how: "vertical")
3227
+ # # =>
3228
+ # # shape: (3, 6)
3229
+ # # ┌────────┬────────┬────────┬────────┬────────┬────────┐
3230
+ # # │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │
3231
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3232
+ # # │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │
3233
+ # # ╞════════╪════════╪════════╪════════╪════════╪════════╡
3234
+ # # │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │
3235
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
3236
+ # # │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │
3237
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
3238
+ # # │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │
3239
+ # # └────────┴────────┴────────┴────────┴────────┴────────┘
3240
+ #
3241
+ # @example
3242
+ # df.unstack(step: 3, how: "horizontal")
3243
+ # # =>
3244
+ # # shape: (3, 6)
3245
+ # # ┌────────┬────────┬────────┬────────┬────────┬────────┐
3246
+ # # │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │
3247
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3248
+ # # │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │
3249
+ # # ╞════════╪════════╪════════╪════════╪════════╪════════╡
3250
+ # # │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │
3251
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
3252
+ # # │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │
3253
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
3254
+ # # │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │
3255
+ # # └────────┴────────┴────────┴────────┴────────┴────────┘
3256
+ def unstack(step:, how: "vertical", columns: nil, fill_values: nil)
3257
+ if !columns.nil?
3258
+ df = select(columns)
3259
+ else
3260
+ df = self
3261
+ end
3262
+
3263
+ height = df.height
3264
+ if how == "vertical"
3265
+ n_rows = step
3266
+ n_cols = (height / n_rows.to_f).ceil
3267
+ else
3268
+ n_cols = step
3269
+ n_rows = (height / n_cols.to_f).ceil
3270
+ end
3271
+
3272
+ n_fill = n_cols * n_rows - height
3273
+
3274
+ if n_fill > 0
3275
+ if !fill_values.is_a?(Array)
3276
+ fill_values = [fill_values] * df.width
3277
+ end
3278
+
3279
+ df = df.select(
3280
+ df.get_columns.zip(fill_values).map do |s, next_fill|
3281
+ s.extend_constant(next_fill, n_fill)
3282
+ end
3283
+ )
3284
+ end
3285
+
3286
+ if how == "horizontal"
3287
+ df = (
3288
+ df.with_column(
3289
+ (Polars.arange(0, n_cols * n_rows, eager: true) % n_cols).alias(
3290
+ "__sort_order"
3291
+ )
3292
+ )
3293
+ .sort("__sort_order")
3294
+ .drop("__sort_order")
3295
+ )
3296
+ end
3297
+
3298
+ zfill_val = Math.log10(n_cols).floor + 1
3299
+ slices =
3300
+ df.get_columns.flat_map do |s|
3301
+ n_cols.times.map do |slice_nbr|
3302
+ s.slice(slice_nbr * n_rows, n_rows).alias("%s_%0#{zfill_val}d" % [s.name, slice_nbr])
3303
+ end
3304
+ end
3305
+
3306
+ _from_rbdf(DataFrame.new(slices)._df)
3307
+ end
3308
+
3309
+ # Split into multiple DataFrames partitioned by groups.
3310
+ #
3311
+ # @param groups [Object]
3312
+ # Groups to partition by.
3313
+ # @param maintain_order [Boolean]
3314
+ # Keep predictable output order. This is slower as it requires an extra sort
3315
+ # operation.
3316
+ # @param as_dict [Boolean]
3317
+ # If true, return the partitions in a dictionary keyed by the distinct group
3318
+ # values instead of a list.
3319
+ #
3320
+ # @return [Object]
3321
+ #
3322
+ # @example
3323
+ # df = Polars::DataFrame.new(
3324
+ # {
3325
+ # "foo" => ["A", "A", "B", "B", "C"],
3326
+ # "N" => [1, 2, 2, 4, 2],
3327
+ # "bar" => ["k", "l", "m", "m", "l"]
3328
+ # }
3329
+ # )
3330
+ # df.partition_by("foo", maintain_order: true)
3331
+ # # =>
3332
+ # # [shape: (2, 3)
3333
+ # # ┌─────┬─────┬─────┐
3334
+ # # │ foo ┆ N ┆ bar │
3335
+ # # │ --- ┆ --- ┆ --- │
3336
+ # # │ str ┆ i64 ┆ str │
3337
+ # # ╞═════╪═════╪═════╡
3338
+ # # │ A ┆ 1 ┆ k │
3339
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
3340
+ # # │ A ┆ 2 ┆ l │
3341
+ # # └─────┴─────┴─────┘, shape: (2, 3)
3342
+ # # ┌─────┬─────┬─────┐
3343
+ # # │ foo ┆ N ┆ bar │
3344
+ # # │ --- ┆ --- ┆ --- │
3345
+ # # │ str ┆ i64 ┆ str │
3346
+ # # ╞═════╪═════╪═════╡
3347
+ # # │ B ┆ 2 ┆ m │
3348
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
3349
+ # # │ B ┆ 4 ┆ m │
3350
+ # # └─────┴─────┴─────┘, shape: (1, 3)
3351
+ # # ┌─────┬─────┬─────┐
3352
+ # # │ foo ┆ N ┆ bar │
3353
+ # # │ --- ┆ --- ┆ --- │
3354
+ # # │ str ┆ i64 ┆ str │
3355
+ # # ╞═════╪═════╪═════╡
3356
+ # # │ C ┆ 2 ┆ l │
3357
+ # # └─────┴─────┴─────┘]
3358
+ #
3359
+ # @example
3360
+ # df.partition_by("foo", maintain_order: true, as_dict: true)
3361
+ # # =>
3362
+ # # {"A"=>shape: (2, 3)
3363
+ # # ┌─────┬─────┬─────┐
3364
+ # # │ foo ┆ N ┆ bar │
3365
+ # # │ --- ┆ --- ┆ --- │
3366
+ # # │ str ┆ i64 ┆ str │
3367
+ # # ╞═════╪═════╪═════╡
3368
+ # # │ A ┆ 1 ┆ k │
3369
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
3370
+ # # │ A ┆ 2 ┆ l │
3371
+ # # └─────┴─────┴─────┘, "B"=>shape: (2, 3)
3372
+ # # ┌─────┬─────┬─────┐
3373
+ # # │ foo ┆ N ┆ bar │
3374
+ # # │ --- ┆ --- ┆ --- │
3375
+ # # │ str ┆ i64 ┆ str │
3376
+ # # ╞═════╪═════╪═════╡
3377
+ # # │ B ┆ 2 ┆ m │
3378
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
3379
+ # # │ B ┆ 4 ┆ m │
3380
+ # # └─────┴─────┴─────┘, "C"=>shape: (1, 3)
3381
+ # # ┌─────┬─────┬─────┐
3382
+ # # │ foo ┆ N ┆ bar │
3383
+ # # │ --- ┆ --- ┆ --- │
3384
+ # # │ str ┆ i64 ┆ str │
3385
+ # # ╞═════╪═════╪═════╡
3386
+ # # │ C ┆ 2 ┆ l │
3387
+ # # └─────┴─────┴─────┘}
3388
+ def partition_by(groups, maintain_order: true, as_dict: false)
3389
+ if groups.is_a?(String)
3390
+ groups = [groups]
3391
+ elsif !groups.is_a?(Array)
3392
+ groups = Array(groups)
3393
+ end
3394
+
3395
+ if as_dict
3396
+ out = {}
3397
+ if groups.length == 1
3398
+ _df.partition_by(groups, maintain_order).each do |df|
3399
+ df = _from_rbdf(df)
3400
+ out[df[groups][0, 0]] = df
3401
+ end
3402
+ else
3403
+ _df.partition_by(groups, maintain_order).each do |df|
3404
+ df = _from_rbdf(df)
3405
+ out[df[groups].row(0)] = df
3406
+ end
3407
+ end
3408
+ out
3409
+ else
3410
+ _df.partition_by(groups, maintain_order).map { |df| _from_rbdf(df) }
3411
+ end
3412
+ end
2192
3413
 
2193
3414
  # Shift values by the given period.
2194
3415
  #
@@ -3061,8 +4282,93 @@ module Polars
3061
4282
  _from_rbdf(_df.sample_n(n, with_replacement, shuffle, seed))
3062
4283
  end
3063
4284
 
3064
- # def fold
3065
- # end
4285
+ # Apply a horizontal reduction on a DataFrame.
4286
+ #
4287
+ # This can be used to effectively determine aggregations on a row level, and can
4288
+ # be applied to any DataType that can be supercasted (casted to a similar parent
4289
+ # type).
4290
+ #
4291
+ # An example of the supercast rules when applying an arithmetic operation on two
4292
+ # DataTypes are for instance:
4293
+ #
4294
+ # i8 + str = str
4295
+ # f32 + i64 = f32
4296
+ # f32 + f64 = f64
4297
+ #
4298
+ # @return [Series]
4299
+ #
4300
+ # @example A horizontal sum operation:
4301
+ # df = Polars::DataFrame.new(
4302
+ # {
4303
+ # "a" => [2, 1, 3],
4304
+ # "b" => [1, 2, 3],
4305
+ # "c" => [1.0, 2.0, 3.0]
4306
+ # }
4307
+ # )
4308
+ # df.fold { |s1, s2| s1 + s2 }
4309
+ # # =>
4310
+ # # shape: (3,)
4311
+ # # Series: 'a' [f64]
4312
+ # # [
4313
+ # # 4.0
4314
+ # # 5.0
4315
+ # # 9.0
4316
+ # # ]
4317
+ #
4318
+ # @example A horizontal minimum operation:
4319
+ # df = Polars::DataFrame.new({"a" => [2, 1, 3], "b" => [1, 2, 3], "c" => [1.0, 2.0, 3.0]})
4320
+ # df.fold { |s1, s2| s1.zip_with(s1 < s2, s2) }
4321
+ # # =>
4322
+ # # shape: (3,)
4323
+ # # Series: 'a' [f64]
4324
+ # # [
4325
+ # # 1.0
4326
+ # # 1.0
4327
+ # # 3.0
4328
+ # # ]
4329
+ #
4330
+ # @example A horizontal string concatenation:
4331
+ # df = Polars::DataFrame.new(
4332
+ # {
4333
+ # "a" => ["foo", "bar", 2],
4334
+ # "b" => [1, 2, 3],
4335
+ # "c" => [1.0, 2.0, 3.0]
4336
+ # }
4337
+ # )
4338
+ # df.fold { |s1, s2| s1 + s2 }
4339
+ # # =>
4340
+ # # shape: (3,)
4341
+ # # Series: 'a' [str]
4342
+ # # [
4343
+ # # "foo11.0"
4344
+ # # "bar22.0"
4345
+ # # null
4346
+ # # ]
4347
+ #
4348
+ # @example A horizontal boolean or, similar to a row-wise .any():
4349
+ # df = Polars::DataFrame.new(
4350
+ # {
4351
+ # "a" => [false, false, true],
4352
+ # "b" => [false, true, false]
4353
+ # }
4354
+ # )
4355
+ # df.fold { |s1, s2| s1 | s2 }
4356
+ # # =>
4357
+ # # shape: (3,)
4358
+ # # Series: 'a' [bool]
4359
+ # # [
4360
+ # # false
4361
+ # # true
4362
+ # # true
4363
+ # # ]
4364
+ def fold(&operation)
4365
+ acc = to_series(0)
4366
+
4367
+ 1.upto(width - 1) do |i|
4368
+ acc = operation.call(acc, to_series(i))
4369
+ end
4370
+ acc
4371
+ end
3066
4372
 
3067
4373
  # Get a row as tuple, either by index or by predicate.
3068
4374
  #
@@ -3171,8 +4477,45 @@ module Polars
3171
4477
  select(Utils.col("*").take_every(n))
3172
4478
  end
3173
4479
 
3174
- # def hash_rows
3175
- # end
4480
+ # Hash and combine the rows in this DataFrame.
4481
+ #
4482
+ # The hash value is of type `:u64`.
4483
+ #
4484
+ # @param seed [Integer]
4485
+ # Random seed parameter. Defaults to 0.
4486
+ # @param seed_1 [Integer]
4487
+ # Random seed parameter. Defaults to `seed` if not set.
4488
+ # @param seed_2 [Integer]
4489
+ # Random seed parameter. Defaults to `seed` if not set.
4490
+ # @param seed_3 [Integer]
4491
+ # Random seed parameter. Defaults to `seed` if not set.
4492
+ #
4493
+ # @return [Series]
4494
+ #
4495
+ # @example
4496
+ # df = Polars::DataFrame.new(
4497
+ # {
4498
+ # "foo" => [1, nil, 3, 4],
4499
+ # "ham" => ["a", "b", nil, "d"]
4500
+ # }
4501
+ # )
4502
+ # df.hash_rows(seed: 42)
4503
+ # # =>
4504
+ # # shape: (4,)
4505
+ # # Series: '' [u64]
4506
+ # # [
4507
+ # # 4238614331852490969
4508
+ # # 17976148875586754089
4509
+ # # 4702262519505526977
4510
+ # # 18144177983981041107
4511
+ # # ]
4512
+ def hash_rows(seed: 0, seed_1: nil, seed_2: nil, seed_3: nil)
4513
+ k0 = seed
4514
+ k1 = seed_1.nil? ? seed : seed_1
4515
+ k2 = seed_2.nil? ? seed : seed_2
4516
+ k3 = seed_3.nil? ? seed : seed_3
4517
+ Utils.wrap_s(_df.hash_rows(k0, k1, k2, k3))
4518
+ end
3176
4519
 
3177
4520
  # Interpolate intermediate values. The interpolation method is linear.
3178
4521
  #
@@ -3297,7 +4640,19 @@ module Polars
3297
4640
  self._df = _df._clone
3298
4641
  end
3299
4642
 
3300
- def hash_to_rbdf(data, columns: nil)
4643
+ def _pos_idx(idx, dim)
4644
+ if idx >= 0
4645
+ idx
4646
+ else
4647
+ shape[dim] + idx
4648
+ end
4649
+ end
4650
+
4651
+ # def _pos_idxs
4652
+ # end
4653
+
4654
+ # @private
4655
+ def self.hash_to_rbdf(data, columns: nil)
3301
4656
  if !columns.nil?
3302
4657
  columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
3303
4658
 
@@ -3313,11 +4668,34 @@ module Polars
3313
4668
  RbDataFrame.read_hash(data)
3314
4669
  end
3315
4670
 
3316
- def _unpack_columns(columns, lookup_names: nil)
3317
- [columns.keys, columns]
4671
+ # @private
4672
+ def self._unpack_columns(columns, lookup_names: nil, n_expected: nil)
4673
+ if columns.is_a?(Hash)
4674
+ columns = columns.to_a
4675
+ end
4676
+ column_names =
4677
+ (columns || []).map.with_index do |col, i|
4678
+ if col.is_a?(String)
4679
+ col || "column_#{i}"
4680
+ else
4681
+ col[0]
4682
+ end
4683
+ end
4684
+ if column_names.empty? && n_expected
4685
+ column_names = n_expected.times.map { |i| "column_#{i}" }
4686
+ end
4687
+ # TODO zip_longest
4688
+ lookup = column_names.zip(lookup_names || []).to_h
4689
+
4690
+ [
4691
+ column_names,
4692
+ (columns || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
4693
+ [lookup[col[0]] || col[0], col[1]]
4694
+ end
4695
+ ]
3318
4696
  end
3319
4697
 
3320
- def _handle_columns_arg(data, columns: nil)
4698
+ def self._handle_columns_arg(data, columns: nil)
3321
4699
  if columns.nil?
3322
4700
  data
3323
4701
  else
@@ -3335,14 +4713,39 @@ module Polars
3335
4713
  end
3336
4714
  end
3337
4715
 
3338
- def sequence_to_rbdf(data, columns: nil, orient: nil)
3339
- if columns || orient
3340
- raise Todo
4716
+ # @private
4717
+ def self.sequence_to_rbdf(data, columns: nil, orient: nil)
4718
+ if data.length == 0
4719
+ return hash_to_rbdf({}, columns: columns)
4720
+ end
4721
+
4722
+ if data[0].is_a?(Series)
4723
+ # series_names = data.map(&:name)
4724
+ # columns, dtypes = _unpack_columns(columns || series_names, n_expected: data.length)
4725
+ data_series = []
4726
+ data.each do |s|
4727
+ data_series << s._s
4728
+ end
4729
+ elsif data[0].is_a?(Array)
4730
+ if orient.nil? && !columns.nil?
4731
+ orient = columns.length == data.length ? "col" : "row"
4732
+ end
4733
+
4734
+ if orient == "row"
4735
+ raise Todo
4736
+ elsif orient == "col" || orient.nil?
4737
+ raise Todo
4738
+ else
4739
+ raise ArgumentError, "orient must be one of {{'col', 'row', nil}}, got #{orient} instead."
4740
+ end
3341
4741
  end
3342
- RbDataFrame.new(data.map(&:_s))
4742
+
4743
+ data_series = _handle_columns_arg(data_series, columns: columns)
4744
+ RbDataFrame.new(data_series)
3343
4745
  end
3344
4746
 
3345
- def series_to_rbdf(data, columns: nil)
4747
+ # @private
4748
+ def self.series_to_rbdf(data, columns: nil)
3346
4749
  if columns
3347
4750
  raise Todo
3348
4751
  end