polars-df 0.5.0-arm64-darwin → 0.7.0-arm64-darwin

Sign up to get free protection for your applications and to get access to all the features.
@@ -20,15 +20,9 @@ module Polars
20
20
  # this does not yield conclusive results, column orientation is used.
21
21
  def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
22
22
  schema ||= columns
23
- raise Todo if schema_overrides
24
23
 
25
- # TODO deprecate in favor of read_sql
26
24
  if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
27
- result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
28
- data = {}
29
- result.columns.each_with_index do |k, i|
30
- data[k] = result.rows.map { |r| r[i] }
31
- end
25
+ raise ArgumentError, "Use read_database instead"
32
26
  end
33
27
 
34
28
  if data.nil?
@@ -36,7 +30,7 @@ module Polars
36
30
  elsif data.is_a?(Hash)
37
31
  data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
38
32
  self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
39
- elsif data.is_a?(Array)
33
+ elsif data.is_a?(::Array)
40
34
  self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
41
35
  elsif data.is_a?(Series)
42
36
  self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
@@ -116,7 +110,7 @@ module Polars
116
110
  dtypes.each do|k, v|
117
111
  dtype_list << [k, Utils.rb_type_to_dtype(v)]
118
112
  end
119
- elsif dtypes.is_a?(Array)
113
+ elsif dtypes.is_a?(::Array)
120
114
  dtype_slice = dtypes
121
115
  else
122
116
  raise ArgumentError, "dtype arg should be list or dict"
@@ -590,7 +584,7 @@ module Polars
590
584
 
591
585
  # df[2, ..] (select row as df)
592
586
  if row_selection.is_a?(Integer)
593
- if col_selection.is_a?(Array)
587
+ if col_selection.is_a?(::Array)
594
588
  df = self[0.., col_selection]
595
589
  return df.slice(row_selection, 1)
596
590
  end
@@ -611,7 +605,7 @@ module Polars
611
605
  return series[row_selection]
612
606
  end
613
607
 
614
- if col_selection.is_a?(Array)
608
+ if col_selection.is_a?(::Array)
615
609
  # df[.., [1, 2]]
616
610
  if Utils.is_int_sequence(col_selection)
617
611
  series_list = col_selection.map { |i| to_series(i) }
@@ -641,7 +635,7 @@ module Polars
641
635
  return Slice.new(self).apply(item)
642
636
  end
643
637
 
644
- if item.is_a?(Array) && item.all? { |v| Utils.strlike?(v) }
638
+ if item.is_a?(::Array) && item.all? { |v| Utils.strlike?(v) }
645
639
  # select multiple columns
646
640
  # df[["foo", "bar"]]
647
641
  return _from_rbdf(_df.select(item.map(&:to_s)))
@@ -684,13 +678,13 @@ module Polars
684
678
  end
685
679
 
686
680
  if Utils.strlike?(key)
687
- if value.is_a?(Array) || (defined?(Numo::NArray) && value.is_a?(Numo::NArray))
681
+ if value.is_a?(::Array) || (defined?(Numo::NArray) && value.is_a?(Numo::NArray))
688
682
  value = Series.new(value)
689
683
  elsif !value.is_a?(Series)
690
684
  value = Polars.lit(value)
691
685
  end
692
686
  self._df = with_column(value.alias(key.to_s))._df
693
- elsif key.is_a?(Array)
687
+ elsif key.is_a?(::Array)
694
688
  row_selection, col_selection = key
695
689
 
696
690
  if Utils.strlike?(col_selection)
@@ -905,6 +899,7 @@ module Polars
905
899
  def write_csv(
906
900
  file = nil,
907
901
  has_header: true,
902
+ include_header: nil,
908
903
  sep: ",",
909
904
  quote: '"',
910
905
  batch_size: 1024,
@@ -914,6 +909,8 @@ module Polars
914
909
  float_precision: nil,
915
910
  null_value: nil
916
911
  )
912
+ include_header = has_header if include_header.nil?
913
+
917
914
  if sep.length > 1
918
915
  raise ArgumentError, "only single byte separator is allowed"
919
916
  elsif quote.length > 1
@@ -927,7 +924,7 @@ module Polars
927
924
  buffer.set_encoding(Encoding::BINARY)
928
925
  _df.write_csv(
929
926
  buffer,
930
- has_header,
927
+ include_header,
931
928
  sep.ord,
932
929
  quote.ord,
933
930
  batch_size,
@@ -946,7 +943,7 @@ module Polars
946
943
 
947
944
  _df.write_csv(
948
945
  file,
949
- has_header,
946
+ include_header,
950
947
  sep.ord,
951
948
  quote.ord,
952
949
  batch_size,
@@ -994,14 +991,21 @@ module Polars
994
991
  #
995
992
  # @return [nil]
996
993
  def write_ipc(file, compression: "uncompressed")
997
- if compression.nil?
998
- compression = "uncompressed"
994
+ return_bytes = file.nil?
995
+ if return_bytes
996
+ file = StringIO.new
997
+ file.set_encoding(Encoding::BINARY)
999
998
  end
1000
999
  if Utils.pathlike?(file)
1001
1000
  file = Utils.normalise_filepath(file)
1002
1001
  end
1003
1002
 
1003
+ if compression.nil?
1004
+ compression = "uncompressed"
1005
+ end
1006
+
1004
1007
  _df.write_ipc(file, compression)
1008
+ return_bytes ? file.string : nil
1005
1009
  end
1006
1010
 
1007
1011
  # Write to Apache Parquet file.
@@ -1144,22 +1148,8 @@ module Polars
1144
1148
  # # │ b ┆ 1 ┆ 2 ┆ 3 │
1145
1149
  # # └─────┴─────┴─────┴─────┘
1146
1150
  def transpose(include_header: false, header_name: "column", column_names: nil)
1147
- df = _from_rbdf(_df.transpose(include_header, header_name))
1148
- if !column_names.nil?
1149
- names = []
1150
- n = df.width
1151
- if include_header
1152
- names << header_name
1153
- n -= 1
1154
- end
1155
-
1156
- column_names = column_names.each
1157
- n.times do
1158
- names << column_names.next
1159
- end
1160
- df.columns = names
1161
- end
1162
- df
1151
+ keep_names_as = include_header ? header_name : nil
1152
+ _from_rbdf(_df.transpose(keep_names_as, column_names))
1163
1153
  end
1164
1154
 
1165
1155
  # Reverse the DataFrame.
@@ -1491,13 +1481,9 @@ module Polars
1491
1481
  # # │ 1 ┆ 6.0 ┆ a │
1492
1482
  # # └─────┴─────┴─────┘
1493
1483
  def sort(by, reverse: false, nulls_last: false)
1494
- if by.is_a?(Array) || by.is_a?(Expr)
1495
- lazy
1496
- .sort(by, reverse: reverse, nulls_last: nulls_last)
1497
- .collect(no_optimization: true, string_cache: false)
1498
- else
1499
- _from_rbdf(_df.sort(by, reverse, nulls_last))
1500
- end
1484
+ lazy
1485
+ .sort(by, reverse: reverse, nulls_last: nulls_last)
1486
+ .collect(no_optimization: true)
1501
1487
  end
1502
1488
 
1503
1489
  # Sort the DataFrame by column in-place.
@@ -1808,13 +1794,13 @@ module Polars
1808
1794
  _from_rbdf(_df.with_row_count(name, offset))
1809
1795
  end
1810
1796
 
1811
- # Start a groupby operation.
1797
+ # Start a group by operation.
1812
1798
  #
1813
1799
  # @param by [Object]
1814
1800
  # Column(s) to group by.
1815
1801
  # @param maintain_order [Boolean]
1816
1802
  # Make sure that the order of the groups remain consistent. This is more
1817
- # expensive than a default groupby. Note that this only works in expression
1803
+ # expensive than a default group by. Note that this only works in expression
1818
1804
  # aggregations.
1819
1805
  #
1820
1806
  # @return [GroupBy]
@@ -1827,7 +1813,7 @@ module Polars
1827
1813
  # "c" => [6, 5, 4, 3, 2, 1]
1828
1814
  # }
1829
1815
  # )
1830
- # df.groupby("a").agg(Polars.col("b").sum).sort("a")
1816
+ # df.group_by("a").agg(Polars.col("b").sum).sort("a")
1831
1817
  # # =>
1832
1818
  # # shape: (3, 2)
1833
1819
  # # ┌─────┬─────┐
@@ -1839,25 +1825,26 @@ module Polars
1839
1825
  # # │ b ┆ 11 │
1840
1826
  # # │ c ┆ 6 │
1841
1827
  # # └─────┴─────┘
1842
- def groupby(by, maintain_order: false)
1828
+ def group_by(by, maintain_order: false)
1843
1829
  if !Utils.bool?(maintain_order)
1844
- raise TypeError, "invalid input for groupby arg `maintain_order`: #{maintain_order}."
1830
+ raise TypeError, "invalid input for group_by arg `maintain_order`: #{maintain_order}."
1845
1831
  end
1846
1832
  GroupBy.new(
1847
- _df,
1833
+ self,
1848
1834
  by,
1849
- self.class,
1850
1835
  maintain_order: maintain_order
1851
1836
  )
1852
1837
  end
1838
+ alias_method :groupby, :group_by
1839
+ alias_method :group, :group_by
1853
1840
 
1854
1841
  # Create rolling groups based on a time column.
1855
1842
  #
1856
1843
  # Also works for index values of type `:i32` or `:i64`.
1857
1844
  #
1858
- # Different from a `dynamic_groupby` the windows are now determined by the
1845
+ # Different from a `dynamic_group_by` the windows are now determined by the
1859
1846
  # individual values and are not of constant intervals. For constant intervals use
1860
- # *groupby_dynamic*
1847
+ # *group_by_dynamic*
1861
1848
  #
1862
1849
  # The `period` and `offset` arguments are created either from a timedelta, or
1863
1850
  # by using the following string language:
@@ -1877,7 +1864,7 @@ module Polars
1877
1864
  # Or combine them:
1878
1865
  # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1879
1866
  #
1880
- # In case of a groupby_rolling on an integer column, the windows are defined by:
1867
+ # In case of a group_by_rolling on an integer column, the windows are defined by:
1881
1868
  #
1882
1869
  # - **"1i" # length 1**
1883
1870
  # - **"10i" # length 10**
@@ -1888,7 +1875,7 @@ module Polars
1888
1875
  # This column must be sorted in ascending order. If not the output will not
1889
1876
  # make sense.
1890
1877
  #
1891
- # In case of a rolling groupby on indices, dtype needs to be one of
1878
+ # In case of a rolling group by on indices, dtype needs to be one of
1892
1879
  # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1893
1880
  # performance matters use an `:i64` column.
1894
1881
  # @param period [Object]
@@ -1899,6 +1886,12 @@ module Polars
1899
1886
  # Define whether the temporal window interval is closed or not.
1900
1887
  # @param by [Object]
1901
1888
  # Also group by this column/these columns.
1889
+ # @param check_sorted [Boolean]
1890
+ # When the `by` argument is given, polars can not check sortedness
1891
+ # by the metadata and has to do a full scan on the index column to
1892
+ # verify data is sorted. This is expensive. If you are sure the
1893
+ # data within the by groups is sorted, you can set this to `false`.
1894
+ # Doing so incorrectly will lead to incorrect output
1902
1895
  #
1903
1896
  # @return [RollingGroupBy]
1904
1897
  #
@@ -1912,9 +1905,9 @@ module Polars
1912
1905
  # "2020-01-08 23:16:43"
1913
1906
  # ]
1914
1907
  # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
1915
- # Polars.col("dt").str.strptime(Polars::Datetime)
1908
+ # Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
1916
1909
  # )
1917
- # df.groupby_rolling(index_column: "dt", period: "2d").agg(
1910
+ # df.group_by_rolling(index_column: "dt", period: "2d").agg(
1918
1911
  # [
1919
1912
  # Polars.sum("a").alias("sum_a"),
1920
1913
  # Polars.min("a").alias("min_a"),
@@ -1935,20 +1928,22 @@ module Polars
1935
1928
  # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
1936
1929
  # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
1937
1930
  # # └─────────────────────┴───────┴───────┴───────┘
1938
- def groupby_rolling(
1931
+ def group_by_rolling(
1939
1932
  index_column:,
1940
1933
  period:,
1941
1934
  offset: nil,
1942
1935
  closed: "right",
1943
- by: nil
1936
+ by: nil,
1937
+ check_sorted: true
1944
1938
  )
1945
- RollingGroupBy.new(self, index_column, period, offset, closed, by)
1939
+ RollingGroupBy.new(self, index_column, period, offset, closed, by, check_sorted)
1946
1940
  end
1941
+ alias_method :groupby_rolling, :group_by_rolling
1947
1942
 
1948
1943
  # Group based on a time value (or index value of type `:i32`, `:i64`).
1949
1944
  #
1950
1945
  # Time windows are calculated and rows are assigned to windows. Different from a
1951
- # normal groupby is that a row can be member of multiple groups. The time/index
1946
+ # normal group by is that a row can be member of multiple groups. The time/index
1952
1947
  # window could be seen as a rolling window, with a window size determined by
1953
1948
  # dates/times/values instead of slots in the DataFrame.
1954
1949
  #
@@ -1976,7 +1971,7 @@ module Polars
1976
1971
  # Or combine them:
1977
1972
  # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1978
1973
  #
1979
- # In case of a groupby_dynamic on an integer column, the windows are defined by:
1974
+ # In case of a group_by_dynamic on an integer column, the windows are defined by:
1980
1975
  #
1981
1976
  # - "1i" # length 1
1982
1977
  # - "10i" # length 10
@@ -1987,7 +1982,7 @@ module Polars
1987
1982
  # This column must be sorted in ascending order. If not the output will not
1988
1983
  # make sense.
1989
1984
  #
1990
- # In case of a dynamic groupby on indices, dtype needs to be one of
1985
+ # In case of a dynamic group by on indices, dtype needs to be one of
1991
1986
  # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1992
1987
  # performance matters use an `:i64` column.
1993
1988
  # @param every
@@ -2038,7 +2033,7 @@ module Polars
2038
2033
  # # └─────────────────────┴─────┘
2039
2034
  #
2040
2035
  # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
2041
- # df.groupby_dynamic("time", every: "1h", closed: "right").agg(
2036
+ # df.group_by_dynamic("time", every: "1h", closed: "right").agg(
2042
2037
  # [
2043
2038
  # Polars.col("time").min.alias("time_min"),
2044
2039
  # Polars.col("time").max.alias("time_max")
@@ -2058,7 +2053,7 @@ module Polars
2058
2053
  # # └─────────────────────┴─────────────────────┴─────────────────────┘
2059
2054
  #
2060
2055
  # @example The window boundaries can also be added to the aggregation result.
2061
- # df.groupby_dynamic(
2056
+ # df.group_by_dynamic(
2062
2057
  # "time", every: "1h", include_boundaries: true, closed: "right"
2063
2058
  # ).agg([Polars.col("time").count.alias("time_count")])
2064
2059
  # # =>
@@ -2075,27 +2070,27 @@ module Polars
2075
2070
  # # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
2076
2071
  #
2077
2072
  # @example When closed="left", should not include right end of interval.
2078
- # df.groupby_dynamic("time", every: "1h", closed: "left").agg(
2073
+ # df.group_by_dynamic("time", every: "1h", closed: "left").agg(
2079
2074
  # [
2080
2075
  # Polars.col("time").count.alias("time_count"),
2081
- # Polars.col("time").list.alias("time_agg_list")
2076
+ # Polars.col("time").alias("time_agg_list")
2082
2077
  # ]
2083
2078
  # )
2084
2079
  # # =>
2085
2080
  # # shape: (4, 3)
2086
- # # ┌─────────────────────┬────────────┬─────────────────────────────────────┐
2087
- # # │ time ┆ time_count ┆ time_agg_list
2088
- # # │ --- ┆ --- ┆ ---
2089
- # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
2090
- # # ╞═════════════════════╪════════════╪═════════════════════════════════════╡
2091
- # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16...
2092
- # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16...
2093
- # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16...
2094
- # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
2095
- # # └─────────────────────┴────────────┴─────────────────────────────────────┘
2081
+ # # ┌─────────────────────┬────────────┬───────────────────────────────────┐
2082
+ # # │ time ┆ time_count ┆ time_agg_list
2083
+ # # │ --- ┆ --- ┆ ---
2084
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
2085
+ # # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
2086
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16
2087
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16
2088
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16
2089
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
2090
+ # # └─────────────────────┴────────────┴───────────────────────────────────┘
2096
2091
  #
2097
2092
  # @example When closed="both" the time values at the window boundaries belong to 2 groups.
2098
- # df.groupby_dynamic("time", every: "1h", closed: "both").agg(
2093
+ # df.group_by_dynamic("time", every: "1h", closed: "both").agg(
2099
2094
  # [Polars.col("time").count.alias("time_count")]
2100
2095
  # )
2101
2096
  # # =>
@@ -2112,7 +2107,7 @@ module Polars
2112
2107
  # # │ 2021-12-16 03:00:00 ┆ 1 │
2113
2108
  # # └─────────────────────┴────────────┘
2114
2109
  #
2115
- # @example Dynamic groupbys can also be combined with grouping on normal keys.
2110
+ # @example Dynamic group bys can also be combined with grouping on normal keys.
2116
2111
  # df = Polars::DataFrame.new(
2117
2112
  # {
2118
2113
  # "time" => Polars.date_range(
@@ -2123,7 +2118,7 @@ module Polars
2123
2118
  # "groups" => ["a", "a", "a", "b", "b", "a", "a"]
2124
2119
  # }
2125
2120
  # )
2126
- # df.groupby_dynamic(
2121
+ # df.group_by_dynamic(
2127
2122
  # "time",
2128
2123
  # every: "1h",
2129
2124
  # closed: "both",
@@ -2146,20 +2141,20 @@ module Polars
2146
2141
  # # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
2147
2142
  # # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
2148
2143
  #
2149
- # @example Dynamic groupby on an index column.
2144
+ # @example Dynamic group by on an index column.
2150
2145
  # df = Polars::DataFrame.new(
2151
2146
  # {
2152
2147
  # "idx" => Polars.arange(0, 6, eager: true),
2153
2148
  # "A" => ["A", "A", "B", "B", "B", "C"]
2154
2149
  # }
2155
2150
  # )
2156
- # df.groupby_dynamic(
2151
+ # df.group_by_dynamic(
2157
2152
  # "idx",
2158
2153
  # every: "2i",
2159
2154
  # period: "3i",
2160
2155
  # include_boundaries: true,
2161
2156
  # closed: "right"
2162
- # ).agg(Polars.col("A").list.alias("A_agg_list"))
2157
+ # ).agg(Polars.col("A").alias("A_agg_list"))
2163
2158
  # # =>
2164
2159
  # # shape: (3, 4)
2165
2160
  # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
@@ -2171,7 +2166,7 @@ module Polars
2171
2166
  # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
2172
2167
  # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
2173
2168
  # # └─────────────────┴─────────────────┴─────┴─────────────────┘
2174
- def groupby_dynamic(
2169
+ def group_by_dynamic(
2175
2170
  index_column,
2176
2171
  every:,
2177
2172
  period: nil,
@@ -2195,6 +2190,7 @@ module Polars
2195
2190
  start_by
2196
2191
  )
2197
2192
  end
2193
+ alias_method :groupby_dynamic, :group_by_dynamic
2198
2194
 
2199
2195
  # Upsample a DataFrame at a regular frequency.
2200
2196
  #
@@ -2242,7 +2238,7 @@ module Polars
2242
2238
  # "groups" => ["A", "B", "A", "B"],
2243
2239
  # "values" => [0, 1, 2, 3]
2244
2240
  # }
2245
- # )
2241
+ # ).set_sorted("time")
2246
2242
  # df.upsample(
2247
2243
  # time_column: "time", every: "1mo", by: "groups", maintain_order: true
2248
2244
  # ).select(Polars.all.forward_fill)
@@ -2360,7 +2356,7 @@ module Polars
2360
2356
  # ], # note record date: Jan 1st (sorted!)
2361
2357
  # "gdp" => [4164, 4411, 4566, 4696]
2362
2358
  # }
2363
- # )
2359
+ # ).set_sorted("date")
2364
2360
  # population = Polars::DataFrame.new(
2365
2361
  # {
2366
2362
  # "date" => [
@@ -2371,7 +2367,7 @@ module Polars
2371
2367
  # ], # note record date: May 12th (sorted!)
2372
2368
  # "population" => [82.19, 82.66, 83.12, 83.52]
2373
2369
  # }
2374
- # )
2370
+ # ).set_sorted("date")
2375
2371
  # population.join_asof(
2376
2372
  # gdp, left_on: "date", right_on: "date", strategy: "backward"
2377
2373
  # )
@@ -2674,7 +2670,7 @@ module Polars
2674
2670
  # # │ 3 ┆ 8 ┆ c ┆ 30 │
2675
2671
  # # └─────┴─────┴─────┴───────┘
2676
2672
  def hstack(columns, in_place: false)
2677
- if !columns.is_a?(Array)
2673
+ if !columns.is_a?(::Array)
2678
2674
  columns = columns.get_columns
2679
2675
  end
2680
2676
  if in_place
@@ -2804,7 +2800,7 @@ module Polars
2804
2800
  # # │ 3 ┆ 8.0 │
2805
2801
  # # └─────┴─────┘
2806
2802
  def drop(columns)
2807
- if columns.is_a?(Array)
2803
+ if columns.is_a?(::Array)
2808
2804
  df = clone
2809
2805
  columns.each do |n|
2810
2806
  df._df.drop_in_place(n)
@@ -3317,7 +3313,7 @@ module Polars
3317
3313
  n_fill = n_cols * n_rows - height
3318
3314
 
3319
3315
  if n_fill > 0
3320
- if !fill_values.is_a?(Array)
3316
+ if !fill_values.is_a?(::Array)
3321
3317
  fill_values = [fill_values] * df.width
3322
3318
  end
3323
3319
 
@@ -3426,36 +3422,38 @@ module Polars
3426
3422
  # # ╞═════╪═════╪═════╡
3427
3423
  # # │ C ┆ 2 ┆ l │
3428
3424
  # # └─────┴─────┴─────┘}
3429
- def partition_by(groups, maintain_order: true, as_dict: false)
3425
+ def partition_by(groups, maintain_order: true, include_key: true, as_dict: false)
3430
3426
  if groups.is_a?(String)
3431
3427
  groups = [groups]
3432
- elsif !groups.is_a?(Array)
3428
+ elsif !groups.is_a?(::Array)
3433
3429
  groups = Array(groups)
3434
3430
  end
3435
3431
 
3436
3432
  if as_dict
3437
3433
  out = {}
3438
3434
  if groups.length == 1
3439
- _df.partition_by(groups, maintain_order).each do |df|
3435
+ _df.partition_by(groups, maintain_order, include_key).each do |df|
3440
3436
  df = _from_rbdf(df)
3441
3437
  out[df[groups][0, 0]] = df
3442
3438
  end
3443
3439
  else
3444
- _df.partition_by(groups, maintain_order).each do |df|
3440
+ _df.partition_by(groups, maintain_order, include_key).each do |df|
3445
3441
  df = _from_rbdf(df)
3446
3442
  out[df[groups].row(0)] = df
3447
3443
  end
3448
3444
  end
3449
3445
  out
3450
3446
  else
3451
- _df.partition_by(groups, maintain_order).map { |df| _from_rbdf(df) }
3447
+ _df.partition_by(groups, maintain_order, include_key).map { |df| _from_rbdf(df) }
3452
3448
  end
3453
3449
  end
3454
3450
 
3455
3451
  # Shift values by the given period.
3456
3452
  #
3457
- # @param periods [Integer]
3453
+ # @param n [Integer]
3458
3454
  # Number of places to shift (may be negative).
3455
+ # @param fill_value [Object]
3456
+ # Fill the resulting null values with this value.
3459
3457
  #
3460
3458
  # @return [DataFrame]
3461
3459
  #
@@ -3493,8 +3491,8 @@ module Polars
3493
3491
  # # │ 3 ┆ 8 ┆ c │
3494
3492
  # # │ null ┆ null ┆ null │
3495
3493
  # # └──────┴──────┴──────┘
3496
- def shift(periods)
3497
- _from_rbdf(_df.shift(periods))
3494
+ def shift(n, fill_value: nil)
3495
+ lazy.shift(n, fill_value: fill_value).collect(_eager: true)
3498
3496
  end
3499
3497
 
3500
3498
  # Shift the values by a given period and fill the resulting null values.
@@ -3527,9 +3525,7 @@ module Polars
3527
3525
  # # │ 2 ┆ 7 ┆ b │
3528
3526
  # # └─────┴─────┴─────┘
3529
3527
  def shift_and_fill(periods, fill_value)
3530
- lazy
3531
- .shift_and_fill(periods, fill_value)
3532
- .collect(no_optimization: true, string_cache: false)
3528
+ shift(periods, fill_value: fill_value)
3533
3529
  end
3534
3530
 
3535
3531
  # Get a mask of all duplicated rows in this DataFrame.
@@ -3716,7 +3712,7 @@ module Polars
3716
3712
  # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
3717
3713
  # # └─────┴──────┴───────┴──────┴──────┴───────┘
3718
3714
  def with_columns(exprs)
3719
- if !exprs.nil? && !exprs.is_a?(Array)
3715
+ if !exprs.nil? && !exprs.is_a?(::Array)
3720
3716
  exprs = [exprs]
3721
3717
  end
3722
3718
  lazy
@@ -3780,7 +3776,7 @@ module Polars
3780
3776
  if axis == 0
3781
3777
  _from_rbdf(_df.max)
3782
3778
  elsif axis == 1
3783
- Utils.wrap_s(_df.hmax)
3779
+ Utils.wrap_s(_df.max_horizontal)
3784
3780
  else
3785
3781
  raise ArgumentError, "Axis should be 0 or 1."
3786
3782
  end
@@ -3812,7 +3808,7 @@ module Polars
3812
3808
  if axis == 0
3813
3809
  _from_rbdf(_df.min)
3814
3810
  elsif axis == 1
3815
- Utils.wrap_s(_df.hmin)
3811
+ Utils.wrap_s(_df.min_horizontal)
3816
3812
  else
3817
3813
  raise ArgumentError, "Axis should be 0 or 1."
3818
3814
  end
@@ -3861,7 +3857,7 @@ module Polars
3861
3857
  when 0
3862
3858
  _from_rbdf(_df.sum)
3863
3859
  when 1
3864
- Utils.wrap_s(_df.hsum(null_strategy))
3860
+ Utils.wrap_s(_df.sum_horizontal(null_strategy))
3865
3861
  else
3866
3862
  raise ArgumentError, "Axis should be 0 or 1."
3867
3863
  end
@@ -3899,7 +3895,7 @@ module Polars
3899
3895
  when 0
3900
3896
  _from_rbdf(_df.mean)
3901
3897
  when 1
3902
- Utils.wrap_s(_df.hmean(null_strategy))
3898
+ Utils.wrap_s(_df.mean_horizontal(null_strategy))
3903
3899
  else
3904
3900
  raise ArgumentError, "Axis should be 0 or 1."
3905
3901
  end
@@ -4097,11 +4093,11 @@ module Polars
4097
4093
  # # │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │
4098
4094
  # # │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │
4099
4095
  # # └───────┴───────┴───────┴───────┴───────┴───────┘
4100
- def to_dummies(columns: nil, separator: "_")
4096
+ def to_dummies(columns: nil, separator: "_", drop_first: false)
4101
4097
  if columns.is_a?(String)
4102
4098
  columns = [columns]
4103
4099
  end
4104
- _from_rbdf(_df.to_dummies(columns, separator))
4100
+ _from_rbdf(_df.to_dummies(columns, separator, drop_first))
4105
4101
  end
4106
4102
 
4107
4103
  # Drop duplicate rows from this DataFrame.
@@ -4189,7 +4185,7 @@ module Polars
4189
4185
  subset = [subset]
4190
4186
  end
4191
4187
 
4192
- if subset.is_a?(Array) && subset.length == 1
4188
+ if subset.is_a?(::Array) && subset.length == 1
4193
4189
  expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false)
4194
4190
  else
4195
4191
  struct_fields = subset.nil? ? Polars.all : subset
@@ -4284,15 +4280,20 @@ module Polars
4284
4280
  end
4285
4281
 
4286
4282
  if n.nil? && !frac.nil?
4283
+ frac = Series.new("frac", [frac]) unless frac.is_a?(Series)
4284
+
4287
4285
  _from_rbdf(
4288
- _df.sample_frac(frac, with_replacement, shuffle, seed)
4286
+ _df.sample_frac(frac._s, with_replacement, shuffle, seed)
4289
4287
  )
4290
4288
  end
4291
4289
 
4292
4290
  if n.nil?
4293
4291
  n = 1
4294
4292
  end
4295
- _from_rbdf(_df.sample_n(n, with_replacement, shuffle, seed))
4293
+
4294
+ n = Series.new("", [n]) unless n.is_a?(Series)
4295
+
4296
+ _from_rbdf(_df.sample_n(n._s, with_replacement, shuffle, seed))
4296
4297
  end
4297
4298
 
4298
4299
  # Apply a horizontal reduction on a DataFrame.
@@ -4591,7 +4592,7 @@ module Polars
4591
4592
  #
4592
4593
  # @example
4593
4594
  # s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]})
4594
- # s.take_every(2)
4595
+ # s.gather_every(2)
4595
4596
  # # =>
4596
4597
  # # shape: (2, 2)
4597
4598
  # # ┌─────┬─────┐
@@ -4602,9 +4603,10 @@ module Polars
4602
4603
  # # │ 1 ┆ 5 │
4603
4604
  # # │ 3 ┆ 7 │
4604
4605
  # # └─────┴─────┘
4605
- def take_every(n)
4606
- select(Utils.col("*").take_every(n))
4606
+ def gather_every(n)
4607
+ select(Utils.col("*").gather_every(n))
4607
4608
  end
4609
+ alias_method :take_every, :gather_every
4608
4610
 
4609
4611
  # Hash and combine the rows in this DataFrame.
4610
4612
  #
@@ -4661,16 +4663,16 @@ module Polars
4661
4663
  # df.interpolate
4662
4664
  # # =>
4663
4665
  # # shape: (4, 3)
4664
- # # ┌─────┬──────┬─────┐
4665
- # # │ foo ┆ bar ┆ baz
4666
- # # │ --- ┆ --- ┆ ---
4667
- # # │ i64 i64i64
4668
- # # ╞═════╪══════╪═════╡
4669
- # # │ 1 ┆ 6 ┆ 1
4670
- # # │ 5 ┆ 7 ┆ 3
4671
- # # │ 9 ┆ 9 ┆ 6
4672
- # # │ 10 ┆ null ┆ 9
4673
- # # └─────┴──────┴─────┘
4666
+ # # ┌──────┬──────┬──────────┐
4667
+ # # │ foo ┆ bar ┆ baz
4668
+ # # │ --- ┆ --- ┆ ---
4669
+ # # │ f64 f64f64
4670
+ # # ╞══════╪══════╪══════════╡
4671
+ # # │ 1.0 ┆ 6.0 ┆ 1.0
4672
+ # # │ 5.0 ┆ 7.0 ┆ 3.666667
4673
+ # # │ 9.0 ┆ 9.0 ┆ 6.333333
4674
+ # # │ 10.0 ┆ null ┆ 9.0
4675
+ # # └──────┴──────┴──────────┘
4674
4676
  def interpolate
4675
4677
  select(Utils.col("*").interpolate)
4676
4678
  end
@@ -4758,6 +4760,38 @@ module Polars
4758
4760
  _from_rbdf(_df.unnest(names))
4759
4761
  end
4760
4762
 
4763
+ # TODO
4764
+ # def corr
4765
+ # end
4766
+
4767
+ # TODO
4768
+ # def merge_sorted
4769
+ # end
4770
+
4771
+ # Indicate that one or multiple columns are sorted.
4772
+ #
4773
+ # @param column [Object]
4774
+ # Columns that are sorted
4775
+ # @param more_columns [Object]
4776
+ # Additional columns that are sorted, specified as positional arguments.
4777
+ # @param descending [Boolean]
4778
+ # Whether the columns are sorted in descending order.
4779
+ #
4780
+ # @return [DataFrame]
4781
+ def set_sorted(
4782
+ column,
4783
+ *more_columns,
4784
+ descending: false
4785
+ )
4786
+ lazy
4787
+ .set_sorted(column, *more_columns, descending: descending)
4788
+ .collect(no_optimization: true)
4789
+ end
4790
+
4791
+ # TODO
4792
+ # def update
4793
+ # end
4794
+
4761
4795
  private
4762
4796
 
4763
4797
  def initialize_copy(other)
@@ -4910,8 +4944,8 @@ module Polars
4910
4944
  [lookup[col[0]] || col[0], col[1]]
4911
4945
  end
4912
4946
 
4913
- if schema_overrides
4914
- raise Todo
4947
+ if schema_overrides && schema_overrides.any?
4948
+ column_dtypes.merge!(schema_overrides)
4915
4949
  end
4916
4950
 
4917
4951
  column_dtypes.each do |col, dtype|
@@ -4967,7 +5001,7 @@ module Polars
4967
5001
  columns.each do |col, i|
4968
5002
  if dtypes[col] == Categorical # != rbdf_dtypes[i]
4969
5003
  column_casts << Polars.col(col).cast(Categorical)._rbexpr
4970
- elsif structs.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
5004
+ elsif structs&.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
4971
5005
  column_casts << Polars.col(col).cast(structs[col])._rbexpr
4972
5006
  elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
4973
5007
  column_casts << Polars.col(col).cast(dtypes[col])._rbexpr
@@ -5012,15 +5046,56 @@ module Polars
5012
5046
  rbdf = _post_apply_columns(rbdf, column_names)
5013
5047
  end
5014
5048
  return rbdf
5015
- elsif data[0].is_a?(Array)
5049
+ elsif data[0].is_a?(::Array)
5016
5050
  if orient.nil? && !columns.nil?
5017
- orient = columns.length == data.length ? "col" : "row"
5051
+ first_element = data[0]
5052
+ row_types = first_element.filter_map { |value| value.class }.uniq
5053
+ if row_types.include?(Integer) && row_types.include?(Float)
5054
+ row_types.delete(Integer)
5055
+ end
5056
+ orient = row_types.length == 1 ? "col" : "row"
5018
5057
  end
5019
5058
 
5020
5059
  if orient == "row"
5021
- raise Todo
5060
+ column_names, schema_overrides = _unpack_schema(
5061
+ schema, schema_overrides: schema_overrides, n_expected: first_element.length
5062
+ )
5063
+ local_schema_override = (
5064
+ schema_overrides.any? ? (raise Todo) : {}
5065
+ )
5066
+ if column_names.any? && first_element.length > 0 && first_element.length != column_names.length
5067
+ raise ArgumentError, "the row data does not match the number of columns"
5068
+ end
5069
+
5070
+ unpack_nested = false
5071
+ local_schema_override.each do |col, tp|
5072
+ raise Todo
5073
+ end
5074
+
5075
+ if unpack_nested
5076
+ raise Todo
5077
+ else
5078
+ rbdf = RbDataFrame.read_rows(
5079
+ data,
5080
+ infer_schema_length,
5081
+ local_schema_override.any? ? local_schema_override : nil
5082
+ )
5083
+ end
5084
+ if column_names.any? || schema_overrides.any?
5085
+ rbdf = _post_apply_columns(
5086
+ rbdf, column_names, schema_overrides: schema_overrides
5087
+ )
5088
+ end
5089
+ return rbdf
5022
5090
  elsif orient == "col" || orient.nil?
5023
- raise Todo
5091
+ column_names, schema_overrides = _unpack_schema(
5092
+ schema, schema_overrides: schema_overrides, n_expected: data.length
5093
+ )
5094
+ data_series =
5095
+ data.map.with_index do |element, i|
5096
+ Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]])._s
5097
+ end
5098
+ return RbDataFrame.new(data_series)
5024
5099
  else
5025
5100
  raise ArgumentError, "orient must be one of {{'col', 'row', nil}}, got #{orient} instead."
5026
5101
  end
@@ -5066,10 +5141,10 @@ module Polars
5066
5141
 
5067
5142
  def _compare_to_other_df(other, op)
5068
5143
  if columns != other.columns
5069
- raise ArgmentError, "DataFrame columns do not match"
5144
+ raise ArgumentError, "DataFrame columns do not match"
5070
5145
  end
5071
5146
  if shape != other.shape
5072
- raise ArgmentError, "DataFrame dimensions do not match"
5147
+ raise ArgumentError, "DataFrame dimensions do not match"
5073
5148
  end
5074
5149
 
5075
5150
  suffix = "__POLARS_CMP_OTHER"
@@ -5117,7 +5192,7 @@ module Polars
5117
5192
 
5118
5193
  def _prepare_other_arg(other)
5119
5194
  if !other.is_a?(Series)
5120
- if other.is_a?(Array)
5195
+ if other.is_a?(::Array)
5121
5196
  raise ArgumentError, "Operation not supported."
5122
5197
  end
5123
5198