polars-df 0.5.0-aarch64-linux → 0.7.0-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,15 +20,9 @@ module Polars
20
20
  # this does not yield conclusive results, column orientation is used.
21
21
  def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
22
22
  schema ||= columns
23
- raise Todo if schema_overrides
24
23
 
25
- # TODO deprecate in favor of read_sql
26
24
  if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
27
- result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
28
- data = {}
29
- result.columns.each_with_index do |k, i|
30
- data[k] = result.rows.map { |r| r[i] }
31
- end
25
+ raise ArgumentError, "Use read_database instead"
32
26
  end
33
27
 
34
28
  if data.nil?
@@ -36,7 +30,7 @@ module Polars
36
30
  elsif data.is_a?(Hash)
37
31
  data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
38
32
  self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
39
- elsif data.is_a?(Array)
33
+ elsif data.is_a?(::Array)
40
34
  self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
41
35
  elsif data.is_a?(Series)
42
36
  self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
@@ -116,7 +110,7 @@ module Polars
116
110
  dtypes.each do|k, v|
117
111
  dtype_list << [k, Utils.rb_type_to_dtype(v)]
118
112
  end
119
- elsif dtypes.is_a?(Array)
113
+ elsif dtypes.is_a?(::Array)
120
114
  dtype_slice = dtypes
121
115
  else
122
116
  raise ArgumentError, "dtype arg should be list or dict"
@@ -590,7 +584,7 @@ module Polars
590
584
 
591
585
  # df[2, ..] (select row as df)
592
586
  if row_selection.is_a?(Integer)
593
- if col_selection.is_a?(Array)
587
+ if col_selection.is_a?(::Array)
594
588
  df = self[0.., col_selection]
595
589
  return df.slice(row_selection, 1)
596
590
  end
@@ -611,7 +605,7 @@ module Polars
611
605
  return series[row_selection]
612
606
  end
613
607
 
614
- if col_selection.is_a?(Array)
608
+ if col_selection.is_a?(::Array)
615
609
  # df[.., [1, 2]]
616
610
  if Utils.is_int_sequence(col_selection)
617
611
  series_list = col_selection.map { |i| to_series(i) }
@@ -641,7 +635,7 @@ module Polars
641
635
  return Slice.new(self).apply(item)
642
636
  end
643
637
 
644
- if item.is_a?(Array) && item.all? { |v| Utils.strlike?(v) }
638
+ if item.is_a?(::Array) && item.all? { |v| Utils.strlike?(v) }
645
639
  # select multiple columns
646
640
  # df[["foo", "bar"]]
647
641
  return _from_rbdf(_df.select(item.map(&:to_s)))
@@ -684,13 +678,13 @@ module Polars
684
678
  end
685
679
 
686
680
  if Utils.strlike?(key)
687
- if value.is_a?(Array) || (defined?(Numo::NArray) && value.is_a?(Numo::NArray))
681
+ if value.is_a?(::Array) || (defined?(Numo::NArray) && value.is_a?(Numo::NArray))
688
682
  value = Series.new(value)
689
683
  elsif !value.is_a?(Series)
690
684
  value = Polars.lit(value)
691
685
  end
692
686
  self._df = with_column(value.alias(key.to_s))._df
693
- elsif key.is_a?(Array)
687
+ elsif key.is_a?(::Array)
694
688
  row_selection, col_selection = key
695
689
 
696
690
  if Utils.strlike?(col_selection)
@@ -905,6 +899,7 @@ module Polars
905
899
  def write_csv(
906
900
  file = nil,
907
901
  has_header: true,
902
+ include_header: nil,
908
903
  sep: ",",
909
904
  quote: '"',
910
905
  batch_size: 1024,
@@ -914,6 +909,8 @@ module Polars
914
909
  float_precision: nil,
915
910
  null_value: nil
916
911
  )
912
+ include_header = has_header if include_header.nil?
913
+
917
914
  if sep.length > 1
918
915
  raise ArgumentError, "only single byte separator is allowed"
919
916
  elsif quote.length > 1
@@ -927,7 +924,7 @@ module Polars
927
924
  buffer.set_encoding(Encoding::BINARY)
928
925
  _df.write_csv(
929
926
  buffer,
930
- has_header,
927
+ include_header,
931
928
  sep.ord,
932
929
  quote.ord,
933
930
  batch_size,
@@ -946,7 +943,7 @@ module Polars
946
943
 
947
944
  _df.write_csv(
948
945
  file,
949
- has_header,
946
+ include_header,
950
947
  sep.ord,
951
948
  quote.ord,
952
949
  batch_size,
@@ -994,14 +991,21 @@ module Polars
994
991
  #
995
992
  # @return [nil]
996
993
  def write_ipc(file, compression: "uncompressed")
997
- if compression.nil?
998
- compression = "uncompressed"
994
+ return_bytes = file.nil?
995
+ if return_bytes
996
+ file = StringIO.new
997
+ file.set_encoding(Encoding::BINARY)
999
998
  end
1000
999
  if Utils.pathlike?(file)
1001
1000
  file = Utils.normalise_filepath(file)
1002
1001
  end
1003
1002
 
1003
+ if compression.nil?
1004
+ compression = "uncompressed"
1005
+ end
1006
+
1004
1007
  _df.write_ipc(file, compression)
1008
+ return_bytes ? file.string : nil
1005
1009
  end
1006
1010
 
1007
1011
  # Write to Apache Parquet file.
@@ -1144,22 +1148,8 @@ module Polars
1144
1148
  # # │ b ┆ 1 ┆ 2 ┆ 3 │
1145
1149
  # # └─────┴─────┴─────┴─────┘
1146
1150
  def transpose(include_header: false, header_name: "column", column_names: nil)
1147
- df = _from_rbdf(_df.transpose(include_header, header_name))
1148
- if !column_names.nil?
1149
- names = []
1150
- n = df.width
1151
- if include_header
1152
- names << header_name
1153
- n -= 1
1154
- end
1155
-
1156
- column_names = column_names.each
1157
- n.times do
1158
- names << column_names.next
1159
- end
1160
- df.columns = names
1161
- end
1162
- df
1151
+ keep_names_as = include_header ? header_name : nil
1152
+ _from_rbdf(_df.transpose(keep_names_as, column_names))
1163
1153
  end
1164
1154
 
1165
1155
  # Reverse the DataFrame.
@@ -1491,13 +1481,9 @@ module Polars
1491
1481
  # # │ 1 ┆ 6.0 ┆ a │
1492
1482
  # # └─────┴─────┴─────┘
1493
1483
  def sort(by, reverse: false, nulls_last: false)
1494
- if by.is_a?(Array) || by.is_a?(Expr)
1495
- lazy
1496
- .sort(by, reverse: reverse, nulls_last: nulls_last)
1497
- .collect(no_optimization: true, string_cache: false)
1498
- else
1499
- _from_rbdf(_df.sort(by, reverse, nulls_last))
1500
- end
1484
+ lazy
1485
+ .sort(by, reverse: reverse, nulls_last: nulls_last)
1486
+ .collect(no_optimization: true)
1501
1487
  end
1502
1488
 
1503
1489
  # Sort the DataFrame by column in-place.
@@ -1808,13 +1794,13 @@ module Polars
1808
1794
  _from_rbdf(_df.with_row_count(name, offset))
1809
1795
  end
1810
1796
 
1811
- # Start a groupby operation.
1797
+ # Start a group by operation.
1812
1798
  #
1813
1799
  # @param by [Object]
1814
1800
  # Column(s) to group by.
1815
1801
  # @param maintain_order [Boolean]
1816
1802
  # Make sure that the order of the groups remain consistent. This is more
1817
- # expensive than a default groupby. Note that this only works in expression
1803
+ # expensive than a default group by. Note that this only works in expression
1818
1804
  # aggregations.
1819
1805
  #
1820
1806
  # @return [GroupBy]
@@ -1827,7 +1813,7 @@ module Polars
1827
1813
  # "c" => [6, 5, 4, 3, 2, 1]
1828
1814
  # }
1829
1815
  # )
1830
- # df.groupby("a").agg(Polars.col("b").sum).sort("a")
1816
+ # df.group_by("a").agg(Polars.col("b").sum).sort("a")
1831
1817
  # # =>
1832
1818
  # # shape: (3, 2)
1833
1819
  # # ┌─────┬─────┐
@@ -1839,25 +1825,26 @@ module Polars
1839
1825
  # # │ b ┆ 11 │
1840
1826
  # # │ c ┆ 6 │
1841
1827
  # # └─────┴─────┘
1842
- def groupby(by, maintain_order: false)
1828
+ def group_by(by, maintain_order: false)
1843
1829
  if !Utils.bool?(maintain_order)
1844
- raise TypeError, "invalid input for groupby arg `maintain_order`: #{maintain_order}."
1830
+ raise TypeError, "invalid input for group_by arg `maintain_order`: #{maintain_order}."
1845
1831
  end
1846
1832
  GroupBy.new(
1847
- _df,
1833
+ self,
1848
1834
  by,
1849
- self.class,
1850
1835
  maintain_order: maintain_order
1851
1836
  )
1852
1837
  end
1838
+ alias_method :groupby, :group_by
1839
+ alias_method :group, :group_by
1853
1840
 
1854
1841
  # Create rolling groups based on a time column.
1855
1842
  #
1856
1843
  # Also works for index values of type `:i32` or `:i64`.
1857
1844
  #
1858
- # Different from a `dynamic_groupby` the windows are now determined by the
1845
+ # Different from a `dynamic_group_by` the windows are now determined by the
1859
1846
  # individual values and are not of constant intervals. For constant intervals use
1860
- # *groupby_dynamic*
1847
+ # *group_by_dynamic*
1861
1848
  #
1862
1849
  # The `period` and `offset` arguments are created either from a timedelta, or
1863
1850
  # by using the following string language:
@@ -1877,7 +1864,7 @@ module Polars
1877
1864
  # Or combine them:
1878
1865
  # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1879
1866
  #
1880
- # In case of a groupby_rolling on an integer column, the windows are defined by:
1867
+ # In case of a group_by_rolling on an integer column, the windows are defined by:
1881
1868
  #
1882
1869
  # - **"1i" # length 1**
1883
1870
  # - **"10i" # length 10**
@@ -1888,7 +1875,7 @@ module Polars
1888
1875
  # This column must be sorted in ascending order. If not the output will not
1889
1876
  # make sense.
1890
1877
  #
1891
- # In case of a rolling groupby on indices, dtype needs to be one of
1878
+ # In case of a rolling group by on indices, dtype needs to be one of
1892
1879
  # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1893
1880
  # performance matters use an `:i64` column.
1894
1881
  # @param period [Object]
@@ -1899,6 +1886,12 @@ module Polars
1899
1886
  # Define whether the temporal window interval is closed or not.
1900
1887
  # @param by [Object]
1901
1888
  # Also group by this column/these columns.
1889
+ # @param check_sorted [Boolean]
1890
+ # When the `by` argument is given, polars can not check sortedness
1891
+ # by the metadata and has to do a full scan on the index column to
1892
+ # verify data is sorted. This is expensive. If you are sure the
1893
+ # data within the by groups is sorted, you can set this to `false`.
1894
+ # Doing so incorrectly will lead to incorrect output
1902
1895
  #
1903
1896
  # @return [RollingGroupBy]
1904
1897
  #
@@ -1912,9 +1905,9 @@ module Polars
1912
1905
  # "2020-01-08 23:16:43"
1913
1906
  # ]
1914
1907
  # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
1915
- # Polars.col("dt").str.strptime(Polars::Datetime)
1908
+ # Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
1916
1909
  # )
1917
- # df.groupby_rolling(index_column: "dt", period: "2d").agg(
1910
+ # df.group_by_rolling(index_column: "dt", period: "2d").agg(
1918
1911
  # [
1919
1912
  # Polars.sum("a").alias("sum_a"),
1920
1913
  # Polars.min("a").alias("min_a"),
@@ -1935,20 +1928,22 @@ module Polars
1935
1928
  # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
1936
1929
  # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
1937
1930
  # # └─────────────────────┴───────┴───────┴───────┘
1938
- def groupby_rolling(
1931
+ def group_by_rolling(
1939
1932
  index_column:,
1940
1933
  period:,
1941
1934
  offset: nil,
1942
1935
  closed: "right",
1943
- by: nil
1936
+ by: nil,
1937
+ check_sorted: true
1944
1938
  )
1945
- RollingGroupBy.new(self, index_column, period, offset, closed, by)
1939
+ RollingGroupBy.new(self, index_column, period, offset, closed, by, check_sorted)
1946
1940
  end
1941
+ alias_method :groupby_rolling, :group_by_rolling
1947
1942
 
1948
1943
  # Group based on a time value (or index value of type `:i32`, `:i64`).
1949
1944
  #
1950
1945
  # Time windows are calculated and rows are assigned to windows. Different from a
1951
- # normal groupby is that a row can be member of multiple groups. The time/index
1946
+ # normal group by is that a row can be member of multiple groups. The time/index
1952
1947
  # window could be seen as a rolling window, with a window size determined by
1953
1948
  # dates/times/values instead of slots in the DataFrame.
1954
1949
  #
@@ -1976,7 +1971,7 @@ module Polars
1976
1971
  # Or combine them:
1977
1972
  # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1978
1973
  #
1979
- # In case of a groupby_dynamic on an integer column, the windows are defined by:
1974
+ # In case of a group_by_dynamic on an integer column, the windows are defined by:
1980
1975
  #
1981
1976
  # - "1i" # length 1
1982
1977
  # - "10i" # length 10
@@ -1987,7 +1982,7 @@ module Polars
1987
1982
  # This column must be sorted in ascending order. If not the output will not
1988
1983
  # make sense.
1989
1984
  #
1990
- # In case of a dynamic groupby on indices, dtype needs to be one of
1985
+ # In case of a dynamic group by on indices, dtype needs to be one of
1991
1986
  # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1992
1987
  # performance matters use an `:i64` column.
1993
1988
  # @param every
@@ -2038,7 +2033,7 @@ module Polars
2038
2033
  # # └─────────────────────┴─────┘
2039
2034
  #
2040
2035
  # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
2041
- # df.groupby_dynamic("time", every: "1h", closed: "right").agg(
2036
+ # df.group_by_dynamic("time", every: "1h", closed: "right").agg(
2042
2037
  # [
2043
2038
  # Polars.col("time").min.alias("time_min"),
2044
2039
  # Polars.col("time").max.alias("time_max")
@@ -2058,7 +2053,7 @@ module Polars
2058
2053
  # # └─────────────────────┴─────────────────────┴─────────────────────┘
2059
2054
  #
2060
2055
  # @example The window boundaries can also be added to the aggregation result.
2061
- # df.groupby_dynamic(
2056
+ # df.group_by_dynamic(
2062
2057
  # "time", every: "1h", include_boundaries: true, closed: "right"
2063
2058
  # ).agg([Polars.col("time").count.alias("time_count")])
2064
2059
  # # =>
@@ -2075,27 +2070,27 @@ module Polars
2075
2070
  # # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
2076
2071
  #
2077
2072
  # @example When closed="left", should not include right end of interval.
2078
- # df.groupby_dynamic("time", every: "1h", closed: "left").agg(
2073
+ # df.group_by_dynamic("time", every: "1h", closed: "left").agg(
2079
2074
  # [
2080
2075
  # Polars.col("time").count.alias("time_count"),
2081
- # Polars.col("time").list.alias("time_agg_list")
2076
+ # Polars.col("time").alias("time_agg_list")
2082
2077
  # ]
2083
2078
  # )
2084
2079
  # # =>
2085
2080
  # # shape: (4, 3)
2086
- # # ┌─────────────────────┬────────────┬─────────────────────────────────────┐
2087
- # # │ time ┆ time_count ┆ time_agg_list
2088
- # # │ --- ┆ --- ┆ ---
2089
- # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
2090
- # # ╞═════════════════════╪════════════╪═════════════════════════════════════╡
2091
- # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16...
2092
- # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16...
2093
- # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16...
2094
- # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
2095
- # # └─────────────────────┴────────────┴─────────────────────────────────────┘
2081
+ # # ┌─────────────────────┬────────────┬───────────────────────────────────┐
2082
+ # # │ time ┆ time_count ┆ time_agg_list
2083
+ # # │ --- ┆ --- ┆ ---
2084
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
2085
+ # # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
2086
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16
2087
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16
2088
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16
2089
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
2090
+ # # └─────────────────────┴────────────┴───────────────────────────────────┘
2096
2091
  #
2097
2092
  # @example When closed="both" the time values at the window boundaries belong to 2 groups.
2098
- # df.groupby_dynamic("time", every: "1h", closed: "both").agg(
2093
+ # df.group_by_dynamic("time", every: "1h", closed: "both").agg(
2099
2094
  # [Polars.col("time").count.alias("time_count")]
2100
2095
  # )
2101
2096
  # # =>
@@ -2112,7 +2107,7 @@ module Polars
2112
2107
  # # │ 2021-12-16 03:00:00 ┆ 1 │
2113
2108
  # # └─────────────────────┴────────────┘
2114
2109
  #
2115
- # @example Dynamic groupbys can also be combined with grouping on normal keys.
2110
+ # @example Dynamic group bys can also be combined with grouping on normal keys.
2116
2111
  # df = Polars::DataFrame.new(
2117
2112
  # {
2118
2113
  # "time" => Polars.date_range(
@@ -2123,7 +2118,7 @@ module Polars
2123
2118
  # "groups" => ["a", "a", "a", "b", "b", "a", "a"]
2124
2119
  # }
2125
2120
  # )
2126
- # df.groupby_dynamic(
2121
+ # df.group_by_dynamic(
2127
2122
  # "time",
2128
2123
  # every: "1h",
2129
2124
  # closed: "both",
@@ -2146,20 +2141,20 @@ module Polars
2146
2141
  # # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
2147
2142
  # # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
2148
2143
  #
2149
- # @example Dynamic groupby on an index column.
2144
+ # @example Dynamic group by on an index column.
2150
2145
  # df = Polars::DataFrame.new(
2151
2146
  # {
2152
2147
  # "idx" => Polars.arange(0, 6, eager: true),
2153
2148
  # "A" => ["A", "A", "B", "B", "B", "C"]
2154
2149
  # }
2155
2150
  # )
2156
- # df.groupby_dynamic(
2151
+ # df.group_by_dynamic(
2157
2152
  # "idx",
2158
2153
  # every: "2i",
2159
2154
  # period: "3i",
2160
2155
  # include_boundaries: true,
2161
2156
  # closed: "right"
2162
- # ).agg(Polars.col("A").list.alias("A_agg_list"))
2157
+ # ).agg(Polars.col("A").alias("A_agg_list"))
2163
2158
  # # =>
2164
2159
  # # shape: (3, 4)
2165
2160
  # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
@@ -2171,7 +2166,7 @@ module Polars
2171
2166
  # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
2172
2167
  # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
2173
2168
  # # └─────────────────┴─────────────────┴─────┴─────────────────┘
2174
- def groupby_dynamic(
2169
+ def group_by_dynamic(
2175
2170
  index_column,
2176
2171
  every:,
2177
2172
  period: nil,
@@ -2195,6 +2190,7 @@ module Polars
2195
2190
  start_by
2196
2191
  )
2197
2192
  end
2193
+ alias_method :groupby_dynamic, :group_by_dynamic
2198
2194
 
2199
2195
  # Upsample a DataFrame at a regular frequency.
2200
2196
  #
@@ -2242,7 +2238,7 @@ module Polars
2242
2238
  # "groups" => ["A", "B", "A", "B"],
2243
2239
  # "values" => [0, 1, 2, 3]
2244
2240
  # }
2245
- # )
2241
+ # ).set_sorted("time")
2246
2242
  # df.upsample(
2247
2243
  # time_column: "time", every: "1mo", by: "groups", maintain_order: true
2248
2244
  # ).select(Polars.all.forward_fill)
@@ -2360,7 +2356,7 @@ module Polars
2360
2356
  # ], # note record date: Jan 1st (sorted!)
2361
2357
  # "gdp" => [4164, 4411, 4566, 4696]
2362
2358
  # }
2363
- # )
2359
+ # ).set_sorted("date")
2364
2360
  # population = Polars::DataFrame.new(
2365
2361
  # {
2366
2362
  # "date" => [
@@ -2371,7 +2367,7 @@ module Polars
2371
2367
  # ], # note record date: May 12th (sorted!)
2372
2368
  # "population" => [82.19, 82.66, 83.12, 83.52]
2373
2369
  # }
2374
- # )
2370
+ # ).set_sorted("date")
2375
2371
  # population.join_asof(
2376
2372
  # gdp, left_on: "date", right_on: "date", strategy: "backward"
2377
2373
  # )
@@ -2674,7 +2670,7 @@ module Polars
2674
2670
  # # │ 3 ┆ 8 ┆ c ┆ 30 │
2675
2671
  # # └─────┴─────┴─────┴───────┘
2676
2672
  def hstack(columns, in_place: false)
2677
- if !columns.is_a?(Array)
2673
+ if !columns.is_a?(::Array)
2678
2674
  columns = columns.get_columns
2679
2675
  end
2680
2676
  if in_place
@@ -2804,7 +2800,7 @@ module Polars
2804
2800
  # # │ 3 ┆ 8.0 │
2805
2801
  # # └─────┴─────┘
2806
2802
  def drop(columns)
2807
- if columns.is_a?(Array)
2803
+ if columns.is_a?(::Array)
2808
2804
  df = clone
2809
2805
  columns.each do |n|
2810
2806
  df._df.drop_in_place(n)
@@ -3317,7 +3313,7 @@ module Polars
3317
3313
  n_fill = n_cols * n_rows - height
3318
3314
 
3319
3315
  if n_fill > 0
3320
- if !fill_values.is_a?(Array)
3316
+ if !fill_values.is_a?(::Array)
3321
3317
  fill_values = [fill_values] * df.width
3322
3318
  end
3323
3319
 
@@ -3426,36 +3422,38 @@ module Polars
3426
3422
  # # ╞═════╪═════╪═════╡
3427
3423
  # # │ C ┆ 2 ┆ l │
3428
3424
  # # └─────┴─────┴─────┘}
3429
- def partition_by(groups, maintain_order: true, as_dict: false)
3425
+ def partition_by(groups, maintain_order: true, include_key: true, as_dict: false)
3430
3426
  if groups.is_a?(String)
3431
3427
  groups = [groups]
3432
- elsif !groups.is_a?(Array)
3428
+ elsif !groups.is_a?(::Array)
3433
3429
  groups = Array(groups)
3434
3430
  end
3435
3431
 
3436
3432
  if as_dict
3437
3433
  out = {}
3438
3434
  if groups.length == 1
3439
- _df.partition_by(groups, maintain_order).each do |df|
3435
+ _df.partition_by(groups, maintain_order, include_key).each do |df|
3440
3436
  df = _from_rbdf(df)
3441
3437
  out[df[groups][0, 0]] = df
3442
3438
  end
3443
3439
  else
3444
- _df.partition_by(groups, maintain_order).each do |df|
3440
+ _df.partition_by(groups, maintain_order, include_key).each do |df|
3445
3441
  df = _from_rbdf(df)
3446
3442
  out[df[groups].row(0)] = df
3447
3443
  end
3448
3444
  end
3449
3445
  out
3450
3446
  else
3451
- _df.partition_by(groups, maintain_order).map { |df| _from_rbdf(df) }
3447
+ _df.partition_by(groups, maintain_order, include_key).map { |df| _from_rbdf(df) }
3452
3448
  end
3453
3449
  end
3454
3450
 
3455
3451
  # Shift values by the given period.
3456
3452
  #
3457
- # @param periods [Integer]
3453
+ # @param n [Integer]
3458
3454
  # Number of places to shift (may be negative).
3455
+ # @param fill_value [Object]
3456
+ # Fill the resulting null values with this value.
3459
3457
  #
3460
3458
  # @return [DataFrame]
3461
3459
  #
@@ -3493,8 +3491,8 @@ module Polars
3493
3491
  # # │ 3 ┆ 8 ┆ c │
3494
3492
  # # │ null ┆ null ┆ null │
3495
3493
  # # └──────┴──────┴──────┘
3496
- def shift(periods)
3497
- _from_rbdf(_df.shift(periods))
3494
+ def shift(n, fill_value: nil)
3495
+ lazy.shift(n, fill_value: fill_value).collect(_eager: true)
3498
3496
  end
3499
3497
 
3500
3498
  # Shift the values by a given period and fill the resulting null values.
@@ -3527,9 +3525,7 @@ module Polars
3527
3525
  # # │ 2 ┆ 7 ┆ b │
3528
3526
  # # └─────┴─────┴─────┘
3529
3527
  def shift_and_fill(periods, fill_value)
3530
- lazy
3531
- .shift_and_fill(periods, fill_value)
3532
- .collect(no_optimization: true, string_cache: false)
3528
+ shift(periods, fill_value: fill_value)
3533
3529
  end
3534
3530
 
3535
3531
  # Get a mask of all duplicated rows in this DataFrame.
@@ -3716,7 +3712,7 @@ module Polars
3716
3712
  # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
3717
3713
  # # └─────┴──────┴───────┴──────┴──────┴───────┘
3718
3714
  def with_columns(exprs)
3719
- if !exprs.nil? && !exprs.is_a?(Array)
3715
+ if !exprs.nil? && !exprs.is_a?(::Array)
3720
3716
  exprs = [exprs]
3721
3717
  end
3722
3718
  lazy
@@ -3780,7 +3776,7 @@ module Polars
3780
3776
  if axis == 0
3781
3777
  _from_rbdf(_df.max)
3782
3778
  elsif axis == 1
3783
- Utils.wrap_s(_df.hmax)
3779
+ Utils.wrap_s(_df.max_horizontal)
3784
3780
  else
3785
3781
  raise ArgumentError, "Axis should be 0 or 1."
3786
3782
  end
@@ -3812,7 +3808,7 @@ module Polars
3812
3808
  if axis == 0
3813
3809
  _from_rbdf(_df.min)
3814
3810
  elsif axis == 1
3815
- Utils.wrap_s(_df.hmin)
3811
+ Utils.wrap_s(_df.min_horizontal)
3816
3812
  else
3817
3813
  raise ArgumentError, "Axis should be 0 or 1."
3818
3814
  end
@@ -3861,7 +3857,7 @@ module Polars
3861
3857
  when 0
3862
3858
  _from_rbdf(_df.sum)
3863
3859
  when 1
3864
- Utils.wrap_s(_df.hsum(null_strategy))
3860
+ Utils.wrap_s(_df.sum_horizontal(null_strategy))
3865
3861
  else
3866
3862
  raise ArgumentError, "Axis should be 0 or 1."
3867
3863
  end
@@ -3899,7 +3895,7 @@ module Polars
3899
3895
  when 0
3900
3896
  _from_rbdf(_df.mean)
3901
3897
  when 1
3902
- Utils.wrap_s(_df.hmean(null_strategy))
3898
+ Utils.wrap_s(_df.mean_horizontal(null_strategy))
3903
3899
  else
3904
3900
  raise ArgumentError, "Axis should be 0 or 1."
3905
3901
  end
@@ -4097,11 +4093,11 @@ module Polars
4097
4093
  # # │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │
4098
4094
  # # │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │
4099
4095
  # # └───────┴───────┴───────┴───────┴───────┴───────┘
4100
- def to_dummies(columns: nil, separator: "_")
4096
+ def to_dummies(columns: nil, separator: "_", drop_first: false)
4101
4097
  if columns.is_a?(String)
4102
4098
  columns = [columns]
4103
4099
  end
4104
- _from_rbdf(_df.to_dummies(columns, separator))
4100
+ _from_rbdf(_df.to_dummies(columns, separator, drop_first))
4105
4101
  end
4106
4102
 
4107
4103
  # Drop duplicate rows from this DataFrame.
@@ -4189,7 +4185,7 @@ module Polars
4189
4185
  subset = [subset]
4190
4186
  end
4191
4187
 
4192
- if subset.is_a?(Array) && subset.length == 1
4188
+ if subset.is_a?(::Array) && subset.length == 1
4193
4189
  expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false)
4194
4190
  else
4195
4191
  struct_fields = subset.nil? ? Polars.all : subset
@@ -4284,15 +4280,20 @@ module Polars
4284
4280
  end
4285
4281
 
4286
4282
  if n.nil? && !frac.nil?
4283
+ frac = Series.new("frac", [frac]) unless frac.is_a?(Series)
4284
+
4287
4285
  _from_rbdf(
4288
- _df.sample_frac(frac, with_replacement, shuffle, seed)
4286
+ _df.sample_frac(frac._s, with_replacement, shuffle, seed)
4289
4287
  )
4290
4288
  end
4291
4289
 
4292
4290
  if n.nil?
4293
4291
  n = 1
4294
4292
  end
4295
- _from_rbdf(_df.sample_n(n, with_replacement, shuffle, seed))
4293
+
4294
+ n = Series.new("", [n]) unless n.is_a?(Series)
4295
+
4296
+ _from_rbdf(_df.sample_n(n._s, with_replacement, shuffle, seed))
4296
4297
  end
4297
4298
 
4298
4299
  # Apply a horizontal reduction on a DataFrame.
@@ -4591,7 +4592,7 @@ module Polars
4591
4592
  #
4592
4593
  # @example
4593
4594
  # s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]})
4594
- # s.take_every(2)
4595
+ # s.gather_every(2)
4595
4596
  # # =>
4596
4597
  # # shape: (2, 2)
4597
4598
  # # ┌─────┬─────┐
@@ -4602,9 +4603,10 @@ module Polars
4602
4603
  # # │ 1 ┆ 5 │
4603
4604
  # # │ 3 ┆ 7 │
4604
4605
  # # └─────┴─────┘
4605
- def take_every(n)
4606
- select(Utils.col("*").take_every(n))
4606
+ def gather_every(n)
4607
+ select(Utils.col("*").gather_every(n))
4607
4608
  end
4609
+ alias_method :take_every, :gather_every
4608
4610
 
4609
4611
  # Hash and combine the rows in this DataFrame.
4610
4612
  #
@@ -4661,16 +4663,16 @@ module Polars
4661
4663
  # df.interpolate
4662
4664
  # # =>
4663
4665
  # # shape: (4, 3)
4664
- # # ┌─────┬──────┬─────┐
4665
- # # │ foo ┆ bar ┆ baz
4666
- # # │ --- ┆ --- ┆ ---
4667
- # # │ i64 i64i64
4668
- # # ╞═════╪══════╪═════╡
4669
- # # │ 1 ┆ 6 ┆ 1
4670
- # # │ 5 ┆ 7 ┆ 3
4671
- # # │ 9 ┆ 9 ┆ 6
4672
- # # │ 10 ┆ null ┆ 9
4673
- # # └─────┴──────┴─────┘
4666
+ # # ┌──────┬──────┬──────────┐
4667
+ # # │ foo ┆ bar ┆ baz
4668
+ # # │ --- ┆ --- ┆ ---
4669
+ # # │ f64 f64f64
4670
+ # # ╞══════╪══════╪══════════╡
4671
+ # # │ 1.0 ┆ 6.0 ┆ 1.0
4672
+ # # │ 5.0 ┆ 7.0 ┆ 3.666667
4673
+ # # │ 9.0 ┆ 9.0 ┆ 6.333333
4674
+ # # │ 10.0 ┆ null ┆ 9.0
4675
+ # # └──────┴──────┴──────────┘
4674
4676
  def interpolate
4675
4677
  select(Utils.col("*").interpolate)
4676
4678
  end
@@ -4758,6 +4760,38 @@ module Polars
4758
4760
  _from_rbdf(_df.unnest(names))
4759
4761
  end
4760
4762
 
4763
+ # TODO
4764
+ # def corr
4765
+ # end
4766
+
4767
+ # TODO
4768
+ # def merge_sorted
4769
+ # end
4770
+
4771
+ # Indicate that one or multiple columns are sorted.
4772
+ #
4773
+ # @param column [Object]
4774
+ # Columns that are sorted
4775
+ # @param more_columns [Object]
4776
+ # Additional columns that are sorted, specified as positional arguments.
4777
+ # @param descending [Boolean]
4778
+ # Whether the columns are sorted in descending order.
4779
+ #
4780
+ # @return [DataFrame]
4781
+ def set_sorted(
4782
+ column,
4783
+ *more_columns,
4784
+ descending: false
4785
+ )
4786
+ lazy
4787
+ .set_sorted(column, *more_columns, descending: descending)
4788
+ .collect(no_optimization: true)
4789
+ end
4790
+
4791
+ # TODO
4792
+ # def update
4793
+ # end
4794
+
4761
4795
  private
4762
4796
 
4763
4797
  def initialize_copy(other)
@@ -4910,8 +4944,8 @@ module Polars
4910
4944
  [lookup[col[0]] || col[0], col[1]]
4911
4945
  end
4912
4946
 
4913
- if schema_overrides
4914
- raise Todo
4947
+ if schema_overrides && schema_overrides.any?
4948
+ column_dtypes.merge!(schema_overrides)
4915
4949
  end
4916
4950
 
4917
4951
  column_dtypes.each do |col, dtype|
@@ -4967,7 +5001,7 @@ module Polars
4967
5001
  columns.each do |col, i|
4968
5002
  if dtypes[col] == Categorical # != rbdf_dtypes[i]
4969
5003
  column_casts << Polars.col(col).cast(Categorical)._rbexpr
4970
- elsif structs.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
5004
+ elsif structs&.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
4971
5005
  column_casts << Polars.col(col).cast(structs[col])._rbexpr
4972
5006
  elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
4973
5007
  column_casts << Polars.col(col).cast(dtypes[col])._rbexpr
@@ -5012,15 +5046,56 @@ module Polars
5012
5046
  rbdf = _post_apply_columns(rbdf, column_names)
5013
5047
  end
5014
5048
  return rbdf
5015
- elsif data[0].is_a?(Array)
5049
+ elsif data[0].is_a?(::Array)
5016
5050
  if orient.nil? && !columns.nil?
5017
- orient = columns.length == data.length ? "col" : "row"
5051
+ first_element = data[0]
5052
+ row_types = first_element.filter_map { |value| value.class }.uniq
5053
+ if row_types.include?(Integer) && row_types.include?(Float)
5054
+ row_types.delete(Integer)
5055
+ end
5056
+ orient = row_types.length == 1 ? "col" : "row"
5018
5057
  end
5019
5058
 
5020
5059
  if orient == "row"
5021
- raise Todo
5060
+ column_names, schema_overrides = _unpack_schema(
5061
+ schema, schema_overrides: schema_overrides, n_expected: first_element.length
5062
+ )
5063
+ local_schema_override = (
5064
+ schema_overrides.any? ? (raise Todo) : {}
5065
+ )
5066
+ if column_names.any? && first_element.length > 0 && first_element.length != column_names.length
5067
+ raise ArgumentError, "the row data does not match the number of columns"
5068
+ end
5069
+
5070
+ unpack_nested = false
5071
+ local_schema_override.each do |col, tp|
5072
+ raise Todo
5073
+ end
5074
+
5075
+ if unpack_nested
5076
+ raise Todo
5077
+ else
5078
+ rbdf = RbDataFrame.read_rows(
5079
+ data,
5080
+ infer_schema_length,
5081
+ local_schema_override.any? ? local_schema_override : nil
5082
+ )
5083
+ end
5084
+ if column_names.any? || schema_overrides.any?
5085
+ rbdf = _post_apply_columns(
5086
+ rbdf, column_names, schema_overrides: schema_overrides
5087
+ )
5088
+ end
5089
+ return rbdf
5022
5090
  elsif orient == "col" || orient.nil?
5023
- raise Todo
5091
+ column_names, schema_overrides = _unpack_schema(
5092
+ schema, schema_overrides: schema_overrides, n_expected: data.length
5093
+ )
5094
+ data_series =
5095
+ data.map.with_index do |element, i|
5096
+ Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]])._s
5097
+ end
5098
+ return RbDataFrame.new(data_series)
5024
5099
  else
5025
5100
  raise ArgumentError, "orient must be one of {{'col', 'row', nil}}, got #{orient} instead."
5026
5101
  end
@@ -5066,10 +5141,10 @@ module Polars
5066
5141
 
5067
5142
  def _compare_to_other_df(other, op)
5068
5143
  if columns != other.columns
5069
- raise ArgmentError, "DataFrame columns do not match"
5144
+ raise ArgumentError, "DataFrame columns do not match"
5070
5145
  end
5071
5146
  if shape != other.shape
5072
- raise ArgmentError, "DataFrame dimensions do not match"
5147
+ raise ArgumentError, "DataFrame dimensions do not match"
5073
5148
  end
5074
5149
 
5075
5150
  suffix = "__POLARS_CMP_OTHER"
@@ -5117,7 +5192,7 @@ module Polars
5117
5192
 
5118
5193
  def _prepare_other_arg(other)
5119
5194
  if !other.is_a?(Series)
5120
- if other.is_a?(Array)
5195
+ if other.is_a?(::Array)
5121
5196
  raise ArgumentError, "Operation not supported."
5122
5197
  end
5123
5198