polars-df 0.5.0-aarch64-linux → 0.7.0-aarch64-linux
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +26 -0
- data/Cargo.lock +595 -709
- data/Cargo.toml +1 -0
- data/LICENSE-THIRD-PARTY.txt +3854 -4496
- data/README.md +11 -9
- data/lib/polars/3.0/polars.so +0 -0
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/array_expr.rb +84 -0
- data/lib/polars/array_name_space.rb +77 -0
- data/lib/polars/batched_csv_reader.rb +1 -1
- data/lib/polars/config.rb +530 -0
- data/lib/polars/data_frame.rb +206 -131
- data/lib/polars/data_types.rb +163 -29
- data/lib/polars/date_time_expr.rb +13 -18
- data/lib/polars/date_time_name_space.rb +22 -28
- data/lib/polars/dynamic_group_by.rb +2 -2
- data/lib/polars/expr.rb +241 -151
- data/lib/polars/functions.rb +29 -38
- data/lib/polars/group_by.rb +38 -76
- data/lib/polars/io.rb +37 -2
- data/lib/polars/lazy_frame.rb +174 -95
- data/lib/polars/lazy_functions.rb +87 -63
- data/lib/polars/lazy_group_by.rb +7 -8
- data/lib/polars/list_expr.rb +40 -36
- data/lib/polars/list_name_space.rb +15 -15
- data/lib/polars/name_expr.rb +198 -0
- data/lib/polars/rolling_group_by.rb +6 -4
- data/lib/polars/series.rb +95 -28
- data/lib/polars/sql_context.rb +194 -0
- data/lib/polars/string_expr.rb +249 -69
- data/lib/polars/string_name_space.rb +155 -25
- data/lib/polars/utils.rb +119 -57
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +6 -0
- metadata +7 -2
data/lib/polars/data_frame.rb
CHANGED
@@ -20,15 +20,9 @@ module Polars
|
|
20
20
|
# this does not yield conclusive results, column orientation is used.
|
21
21
|
def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
|
22
22
|
schema ||= columns
|
23
|
-
raise Todo if schema_overrides
|
24
23
|
|
25
|
-
# TODO deprecate in favor of read_sql
|
26
24
|
if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
|
27
|
-
|
28
|
-
data = {}
|
29
|
-
result.columns.each_with_index do |k, i|
|
30
|
-
data[k] = result.rows.map { |r| r[i] }
|
31
|
-
end
|
25
|
+
raise ArgumentError, "Use read_database instead"
|
32
26
|
end
|
33
27
|
|
34
28
|
if data.nil?
|
@@ -36,7 +30,7 @@ module Polars
|
|
36
30
|
elsif data.is_a?(Hash)
|
37
31
|
data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
|
38
32
|
self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
|
39
|
-
elsif data.is_a?(Array)
|
33
|
+
elsif data.is_a?(::Array)
|
40
34
|
self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
|
41
35
|
elsif data.is_a?(Series)
|
42
36
|
self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
|
@@ -116,7 +110,7 @@ module Polars
|
|
116
110
|
dtypes.each do|k, v|
|
117
111
|
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
118
112
|
end
|
119
|
-
elsif dtypes.is_a?(Array)
|
113
|
+
elsif dtypes.is_a?(::Array)
|
120
114
|
dtype_slice = dtypes
|
121
115
|
else
|
122
116
|
raise ArgumentError, "dtype arg should be list or dict"
|
@@ -590,7 +584,7 @@ module Polars
|
|
590
584
|
|
591
585
|
# df[2, ..] (select row as df)
|
592
586
|
if row_selection.is_a?(Integer)
|
593
|
-
if col_selection.is_a?(Array)
|
587
|
+
if col_selection.is_a?(::Array)
|
594
588
|
df = self[0.., col_selection]
|
595
589
|
return df.slice(row_selection, 1)
|
596
590
|
end
|
@@ -611,7 +605,7 @@ module Polars
|
|
611
605
|
return series[row_selection]
|
612
606
|
end
|
613
607
|
|
614
|
-
if col_selection.is_a?(Array)
|
608
|
+
if col_selection.is_a?(::Array)
|
615
609
|
# df[.., [1, 2]]
|
616
610
|
if Utils.is_int_sequence(col_selection)
|
617
611
|
series_list = col_selection.map { |i| to_series(i) }
|
@@ -641,7 +635,7 @@ module Polars
|
|
641
635
|
return Slice.new(self).apply(item)
|
642
636
|
end
|
643
637
|
|
644
|
-
if item.is_a?(Array) && item.all? { |v| Utils.strlike?(v) }
|
638
|
+
if item.is_a?(::Array) && item.all? { |v| Utils.strlike?(v) }
|
645
639
|
# select multiple columns
|
646
640
|
# df[["foo", "bar"]]
|
647
641
|
return _from_rbdf(_df.select(item.map(&:to_s)))
|
@@ -684,13 +678,13 @@ module Polars
|
|
684
678
|
end
|
685
679
|
|
686
680
|
if Utils.strlike?(key)
|
687
|
-
if value.is_a?(Array) || (defined?(Numo::NArray) && value.is_a?(Numo::NArray))
|
681
|
+
if value.is_a?(::Array) || (defined?(Numo::NArray) && value.is_a?(Numo::NArray))
|
688
682
|
value = Series.new(value)
|
689
683
|
elsif !value.is_a?(Series)
|
690
684
|
value = Polars.lit(value)
|
691
685
|
end
|
692
686
|
self._df = with_column(value.alias(key.to_s))._df
|
693
|
-
elsif key.is_a?(Array)
|
687
|
+
elsif key.is_a?(::Array)
|
694
688
|
row_selection, col_selection = key
|
695
689
|
|
696
690
|
if Utils.strlike?(col_selection)
|
@@ -905,6 +899,7 @@ module Polars
|
|
905
899
|
def write_csv(
|
906
900
|
file = nil,
|
907
901
|
has_header: true,
|
902
|
+
include_header: nil,
|
908
903
|
sep: ",",
|
909
904
|
quote: '"',
|
910
905
|
batch_size: 1024,
|
@@ -914,6 +909,8 @@ module Polars
|
|
914
909
|
float_precision: nil,
|
915
910
|
null_value: nil
|
916
911
|
)
|
912
|
+
include_header = has_header if include_header.nil?
|
913
|
+
|
917
914
|
if sep.length > 1
|
918
915
|
raise ArgumentError, "only single byte separator is allowed"
|
919
916
|
elsif quote.length > 1
|
@@ -927,7 +924,7 @@ module Polars
|
|
927
924
|
buffer.set_encoding(Encoding::BINARY)
|
928
925
|
_df.write_csv(
|
929
926
|
buffer,
|
930
|
-
|
927
|
+
include_header,
|
931
928
|
sep.ord,
|
932
929
|
quote.ord,
|
933
930
|
batch_size,
|
@@ -946,7 +943,7 @@ module Polars
|
|
946
943
|
|
947
944
|
_df.write_csv(
|
948
945
|
file,
|
949
|
-
|
946
|
+
include_header,
|
950
947
|
sep.ord,
|
951
948
|
quote.ord,
|
952
949
|
batch_size,
|
@@ -994,14 +991,21 @@ module Polars
|
|
994
991
|
#
|
995
992
|
# @return [nil]
|
996
993
|
def write_ipc(file, compression: "uncompressed")
|
997
|
-
|
998
|
-
|
994
|
+
return_bytes = file.nil?
|
995
|
+
if return_bytes
|
996
|
+
file = StringIO.new
|
997
|
+
file.set_encoding(Encoding::BINARY)
|
999
998
|
end
|
1000
999
|
if Utils.pathlike?(file)
|
1001
1000
|
file = Utils.normalise_filepath(file)
|
1002
1001
|
end
|
1003
1002
|
|
1003
|
+
if compression.nil?
|
1004
|
+
compression = "uncompressed"
|
1005
|
+
end
|
1006
|
+
|
1004
1007
|
_df.write_ipc(file, compression)
|
1008
|
+
return_bytes ? file.string : nil
|
1005
1009
|
end
|
1006
1010
|
|
1007
1011
|
# Write to Apache Parquet file.
|
@@ -1144,22 +1148,8 @@ module Polars
|
|
1144
1148
|
# # │ b ┆ 1 ┆ 2 ┆ 3 │
|
1145
1149
|
# # └─────┴─────┴─────┴─────┘
|
1146
1150
|
def transpose(include_header: false, header_name: "column", column_names: nil)
|
1147
|
-
|
1148
|
-
|
1149
|
-
names = []
|
1150
|
-
n = df.width
|
1151
|
-
if include_header
|
1152
|
-
names << header_name
|
1153
|
-
n -= 1
|
1154
|
-
end
|
1155
|
-
|
1156
|
-
column_names = column_names.each
|
1157
|
-
n.times do
|
1158
|
-
names << column_names.next
|
1159
|
-
end
|
1160
|
-
df.columns = names
|
1161
|
-
end
|
1162
|
-
df
|
1151
|
+
keep_names_as = include_header ? header_name : nil
|
1152
|
+
_from_rbdf(_df.transpose(keep_names_as, column_names))
|
1163
1153
|
end
|
1164
1154
|
|
1165
1155
|
# Reverse the DataFrame.
|
@@ -1491,13 +1481,9 @@ module Polars
|
|
1491
1481
|
# # │ 1 ┆ 6.0 ┆ a │
|
1492
1482
|
# # └─────┴─────┴─────┘
|
1493
1483
|
def sort(by, reverse: false, nulls_last: false)
|
1494
|
-
|
1495
|
-
|
1496
|
-
|
1497
|
-
.collect(no_optimization: true, string_cache: false)
|
1498
|
-
else
|
1499
|
-
_from_rbdf(_df.sort(by, reverse, nulls_last))
|
1500
|
-
end
|
1484
|
+
lazy
|
1485
|
+
.sort(by, reverse: reverse, nulls_last: nulls_last)
|
1486
|
+
.collect(no_optimization: true)
|
1501
1487
|
end
|
1502
1488
|
|
1503
1489
|
# Sort the DataFrame by column in-place.
|
@@ -1808,13 +1794,13 @@ module Polars
|
|
1808
1794
|
_from_rbdf(_df.with_row_count(name, offset))
|
1809
1795
|
end
|
1810
1796
|
|
1811
|
-
# Start a
|
1797
|
+
# Start a group by operation.
|
1812
1798
|
#
|
1813
1799
|
# @param by [Object]
|
1814
1800
|
# Column(s) to group by.
|
1815
1801
|
# @param maintain_order [Boolean]
|
1816
1802
|
# Make sure that the order of the groups remain consistent. This is more
|
1817
|
-
# expensive than a default
|
1803
|
+
# expensive than a default group by. Note that this only works in expression
|
1818
1804
|
# aggregations.
|
1819
1805
|
#
|
1820
1806
|
# @return [GroupBy]
|
@@ -1827,7 +1813,7 @@ module Polars
|
|
1827
1813
|
# "c" => [6, 5, 4, 3, 2, 1]
|
1828
1814
|
# }
|
1829
1815
|
# )
|
1830
|
-
# df.
|
1816
|
+
# df.group_by("a").agg(Polars.col("b").sum).sort("a")
|
1831
1817
|
# # =>
|
1832
1818
|
# # shape: (3, 2)
|
1833
1819
|
# # ┌─────┬─────┐
|
@@ -1839,25 +1825,26 @@ module Polars
|
|
1839
1825
|
# # │ b ┆ 11 │
|
1840
1826
|
# # │ c ┆ 6 │
|
1841
1827
|
# # └─────┴─────┘
|
1842
|
-
def
|
1828
|
+
def group_by(by, maintain_order: false)
|
1843
1829
|
if !Utils.bool?(maintain_order)
|
1844
|
-
raise TypeError, "invalid input for
|
1830
|
+
raise TypeError, "invalid input for group_by arg `maintain_order`: #{maintain_order}."
|
1845
1831
|
end
|
1846
1832
|
GroupBy.new(
|
1847
|
-
|
1833
|
+
self,
|
1848
1834
|
by,
|
1849
|
-
self.class,
|
1850
1835
|
maintain_order: maintain_order
|
1851
1836
|
)
|
1852
1837
|
end
|
1838
|
+
alias_method :groupby, :group_by
|
1839
|
+
alias_method :group, :group_by
|
1853
1840
|
|
1854
1841
|
# Create rolling groups based on a time column.
|
1855
1842
|
#
|
1856
1843
|
# Also works for index values of type `:i32` or `:i64`.
|
1857
1844
|
#
|
1858
|
-
# Different from a `
|
1845
|
+
# Different from a `dynamic_group_by` the windows are now determined by the
|
1859
1846
|
# individual values and are not of constant intervals. For constant intervals use
|
1860
|
-
# *
|
1847
|
+
# *group_by_dynamic*
|
1861
1848
|
#
|
1862
1849
|
# The `period` and `offset` arguments are created either from a timedelta, or
|
1863
1850
|
# by using the following string language:
|
@@ -1877,7 +1864,7 @@ module Polars
|
|
1877
1864
|
# Or combine them:
|
1878
1865
|
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
1879
1866
|
#
|
1880
|
-
# In case of a
|
1867
|
+
# In case of a group_by_rolling on an integer column, the windows are defined by:
|
1881
1868
|
#
|
1882
1869
|
# - **"1i" # length 1**
|
1883
1870
|
# - **"10i" # length 10**
|
@@ -1888,7 +1875,7 @@ module Polars
|
|
1888
1875
|
# This column must be sorted in ascending order. If not the output will not
|
1889
1876
|
# make sense.
|
1890
1877
|
#
|
1891
|
-
# In case of a rolling
|
1878
|
+
# In case of a rolling group by on indices, dtype needs to be one of
|
1892
1879
|
# `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
|
1893
1880
|
# performance matters use an `:i64` column.
|
1894
1881
|
# @param period [Object]
|
@@ -1899,6 +1886,12 @@ module Polars
|
|
1899
1886
|
# Define whether the temporal window interval is closed or not.
|
1900
1887
|
# @param by [Object]
|
1901
1888
|
# Also group by this column/these columns.
|
1889
|
+
# @param check_sorted [Boolean]
|
1890
|
+
# When the `by` argument is given, polars can not check sortedness
|
1891
|
+
# by the metadata and has to do a full scan on the index column to
|
1892
|
+
# verify data is sorted. This is expensive. If you are sure the
|
1893
|
+
# data within the by groups is sorted, you can set this to `false`.
|
1894
|
+
# Doing so incorrectly will lead to incorrect output
|
1902
1895
|
#
|
1903
1896
|
# @return [RollingGroupBy]
|
1904
1897
|
#
|
@@ -1912,9 +1905,9 @@ module Polars
|
|
1912
1905
|
# "2020-01-08 23:16:43"
|
1913
1906
|
# ]
|
1914
1907
|
# df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
|
1915
|
-
# Polars.col("dt").str.strptime(Polars::Datetime)
|
1908
|
+
# Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
|
1916
1909
|
# )
|
1917
|
-
# df.
|
1910
|
+
# df.group_by_rolling(index_column: "dt", period: "2d").agg(
|
1918
1911
|
# [
|
1919
1912
|
# Polars.sum("a").alias("sum_a"),
|
1920
1913
|
# Polars.min("a").alias("min_a"),
|
@@ -1935,20 +1928,22 @@ module Polars
|
|
1935
1928
|
# # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
|
1936
1929
|
# # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
|
1937
1930
|
# # └─────────────────────┴───────┴───────┴───────┘
|
1938
|
-
def
|
1931
|
+
def group_by_rolling(
|
1939
1932
|
index_column:,
|
1940
1933
|
period:,
|
1941
1934
|
offset: nil,
|
1942
1935
|
closed: "right",
|
1943
|
-
by: nil
|
1936
|
+
by: nil,
|
1937
|
+
check_sorted: true
|
1944
1938
|
)
|
1945
|
-
RollingGroupBy.new(self, index_column, period, offset, closed, by)
|
1939
|
+
RollingGroupBy.new(self, index_column, period, offset, closed, by, check_sorted)
|
1946
1940
|
end
|
1941
|
+
alias_method :groupby_rolling, :group_by_rolling
|
1947
1942
|
|
1948
1943
|
# Group based on a time value (or index value of type `:i32`, `:i64`).
|
1949
1944
|
#
|
1950
1945
|
# Time windows are calculated and rows are assigned to windows. Different from a
|
1951
|
-
# normal
|
1946
|
+
# normal group by is that a row can be member of multiple groups. The time/index
|
1952
1947
|
# window could be seen as a rolling window, with a window size determined by
|
1953
1948
|
# dates/times/values instead of slots in the DataFrame.
|
1954
1949
|
#
|
@@ -1976,7 +1971,7 @@ module Polars
|
|
1976
1971
|
# Or combine them:
|
1977
1972
|
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
1978
1973
|
#
|
1979
|
-
# In case of a
|
1974
|
+
# In case of a group_by_dynamic on an integer column, the windows are defined by:
|
1980
1975
|
#
|
1981
1976
|
# - "1i" # length 1
|
1982
1977
|
# - "10i" # length 10
|
@@ -1987,7 +1982,7 @@ module Polars
|
|
1987
1982
|
# This column must be sorted in ascending order. If not the output will not
|
1988
1983
|
# make sense.
|
1989
1984
|
#
|
1990
|
-
# In case of a dynamic
|
1985
|
+
# In case of a dynamic group by on indices, dtype needs to be one of
|
1991
1986
|
# `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
|
1992
1987
|
# performance matters use an `:i64` column.
|
1993
1988
|
# @param every
|
@@ -2038,7 +2033,7 @@ module Polars
|
|
2038
2033
|
# # └─────────────────────┴─────┘
|
2039
2034
|
#
|
2040
2035
|
# @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
|
2041
|
-
# df.
|
2036
|
+
# df.group_by_dynamic("time", every: "1h", closed: "right").agg(
|
2042
2037
|
# [
|
2043
2038
|
# Polars.col("time").min.alias("time_min"),
|
2044
2039
|
# Polars.col("time").max.alias("time_max")
|
@@ -2058,7 +2053,7 @@ module Polars
|
|
2058
2053
|
# # └─────────────────────┴─────────────────────┴─────────────────────┘
|
2059
2054
|
#
|
2060
2055
|
# @example The window boundaries can also be added to the aggregation result.
|
2061
|
-
# df.
|
2056
|
+
# df.group_by_dynamic(
|
2062
2057
|
# "time", every: "1h", include_boundaries: true, closed: "right"
|
2063
2058
|
# ).agg([Polars.col("time").count.alias("time_count")])
|
2064
2059
|
# # =>
|
@@ -2075,27 +2070,27 @@ module Polars
|
|
2075
2070
|
# # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
|
2076
2071
|
#
|
2077
2072
|
# @example When closed="left", should not include right end of interval.
|
2078
|
-
# df.
|
2073
|
+
# df.group_by_dynamic("time", every: "1h", closed: "left").agg(
|
2079
2074
|
# [
|
2080
2075
|
# Polars.col("time").count.alias("time_count"),
|
2081
|
-
# Polars.col("time").
|
2076
|
+
# Polars.col("time").alias("time_agg_list")
|
2082
2077
|
# ]
|
2083
2078
|
# )
|
2084
2079
|
# # =>
|
2085
2080
|
# # shape: (4, 3)
|
2086
|
-
# #
|
2087
|
-
# # │ time ┆ time_count ┆ time_agg_list
|
2088
|
-
# # │ --- ┆ --- ┆ ---
|
2089
|
-
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
|
2090
|
-
# #
|
2091
|
-
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16
|
2092
|
-
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16
|
2093
|
-
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16
|
2094
|
-
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
|
2095
|
-
# #
|
2081
|
+
# # ┌─────────────────────┬────────────┬───────────────────────────────────┐
|
2082
|
+
# # │ time ┆ time_count ┆ time_agg_list │
|
2083
|
+
# # │ --- ┆ --- ┆ --- │
|
2084
|
+
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
|
2085
|
+
# # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
|
2086
|
+
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │
|
2087
|
+
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │
|
2088
|
+
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │
|
2089
|
+
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
|
2090
|
+
# # └─────────────────────┴────────────┴───────────────────────────────────┘
|
2096
2091
|
#
|
2097
2092
|
# @example When closed="both" the time values at the window boundaries belong to 2 groups.
|
2098
|
-
# df.
|
2093
|
+
# df.group_by_dynamic("time", every: "1h", closed: "both").agg(
|
2099
2094
|
# [Polars.col("time").count.alias("time_count")]
|
2100
2095
|
# )
|
2101
2096
|
# # =>
|
@@ -2112,7 +2107,7 @@ module Polars
|
|
2112
2107
|
# # │ 2021-12-16 03:00:00 ┆ 1 │
|
2113
2108
|
# # └─────────────────────┴────────────┘
|
2114
2109
|
#
|
2115
|
-
# @example Dynamic
|
2110
|
+
# @example Dynamic group bys can also be combined with grouping on normal keys.
|
2116
2111
|
# df = Polars::DataFrame.new(
|
2117
2112
|
# {
|
2118
2113
|
# "time" => Polars.date_range(
|
@@ -2123,7 +2118,7 @@ module Polars
|
|
2123
2118
|
# "groups" => ["a", "a", "a", "b", "b", "a", "a"]
|
2124
2119
|
# }
|
2125
2120
|
# )
|
2126
|
-
# df.
|
2121
|
+
# df.group_by_dynamic(
|
2127
2122
|
# "time",
|
2128
2123
|
# every: "1h",
|
2129
2124
|
# closed: "both",
|
@@ -2146,20 +2141,20 @@ module Polars
|
|
2146
2141
|
# # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
|
2147
2142
|
# # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
|
2148
2143
|
#
|
2149
|
-
# @example Dynamic
|
2144
|
+
# @example Dynamic group by on an index column.
|
2150
2145
|
# df = Polars::DataFrame.new(
|
2151
2146
|
# {
|
2152
2147
|
# "idx" => Polars.arange(0, 6, eager: true),
|
2153
2148
|
# "A" => ["A", "A", "B", "B", "B", "C"]
|
2154
2149
|
# }
|
2155
2150
|
# )
|
2156
|
-
# df.
|
2151
|
+
# df.group_by_dynamic(
|
2157
2152
|
# "idx",
|
2158
2153
|
# every: "2i",
|
2159
2154
|
# period: "3i",
|
2160
2155
|
# include_boundaries: true,
|
2161
2156
|
# closed: "right"
|
2162
|
-
# ).agg(Polars.col("A").
|
2157
|
+
# ).agg(Polars.col("A").alias("A_agg_list"))
|
2163
2158
|
# # =>
|
2164
2159
|
# # shape: (3, 4)
|
2165
2160
|
# # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
|
@@ -2171,7 +2166,7 @@ module Polars
|
|
2171
2166
|
# # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
|
2172
2167
|
# # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
|
2173
2168
|
# # └─────────────────┴─────────────────┴─────┴─────────────────┘
|
2174
|
-
def
|
2169
|
+
def group_by_dynamic(
|
2175
2170
|
index_column,
|
2176
2171
|
every:,
|
2177
2172
|
period: nil,
|
@@ -2195,6 +2190,7 @@ module Polars
|
|
2195
2190
|
start_by
|
2196
2191
|
)
|
2197
2192
|
end
|
2193
|
+
alias_method :groupby_dynamic, :group_by_dynamic
|
2198
2194
|
|
2199
2195
|
# Upsample a DataFrame at a regular frequency.
|
2200
2196
|
#
|
@@ -2242,7 +2238,7 @@ module Polars
|
|
2242
2238
|
# "groups" => ["A", "B", "A", "B"],
|
2243
2239
|
# "values" => [0, 1, 2, 3]
|
2244
2240
|
# }
|
2245
|
-
# )
|
2241
|
+
# ).set_sorted("time")
|
2246
2242
|
# df.upsample(
|
2247
2243
|
# time_column: "time", every: "1mo", by: "groups", maintain_order: true
|
2248
2244
|
# ).select(Polars.all.forward_fill)
|
@@ -2360,7 +2356,7 @@ module Polars
|
|
2360
2356
|
# ], # note record date: Jan 1st (sorted!)
|
2361
2357
|
# "gdp" => [4164, 4411, 4566, 4696]
|
2362
2358
|
# }
|
2363
|
-
# )
|
2359
|
+
# ).set_sorted("date")
|
2364
2360
|
# population = Polars::DataFrame.new(
|
2365
2361
|
# {
|
2366
2362
|
# "date" => [
|
@@ -2371,7 +2367,7 @@ module Polars
|
|
2371
2367
|
# ], # note record date: May 12th (sorted!)
|
2372
2368
|
# "population" => [82.19, 82.66, 83.12, 83.52]
|
2373
2369
|
# }
|
2374
|
-
# )
|
2370
|
+
# ).set_sorted("date")
|
2375
2371
|
# population.join_asof(
|
2376
2372
|
# gdp, left_on: "date", right_on: "date", strategy: "backward"
|
2377
2373
|
# )
|
@@ -2674,7 +2670,7 @@ module Polars
|
|
2674
2670
|
# # │ 3 ┆ 8 ┆ c ┆ 30 │
|
2675
2671
|
# # └─────┴─────┴─────┴───────┘
|
2676
2672
|
def hstack(columns, in_place: false)
|
2677
|
-
if !columns.is_a?(Array)
|
2673
|
+
if !columns.is_a?(::Array)
|
2678
2674
|
columns = columns.get_columns
|
2679
2675
|
end
|
2680
2676
|
if in_place
|
@@ -2804,7 +2800,7 @@ module Polars
|
|
2804
2800
|
# # │ 3 ┆ 8.0 │
|
2805
2801
|
# # └─────┴─────┘
|
2806
2802
|
def drop(columns)
|
2807
|
-
if columns.is_a?(Array)
|
2803
|
+
if columns.is_a?(::Array)
|
2808
2804
|
df = clone
|
2809
2805
|
columns.each do |n|
|
2810
2806
|
df._df.drop_in_place(n)
|
@@ -3317,7 +3313,7 @@ module Polars
|
|
3317
3313
|
n_fill = n_cols * n_rows - height
|
3318
3314
|
|
3319
3315
|
if n_fill > 0
|
3320
|
-
if !fill_values.is_a?(Array)
|
3316
|
+
if !fill_values.is_a?(::Array)
|
3321
3317
|
fill_values = [fill_values] * df.width
|
3322
3318
|
end
|
3323
3319
|
|
@@ -3426,36 +3422,38 @@ module Polars
|
|
3426
3422
|
# # ╞═════╪═════╪═════╡
|
3427
3423
|
# # │ C ┆ 2 ┆ l │
|
3428
3424
|
# # └─────┴─────┴─────┘}
|
3429
|
-
def partition_by(groups, maintain_order: true, as_dict: false)
|
3425
|
+
def partition_by(groups, maintain_order: true, include_key: true, as_dict: false)
|
3430
3426
|
if groups.is_a?(String)
|
3431
3427
|
groups = [groups]
|
3432
|
-
elsif !groups.is_a?(Array)
|
3428
|
+
elsif !groups.is_a?(::Array)
|
3433
3429
|
groups = Array(groups)
|
3434
3430
|
end
|
3435
3431
|
|
3436
3432
|
if as_dict
|
3437
3433
|
out = {}
|
3438
3434
|
if groups.length == 1
|
3439
|
-
_df.partition_by(groups, maintain_order).each do |df|
|
3435
|
+
_df.partition_by(groups, maintain_order, include_key).each do |df|
|
3440
3436
|
df = _from_rbdf(df)
|
3441
3437
|
out[df[groups][0, 0]] = df
|
3442
3438
|
end
|
3443
3439
|
else
|
3444
|
-
_df.partition_by(groups, maintain_order).each do |df|
|
3440
|
+
_df.partition_by(groups, maintain_order, include_key).each do |df|
|
3445
3441
|
df = _from_rbdf(df)
|
3446
3442
|
out[df[groups].row(0)] = df
|
3447
3443
|
end
|
3448
3444
|
end
|
3449
3445
|
out
|
3450
3446
|
else
|
3451
|
-
_df.partition_by(groups, maintain_order).map { |df| _from_rbdf(df) }
|
3447
|
+
_df.partition_by(groups, maintain_order, include_key).map { |df| _from_rbdf(df) }
|
3452
3448
|
end
|
3453
3449
|
end
|
3454
3450
|
|
3455
3451
|
# Shift values by the given period.
|
3456
3452
|
#
|
3457
|
-
# @param
|
3453
|
+
# @param n [Integer]
|
3458
3454
|
# Number of places to shift (may be negative).
|
3455
|
+
# @param fill_value [Object]
|
3456
|
+
# Fill the resulting null values with this value.
|
3459
3457
|
#
|
3460
3458
|
# @return [DataFrame]
|
3461
3459
|
#
|
@@ -3493,8 +3491,8 @@ module Polars
|
|
3493
3491
|
# # │ 3 ┆ 8 ┆ c │
|
3494
3492
|
# # │ null ┆ null ┆ null │
|
3495
3493
|
# # └──────┴──────┴──────┘
|
3496
|
-
def shift(
|
3497
|
-
|
3494
|
+
def shift(n, fill_value: nil)
|
3495
|
+
lazy.shift(n, fill_value: fill_value).collect(_eager: true)
|
3498
3496
|
end
|
3499
3497
|
|
3500
3498
|
# Shift the values by a given period and fill the resulting null values.
|
@@ -3527,9 +3525,7 @@ module Polars
|
|
3527
3525
|
# # │ 2 ┆ 7 ┆ b │
|
3528
3526
|
# # └─────┴─────┴─────┘
|
3529
3527
|
def shift_and_fill(periods, fill_value)
|
3530
|
-
|
3531
|
-
.shift_and_fill(periods, fill_value)
|
3532
|
-
.collect(no_optimization: true, string_cache: false)
|
3528
|
+
shift(periods, fill_value: fill_value)
|
3533
3529
|
end
|
3534
3530
|
|
3535
3531
|
# Get a mask of all duplicated rows in this DataFrame.
|
@@ -3716,7 +3712,7 @@ module Polars
|
|
3716
3712
|
# # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
|
3717
3713
|
# # └─────┴──────┴───────┴──────┴──────┴───────┘
|
3718
3714
|
def with_columns(exprs)
|
3719
|
-
if !exprs.nil? && !exprs.is_a?(Array)
|
3715
|
+
if !exprs.nil? && !exprs.is_a?(::Array)
|
3720
3716
|
exprs = [exprs]
|
3721
3717
|
end
|
3722
3718
|
lazy
|
@@ -3780,7 +3776,7 @@ module Polars
|
|
3780
3776
|
if axis == 0
|
3781
3777
|
_from_rbdf(_df.max)
|
3782
3778
|
elsif axis == 1
|
3783
|
-
Utils.wrap_s(_df.
|
3779
|
+
Utils.wrap_s(_df.max_horizontal)
|
3784
3780
|
else
|
3785
3781
|
raise ArgumentError, "Axis should be 0 or 1."
|
3786
3782
|
end
|
@@ -3812,7 +3808,7 @@ module Polars
|
|
3812
3808
|
if axis == 0
|
3813
3809
|
_from_rbdf(_df.min)
|
3814
3810
|
elsif axis == 1
|
3815
|
-
Utils.wrap_s(_df.
|
3811
|
+
Utils.wrap_s(_df.min_horizontal)
|
3816
3812
|
else
|
3817
3813
|
raise ArgumentError, "Axis should be 0 or 1."
|
3818
3814
|
end
|
@@ -3861,7 +3857,7 @@ module Polars
|
|
3861
3857
|
when 0
|
3862
3858
|
_from_rbdf(_df.sum)
|
3863
3859
|
when 1
|
3864
|
-
Utils.wrap_s(_df.
|
3860
|
+
Utils.wrap_s(_df.sum_horizontal(null_strategy))
|
3865
3861
|
else
|
3866
3862
|
raise ArgumentError, "Axis should be 0 or 1."
|
3867
3863
|
end
|
@@ -3899,7 +3895,7 @@ module Polars
|
|
3899
3895
|
when 0
|
3900
3896
|
_from_rbdf(_df.mean)
|
3901
3897
|
when 1
|
3902
|
-
Utils.wrap_s(_df.
|
3898
|
+
Utils.wrap_s(_df.mean_horizontal(null_strategy))
|
3903
3899
|
else
|
3904
3900
|
raise ArgumentError, "Axis should be 0 or 1."
|
3905
3901
|
end
|
@@ -4097,11 +4093,11 @@ module Polars
|
|
4097
4093
|
# # │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │
|
4098
4094
|
# # │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │
|
4099
4095
|
# # └───────┴───────┴───────┴───────┴───────┴───────┘
|
4100
|
-
def to_dummies(columns: nil, separator: "_")
|
4096
|
+
def to_dummies(columns: nil, separator: "_", drop_first: false)
|
4101
4097
|
if columns.is_a?(String)
|
4102
4098
|
columns = [columns]
|
4103
4099
|
end
|
4104
|
-
_from_rbdf(_df.to_dummies(columns, separator))
|
4100
|
+
_from_rbdf(_df.to_dummies(columns, separator, drop_first))
|
4105
4101
|
end
|
4106
4102
|
|
4107
4103
|
# Drop duplicate rows from this DataFrame.
|
@@ -4189,7 +4185,7 @@ module Polars
|
|
4189
4185
|
subset = [subset]
|
4190
4186
|
end
|
4191
4187
|
|
4192
|
-
if subset.is_a?(Array) && subset.length == 1
|
4188
|
+
if subset.is_a?(::Array) && subset.length == 1
|
4193
4189
|
expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false)
|
4194
4190
|
else
|
4195
4191
|
struct_fields = subset.nil? ? Polars.all : subset
|
@@ -4284,15 +4280,20 @@ module Polars
|
|
4284
4280
|
end
|
4285
4281
|
|
4286
4282
|
if n.nil? && !frac.nil?
|
4283
|
+
frac = Series.new("frac", [frac]) unless frac.is_a?(Series)
|
4284
|
+
|
4287
4285
|
_from_rbdf(
|
4288
|
-
_df.sample_frac(frac, with_replacement, shuffle, seed)
|
4286
|
+
_df.sample_frac(frac._s, with_replacement, shuffle, seed)
|
4289
4287
|
)
|
4290
4288
|
end
|
4291
4289
|
|
4292
4290
|
if n.nil?
|
4293
4291
|
n = 1
|
4294
4292
|
end
|
4295
|
-
|
4293
|
+
|
4294
|
+
n = Series.new("", [n]) unless n.is_a?(Series)
|
4295
|
+
|
4296
|
+
_from_rbdf(_df.sample_n(n._s, with_replacement, shuffle, seed))
|
4296
4297
|
end
|
4297
4298
|
|
4298
4299
|
# Apply a horizontal reduction on a DataFrame.
|
@@ -4591,7 +4592,7 @@ module Polars
|
|
4591
4592
|
#
|
4592
4593
|
# @example
|
4593
4594
|
# s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]})
|
4594
|
-
# s.
|
4595
|
+
# s.gather_every(2)
|
4595
4596
|
# # =>
|
4596
4597
|
# # shape: (2, 2)
|
4597
4598
|
# # ┌─────┬─────┐
|
@@ -4602,9 +4603,10 @@ module Polars
|
|
4602
4603
|
# # │ 1 ┆ 5 │
|
4603
4604
|
# # │ 3 ┆ 7 │
|
4604
4605
|
# # └─────┴─────┘
|
4605
|
-
def
|
4606
|
-
select(Utils.col("*").
|
4606
|
+
def gather_every(n)
|
4607
|
+
select(Utils.col("*").gather_every(n))
|
4607
4608
|
end
|
4609
|
+
alias_method :take_every, :gather_every
|
4608
4610
|
|
4609
4611
|
# Hash and combine the rows in this DataFrame.
|
4610
4612
|
#
|
@@ -4661,16 +4663,16 @@ module Polars
|
|
4661
4663
|
# df.interpolate
|
4662
4664
|
# # =>
|
4663
4665
|
# # shape: (4, 3)
|
4664
|
-
# #
|
4665
|
-
# # │ foo
|
4666
|
-
# # │ ---
|
4667
|
-
# # │
|
4668
|
-
# #
|
4669
|
-
# # │ 1
|
4670
|
-
# # │ 5
|
4671
|
-
# # │ 9
|
4672
|
-
# # │ 10
|
4673
|
-
# #
|
4666
|
+
# # ┌──────┬──────┬──────────┐
|
4667
|
+
# # │ foo ┆ bar ┆ baz │
|
4668
|
+
# # │ --- ┆ --- ┆ --- │
|
4669
|
+
# # │ f64 ┆ f64 ┆ f64 │
|
4670
|
+
# # ╞══════╪══════╪══════════╡
|
4671
|
+
# # │ 1.0 ┆ 6.0 ┆ 1.0 │
|
4672
|
+
# # │ 5.0 ┆ 7.0 ┆ 3.666667 │
|
4673
|
+
# # │ 9.0 ┆ 9.0 ┆ 6.333333 │
|
4674
|
+
# # │ 10.0 ┆ null ┆ 9.0 │
|
4675
|
+
# # └──────┴──────┴──────────┘
|
4674
4676
|
def interpolate
|
4675
4677
|
select(Utils.col("*").interpolate)
|
4676
4678
|
end
|
@@ -4758,6 +4760,38 @@ module Polars
|
|
4758
4760
|
_from_rbdf(_df.unnest(names))
|
4759
4761
|
end
|
4760
4762
|
|
4763
|
+
# TODO
|
4764
|
+
# def corr
|
4765
|
+
# end
|
4766
|
+
|
4767
|
+
# TODO
|
4768
|
+
# def merge_sorted
|
4769
|
+
# end
|
4770
|
+
|
4771
|
+
# Indicate that one or multiple columns are sorted.
|
4772
|
+
#
|
4773
|
+
# @param column [Object]
|
4774
|
+
# Columns that are sorted
|
4775
|
+
# @param more_columns [Object]
|
4776
|
+
# Additional columns that are sorted, specified as positional arguments.
|
4777
|
+
# @param descending [Boolean]
|
4778
|
+
# Whether the columns are sorted in descending order.
|
4779
|
+
#
|
4780
|
+
# @return [DataFrame]
|
4781
|
+
def set_sorted(
|
4782
|
+
column,
|
4783
|
+
*more_columns,
|
4784
|
+
descending: false
|
4785
|
+
)
|
4786
|
+
lazy
|
4787
|
+
.set_sorted(column, *more_columns, descending: descending)
|
4788
|
+
.collect(no_optimization: true)
|
4789
|
+
end
|
4790
|
+
|
4791
|
+
# TODO
|
4792
|
+
# def update
|
4793
|
+
# end
|
4794
|
+
|
4761
4795
|
private
|
4762
4796
|
|
4763
4797
|
def initialize_copy(other)
|
@@ -4910,8 +4944,8 @@ module Polars
|
|
4910
4944
|
[lookup[col[0]] || col[0], col[1]]
|
4911
4945
|
end
|
4912
4946
|
|
4913
|
-
if schema_overrides
|
4914
|
-
|
4947
|
+
if schema_overrides && schema_overrides.any?
|
4948
|
+
column_dtypes.merge!(schema_overrides)
|
4915
4949
|
end
|
4916
4950
|
|
4917
4951
|
column_dtypes.each do |col, dtype|
|
@@ -4967,7 +5001,7 @@ module Polars
|
|
4967
5001
|
columns.each do |col, i|
|
4968
5002
|
if dtypes[col] == Categorical # != rbdf_dtypes[i]
|
4969
5003
|
column_casts << Polars.col(col).cast(Categorical)._rbexpr
|
4970
|
-
elsif structs
|
5004
|
+
elsif structs&.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
|
4971
5005
|
column_casts << Polars.col(col).cast(structs[col])._rbexpr
|
4972
5006
|
elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
|
4973
5007
|
column_casts << Polars.col(col).cast(dtypes[col])._rbexpr
|
@@ -5012,15 +5046,56 @@ module Polars
|
|
5012
5046
|
rbdf = _post_apply_columns(rbdf, column_names)
|
5013
5047
|
end
|
5014
5048
|
return rbdf
|
5015
|
-
elsif data[0].is_a?(Array)
|
5049
|
+
elsif data[0].is_a?(::Array)
|
5016
5050
|
if orient.nil? && !columns.nil?
|
5017
|
-
|
5051
|
+
first_element = data[0]
|
5052
|
+
row_types = first_element.filter_map { |value| value.class }.uniq
|
5053
|
+
if row_types.include?(Integer) && row_types.include?(Float)
|
5054
|
+
row_types.delete(Integer)
|
5055
|
+
end
|
5056
|
+
orient = row_types.length == 1 ? "col" : "row"
|
5018
5057
|
end
|
5019
5058
|
|
5020
5059
|
if orient == "row"
|
5021
|
-
|
5060
|
+
column_names, schema_overrides = _unpack_schema(
|
5061
|
+
schema, schema_overrides: schema_overrides, n_expected: first_element.length
|
5062
|
+
)
|
5063
|
+
local_schema_override = (
|
5064
|
+
schema_overrides.any? ? (raise Todo) : {}
|
5065
|
+
)
|
5066
|
+
if column_names.any? && first_element.length > 0 && first_element.length != column_names.length
|
5067
|
+
raise ArgumentError, "the row data does not match the number of columns"
|
5068
|
+
end
|
5069
|
+
|
5070
|
+
unpack_nested = false
|
5071
|
+
local_schema_override.each do |col, tp|
|
5072
|
+
raise Todo
|
5073
|
+
end
|
5074
|
+
|
5075
|
+
if unpack_nested
|
5076
|
+
raise Todo
|
5077
|
+
else
|
5078
|
+
rbdf = RbDataFrame.read_rows(
|
5079
|
+
data,
|
5080
|
+
infer_schema_length,
|
5081
|
+
local_schema_override.any? ? local_schema_override : nil
|
5082
|
+
)
|
5083
|
+
end
|
5084
|
+
if column_names.any? || schema_overrides.any?
|
5085
|
+
rbdf = _post_apply_columns(
|
5086
|
+
rbdf, column_names, schema_overrides: schema_overrides
|
5087
|
+
)
|
5088
|
+
end
|
5089
|
+
return rbdf
|
5022
5090
|
elsif orient == "col" || orient.nil?
|
5023
|
-
|
5091
|
+
column_names, schema_overrides = _unpack_schema(
|
5092
|
+
schema, schema_overrides: schema_overrides, n_expected: data.length
|
5093
|
+
)
|
5094
|
+
data_series =
|
5095
|
+
data.map.with_index do |element, i|
|
5096
|
+
Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]])._s
|
5097
|
+
end
|
5098
|
+
return RbDataFrame.new(data_series)
|
5024
5099
|
else
|
5025
5100
|
raise ArgumentError, "orient must be one of {{'col', 'row', nil}}, got #{orient} instead."
|
5026
5101
|
end
|
@@ -5066,10 +5141,10 @@ module Polars
|
|
5066
5141
|
|
5067
5142
|
def _compare_to_other_df(other, op)
|
5068
5143
|
if columns != other.columns
|
5069
|
-
raise
|
5144
|
+
raise ArgumentError, "DataFrame columns do not match"
|
5070
5145
|
end
|
5071
5146
|
if shape != other.shape
|
5072
|
-
raise
|
5147
|
+
raise ArgumentError, "DataFrame dimensions do not match"
|
5073
5148
|
end
|
5074
5149
|
|
5075
5150
|
suffix = "__POLARS_CMP_OTHER"
|
@@ -5117,7 +5192,7 @@ module Polars
|
|
5117
5192
|
|
5118
5193
|
def _prepare_other_arg(other)
|
5119
5194
|
if !other.is_a?(Series)
|
5120
|
-
if other.is_a?(Array)
|
5195
|
+
if other.is_a?(::Array)
|
5121
5196
|
raise ArgumentError, "Operation not supported."
|
5122
5197
|
end
|
5123
5198
|
|