polars-df 0.5.0-aarch64-linux → 0.7.0-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +26 -0
- data/Cargo.lock +595 -709
- data/Cargo.toml +1 -0
- data/LICENSE-THIRD-PARTY.txt +3854 -4496
- data/README.md +11 -9
- data/lib/polars/3.0/polars.so +0 -0
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/array_expr.rb +84 -0
- data/lib/polars/array_name_space.rb +77 -0
- data/lib/polars/batched_csv_reader.rb +1 -1
- data/lib/polars/config.rb +530 -0
- data/lib/polars/data_frame.rb +206 -131
- data/lib/polars/data_types.rb +163 -29
- data/lib/polars/date_time_expr.rb +13 -18
- data/lib/polars/date_time_name_space.rb +22 -28
- data/lib/polars/dynamic_group_by.rb +2 -2
- data/lib/polars/expr.rb +241 -151
- data/lib/polars/functions.rb +29 -38
- data/lib/polars/group_by.rb +38 -76
- data/lib/polars/io.rb +37 -2
- data/lib/polars/lazy_frame.rb +174 -95
- data/lib/polars/lazy_functions.rb +87 -63
- data/lib/polars/lazy_group_by.rb +7 -8
- data/lib/polars/list_expr.rb +40 -36
- data/lib/polars/list_name_space.rb +15 -15
- data/lib/polars/name_expr.rb +198 -0
- data/lib/polars/rolling_group_by.rb +6 -4
- data/lib/polars/series.rb +95 -28
- data/lib/polars/sql_context.rb +194 -0
- data/lib/polars/string_expr.rb +249 -69
- data/lib/polars/string_name_space.rb +155 -25
- data/lib/polars/utils.rb +119 -57
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +6 -0
- metadata +7 -2
data/lib/polars/data_frame.rb
CHANGED
@@ -20,15 +20,9 @@ module Polars
|
|
20
20
|
# this does not yield conclusive results, column orientation is used.
|
21
21
|
def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
|
22
22
|
schema ||= columns
|
23
|
-
raise Todo if schema_overrides
|
24
23
|
|
25
|
-
# TODO deprecate in favor of read_sql
|
26
24
|
if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
|
27
|
-
|
28
|
-
data = {}
|
29
|
-
result.columns.each_with_index do |k, i|
|
30
|
-
data[k] = result.rows.map { |r| r[i] }
|
31
|
-
end
|
25
|
+
raise ArgumentError, "Use read_database instead"
|
32
26
|
end
|
33
27
|
|
34
28
|
if data.nil?
|
@@ -36,7 +30,7 @@ module Polars
|
|
36
30
|
elsif data.is_a?(Hash)
|
37
31
|
data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
|
38
32
|
self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
|
39
|
-
elsif data.is_a?(Array)
|
33
|
+
elsif data.is_a?(::Array)
|
40
34
|
self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
|
41
35
|
elsif data.is_a?(Series)
|
42
36
|
self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
|
@@ -116,7 +110,7 @@ module Polars
|
|
116
110
|
dtypes.each do|k, v|
|
117
111
|
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
118
112
|
end
|
119
|
-
elsif dtypes.is_a?(Array)
|
113
|
+
elsif dtypes.is_a?(::Array)
|
120
114
|
dtype_slice = dtypes
|
121
115
|
else
|
122
116
|
raise ArgumentError, "dtype arg should be list or dict"
|
@@ -590,7 +584,7 @@ module Polars
|
|
590
584
|
|
591
585
|
# df[2, ..] (select row as df)
|
592
586
|
if row_selection.is_a?(Integer)
|
593
|
-
if col_selection.is_a?(Array)
|
587
|
+
if col_selection.is_a?(::Array)
|
594
588
|
df = self[0.., col_selection]
|
595
589
|
return df.slice(row_selection, 1)
|
596
590
|
end
|
@@ -611,7 +605,7 @@ module Polars
|
|
611
605
|
return series[row_selection]
|
612
606
|
end
|
613
607
|
|
614
|
-
if col_selection.is_a?(Array)
|
608
|
+
if col_selection.is_a?(::Array)
|
615
609
|
# df[.., [1, 2]]
|
616
610
|
if Utils.is_int_sequence(col_selection)
|
617
611
|
series_list = col_selection.map { |i| to_series(i) }
|
@@ -641,7 +635,7 @@ module Polars
|
|
641
635
|
return Slice.new(self).apply(item)
|
642
636
|
end
|
643
637
|
|
644
|
-
if item.is_a?(Array) && item.all? { |v| Utils.strlike?(v) }
|
638
|
+
if item.is_a?(::Array) && item.all? { |v| Utils.strlike?(v) }
|
645
639
|
# select multiple columns
|
646
640
|
# df[["foo", "bar"]]
|
647
641
|
return _from_rbdf(_df.select(item.map(&:to_s)))
|
@@ -684,13 +678,13 @@ module Polars
|
|
684
678
|
end
|
685
679
|
|
686
680
|
if Utils.strlike?(key)
|
687
|
-
if value.is_a?(Array) || (defined?(Numo::NArray) && value.is_a?(Numo::NArray))
|
681
|
+
if value.is_a?(::Array) || (defined?(Numo::NArray) && value.is_a?(Numo::NArray))
|
688
682
|
value = Series.new(value)
|
689
683
|
elsif !value.is_a?(Series)
|
690
684
|
value = Polars.lit(value)
|
691
685
|
end
|
692
686
|
self._df = with_column(value.alias(key.to_s))._df
|
693
|
-
elsif key.is_a?(Array)
|
687
|
+
elsif key.is_a?(::Array)
|
694
688
|
row_selection, col_selection = key
|
695
689
|
|
696
690
|
if Utils.strlike?(col_selection)
|
@@ -905,6 +899,7 @@ module Polars
|
|
905
899
|
def write_csv(
|
906
900
|
file = nil,
|
907
901
|
has_header: true,
|
902
|
+
include_header: nil,
|
908
903
|
sep: ",",
|
909
904
|
quote: '"',
|
910
905
|
batch_size: 1024,
|
@@ -914,6 +909,8 @@ module Polars
|
|
914
909
|
float_precision: nil,
|
915
910
|
null_value: nil
|
916
911
|
)
|
912
|
+
include_header = has_header if include_header.nil?
|
913
|
+
|
917
914
|
if sep.length > 1
|
918
915
|
raise ArgumentError, "only single byte separator is allowed"
|
919
916
|
elsif quote.length > 1
|
@@ -927,7 +924,7 @@ module Polars
|
|
927
924
|
buffer.set_encoding(Encoding::BINARY)
|
928
925
|
_df.write_csv(
|
929
926
|
buffer,
|
930
|
-
|
927
|
+
include_header,
|
931
928
|
sep.ord,
|
932
929
|
quote.ord,
|
933
930
|
batch_size,
|
@@ -946,7 +943,7 @@ module Polars
|
|
946
943
|
|
947
944
|
_df.write_csv(
|
948
945
|
file,
|
949
|
-
|
946
|
+
include_header,
|
950
947
|
sep.ord,
|
951
948
|
quote.ord,
|
952
949
|
batch_size,
|
@@ -994,14 +991,21 @@ module Polars
|
|
994
991
|
#
|
995
992
|
# @return [nil]
|
996
993
|
def write_ipc(file, compression: "uncompressed")
|
997
|
-
|
998
|
-
|
994
|
+
return_bytes = file.nil?
|
995
|
+
if return_bytes
|
996
|
+
file = StringIO.new
|
997
|
+
file.set_encoding(Encoding::BINARY)
|
999
998
|
end
|
1000
999
|
if Utils.pathlike?(file)
|
1001
1000
|
file = Utils.normalise_filepath(file)
|
1002
1001
|
end
|
1003
1002
|
|
1003
|
+
if compression.nil?
|
1004
|
+
compression = "uncompressed"
|
1005
|
+
end
|
1006
|
+
|
1004
1007
|
_df.write_ipc(file, compression)
|
1008
|
+
return_bytes ? file.string : nil
|
1005
1009
|
end
|
1006
1010
|
|
1007
1011
|
# Write to Apache Parquet file.
|
@@ -1144,22 +1148,8 @@ module Polars
|
|
1144
1148
|
# # │ b ┆ 1 ┆ 2 ┆ 3 │
|
1145
1149
|
# # └─────┴─────┴─────┴─────┘
|
1146
1150
|
def transpose(include_header: false, header_name: "column", column_names: nil)
|
1147
|
-
|
1148
|
-
|
1149
|
-
names = []
|
1150
|
-
n = df.width
|
1151
|
-
if include_header
|
1152
|
-
names << header_name
|
1153
|
-
n -= 1
|
1154
|
-
end
|
1155
|
-
|
1156
|
-
column_names = column_names.each
|
1157
|
-
n.times do
|
1158
|
-
names << column_names.next
|
1159
|
-
end
|
1160
|
-
df.columns = names
|
1161
|
-
end
|
1162
|
-
df
|
1151
|
+
keep_names_as = include_header ? header_name : nil
|
1152
|
+
_from_rbdf(_df.transpose(keep_names_as, column_names))
|
1163
1153
|
end
|
1164
1154
|
|
1165
1155
|
# Reverse the DataFrame.
|
@@ -1491,13 +1481,9 @@ module Polars
|
|
1491
1481
|
# # │ 1 ┆ 6.0 ┆ a │
|
1492
1482
|
# # └─────┴─────┴─────┘
|
1493
1483
|
def sort(by, reverse: false, nulls_last: false)
|
1494
|
-
|
1495
|
-
|
1496
|
-
|
1497
|
-
.collect(no_optimization: true, string_cache: false)
|
1498
|
-
else
|
1499
|
-
_from_rbdf(_df.sort(by, reverse, nulls_last))
|
1500
|
-
end
|
1484
|
+
lazy
|
1485
|
+
.sort(by, reverse: reverse, nulls_last: nulls_last)
|
1486
|
+
.collect(no_optimization: true)
|
1501
1487
|
end
|
1502
1488
|
|
1503
1489
|
# Sort the DataFrame by column in-place.
|
@@ -1808,13 +1794,13 @@ module Polars
|
|
1808
1794
|
_from_rbdf(_df.with_row_count(name, offset))
|
1809
1795
|
end
|
1810
1796
|
|
1811
|
-
# Start a
|
1797
|
+
# Start a group by operation.
|
1812
1798
|
#
|
1813
1799
|
# @param by [Object]
|
1814
1800
|
# Column(s) to group by.
|
1815
1801
|
# @param maintain_order [Boolean]
|
1816
1802
|
# Make sure that the order of the groups remain consistent. This is more
|
1817
|
-
# expensive than a default
|
1803
|
+
# expensive than a default group by. Note that this only works in expression
|
1818
1804
|
# aggregations.
|
1819
1805
|
#
|
1820
1806
|
# @return [GroupBy]
|
@@ -1827,7 +1813,7 @@ module Polars
|
|
1827
1813
|
# "c" => [6, 5, 4, 3, 2, 1]
|
1828
1814
|
# }
|
1829
1815
|
# )
|
1830
|
-
# df.
|
1816
|
+
# df.group_by("a").agg(Polars.col("b").sum).sort("a")
|
1831
1817
|
# # =>
|
1832
1818
|
# # shape: (3, 2)
|
1833
1819
|
# # ┌─────┬─────┐
|
@@ -1839,25 +1825,26 @@ module Polars
|
|
1839
1825
|
# # │ b ┆ 11 │
|
1840
1826
|
# # │ c ┆ 6 │
|
1841
1827
|
# # └─────┴─────┘
|
1842
|
-
def
|
1828
|
+
def group_by(by, maintain_order: false)
|
1843
1829
|
if !Utils.bool?(maintain_order)
|
1844
|
-
raise TypeError, "invalid input for
|
1830
|
+
raise TypeError, "invalid input for group_by arg `maintain_order`: #{maintain_order}."
|
1845
1831
|
end
|
1846
1832
|
GroupBy.new(
|
1847
|
-
|
1833
|
+
self,
|
1848
1834
|
by,
|
1849
|
-
self.class,
|
1850
1835
|
maintain_order: maintain_order
|
1851
1836
|
)
|
1852
1837
|
end
|
1838
|
+
alias_method :groupby, :group_by
|
1839
|
+
alias_method :group, :group_by
|
1853
1840
|
|
1854
1841
|
# Create rolling groups based on a time column.
|
1855
1842
|
#
|
1856
1843
|
# Also works for index values of type `:i32` or `:i64`.
|
1857
1844
|
#
|
1858
|
-
# Different from a `
|
1845
|
+
# Different from a `dynamic_group_by` the windows are now determined by the
|
1859
1846
|
# individual values and are not of constant intervals. For constant intervals use
|
1860
|
-
# *
|
1847
|
+
# *group_by_dynamic*
|
1861
1848
|
#
|
1862
1849
|
# The `period` and `offset` arguments are created either from a timedelta, or
|
1863
1850
|
# by using the following string language:
|
@@ -1877,7 +1864,7 @@ module Polars
|
|
1877
1864
|
# Or combine them:
|
1878
1865
|
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
1879
1866
|
#
|
1880
|
-
# In case of a
|
1867
|
+
# In case of a group_by_rolling on an integer column, the windows are defined by:
|
1881
1868
|
#
|
1882
1869
|
# - **"1i" # length 1**
|
1883
1870
|
# - **"10i" # length 10**
|
@@ -1888,7 +1875,7 @@ module Polars
|
|
1888
1875
|
# This column must be sorted in ascending order. If not the output will not
|
1889
1876
|
# make sense.
|
1890
1877
|
#
|
1891
|
-
# In case of a rolling
|
1878
|
+
# In case of a rolling group by on indices, dtype needs to be one of
|
1892
1879
|
# `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
|
1893
1880
|
# performance matters use an `:i64` column.
|
1894
1881
|
# @param period [Object]
|
@@ -1899,6 +1886,12 @@ module Polars
|
|
1899
1886
|
# Define whether the temporal window interval is closed or not.
|
1900
1887
|
# @param by [Object]
|
1901
1888
|
# Also group by this column/these columns.
|
1889
|
+
# @param check_sorted [Boolean]
|
1890
|
+
# When the `by` argument is given, polars can not check sortedness
|
1891
|
+
# by the metadata and has to do a full scan on the index column to
|
1892
|
+
# verify data is sorted. This is expensive. If you are sure the
|
1893
|
+
# data within the by groups is sorted, you can set this to `false`.
|
1894
|
+
# Doing so incorrectly will lead to incorrect output
|
1902
1895
|
#
|
1903
1896
|
# @return [RollingGroupBy]
|
1904
1897
|
#
|
@@ -1912,9 +1905,9 @@ module Polars
|
|
1912
1905
|
# "2020-01-08 23:16:43"
|
1913
1906
|
# ]
|
1914
1907
|
# df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
|
1915
|
-
# Polars.col("dt").str.strptime(Polars::Datetime)
|
1908
|
+
# Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
|
1916
1909
|
# )
|
1917
|
-
# df.
|
1910
|
+
# df.group_by_rolling(index_column: "dt", period: "2d").agg(
|
1918
1911
|
# [
|
1919
1912
|
# Polars.sum("a").alias("sum_a"),
|
1920
1913
|
# Polars.min("a").alias("min_a"),
|
@@ -1935,20 +1928,22 @@ module Polars
|
|
1935
1928
|
# # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
|
1936
1929
|
# # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
|
1937
1930
|
# # └─────────────────────┴───────┴───────┴───────┘
|
1938
|
-
def
|
1931
|
+
def group_by_rolling(
|
1939
1932
|
index_column:,
|
1940
1933
|
period:,
|
1941
1934
|
offset: nil,
|
1942
1935
|
closed: "right",
|
1943
|
-
by: nil
|
1936
|
+
by: nil,
|
1937
|
+
check_sorted: true
|
1944
1938
|
)
|
1945
|
-
RollingGroupBy.new(self, index_column, period, offset, closed, by)
|
1939
|
+
RollingGroupBy.new(self, index_column, period, offset, closed, by, check_sorted)
|
1946
1940
|
end
|
1941
|
+
alias_method :groupby_rolling, :group_by_rolling
|
1947
1942
|
|
1948
1943
|
# Group based on a time value (or index value of type `:i32`, `:i64`).
|
1949
1944
|
#
|
1950
1945
|
# Time windows are calculated and rows are assigned to windows. Different from a
|
1951
|
-
# normal
|
1946
|
+
# normal group by is that a row can be member of multiple groups. The time/index
|
1952
1947
|
# window could be seen as a rolling window, with a window size determined by
|
1953
1948
|
# dates/times/values instead of slots in the DataFrame.
|
1954
1949
|
#
|
@@ -1976,7 +1971,7 @@ module Polars
|
|
1976
1971
|
# Or combine them:
|
1977
1972
|
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
1978
1973
|
#
|
1979
|
-
# In case of a
|
1974
|
+
# In case of a group_by_dynamic on an integer column, the windows are defined by:
|
1980
1975
|
#
|
1981
1976
|
# - "1i" # length 1
|
1982
1977
|
# - "10i" # length 10
|
@@ -1987,7 +1982,7 @@ module Polars
|
|
1987
1982
|
# This column must be sorted in ascending order. If not the output will not
|
1988
1983
|
# make sense.
|
1989
1984
|
#
|
1990
|
-
# In case of a dynamic
|
1985
|
+
# In case of a dynamic group by on indices, dtype needs to be one of
|
1991
1986
|
# `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
|
1992
1987
|
# performance matters use an `:i64` column.
|
1993
1988
|
# @param every
|
@@ -2038,7 +2033,7 @@ module Polars
|
|
2038
2033
|
# # └─────────────────────┴─────┘
|
2039
2034
|
#
|
2040
2035
|
# @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
|
2041
|
-
# df.
|
2036
|
+
# df.group_by_dynamic("time", every: "1h", closed: "right").agg(
|
2042
2037
|
# [
|
2043
2038
|
# Polars.col("time").min.alias("time_min"),
|
2044
2039
|
# Polars.col("time").max.alias("time_max")
|
@@ -2058,7 +2053,7 @@ module Polars
|
|
2058
2053
|
# # └─────────────────────┴─────────────────────┴─────────────────────┘
|
2059
2054
|
#
|
2060
2055
|
# @example The window boundaries can also be added to the aggregation result.
|
2061
|
-
# df.
|
2056
|
+
# df.group_by_dynamic(
|
2062
2057
|
# "time", every: "1h", include_boundaries: true, closed: "right"
|
2063
2058
|
# ).agg([Polars.col("time").count.alias("time_count")])
|
2064
2059
|
# # =>
|
@@ -2075,27 +2070,27 @@ module Polars
|
|
2075
2070
|
# # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
|
2076
2071
|
#
|
2077
2072
|
# @example When closed="left", should not include right end of interval.
|
2078
|
-
# df.
|
2073
|
+
# df.group_by_dynamic("time", every: "1h", closed: "left").agg(
|
2079
2074
|
# [
|
2080
2075
|
# Polars.col("time").count.alias("time_count"),
|
2081
|
-
# Polars.col("time").
|
2076
|
+
# Polars.col("time").alias("time_agg_list")
|
2082
2077
|
# ]
|
2083
2078
|
# )
|
2084
2079
|
# # =>
|
2085
2080
|
# # shape: (4, 3)
|
2086
|
-
# #
|
2087
|
-
# # │ time ┆ time_count ┆ time_agg_list
|
2088
|
-
# # │ --- ┆ --- ┆ ---
|
2089
|
-
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
|
2090
|
-
# #
|
2091
|
-
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16
|
2092
|
-
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16
|
2093
|
-
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16
|
2094
|
-
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
|
2095
|
-
# #
|
2081
|
+
# # ┌─────────────────────┬────────────┬───────────────────────────────────┐
|
2082
|
+
# # │ time ┆ time_count ┆ time_agg_list │
|
2083
|
+
# # │ --- ┆ --- ┆ --- │
|
2084
|
+
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
|
2085
|
+
# # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
|
2086
|
+
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │
|
2087
|
+
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │
|
2088
|
+
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │
|
2089
|
+
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
|
2090
|
+
# # └─────────────────────┴────────────┴───────────────────────────────────┘
|
2096
2091
|
#
|
2097
2092
|
# @example When closed="both" the time values at the window boundaries belong to 2 groups.
|
2098
|
-
# df.
|
2093
|
+
# df.group_by_dynamic("time", every: "1h", closed: "both").agg(
|
2099
2094
|
# [Polars.col("time").count.alias("time_count")]
|
2100
2095
|
# )
|
2101
2096
|
# # =>
|
@@ -2112,7 +2107,7 @@ module Polars
|
|
2112
2107
|
# # │ 2021-12-16 03:00:00 ┆ 1 │
|
2113
2108
|
# # └─────────────────────┴────────────┘
|
2114
2109
|
#
|
2115
|
-
# @example Dynamic
|
2110
|
+
# @example Dynamic group bys can also be combined with grouping on normal keys.
|
2116
2111
|
# df = Polars::DataFrame.new(
|
2117
2112
|
# {
|
2118
2113
|
# "time" => Polars.date_range(
|
@@ -2123,7 +2118,7 @@ module Polars
|
|
2123
2118
|
# "groups" => ["a", "a", "a", "b", "b", "a", "a"]
|
2124
2119
|
# }
|
2125
2120
|
# )
|
2126
|
-
# df.
|
2121
|
+
# df.group_by_dynamic(
|
2127
2122
|
# "time",
|
2128
2123
|
# every: "1h",
|
2129
2124
|
# closed: "both",
|
@@ -2146,20 +2141,20 @@ module Polars
|
|
2146
2141
|
# # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
|
2147
2142
|
# # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
|
2148
2143
|
#
|
2149
|
-
# @example Dynamic
|
2144
|
+
# @example Dynamic group by on an index column.
|
2150
2145
|
# df = Polars::DataFrame.new(
|
2151
2146
|
# {
|
2152
2147
|
# "idx" => Polars.arange(0, 6, eager: true),
|
2153
2148
|
# "A" => ["A", "A", "B", "B", "B", "C"]
|
2154
2149
|
# }
|
2155
2150
|
# )
|
2156
|
-
# df.
|
2151
|
+
# df.group_by_dynamic(
|
2157
2152
|
# "idx",
|
2158
2153
|
# every: "2i",
|
2159
2154
|
# period: "3i",
|
2160
2155
|
# include_boundaries: true,
|
2161
2156
|
# closed: "right"
|
2162
|
-
# ).agg(Polars.col("A").
|
2157
|
+
# ).agg(Polars.col("A").alias("A_agg_list"))
|
2163
2158
|
# # =>
|
2164
2159
|
# # shape: (3, 4)
|
2165
2160
|
# # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
|
@@ -2171,7 +2166,7 @@ module Polars
|
|
2171
2166
|
# # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
|
2172
2167
|
# # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
|
2173
2168
|
# # └─────────────────┴─────────────────┴─────┴─────────────────┘
|
2174
|
-
def
|
2169
|
+
def group_by_dynamic(
|
2175
2170
|
index_column,
|
2176
2171
|
every:,
|
2177
2172
|
period: nil,
|
@@ -2195,6 +2190,7 @@ module Polars
|
|
2195
2190
|
start_by
|
2196
2191
|
)
|
2197
2192
|
end
|
2193
|
+
alias_method :groupby_dynamic, :group_by_dynamic
|
2198
2194
|
|
2199
2195
|
# Upsample a DataFrame at a regular frequency.
|
2200
2196
|
#
|
@@ -2242,7 +2238,7 @@ module Polars
|
|
2242
2238
|
# "groups" => ["A", "B", "A", "B"],
|
2243
2239
|
# "values" => [0, 1, 2, 3]
|
2244
2240
|
# }
|
2245
|
-
# )
|
2241
|
+
# ).set_sorted("time")
|
2246
2242
|
# df.upsample(
|
2247
2243
|
# time_column: "time", every: "1mo", by: "groups", maintain_order: true
|
2248
2244
|
# ).select(Polars.all.forward_fill)
|
@@ -2360,7 +2356,7 @@ module Polars
|
|
2360
2356
|
# ], # note record date: Jan 1st (sorted!)
|
2361
2357
|
# "gdp" => [4164, 4411, 4566, 4696]
|
2362
2358
|
# }
|
2363
|
-
# )
|
2359
|
+
# ).set_sorted("date")
|
2364
2360
|
# population = Polars::DataFrame.new(
|
2365
2361
|
# {
|
2366
2362
|
# "date" => [
|
@@ -2371,7 +2367,7 @@ module Polars
|
|
2371
2367
|
# ], # note record date: May 12th (sorted!)
|
2372
2368
|
# "population" => [82.19, 82.66, 83.12, 83.52]
|
2373
2369
|
# }
|
2374
|
-
# )
|
2370
|
+
# ).set_sorted("date")
|
2375
2371
|
# population.join_asof(
|
2376
2372
|
# gdp, left_on: "date", right_on: "date", strategy: "backward"
|
2377
2373
|
# )
|
@@ -2674,7 +2670,7 @@ module Polars
|
|
2674
2670
|
# # │ 3 ┆ 8 ┆ c ┆ 30 │
|
2675
2671
|
# # └─────┴─────┴─────┴───────┘
|
2676
2672
|
def hstack(columns, in_place: false)
|
2677
|
-
if !columns.is_a?(Array)
|
2673
|
+
if !columns.is_a?(::Array)
|
2678
2674
|
columns = columns.get_columns
|
2679
2675
|
end
|
2680
2676
|
if in_place
|
@@ -2804,7 +2800,7 @@ module Polars
|
|
2804
2800
|
# # │ 3 ┆ 8.0 │
|
2805
2801
|
# # └─────┴─────┘
|
2806
2802
|
def drop(columns)
|
2807
|
-
if columns.is_a?(Array)
|
2803
|
+
if columns.is_a?(::Array)
|
2808
2804
|
df = clone
|
2809
2805
|
columns.each do |n|
|
2810
2806
|
df._df.drop_in_place(n)
|
@@ -3317,7 +3313,7 @@ module Polars
|
|
3317
3313
|
n_fill = n_cols * n_rows - height
|
3318
3314
|
|
3319
3315
|
if n_fill > 0
|
3320
|
-
if !fill_values.is_a?(Array)
|
3316
|
+
if !fill_values.is_a?(::Array)
|
3321
3317
|
fill_values = [fill_values] * df.width
|
3322
3318
|
end
|
3323
3319
|
|
@@ -3426,36 +3422,38 @@ module Polars
|
|
3426
3422
|
# # ╞═════╪═════╪═════╡
|
3427
3423
|
# # │ C ┆ 2 ┆ l │
|
3428
3424
|
# # └─────┴─────┴─────┘}
|
3429
|
-
def partition_by(groups, maintain_order: true, as_dict: false)
|
3425
|
+
def partition_by(groups, maintain_order: true, include_key: true, as_dict: false)
|
3430
3426
|
if groups.is_a?(String)
|
3431
3427
|
groups = [groups]
|
3432
|
-
elsif !groups.is_a?(Array)
|
3428
|
+
elsif !groups.is_a?(::Array)
|
3433
3429
|
groups = Array(groups)
|
3434
3430
|
end
|
3435
3431
|
|
3436
3432
|
if as_dict
|
3437
3433
|
out = {}
|
3438
3434
|
if groups.length == 1
|
3439
|
-
_df.partition_by(groups, maintain_order).each do |df|
|
3435
|
+
_df.partition_by(groups, maintain_order, include_key).each do |df|
|
3440
3436
|
df = _from_rbdf(df)
|
3441
3437
|
out[df[groups][0, 0]] = df
|
3442
3438
|
end
|
3443
3439
|
else
|
3444
|
-
_df.partition_by(groups, maintain_order).each do |df|
|
3440
|
+
_df.partition_by(groups, maintain_order, include_key).each do |df|
|
3445
3441
|
df = _from_rbdf(df)
|
3446
3442
|
out[df[groups].row(0)] = df
|
3447
3443
|
end
|
3448
3444
|
end
|
3449
3445
|
out
|
3450
3446
|
else
|
3451
|
-
_df.partition_by(groups, maintain_order).map { |df| _from_rbdf(df) }
|
3447
|
+
_df.partition_by(groups, maintain_order, include_key).map { |df| _from_rbdf(df) }
|
3452
3448
|
end
|
3453
3449
|
end
|
3454
3450
|
|
3455
3451
|
# Shift values by the given period.
|
3456
3452
|
#
|
3457
|
-
# @param
|
3453
|
+
# @param n [Integer]
|
3458
3454
|
# Number of places to shift (may be negative).
|
3455
|
+
# @param fill_value [Object]
|
3456
|
+
# Fill the resulting null values with this value.
|
3459
3457
|
#
|
3460
3458
|
# @return [DataFrame]
|
3461
3459
|
#
|
@@ -3493,8 +3491,8 @@ module Polars
|
|
3493
3491
|
# # │ 3 ┆ 8 ┆ c │
|
3494
3492
|
# # │ null ┆ null ┆ null │
|
3495
3493
|
# # └──────┴──────┴──────┘
|
3496
|
-
def shift(
|
3497
|
-
|
3494
|
+
def shift(n, fill_value: nil)
|
3495
|
+
lazy.shift(n, fill_value: fill_value).collect(_eager: true)
|
3498
3496
|
end
|
3499
3497
|
|
3500
3498
|
# Shift the values by a given period and fill the resulting null values.
|
@@ -3527,9 +3525,7 @@ module Polars
|
|
3527
3525
|
# # │ 2 ┆ 7 ┆ b │
|
3528
3526
|
# # └─────┴─────┴─────┘
|
3529
3527
|
def shift_and_fill(periods, fill_value)
|
3530
|
-
|
3531
|
-
.shift_and_fill(periods, fill_value)
|
3532
|
-
.collect(no_optimization: true, string_cache: false)
|
3528
|
+
shift(periods, fill_value: fill_value)
|
3533
3529
|
end
|
3534
3530
|
|
3535
3531
|
# Get a mask of all duplicated rows in this DataFrame.
|
@@ -3716,7 +3712,7 @@ module Polars
|
|
3716
3712
|
# # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
|
3717
3713
|
# # └─────┴──────┴───────┴──────┴──────┴───────┘
|
3718
3714
|
def with_columns(exprs)
|
3719
|
-
if !exprs.nil? && !exprs.is_a?(Array)
|
3715
|
+
if !exprs.nil? && !exprs.is_a?(::Array)
|
3720
3716
|
exprs = [exprs]
|
3721
3717
|
end
|
3722
3718
|
lazy
|
@@ -3780,7 +3776,7 @@ module Polars
|
|
3780
3776
|
if axis == 0
|
3781
3777
|
_from_rbdf(_df.max)
|
3782
3778
|
elsif axis == 1
|
3783
|
-
Utils.wrap_s(_df.
|
3779
|
+
Utils.wrap_s(_df.max_horizontal)
|
3784
3780
|
else
|
3785
3781
|
raise ArgumentError, "Axis should be 0 or 1."
|
3786
3782
|
end
|
@@ -3812,7 +3808,7 @@ module Polars
|
|
3812
3808
|
if axis == 0
|
3813
3809
|
_from_rbdf(_df.min)
|
3814
3810
|
elsif axis == 1
|
3815
|
-
Utils.wrap_s(_df.
|
3811
|
+
Utils.wrap_s(_df.min_horizontal)
|
3816
3812
|
else
|
3817
3813
|
raise ArgumentError, "Axis should be 0 or 1."
|
3818
3814
|
end
|
@@ -3861,7 +3857,7 @@ module Polars
|
|
3861
3857
|
when 0
|
3862
3858
|
_from_rbdf(_df.sum)
|
3863
3859
|
when 1
|
3864
|
-
Utils.wrap_s(_df.
|
3860
|
+
Utils.wrap_s(_df.sum_horizontal(null_strategy))
|
3865
3861
|
else
|
3866
3862
|
raise ArgumentError, "Axis should be 0 or 1."
|
3867
3863
|
end
|
@@ -3899,7 +3895,7 @@ module Polars
|
|
3899
3895
|
when 0
|
3900
3896
|
_from_rbdf(_df.mean)
|
3901
3897
|
when 1
|
3902
|
-
Utils.wrap_s(_df.
|
3898
|
+
Utils.wrap_s(_df.mean_horizontal(null_strategy))
|
3903
3899
|
else
|
3904
3900
|
raise ArgumentError, "Axis should be 0 or 1."
|
3905
3901
|
end
|
@@ -4097,11 +4093,11 @@ module Polars
|
|
4097
4093
|
# # │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │
|
4098
4094
|
# # │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │
|
4099
4095
|
# # └───────┴───────┴───────┴───────┴───────┴───────┘
|
4100
|
-
def to_dummies(columns: nil, separator: "_")
|
4096
|
+
def to_dummies(columns: nil, separator: "_", drop_first: false)
|
4101
4097
|
if columns.is_a?(String)
|
4102
4098
|
columns = [columns]
|
4103
4099
|
end
|
4104
|
-
_from_rbdf(_df.to_dummies(columns, separator))
|
4100
|
+
_from_rbdf(_df.to_dummies(columns, separator, drop_first))
|
4105
4101
|
end
|
4106
4102
|
|
4107
4103
|
# Drop duplicate rows from this DataFrame.
|
@@ -4189,7 +4185,7 @@ module Polars
|
|
4189
4185
|
subset = [subset]
|
4190
4186
|
end
|
4191
4187
|
|
4192
|
-
if subset.is_a?(Array) && subset.length == 1
|
4188
|
+
if subset.is_a?(::Array) && subset.length == 1
|
4193
4189
|
expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false)
|
4194
4190
|
else
|
4195
4191
|
struct_fields = subset.nil? ? Polars.all : subset
|
@@ -4284,15 +4280,20 @@ module Polars
|
|
4284
4280
|
end
|
4285
4281
|
|
4286
4282
|
if n.nil? && !frac.nil?
|
4283
|
+
frac = Series.new("frac", [frac]) unless frac.is_a?(Series)
|
4284
|
+
|
4287
4285
|
_from_rbdf(
|
4288
|
-
_df.sample_frac(frac, with_replacement, shuffle, seed)
|
4286
|
+
_df.sample_frac(frac._s, with_replacement, shuffle, seed)
|
4289
4287
|
)
|
4290
4288
|
end
|
4291
4289
|
|
4292
4290
|
if n.nil?
|
4293
4291
|
n = 1
|
4294
4292
|
end
|
4295
|
-
|
4293
|
+
|
4294
|
+
n = Series.new("", [n]) unless n.is_a?(Series)
|
4295
|
+
|
4296
|
+
_from_rbdf(_df.sample_n(n._s, with_replacement, shuffle, seed))
|
4296
4297
|
end
|
4297
4298
|
|
4298
4299
|
# Apply a horizontal reduction on a DataFrame.
|
@@ -4591,7 +4592,7 @@ module Polars
|
|
4591
4592
|
#
|
4592
4593
|
# @example
|
4593
4594
|
# s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]})
|
4594
|
-
# s.
|
4595
|
+
# s.gather_every(2)
|
4595
4596
|
# # =>
|
4596
4597
|
# # shape: (2, 2)
|
4597
4598
|
# # ┌─────┬─────┐
|
@@ -4602,9 +4603,10 @@ module Polars
|
|
4602
4603
|
# # │ 1 ┆ 5 │
|
4603
4604
|
# # │ 3 ┆ 7 │
|
4604
4605
|
# # └─────┴─────┘
|
4605
|
-
def
|
4606
|
-
select(Utils.col("*").
|
4606
|
+
def gather_every(n)
|
4607
|
+
select(Utils.col("*").gather_every(n))
|
4607
4608
|
end
|
4609
|
+
alias_method :take_every, :gather_every
|
4608
4610
|
|
4609
4611
|
# Hash and combine the rows in this DataFrame.
|
4610
4612
|
#
|
@@ -4661,16 +4663,16 @@ module Polars
|
|
4661
4663
|
# df.interpolate
|
4662
4664
|
# # =>
|
4663
4665
|
# # shape: (4, 3)
|
4664
|
-
# #
|
4665
|
-
# # │ foo
|
4666
|
-
# # │ ---
|
4667
|
-
# # │
|
4668
|
-
# #
|
4669
|
-
# # │ 1
|
4670
|
-
# # │ 5
|
4671
|
-
# # │ 9
|
4672
|
-
# # │ 10
|
4673
|
-
# #
|
4666
|
+
# # ┌──────┬──────┬──────────┐
|
4667
|
+
# # │ foo ┆ bar ┆ baz │
|
4668
|
+
# # │ --- ┆ --- ┆ --- │
|
4669
|
+
# # │ f64 ┆ f64 ┆ f64 │
|
4670
|
+
# # ╞══════╪══════╪══════════╡
|
4671
|
+
# # │ 1.0 ┆ 6.0 ┆ 1.0 │
|
4672
|
+
# # │ 5.0 ┆ 7.0 ┆ 3.666667 │
|
4673
|
+
# # │ 9.0 ┆ 9.0 ┆ 6.333333 │
|
4674
|
+
# # │ 10.0 ┆ null ┆ 9.0 │
|
4675
|
+
# # └──────┴──────┴──────────┘
|
4674
4676
|
def interpolate
|
4675
4677
|
select(Utils.col("*").interpolate)
|
4676
4678
|
end
|
@@ -4758,6 +4760,38 @@ module Polars
|
|
4758
4760
|
_from_rbdf(_df.unnest(names))
|
4759
4761
|
end
|
4760
4762
|
|
4763
|
+
# TODO
|
4764
|
+
# def corr
|
4765
|
+
# end
|
4766
|
+
|
4767
|
+
# TODO
|
4768
|
+
# def merge_sorted
|
4769
|
+
# end
|
4770
|
+
|
4771
|
+
# Indicate that one or multiple columns are sorted.
|
4772
|
+
#
|
4773
|
+
# @param column [Object]
|
4774
|
+
# Columns that are sorted
|
4775
|
+
# @param more_columns [Object]
|
4776
|
+
# Additional columns that are sorted, specified as positional arguments.
|
4777
|
+
# @param descending [Boolean]
|
4778
|
+
# Whether the columns are sorted in descending order.
|
4779
|
+
#
|
4780
|
+
# @return [DataFrame]
|
4781
|
+
def set_sorted(
|
4782
|
+
column,
|
4783
|
+
*more_columns,
|
4784
|
+
descending: false
|
4785
|
+
)
|
4786
|
+
lazy
|
4787
|
+
.set_sorted(column, *more_columns, descending: descending)
|
4788
|
+
.collect(no_optimization: true)
|
4789
|
+
end
|
4790
|
+
|
4791
|
+
# TODO
|
4792
|
+
# def update
|
4793
|
+
# end
|
4794
|
+
|
4761
4795
|
private
|
4762
4796
|
|
4763
4797
|
def initialize_copy(other)
|
@@ -4910,8 +4944,8 @@ module Polars
|
|
4910
4944
|
[lookup[col[0]] || col[0], col[1]]
|
4911
4945
|
end
|
4912
4946
|
|
4913
|
-
if schema_overrides
|
4914
|
-
|
4947
|
+
if schema_overrides && schema_overrides.any?
|
4948
|
+
column_dtypes.merge!(schema_overrides)
|
4915
4949
|
end
|
4916
4950
|
|
4917
4951
|
column_dtypes.each do |col, dtype|
|
@@ -4967,7 +5001,7 @@ module Polars
|
|
4967
5001
|
columns.each do |col, i|
|
4968
5002
|
if dtypes[col] == Categorical # != rbdf_dtypes[i]
|
4969
5003
|
column_casts << Polars.col(col).cast(Categorical)._rbexpr
|
4970
|
-
elsif structs
|
5004
|
+
elsif structs&.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
|
4971
5005
|
column_casts << Polars.col(col).cast(structs[col])._rbexpr
|
4972
5006
|
elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
|
4973
5007
|
column_casts << Polars.col(col).cast(dtypes[col])._rbexpr
|
@@ -5012,15 +5046,56 @@ module Polars
|
|
5012
5046
|
rbdf = _post_apply_columns(rbdf, column_names)
|
5013
5047
|
end
|
5014
5048
|
return rbdf
|
5015
|
-
elsif data[0].is_a?(Array)
|
5049
|
+
elsif data[0].is_a?(::Array)
|
5016
5050
|
if orient.nil? && !columns.nil?
|
5017
|
-
|
5051
|
+
first_element = data[0]
|
5052
|
+
row_types = first_element.filter_map { |value| value.class }.uniq
|
5053
|
+
if row_types.include?(Integer) && row_types.include?(Float)
|
5054
|
+
row_types.delete(Integer)
|
5055
|
+
end
|
5056
|
+
orient = row_types.length == 1 ? "col" : "row"
|
5018
5057
|
end
|
5019
5058
|
|
5020
5059
|
if orient == "row"
|
5021
|
-
|
5060
|
+
column_names, schema_overrides = _unpack_schema(
|
5061
|
+
schema, schema_overrides: schema_overrides, n_expected: first_element.length
|
5062
|
+
)
|
5063
|
+
local_schema_override = (
|
5064
|
+
schema_overrides.any? ? (raise Todo) : {}
|
5065
|
+
)
|
5066
|
+
if column_names.any? && first_element.length > 0 && first_element.length != column_names.length
|
5067
|
+
raise ArgumentError, "the row data does not match the number of columns"
|
5068
|
+
end
|
5069
|
+
|
5070
|
+
unpack_nested = false
|
5071
|
+
local_schema_override.each do |col, tp|
|
5072
|
+
raise Todo
|
5073
|
+
end
|
5074
|
+
|
5075
|
+
if unpack_nested
|
5076
|
+
raise Todo
|
5077
|
+
else
|
5078
|
+
rbdf = RbDataFrame.read_rows(
|
5079
|
+
data,
|
5080
|
+
infer_schema_length,
|
5081
|
+
local_schema_override.any? ? local_schema_override : nil
|
5082
|
+
)
|
5083
|
+
end
|
5084
|
+
if column_names.any? || schema_overrides.any?
|
5085
|
+
rbdf = _post_apply_columns(
|
5086
|
+
rbdf, column_names, schema_overrides: schema_overrides
|
5087
|
+
)
|
5088
|
+
end
|
5089
|
+
return rbdf
|
5022
5090
|
elsif orient == "col" || orient.nil?
|
5023
|
-
|
5091
|
+
column_names, schema_overrides = _unpack_schema(
|
5092
|
+
schema, schema_overrides: schema_overrides, n_expected: data.length
|
5093
|
+
)
|
5094
|
+
data_series =
|
5095
|
+
data.map.with_index do |element, i|
|
5096
|
+
Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]])._s
|
5097
|
+
end
|
5098
|
+
return RbDataFrame.new(data_series)
|
5024
5099
|
else
|
5025
5100
|
raise ArgumentError, "orient must be one of {{'col', 'row', nil}}, got #{orient} instead."
|
5026
5101
|
end
|
@@ -5066,10 +5141,10 @@ module Polars
|
|
5066
5141
|
|
5067
5142
|
def _compare_to_other_df(other, op)
|
5068
5143
|
if columns != other.columns
|
5069
|
-
raise
|
5144
|
+
raise ArgumentError, "DataFrame columns do not match"
|
5070
5145
|
end
|
5071
5146
|
if shape != other.shape
|
5072
|
-
raise
|
5147
|
+
raise ArgumentError, "DataFrame dimensions do not match"
|
5073
5148
|
end
|
5074
5149
|
|
5075
5150
|
suffix = "__POLARS_CMP_OTHER"
|
@@ -5117,7 +5192,7 @@ module Polars
|
|
5117
5192
|
|
5118
5193
|
def _prepare_other_arg(other)
|
5119
5194
|
if !other.is_a?(Series)
|
5120
|
-
if other.is_a?(Array)
|
5195
|
+
if other.is_a?(::Array)
|
5121
5196
|
raise ArgumentError, "Operation not supported."
|
5122
5197
|
end
|
5123
5198
|
|