polars-df 0.5.0-x86_64-linux → 0.6.0-x86_64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/Cargo.lock +337 -381
- data/LICENSE-THIRD-PARTY.txt +1161 -832
- data/README.md +4 -3
- data/lib/polars/3.0/polars.so +0 -0
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/array_expr.rb +84 -0
- data/lib/polars/array_name_space.rb +77 -0
- data/lib/polars/batched_csv_reader.rb +1 -1
- data/lib/polars/data_frame.rb +91 -49
- data/lib/polars/data_types.rb +163 -29
- data/lib/polars/date_time_name_space.rb +17 -3
- data/lib/polars/expr.rb +76 -69
- data/lib/polars/functions.rb +0 -1
- data/lib/polars/group_by.rb +1 -22
- data/lib/polars/lazy_frame.rb +82 -30
- data/lib/polars/lazy_functions.rb +67 -31
- data/lib/polars/list_expr.rb +28 -28
- data/lib/polars/list_name_space.rb +13 -13
- data/lib/polars/rolling_group_by.rb +4 -2
- data/lib/polars/series.rb +70 -16
- data/lib/polars/string_expr.rb +137 -11
- data/lib/polars/string_name_space.rb +137 -22
- data/lib/polars/utils.rb +107 -57
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +3 -0
- metadata +4 -2
data/README.md
CHANGED
@@ -25,7 +25,7 @@ Polars.read_csv("iris.csv")
|
|
25
25
|
.collect
|
26
26
|
```
|
27
27
|
|
28
|
-
You can follow [Polars tutorials](https://pola-rs.github.io/polars-book/user-guide/
|
28
|
+
You can follow [Polars tutorials](https://pola-rs.github.io/polars-book/user-guide/) and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems.
|
29
29
|
|
30
30
|
## Reference
|
31
31
|
|
@@ -348,7 +348,7 @@ df.to_numo
|
|
348
348
|
You can specify column types when creating a data frame
|
349
349
|
|
350
350
|
```ruby
|
351
|
-
Polars::DataFrame.new(data,
|
351
|
+
Polars::DataFrame.new(data, schema: {"a" => Polars::Int32, "b" => Polars::Float32})
|
352
352
|
```
|
353
353
|
|
354
354
|
Supported types are:
|
@@ -357,8 +357,9 @@ Supported types are:
|
|
357
357
|
- float - `Float64`, `Float32`
|
358
358
|
- integer - `Int64`, `Int32`, `Int16`, `Int8`
|
359
359
|
- unsigned integer - `UInt64`, `UInt32`, `UInt16`, `UInt8`
|
360
|
-
- string - `Utf8`, `Categorical`
|
360
|
+
- string - `Utf8`, `Binary`, `Categorical`
|
361
361
|
- temporal - `Date`, `Datetime`, `Time`, `Duration`
|
362
|
+
- other - `Object`, `List`, `Struct`, `Array` [unreleased]
|
362
363
|
|
363
364
|
Get column types
|
364
365
|
|
data/lib/polars/3.0/polars.so
CHANGED
Binary file
|
data/lib/polars/3.1/polars.so
CHANGED
Binary file
|
data/lib/polars/3.2/polars.so
CHANGED
Binary file
|
@@ -0,0 +1,84 @@
|
|
1
|
+
module Polars
|
2
|
+
# Namespace for array related expressions.
|
3
|
+
class ArrayExpr
|
4
|
+
# @private
|
5
|
+
attr_accessor :_rbexpr
|
6
|
+
|
7
|
+
# @private
|
8
|
+
def initialize(expr)
|
9
|
+
self._rbexpr = expr._rbexpr
|
10
|
+
end
|
11
|
+
|
12
|
+
# Compute the min values of the sub-arrays.
|
13
|
+
#
|
14
|
+
# @return [Expr]
|
15
|
+
#
|
16
|
+
# @example
|
17
|
+
# df = Polars::DataFrame.new(
|
18
|
+
# {"a" => [[1, 2], [4, 3]]},
|
19
|
+
# schema: {"a" => Polars::Array.new(2, Polars::Int64)}
|
20
|
+
# )
|
21
|
+
# df.select(Polars.col("a").arr.min)
|
22
|
+
# # =>
|
23
|
+
# # shape: (2, 1)
|
24
|
+
# # ┌─────┐
|
25
|
+
# # │ a │
|
26
|
+
# # │ --- │
|
27
|
+
# # │ i64 │
|
28
|
+
# # ╞═════╡
|
29
|
+
# # │ 1 │
|
30
|
+
# # │ 3 │
|
31
|
+
# # └─────┘
|
32
|
+
def min
|
33
|
+
Utils.wrap_expr(_rbexpr.array_min)
|
34
|
+
end
|
35
|
+
|
36
|
+
# Compute the max values of the sub-arrays.
|
37
|
+
#
|
38
|
+
# @return [Expr]
|
39
|
+
#
|
40
|
+
# @example
|
41
|
+
# df = Polars::DataFrame.new(
|
42
|
+
# {"a" => [[1, 2], [4, 3]]},
|
43
|
+
# schema: {"a" => Polars::Array.new(2, Polars::Int64)}
|
44
|
+
# )
|
45
|
+
# df.select(Polars.col("a").arr.max)
|
46
|
+
# # =>
|
47
|
+
# # shape: (2, 1)
|
48
|
+
# # ┌─────┐
|
49
|
+
# # │ a │
|
50
|
+
# # │ --- │
|
51
|
+
# # │ i64 │
|
52
|
+
# # ╞═════╡
|
53
|
+
# # │ 2 │
|
54
|
+
# # │ 4 │
|
55
|
+
# # └─────┘
|
56
|
+
def max
|
57
|
+
Utils.wrap_expr(_rbexpr.array_max)
|
58
|
+
end
|
59
|
+
|
60
|
+
# Compute the sum values of the sub-arrays.
|
61
|
+
#
|
62
|
+
# @return [Expr]
|
63
|
+
#
|
64
|
+
# @example
|
65
|
+
# df = Polars::DataFrame.new(
|
66
|
+
# {"a" => [[1, 2], [4, 3]]},
|
67
|
+
# schema: {"a" => Polars::Array.new(2, Polars::Int64)}
|
68
|
+
# )
|
69
|
+
# df.select(Polars.col("a").arr.sum)
|
70
|
+
# # =>
|
71
|
+
# # shape: (2, 1)
|
72
|
+
# # ┌─────┐
|
73
|
+
# # │ a │
|
74
|
+
# # │ --- │
|
75
|
+
# # │ i64 │
|
76
|
+
# # ╞═════╡
|
77
|
+
# # │ 3 │
|
78
|
+
# # │ 7 │
|
79
|
+
# # └─────┘
|
80
|
+
def sum
|
81
|
+
Utils.wrap_expr(_rbexpr.array_sum)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
module Polars
|
2
|
+
# Series.arr namespace.
|
3
|
+
class ArrayNameSpace
|
4
|
+
include ExprDispatch
|
5
|
+
|
6
|
+
self._accessor = "arr"
|
7
|
+
|
8
|
+
# @private
|
9
|
+
def initialize(series)
|
10
|
+
self._s = series._s
|
11
|
+
end
|
12
|
+
|
13
|
+
# Compute the min values of the sub-arrays.
|
14
|
+
#
|
15
|
+
# @return [Series]
|
16
|
+
#
|
17
|
+
# @example
|
18
|
+
# s = Polars::Series.new(
|
19
|
+
# "a", [[1, 2], [4, 3]], dtype: Polars::Array.new(2, Polars::Int64)
|
20
|
+
# )
|
21
|
+
# s.arr.min
|
22
|
+
# # =>
|
23
|
+
# # shape: (2,)
|
24
|
+
# # Series: 'a' [i64]
|
25
|
+
# # [
|
26
|
+
# # 1
|
27
|
+
# # 3
|
28
|
+
# # ]
|
29
|
+
def min
|
30
|
+
super
|
31
|
+
end
|
32
|
+
|
33
|
+
# Compute the max values of the sub-arrays.
|
34
|
+
#
|
35
|
+
# @return [Series]
|
36
|
+
#
|
37
|
+
# @example
|
38
|
+
# s = Polars::Series.new(
|
39
|
+
# "a", [[1, 2], [4, 3]], dtype: Polars::Array.new(2, Polars::Int64)
|
40
|
+
# )
|
41
|
+
# s.arr.max
|
42
|
+
# # =>
|
43
|
+
# # shape: (2,)
|
44
|
+
# # Series: 'a' [i64]
|
45
|
+
# # [
|
46
|
+
# # 2
|
47
|
+
# # 4
|
48
|
+
# # ]
|
49
|
+
def max
|
50
|
+
super
|
51
|
+
end
|
52
|
+
|
53
|
+
# Compute the sum values of the sub-arrays.
|
54
|
+
#
|
55
|
+
# @return [Series]
|
56
|
+
#
|
57
|
+
# @example
|
58
|
+
# df = Polars::DataFrame.new(
|
59
|
+
# {"a" => [[1, 2], [4, 3]]},
|
60
|
+
# schema: {"a" => Polars::Array.new(2, Polars::Int64)}
|
61
|
+
# )
|
62
|
+
# df.select(Polars.col("a").arr.sum)
|
63
|
+
# # =>
|
64
|
+
# # shape: (2, 1)
|
65
|
+
# # ┌─────┐
|
66
|
+
# # │ a │
|
67
|
+
# # │ --- │
|
68
|
+
# # │ i64 │
|
69
|
+
# # ╞═════╡
|
70
|
+
# # │ 3 │
|
71
|
+
# # │ 7 │
|
72
|
+
# # └─────┘
|
73
|
+
def sum
|
74
|
+
super
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
data/lib/polars/data_frame.rb
CHANGED
@@ -36,7 +36,7 @@ module Polars
|
|
36
36
|
elsif data.is_a?(Hash)
|
37
37
|
data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
|
38
38
|
self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
|
39
|
-
elsif data.is_a?(Array)
|
39
|
+
elsif data.is_a?(::Array)
|
40
40
|
self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
|
41
41
|
elsif data.is_a?(Series)
|
42
42
|
self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
|
@@ -116,7 +116,7 @@ module Polars
|
|
116
116
|
dtypes.each do|k, v|
|
117
117
|
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
118
118
|
end
|
119
|
-
elsif dtypes.is_a?(Array)
|
119
|
+
elsif dtypes.is_a?(::Array)
|
120
120
|
dtype_slice = dtypes
|
121
121
|
else
|
122
122
|
raise ArgumentError, "dtype arg should be list or dict"
|
@@ -590,7 +590,7 @@ module Polars
|
|
590
590
|
|
591
591
|
# df[2, ..] (select row as df)
|
592
592
|
if row_selection.is_a?(Integer)
|
593
|
-
if col_selection.is_a?(Array)
|
593
|
+
if col_selection.is_a?(::Array)
|
594
594
|
df = self[0.., col_selection]
|
595
595
|
return df.slice(row_selection, 1)
|
596
596
|
end
|
@@ -611,7 +611,7 @@ module Polars
|
|
611
611
|
return series[row_selection]
|
612
612
|
end
|
613
613
|
|
614
|
-
if col_selection.is_a?(Array)
|
614
|
+
if col_selection.is_a?(::Array)
|
615
615
|
# df[.., [1, 2]]
|
616
616
|
if Utils.is_int_sequence(col_selection)
|
617
617
|
series_list = col_selection.map { |i| to_series(i) }
|
@@ -641,7 +641,7 @@ module Polars
|
|
641
641
|
return Slice.new(self).apply(item)
|
642
642
|
end
|
643
643
|
|
644
|
-
if item.is_a?(Array) && item.all? { |v| Utils.strlike?(v) }
|
644
|
+
if item.is_a?(::Array) && item.all? { |v| Utils.strlike?(v) }
|
645
645
|
# select multiple columns
|
646
646
|
# df[["foo", "bar"]]
|
647
647
|
return _from_rbdf(_df.select(item.map(&:to_s)))
|
@@ -684,13 +684,13 @@ module Polars
|
|
684
684
|
end
|
685
685
|
|
686
686
|
if Utils.strlike?(key)
|
687
|
-
if value.is_a?(Array) || (defined?(Numo::NArray) && value.is_a?(Numo::NArray))
|
687
|
+
if value.is_a?(::Array) || (defined?(Numo::NArray) && value.is_a?(Numo::NArray))
|
688
688
|
value = Series.new(value)
|
689
689
|
elsif !value.is_a?(Series)
|
690
690
|
value = Polars.lit(value)
|
691
691
|
end
|
692
692
|
self._df = with_column(value.alias(key.to_s))._df
|
693
|
-
elsif key.is_a?(Array)
|
693
|
+
elsif key.is_a?(::Array)
|
694
694
|
row_selection, col_selection = key
|
695
695
|
|
696
696
|
if Utils.strlike?(col_selection)
|
@@ -994,14 +994,21 @@ module Polars
|
|
994
994
|
#
|
995
995
|
# @return [nil]
|
996
996
|
def write_ipc(file, compression: "uncompressed")
|
997
|
-
|
998
|
-
|
997
|
+
return_bytes = file.nil?
|
998
|
+
if return_bytes
|
999
|
+
file = StringIO.new
|
1000
|
+
file.set_encoding(Encoding::BINARY)
|
999
1001
|
end
|
1000
1002
|
if Utils.pathlike?(file)
|
1001
1003
|
file = Utils.normalise_filepath(file)
|
1002
1004
|
end
|
1003
1005
|
|
1006
|
+
if compression.nil?
|
1007
|
+
compression = "uncompressed"
|
1008
|
+
end
|
1009
|
+
|
1004
1010
|
_df.write_ipc(file, compression)
|
1011
|
+
return_bytes ? file.string : nil
|
1005
1012
|
end
|
1006
1013
|
|
1007
1014
|
# Write to Apache Parquet file.
|
@@ -1491,13 +1498,9 @@ module Polars
|
|
1491
1498
|
# # │ 1 ┆ 6.0 ┆ a │
|
1492
1499
|
# # └─────┴─────┴─────┘
|
1493
1500
|
def sort(by, reverse: false, nulls_last: false)
|
1494
|
-
|
1495
|
-
|
1496
|
-
|
1497
|
-
.collect(no_optimization: true, string_cache: false)
|
1498
|
-
else
|
1499
|
-
_from_rbdf(_df.sort(by, reverse, nulls_last))
|
1500
|
-
end
|
1501
|
+
lazy
|
1502
|
+
.sort(by, reverse: reverse, nulls_last: nulls_last)
|
1503
|
+
.collect(no_optimization: true)
|
1501
1504
|
end
|
1502
1505
|
|
1503
1506
|
# Sort the DataFrame by column in-place.
|
@@ -1899,6 +1902,12 @@ module Polars
|
|
1899
1902
|
# Define whether the temporal window interval is closed or not.
|
1900
1903
|
# @param by [Object]
|
1901
1904
|
# Also group by this column/these columns.
|
1905
|
+
# @param check_sorted [Boolean]
|
1906
|
+
# When the `by` argument is given, polars can not check sortedness
|
1907
|
+
# by the metadata and has to do a full scan on the index column to
|
1908
|
+
# verify data is sorted. This is expensive. If you are sure the
|
1909
|
+
# data within the by groups is sorted, you can set this to `false`.
|
1910
|
+
# Doing so incorrectly will lead to incorrect output
|
1902
1911
|
#
|
1903
1912
|
# @return [RollingGroupBy]
|
1904
1913
|
#
|
@@ -1912,7 +1921,7 @@ module Polars
|
|
1912
1921
|
# "2020-01-08 23:16:43"
|
1913
1922
|
# ]
|
1914
1923
|
# df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
|
1915
|
-
# Polars.col("dt").str.strptime(Polars::Datetime)
|
1924
|
+
# Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
|
1916
1925
|
# )
|
1917
1926
|
# df.groupby_rolling(index_column: "dt", period: "2d").agg(
|
1918
1927
|
# [
|
@@ -1940,9 +1949,10 @@ module Polars
|
|
1940
1949
|
period:,
|
1941
1950
|
offset: nil,
|
1942
1951
|
closed: "right",
|
1943
|
-
by: nil
|
1952
|
+
by: nil,
|
1953
|
+
check_sorted: true
|
1944
1954
|
)
|
1945
|
-
RollingGroupBy.new(self, index_column, period, offset, closed, by)
|
1955
|
+
RollingGroupBy.new(self, index_column, period, offset, closed, by, check_sorted)
|
1946
1956
|
end
|
1947
1957
|
|
1948
1958
|
# Group based on a time value (or index value of type `:i32`, `:i64`).
|
@@ -2078,21 +2088,21 @@ module Polars
|
|
2078
2088
|
# df.groupby_dynamic("time", every: "1h", closed: "left").agg(
|
2079
2089
|
# [
|
2080
2090
|
# Polars.col("time").count.alias("time_count"),
|
2081
|
-
# Polars.col("time").
|
2091
|
+
# Polars.col("time").alias("time_agg_list")
|
2082
2092
|
# ]
|
2083
2093
|
# )
|
2084
2094
|
# # =>
|
2085
2095
|
# # shape: (4, 3)
|
2086
|
-
# #
|
2087
|
-
# # │ time ┆ time_count ┆ time_agg_list
|
2088
|
-
# # │ --- ┆ --- ┆ ---
|
2089
|
-
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
|
2090
|
-
# #
|
2091
|
-
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16
|
2092
|
-
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16
|
2093
|
-
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16
|
2094
|
-
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
|
2095
|
-
# #
|
2096
|
+
# # ┌─────────────────────┬────────────┬───────────────────────────────────┐
|
2097
|
+
# # │ time ┆ time_count ┆ time_agg_list │
|
2098
|
+
# # │ --- ┆ --- ┆ --- │
|
2099
|
+
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
|
2100
|
+
# # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
|
2101
|
+
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │
|
2102
|
+
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │
|
2103
|
+
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │
|
2104
|
+
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
|
2105
|
+
# # └─────────────────────┴────────────┴───────────────────────────────────┘
|
2096
2106
|
#
|
2097
2107
|
# @example When closed="both" the time values at the window boundaries belong to 2 groups.
|
2098
2108
|
# df.groupby_dynamic("time", every: "1h", closed: "both").agg(
|
@@ -2159,7 +2169,7 @@ module Polars
|
|
2159
2169
|
# period: "3i",
|
2160
2170
|
# include_boundaries: true,
|
2161
2171
|
# closed: "right"
|
2162
|
-
# ).agg(Polars.col("A").
|
2172
|
+
# ).agg(Polars.col("A").alias("A_agg_list"))
|
2163
2173
|
# # =>
|
2164
2174
|
# # shape: (3, 4)
|
2165
2175
|
# # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
|
@@ -2242,7 +2252,7 @@ module Polars
|
|
2242
2252
|
# "groups" => ["A", "B", "A", "B"],
|
2243
2253
|
# "values" => [0, 1, 2, 3]
|
2244
2254
|
# }
|
2245
|
-
# )
|
2255
|
+
# ).set_sorted("time")
|
2246
2256
|
# df.upsample(
|
2247
2257
|
# time_column: "time", every: "1mo", by: "groups", maintain_order: true
|
2248
2258
|
# ).select(Polars.all.forward_fill)
|
@@ -2360,7 +2370,7 @@ module Polars
|
|
2360
2370
|
# ], # note record date: Jan 1st (sorted!)
|
2361
2371
|
# "gdp" => [4164, 4411, 4566, 4696]
|
2362
2372
|
# }
|
2363
|
-
# )
|
2373
|
+
# ).set_sorted("date")
|
2364
2374
|
# population = Polars::DataFrame.new(
|
2365
2375
|
# {
|
2366
2376
|
# "date" => [
|
@@ -2371,7 +2381,7 @@ module Polars
|
|
2371
2381
|
# ], # note record date: May 12th (sorted!)
|
2372
2382
|
# "population" => [82.19, 82.66, 83.12, 83.52]
|
2373
2383
|
# }
|
2374
|
-
# )
|
2384
|
+
# ).set_sorted("date")
|
2375
2385
|
# population.join_asof(
|
2376
2386
|
# gdp, left_on: "date", right_on: "date", strategy: "backward"
|
2377
2387
|
# )
|
@@ -2674,7 +2684,7 @@ module Polars
|
|
2674
2684
|
# # │ 3 ┆ 8 ┆ c ┆ 30 │
|
2675
2685
|
# # └─────┴─────┴─────┴───────┘
|
2676
2686
|
def hstack(columns, in_place: false)
|
2677
|
-
if !columns.is_a?(Array)
|
2687
|
+
if !columns.is_a?(::Array)
|
2678
2688
|
columns = columns.get_columns
|
2679
2689
|
end
|
2680
2690
|
if in_place
|
@@ -2804,7 +2814,7 @@ module Polars
|
|
2804
2814
|
# # │ 3 ┆ 8.0 │
|
2805
2815
|
# # └─────┴─────┘
|
2806
2816
|
def drop(columns)
|
2807
|
-
if columns.is_a?(Array)
|
2817
|
+
if columns.is_a?(::Array)
|
2808
2818
|
df = clone
|
2809
2819
|
columns.each do |n|
|
2810
2820
|
df._df.drop_in_place(n)
|
@@ -3317,7 +3327,7 @@ module Polars
|
|
3317
3327
|
n_fill = n_cols * n_rows - height
|
3318
3328
|
|
3319
3329
|
if n_fill > 0
|
3320
|
-
if !fill_values.is_a?(Array)
|
3330
|
+
if !fill_values.is_a?(::Array)
|
3321
3331
|
fill_values = [fill_values] * df.width
|
3322
3332
|
end
|
3323
3333
|
|
@@ -3426,29 +3436,29 @@ module Polars
|
|
3426
3436
|
# # ╞═════╪═════╪═════╡
|
3427
3437
|
# # │ C ┆ 2 ┆ l │
|
3428
3438
|
# # └─────┴─────┴─────┘}
|
3429
|
-
def partition_by(groups, maintain_order: true, as_dict: false)
|
3439
|
+
def partition_by(groups, maintain_order: true, include_key: true, as_dict: false)
|
3430
3440
|
if groups.is_a?(String)
|
3431
3441
|
groups = [groups]
|
3432
|
-
elsif !groups.is_a?(Array)
|
3442
|
+
elsif !groups.is_a?(::Array)
|
3433
3443
|
groups = Array(groups)
|
3434
3444
|
end
|
3435
3445
|
|
3436
3446
|
if as_dict
|
3437
3447
|
out = {}
|
3438
3448
|
if groups.length == 1
|
3439
|
-
_df.partition_by(groups, maintain_order).each do |df|
|
3449
|
+
_df.partition_by(groups, maintain_order, include_key).each do |df|
|
3440
3450
|
df = _from_rbdf(df)
|
3441
3451
|
out[df[groups][0, 0]] = df
|
3442
3452
|
end
|
3443
3453
|
else
|
3444
|
-
_df.partition_by(groups, maintain_order).each do |df|
|
3454
|
+
_df.partition_by(groups, maintain_order, include_key).each do |df|
|
3445
3455
|
df = _from_rbdf(df)
|
3446
3456
|
out[df[groups].row(0)] = df
|
3447
3457
|
end
|
3448
3458
|
end
|
3449
3459
|
out
|
3450
3460
|
else
|
3451
|
-
_df.partition_by(groups, maintain_order).map { |df| _from_rbdf(df) }
|
3461
|
+
_df.partition_by(groups, maintain_order, include_key).map { |df| _from_rbdf(df) }
|
3452
3462
|
end
|
3453
3463
|
end
|
3454
3464
|
|
@@ -3716,7 +3726,7 @@ module Polars
|
|
3716
3726
|
# # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
|
3717
3727
|
# # └─────┴──────┴───────┴──────┴──────┴───────┘
|
3718
3728
|
def with_columns(exprs)
|
3719
|
-
if !exprs.nil? && !exprs.is_a?(Array)
|
3729
|
+
if !exprs.nil? && !exprs.is_a?(::Array)
|
3720
3730
|
exprs = [exprs]
|
3721
3731
|
end
|
3722
3732
|
lazy
|
@@ -4097,11 +4107,11 @@ module Polars
|
|
4097
4107
|
# # │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │
|
4098
4108
|
# # │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │
|
4099
4109
|
# # └───────┴───────┴───────┴───────┴───────┴───────┘
|
4100
|
-
def to_dummies(columns: nil, separator: "_")
|
4110
|
+
def to_dummies(columns: nil, separator: "_", drop_first: false)
|
4101
4111
|
if columns.is_a?(String)
|
4102
4112
|
columns = [columns]
|
4103
4113
|
end
|
4104
|
-
_from_rbdf(_df.to_dummies(columns, separator))
|
4114
|
+
_from_rbdf(_df.to_dummies(columns, separator, drop_first))
|
4105
4115
|
end
|
4106
4116
|
|
4107
4117
|
# Drop duplicate rows from this DataFrame.
|
@@ -4189,7 +4199,7 @@ module Polars
|
|
4189
4199
|
subset = [subset]
|
4190
4200
|
end
|
4191
4201
|
|
4192
|
-
if subset.is_a?(Array) && subset.length == 1
|
4202
|
+
if subset.is_a?(::Array) && subset.length == 1
|
4193
4203
|
expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false)
|
4194
4204
|
else
|
4195
4205
|
struct_fields = subset.nil? ? Polars.all : subset
|
@@ -4758,6 +4768,38 @@ module Polars
|
|
4758
4768
|
_from_rbdf(_df.unnest(names))
|
4759
4769
|
end
|
4760
4770
|
|
4771
|
+
# TODO
|
4772
|
+
# def corr
|
4773
|
+
# end
|
4774
|
+
|
4775
|
+
# TODO
|
4776
|
+
# def merge_sorted
|
4777
|
+
# end
|
4778
|
+
|
4779
|
+
# Indicate that one or multiple columns are sorted.
|
4780
|
+
#
|
4781
|
+
# @param column [Object]
|
4782
|
+
# Columns that are sorted
|
4783
|
+
# @param more_columns [Object]
|
4784
|
+
# Additional columns that are sorted, specified as positional arguments.
|
4785
|
+
# @param descending [Boolean]
|
4786
|
+
# Whether the columns are sorted in descending order.
|
4787
|
+
#
|
4788
|
+
# @return [DataFrame]
|
4789
|
+
def set_sorted(
|
4790
|
+
column,
|
4791
|
+
*more_columns,
|
4792
|
+
descending: false
|
4793
|
+
)
|
4794
|
+
lazy
|
4795
|
+
.set_sorted(column, *more_columns, descending: descending)
|
4796
|
+
.collect(no_optimization: true)
|
4797
|
+
end
|
4798
|
+
|
4799
|
+
# TODO
|
4800
|
+
# def update
|
4801
|
+
# end
|
4802
|
+
|
4761
4803
|
private
|
4762
4804
|
|
4763
4805
|
def initialize_copy(other)
|
@@ -4967,7 +5009,7 @@ module Polars
|
|
4967
5009
|
columns.each do |col, i|
|
4968
5010
|
if dtypes[col] == Categorical # != rbdf_dtypes[i]
|
4969
5011
|
column_casts << Polars.col(col).cast(Categorical)._rbexpr
|
4970
|
-
elsif structs
|
5012
|
+
elsif structs&.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
|
4971
5013
|
column_casts << Polars.col(col).cast(structs[col])._rbexpr
|
4972
5014
|
elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
|
4973
5015
|
column_casts << Polars.col(col).cast(dtypes[col])._rbexpr
|
@@ -5012,7 +5054,7 @@ module Polars
|
|
5012
5054
|
rbdf = _post_apply_columns(rbdf, column_names)
|
5013
5055
|
end
|
5014
5056
|
return rbdf
|
5015
|
-
elsif data[0].is_a?(Array)
|
5057
|
+
elsif data[0].is_a?(::Array)
|
5016
5058
|
if orient.nil? && !columns.nil?
|
5017
5059
|
orient = columns.length == data.length ? "col" : "row"
|
5018
5060
|
end
|
@@ -5117,7 +5159,7 @@ module Polars
|
|
5117
5159
|
|
5118
5160
|
def _prepare_other_arg(other)
|
5119
5161
|
if !other.is_a?(Series)
|
5120
|
-
if other.is_a?(Array)
|
5162
|
+
if other.is_a?(::Array)
|
5121
5163
|
raise ArgumentError, "Operation not supported."
|
5122
5164
|
end
|
5123
5165
|
|