polars-df 0.7.0-x86_64-darwin → 0.9.0-x86_64-darwin
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +41 -0
- data/Cargo.lock +353 -237
- data/Cargo.toml +0 -3
- data/LICENSE-THIRD-PARTY.txt +4014 -3495
- data/LICENSE.txt +1 -1
- data/README.md +2 -2
- data/lib/polars/3.1/polars.bundle +0 -0
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/{3.0 → 3.3}/polars.bundle +0 -0
- data/lib/polars/array_expr.rb +449 -0
- data/lib/polars/array_name_space.rb +346 -0
- data/lib/polars/cat_expr.rb +24 -0
- data/lib/polars/cat_name_space.rb +75 -0
- data/lib/polars/config.rb +2 -2
- data/lib/polars/data_frame.rb +248 -108
- data/lib/polars/data_types.rb +195 -29
- data/lib/polars/date_time_expr.rb +41 -24
- data/lib/polars/date_time_name_space.rb +12 -12
- data/lib/polars/exceptions.rb +12 -1
- data/lib/polars/expr.rb +1080 -195
- data/lib/polars/functions/aggregation/horizontal.rb +246 -0
- data/lib/polars/functions/aggregation/vertical.rb +282 -0
- data/lib/polars/functions/as_datatype.rb +248 -0
- data/lib/polars/functions/col.rb +47 -0
- data/lib/polars/functions/eager.rb +182 -0
- data/lib/polars/functions/lazy.rb +1280 -0
- data/lib/polars/functions/len.rb +49 -0
- data/lib/polars/functions/lit.rb +35 -0
- data/lib/polars/functions/random.rb +16 -0
- data/lib/polars/functions/range/date_range.rb +103 -0
- data/lib/polars/functions/range/int_range.rb +51 -0
- data/lib/polars/functions/repeat.rb +144 -0
- data/lib/polars/functions/whenthen.rb +27 -0
- data/lib/polars/functions.rb +29 -416
- data/lib/polars/group_by.rb +3 -3
- data/lib/polars/io.rb +21 -28
- data/lib/polars/lazy_frame.rb +390 -76
- data/lib/polars/list_expr.rb +152 -6
- data/lib/polars/list_name_space.rb +102 -0
- data/lib/polars/meta_expr.rb +175 -7
- data/lib/polars/series.rb +557 -59
- data/lib/polars/sql_context.rb +1 -1
- data/lib/polars/string_cache.rb +75 -0
- data/lib/polars/string_expr.rb +412 -96
- data/lib/polars/string_name_space.rb +4 -4
- data/lib/polars/struct_expr.rb +1 -1
- data/lib/polars/struct_name_space.rb +1 -1
- data/lib/polars/testing.rb +507 -0
- data/lib/polars/utils.rb +64 -20
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +15 -2
- metadata +36 -7
- data/lib/polars/lazy_functions.rb +0 -1197
@@ -0,0 +1,248 @@
|
|
1
|
+
module Polars
|
2
|
+
module Functions
|
3
|
+
# Create polars `Duration` from distinct time components.
|
4
|
+
#
|
5
|
+
# @return [Expr]
|
6
|
+
#
|
7
|
+
# @example
|
8
|
+
# df = Polars::DataFrame.new(
|
9
|
+
# {
|
10
|
+
# "datetime" => [DateTime.new(2022, 1, 1), DateTime.new(2022, 1, 2)],
|
11
|
+
# "add" => [1, 2]
|
12
|
+
# }
|
13
|
+
# )
|
14
|
+
# df.select(
|
15
|
+
# [
|
16
|
+
# (Polars.col("datetime") + Polars.duration(weeks: "add")).alias("add_weeks"),
|
17
|
+
# (Polars.col("datetime") + Polars.duration(days: "add")).alias("add_days"),
|
18
|
+
# (Polars.col("datetime") + Polars.duration(seconds: "add")).alias("add_seconds"),
|
19
|
+
# (Polars.col("datetime") + Polars.duration(milliseconds: "add")).alias(
|
20
|
+
# "add_milliseconds"
|
21
|
+
# ),
|
22
|
+
# (Polars.col("datetime") + Polars.duration(hours: "add")).alias("add_hours")
|
23
|
+
# ]
|
24
|
+
# )
|
25
|
+
# # =>
|
26
|
+
# # shape: (2, 5)
|
27
|
+
# # ┌─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────────┬─────────────────────┐
|
28
|
+
# # │ add_weeks ┆ add_days ┆ add_seconds ┆ add_milliseconds ┆ add_hours │
|
29
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
30
|
+
# # │ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] │
|
31
|
+
# # ╞═════════════════════╪═════════════════════╪═════════════════════╪═════════════════════════╪═════════════════════╡
|
32
|
+
# # │ 2022-01-08 00:00:00 ┆ 2022-01-02 00:00:00 ┆ 2022-01-01 00:00:01 ┆ 2022-01-01 00:00:00.001 ┆ 2022-01-01 01:00:00 │
|
33
|
+
# # │ 2022-01-16 00:00:00 ┆ 2022-01-04 00:00:00 ┆ 2022-01-02 00:00:02 ┆ 2022-01-02 00:00:00.002 ┆ 2022-01-02 02:00:00 │
|
34
|
+
# # └─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────────┴─────────────────────┘
|
35
|
+
def duration(
|
36
|
+
weeks: nil,
|
37
|
+
days: nil,
|
38
|
+
hours: nil,
|
39
|
+
minutes: nil,
|
40
|
+
seconds: nil,
|
41
|
+
milliseconds: nil,
|
42
|
+
microseconds: nil,
|
43
|
+
nanoseconds: nil,
|
44
|
+
time_unit: "us"
|
45
|
+
)
|
46
|
+
if !weeks.nil?
|
47
|
+
weeks = Utils.expr_to_lit_or_expr(weeks, str_to_lit: false)._rbexpr
|
48
|
+
end
|
49
|
+
if !days.nil?
|
50
|
+
days = Utils.expr_to_lit_or_expr(days, str_to_lit: false)._rbexpr
|
51
|
+
end
|
52
|
+
if !hours.nil?
|
53
|
+
hours = Utils.expr_to_lit_or_expr(hours, str_to_lit: false)._rbexpr
|
54
|
+
end
|
55
|
+
if !minutes.nil?
|
56
|
+
minutes = Utils.expr_to_lit_or_expr(minutes, str_to_lit: false)._rbexpr
|
57
|
+
end
|
58
|
+
if !seconds.nil?
|
59
|
+
seconds = Utils.expr_to_lit_or_expr(seconds, str_to_lit: false)._rbexpr
|
60
|
+
end
|
61
|
+
if !milliseconds.nil?
|
62
|
+
milliseconds = Utils.expr_to_lit_or_expr(milliseconds, str_to_lit: false)._rbexpr
|
63
|
+
end
|
64
|
+
if !microseconds.nil?
|
65
|
+
microseconds = Utils.expr_to_lit_or_expr(microseconds, str_to_lit: false)._rbexpr
|
66
|
+
end
|
67
|
+
if !nanoseconds.nil?
|
68
|
+
nanoseconds = Utils.expr_to_lit_or_expr(nanoseconds, str_to_lit: false)._rbexpr
|
69
|
+
end
|
70
|
+
|
71
|
+
Utils.wrap_expr(
|
72
|
+
Plr.duration(
|
73
|
+
weeks,
|
74
|
+
days,
|
75
|
+
hours,
|
76
|
+
minutes,
|
77
|
+
seconds,
|
78
|
+
milliseconds,
|
79
|
+
microseconds,
|
80
|
+
nanoseconds,
|
81
|
+
time_unit
|
82
|
+
)
|
83
|
+
)
|
84
|
+
end
|
85
|
+
|
86
|
+
# Concat the arrays in a Series dtype List in linear time.
|
87
|
+
#
|
88
|
+
# @return [Expr]
|
89
|
+
def concat_list(exprs)
|
90
|
+
exprs = Utils.selection_to_rbexpr_list(exprs)
|
91
|
+
Utils.wrap_expr(Plr.concat_list(exprs))
|
92
|
+
end
|
93
|
+
|
94
|
+
# Collect several columns into a Series of dtype Struct.
|
95
|
+
#
|
96
|
+
# @param exprs [Object]
|
97
|
+
# Columns/Expressions to collect into a Struct
|
98
|
+
# @param eager [Boolean]
|
99
|
+
# Evaluate immediately
|
100
|
+
#
|
101
|
+
# @return [Object]
|
102
|
+
#
|
103
|
+
# @example
|
104
|
+
# Polars::DataFrame.new(
|
105
|
+
# {
|
106
|
+
# "int" => [1, 2],
|
107
|
+
# "str" => ["a", "b"],
|
108
|
+
# "bool" => [true, nil],
|
109
|
+
# "list" => [[1, 2], [3]],
|
110
|
+
# }
|
111
|
+
# ).select([Polars.struct(Polars.all).alias("my_struct")])
|
112
|
+
# # =>
|
113
|
+
# # shape: (2, 1)
|
114
|
+
# # ┌─────────────────────┐
|
115
|
+
# # │ my_struct │
|
116
|
+
# # │ --- │
|
117
|
+
# # │ struct[4] │
|
118
|
+
# # ╞═════════════════════╡
|
119
|
+
# # │ {1,"a",true,[1, 2]} │
|
120
|
+
# # │ {2,"b",null,[3]} │
|
121
|
+
# # └─────────────────────┘
|
122
|
+
#
|
123
|
+
# @example Only collect specific columns as a struct:
|
124
|
+
# df = Polars::DataFrame.new(
|
125
|
+
# {"a" => [1, 2, 3, 4], "b" => ["one", "two", "three", "four"], "c" => [9, 8, 7, 6]}
|
126
|
+
# )
|
127
|
+
# df.with_column(Polars.struct(Polars.col(["a", "b"])).alias("a_and_b"))
|
128
|
+
# # =>
|
129
|
+
# # shape: (4, 4)
|
130
|
+
# # ┌─────┬───────┬─────┬─────────────┐
|
131
|
+
# # │ a ┆ b ┆ c ┆ a_and_b │
|
132
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
133
|
+
# # │ i64 ┆ str ┆ i64 ┆ struct[2] │
|
134
|
+
# # ╞═════╪═══════╪═════╪═════════════╡
|
135
|
+
# # │ 1 ┆ one ┆ 9 ┆ {1,"one"} │
|
136
|
+
# # │ 2 ┆ two ┆ 8 ┆ {2,"two"} │
|
137
|
+
# # │ 3 ┆ three ┆ 7 ┆ {3,"three"} │
|
138
|
+
# # │ 4 ┆ four ┆ 6 ┆ {4,"four"} │
|
139
|
+
# # └─────┴───────┴─────┴─────────────┘
|
140
|
+
def struct(exprs, eager: false)
|
141
|
+
if eager
|
142
|
+
Polars.select(struct(exprs, eager: false)).to_series
|
143
|
+
end
|
144
|
+
exprs = Utils.selection_to_rbexpr_list(exprs)
|
145
|
+
Utils.wrap_expr(Plr.as_struct(exprs))
|
146
|
+
end
|
147
|
+
|
148
|
+
# Horizontally concat Utf8 Series in linear time. Non-Utf8 columns are cast to Utf8.
|
149
|
+
#
|
150
|
+
# @param exprs [Object]
|
151
|
+
# Columns to concat into a Utf8 Series.
|
152
|
+
# @param sep [String]
|
153
|
+
# String value that will be used to separate the values.
|
154
|
+
# @param ignore_nulls [Boolean]
|
155
|
+
# Ignore null values (default).
|
156
|
+
#
|
157
|
+
# @return [Expr]
|
158
|
+
#
|
159
|
+
# @example
|
160
|
+
# df = Polars::DataFrame.new(
|
161
|
+
# {
|
162
|
+
# "a" => [1, 2, 3],
|
163
|
+
# "b" => ["dogs", "cats", nil],
|
164
|
+
# "c" => ["play", "swim", "walk"]
|
165
|
+
# }
|
166
|
+
# )
|
167
|
+
# df.with_columns(
|
168
|
+
# [
|
169
|
+
# Polars.concat_str(
|
170
|
+
# [
|
171
|
+
# Polars.col("a") * 2,
|
172
|
+
# Polars.col("b"),
|
173
|
+
# Polars.col("c")
|
174
|
+
# ],
|
175
|
+
# sep: " "
|
176
|
+
# ).alias("full_sentence")
|
177
|
+
# ]
|
178
|
+
# )
|
179
|
+
# # =>
|
180
|
+
# # shape: (3, 4)
|
181
|
+
# # ┌─────┬──────┬──────┬───────────────┐
|
182
|
+
# # │ a ┆ b ┆ c ┆ full_sentence │
|
183
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
184
|
+
# # │ i64 ┆ str ┆ str ┆ str │
|
185
|
+
# # ╞═════╪══════╪══════╪═══════════════╡
|
186
|
+
# # │ 1 ┆ dogs ┆ play ┆ 2 dogs play │
|
187
|
+
# # │ 2 ┆ cats ┆ swim ┆ 4 cats swim │
|
188
|
+
# # │ 3 ┆ null ┆ walk ┆ null │
|
189
|
+
# # └─────┴──────┴──────┴───────────────┘
|
190
|
+
def concat_str(exprs, sep: "", ignore_nulls: false)
|
191
|
+
exprs = Utils.selection_to_rbexpr_list(exprs)
|
192
|
+
Utils.wrap_expr(Plr.concat_str(exprs, sep, ignore_nulls))
|
193
|
+
end
|
194
|
+
|
195
|
+
# Format expressions as a string.
|
196
|
+
#
|
197
|
+
# @param fstring [String]
|
198
|
+
# A string that with placeholders.
|
199
|
+
# For example: "hello_{}" or "{}_world
|
200
|
+
# @param args [Object]
|
201
|
+
# Expression(s) that fill the placeholders
|
202
|
+
#
|
203
|
+
# @return [Expr]
|
204
|
+
#
|
205
|
+
# @example
|
206
|
+
# df = Polars::DataFrame.new(
|
207
|
+
# {
|
208
|
+
# "a": ["a", "b", "c"],
|
209
|
+
# "b": [1, 2, 3]
|
210
|
+
# }
|
211
|
+
# )
|
212
|
+
# df.select(
|
213
|
+
# [
|
214
|
+
# Polars.format("foo_{}_bar_{}", Polars.col("a"), "b").alias("fmt")
|
215
|
+
# ]
|
216
|
+
# )
|
217
|
+
# # =>
|
218
|
+
# # shape: (3, 1)
|
219
|
+
# # ┌─────────────┐
|
220
|
+
# # │ fmt │
|
221
|
+
# # │ --- │
|
222
|
+
# # │ str │
|
223
|
+
# # ╞═════════════╡
|
224
|
+
# # │ foo_a_bar_1 │
|
225
|
+
# # │ foo_b_bar_2 │
|
226
|
+
# # │ foo_c_bar_3 │
|
227
|
+
# # └─────────────┘
|
228
|
+
def format(fstring, *args)
|
229
|
+
if fstring.scan("{}").length != args.length
|
230
|
+
raise ArgumentError, "number of placeholders should equal the number of arguments"
|
231
|
+
end
|
232
|
+
|
233
|
+
exprs = []
|
234
|
+
|
235
|
+
arguments = args.each
|
236
|
+
fstring.split(/(\{\})/).each do |s|
|
237
|
+
if s == "{}"
|
238
|
+
e = Utils.expr_to_lit_or_expr(arguments.next, str_to_lit: false)
|
239
|
+
exprs << e
|
240
|
+
elsif s.length > 0
|
241
|
+
exprs << lit(s)
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
concat_str(exprs, sep: "")
|
246
|
+
end
|
247
|
+
end
|
248
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module Polars
|
2
|
+
module Functions
|
3
|
+
# Return an expression representing a column in a DataFrame.
|
4
|
+
#
|
5
|
+
# @return [Expr]
|
6
|
+
def col(name, *more_names)
|
7
|
+
if more_names.any?
|
8
|
+
if Utils.strlike?(name)
|
9
|
+
names_str = [name]
|
10
|
+
names_str.concat(more_names)
|
11
|
+
return Utils.wrap_expr(Plr.cols(names_str.map(&:to_s)))
|
12
|
+
elsif Utils.is_polars_dtype(name)
|
13
|
+
dtypes = [name]
|
14
|
+
dtypes.concat(more_names)
|
15
|
+
return Utils.wrap_expr(Plr.dtype_cols(dtypes))
|
16
|
+
else
|
17
|
+
msg = "invalid input for `col`\n\nExpected `str` or `DataType`, got #{name.class.name}."
|
18
|
+
raise TypeError, msg
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
if Utils.strlike?(name)
|
23
|
+
Utils.wrap_expr(Plr.col(name.to_s))
|
24
|
+
elsif Utils.is_polars_dtype(name)
|
25
|
+
Utils.wrap_expr(Plr.dtype_cols([name]))
|
26
|
+
elsif name.is_a?(::Array)
|
27
|
+
names = Array(name)
|
28
|
+
if names.empty?
|
29
|
+
return Utils.wrap_expr(Plr.cols(names))
|
30
|
+
end
|
31
|
+
|
32
|
+
item = names[0]
|
33
|
+
if Utils.strlike?(item)
|
34
|
+
Utils.wrap_expr(Plr.cols(names.map(&:to_s)))
|
35
|
+
elsif Utils.is_polars_dtype(item)
|
36
|
+
Utils.wrap_expr(Plr.dtype_cols(names))
|
37
|
+
else
|
38
|
+
msg = "invalid input for `col`\n\nExpected iterable of type `str` or `DataType`, got iterable of type #{item.class.name}."
|
39
|
+
raise TypeError, msg
|
40
|
+
end
|
41
|
+
else
|
42
|
+
msg = "invalid input for `col`\n\nExpected `str` or `DataType`, got #{name.class.name}."
|
43
|
+
raise TypeError, msg
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,182 @@
|
|
1
|
+
module Polars
|
2
|
+
module Functions
|
3
|
+
# Aggregate multiple Dataframes/Series to a single DataFrame/Series.
|
4
|
+
#
|
5
|
+
# @param items [Object]
|
6
|
+
# DataFrames/Series/LazyFrames to concatenate.
|
7
|
+
# @param rechunk [Boolean]
|
8
|
+
# Make sure that all data is in contiguous memory.
|
9
|
+
# @param how ["vertical", "vertical_relaxed", "diagonal", "horizontal"]
|
10
|
+
# LazyFrames do not support the `horizontal` strategy.
|
11
|
+
#
|
12
|
+
# - Vertical: applies multiple `vstack` operations.
|
13
|
+
# - Diagonal: finds a union between the column schemas and fills missing column values with null.
|
14
|
+
# - Horizontal: stacks Series horizontally and fills with nulls if the lengths don't match.
|
15
|
+
# @param parallel [Boolean]
|
16
|
+
# Only relevant for LazyFrames. This determines if the concatenated
|
17
|
+
# lazy computations may be executed in parallel.
|
18
|
+
#
|
19
|
+
# @return [Object]
|
20
|
+
#
|
21
|
+
# @example
|
22
|
+
# df1 = Polars::DataFrame.new({"a" => [1], "b" => [3]})
|
23
|
+
# df2 = Polars::DataFrame.new({"a" => [2], "b" => [4]})
|
24
|
+
# Polars.concat([df1, df2])
|
25
|
+
# # =>
|
26
|
+
# # shape: (2, 2)
|
27
|
+
# # ┌─────┬─────┐
|
28
|
+
# # │ a ┆ b │
|
29
|
+
# # │ --- ┆ --- │
|
30
|
+
# # │ i64 ┆ i64 │
|
31
|
+
# # ╞═════╪═════╡
|
32
|
+
# # │ 1 ┆ 3 │
|
33
|
+
# # │ 2 ┆ 4 │
|
34
|
+
# # └─────┴─────┘
|
35
|
+
def concat(items, rechunk: true, how: "vertical", parallel: true)
|
36
|
+
if items.empty?
|
37
|
+
raise ArgumentError, "cannot concat empty list"
|
38
|
+
end
|
39
|
+
|
40
|
+
first = items[0]
|
41
|
+
if first.is_a?(DataFrame)
|
42
|
+
if how == "vertical"
|
43
|
+
out = Utils.wrap_df(Plr.concat_df(items))
|
44
|
+
elsif how == "diagonal"
|
45
|
+
out = Utils.wrap_df(Plr.concat_df_diagonal(items))
|
46
|
+
elsif how == "horizontal"
|
47
|
+
out = Utils.wrap_df(Plr.concat_df_horizontal(items))
|
48
|
+
else
|
49
|
+
raise ArgumentError, "how must be one of {{'vertical', 'diagonal', 'horizontal'}}, got #{how}"
|
50
|
+
end
|
51
|
+
elsif first.is_a?(LazyFrame)
|
52
|
+
if how == "vertical"
|
53
|
+
return Utils.wrap_ldf(Plr.concat_lf(items, rechunk, parallel, false))
|
54
|
+
elsif how == "vertical_relaxed"
|
55
|
+
return Utils.wrap_ldf(Plr.concat_lf(items, rechunk, parallel, true))
|
56
|
+
elsif how == "diagonal"
|
57
|
+
return Utils.wrap_ldf(Plr.concat_lf_diagonal(items, rechunk, parallel, false))
|
58
|
+
else
|
59
|
+
raise ArgumentError, "Lazy only allows 'vertical', 'vertical_relaxed', and 'diagonal' concat strategy."
|
60
|
+
end
|
61
|
+
elsif first.is_a?(Series)
|
62
|
+
# TODO
|
63
|
+
out = Utils.wrap_s(Plr.concat_series(items))
|
64
|
+
elsif first.is_a?(Expr)
|
65
|
+
out = first
|
66
|
+
items[1..-1].each do |e|
|
67
|
+
out = out.append(e)
|
68
|
+
end
|
69
|
+
else
|
70
|
+
raise ArgumentError, "did not expect type: #{first.class.name} in 'Polars.concat'."
|
71
|
+
end
|
72
|
+
|
73
|
+
if rechunk
|
74
|
+
out.rechunk
|
75
|
+
else
|
76
|
+
out
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# Align a sequence of frames using the uique values from one or more columns as a key.
|
81
|
+
#
|
82
|
+
# Frames that do not contain the given key values have rows injected (with nulls
|
83
|
+
# filling the non-key columns), and each resulting frame is sorted by the key.
|
84
|
+
#
|
85
|
+
# The original column order of input frames is not changed unless ``select`` is
|
86
|
+
# specified (in which case the final column order is determined from that).
|
87
|
+
#
|
88
|
+
# Note that this does not result in a joined frame - you receive the same number
|
89
|
+
# of frames back that you passed in, but each is now aligned by key and has
|
90
|
+
# the same number of rows.
|
91
|
+
#
|
92
|
+
# @param frames [Array]
|
93
|
+
# Sequence of DataFrames or LazyFrames.
|
94
|
+
# @param on [Object]
|
95
|
+
# One or more columns whose unique values will be used to align the frames.
|
96
|
+
# @param select [Object]
|
97
|
+
# Optional post-alignment column select to constrain and/or order
|
98
|
+
# the columns returned from the newly aligned frames.
|
99
|
+
# @param reverse [Object]
|
100
|
+
# Sort the alignment column values in descending order; can be a single
|
101
|
+
# boolean or a list of booleans associated with each column in `on`.
|
102
|
+
#
|
103
|
+
# @return [Object]
|
104
|
+
#
|
105
|
+
# @example
|
106
|
+
# df1 = Polars::DataFrame.new(
|
107
|
+
# {
|
108
|
+
# "dt" => [Date.new(2022, 9, 1), Date.new(2022, 9, 2), Date.new(2022, 9, 3)],
|
109
|
+
# "x" => [3.5, 4.0, 1.0],
|
110
|
+
# "y" => [10.0, 2.5, 1.5]
|
111
|
+
# }
|
112
|
+
# )
|
113
|
+
# df2 = Polars::DataFrame.new(
|
114
|
+
# {
|
115
|
+
# "dt" => [Date.new(2022, 9, 2), Date.new(2022, 9, 3), Date.new(2022, 9, 1)],
|
116
|
+
# "x" => [8.0, 1.0, 3.5],
|
117
|
+
# "y" => [1.5, 12.0, 5.0]
|
118
|
+
# }
|
119
|
+
# )
|
120
|
+
# df3 = Polars::DataFrame.new(
|
121
|
+
# {
|
122
|
+
# "dt" => [Date.new(2022, 9, 3), Date.new(2022, 9, 2)],
|
123
|
+
# "x" => [2.0, 5.0],
|
124
|
+
# "y" => [2.5, 2.0]
|
125
|
+
# }
|
126
|
+
# )
|
127
|
+
# af1, af2, af3 = Polars.align_frames(
|
128
|
+
# df1, df2, df3, on: "dt", select: ["x", "y"]
|
129
|
+
# )
|
130
|
+
# (af1 * af2 * af3).fill_null(0).select(Polars.sum(Polars.col("*")).alias("dot"))
|
131
|
+
# # =>
|
132
|
+
# # shape: (3, 1)
|
133
|
+
# # ┌───────┐
|
134
|
+
# # │ dot │
|
135
|
+
# # │ --- │
|
136
|
+
# # │ f64 │
|
137
|
+
# # ╞═══════╡
|
138
|
+
# # │ 0.0 │
|
139
|
+
# # ├╌╌╌╌╌╌╌┤
|
140
|
+
# # │ 167.5 │
|
141
|
+
# # ├╌╌╌╌╌╌╌┤
|
142
|
+
# # │ 47.0 │
|
143
|
+
# # └───────┘
|
144
|
+
def align_frames(
|
145
|
+
*frames,
|
146
|
+
on:,
|
147
|
+
select: nil,
|
148
|
+
reverse: false
|
149
|
+
)
|
150
|
+
if frames.empty?
|
151
|
+
return []
|
152
|
+
elsif frames.map(&:class).uniq.length != 1
|
153
|
+
raise TypeError, "Input frames must be of a consistent type (all LazyFrame or all DataFrame)"
|
154
|
+
end
|
155
|
+
|
156
|
+
# establish the superset of all "on" column values, sort, and cache
|
157
|
+
eager = frames[0].is_a?(DataFrame)
|
158
|
+
alignment_frame = (
|
159
|
+
concat(frames.map { |df| df.lazy.select(on) })
|
160
|
+
.unique(maintain_order: false)
|
161
|
+
.sort(on, reverse: reverse)
|
162
|
+
)
|
163
|
+
alignment_frame = (
|
164
|
+
eager ? alignment_frame.collect.lazy : alignment_frame.cache
|
165
|
+
)
|
166
|
+
# finally, align all frames
|
167
|
+
aligned_frames =
|
168
|
+
frames.map do |df|
|
169
|
+
alignment_frame.join(
|
170
|
+
df.lazy,
|
171
|
+
on: alignment_frame.columns,
|
172
|
+
how: "left"
|
173
|
+
).select(df.columns)
|
174
|
+
end
|
175
|
+
if !select.nil?
|
176
|
+
aligned_frames = aligned_frames.map { |df| df.select(select) }
|
177
|
+
end
|
178
|
+
|
179
|
+
eager ? aligned_frames.map(&:collect) : aligned_frames
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|