polars-df 0.13.0-aarch64-linux-musl
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +3 -0
- data/CHANGELOG.md +208 -0
- data/Cargo.lock +2556 -0
- data/Cargo.toml +6 -0
- data/LICENSE-THIRD-PARTY.txt +39059 -0
- data/LICENSE.txt +20 -0
- data/README.md +437 -0
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/array_expr.rb +537 -0
- data/lib/polars/array_name_space.rb +423 -0
- data/lib/polars/batched_csv_reader.rb +104 -0
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/cat_expr.rb +36 -0
- data/lib/polars/cat_name_space.rb +88 -0
- data/lib/polars/config.rb +530 -0
- data/lib/polars/convert.rb +98 -0
- data/lib/polars/data_frame.rb +5191 -0
- data/lib/polars/data_types.rb +466 -0
- data/lib/polars/date_time_expr.rb +1397 -0
- data/lib/polars/date_time_name_space.rb +1287 -0
- data/lib/polars/dynamic_group_by.rb +52 -0
- data/lib/polars/exceptions.rb +38 -0
- data/lib/polars/expr.rb +7256 -0
- data/lib/polars/expr_dispatch.rb +22 -0
- data/lib/polars/functions/aggregation/horizontal.rb +246 -0
- data/lib/polars/functions/aggregation/vertical.rb +282 -0
- data/lib/polars/functions/as_datatype.rb +271 -0
- data/lib/polars/functions/col.rb +47 -0
- data/lib/polars/functions/eager.rb +182 -0
- data/lib/polars/functions/lazy.rb +1329 -0
- data/lib/polars/functions/len.rb +49 -0
- data/lib/polars/functions/lit.rb +35 -0
- data/lib/polars/functions/random.rb +16 -0
- data/lib/polars/functions/range/date_range.rb +136 -0
- data/lib/polars/functions/range/datetime_range.rb +149 -0
- data/lib/polars/functions/range/int_range.rb +51 -0
- data/lib/polars/functions/range/time_range.rb +141 -0
- data/lib/polars/functions/repeat.rb +144 -0
- data/lib/polars/functions/whenthen.rb +96 -0
- data/lib/polars/functions.rb +57 -0
- data/lib/polars/group_by.rb +613 -0
- data/lib/polars/io/avro.rb +24 -0
- data/lib/polars/io/csv.rb +696 -0
- data/lib/polars/io/database.rb +73 -0
- data/lib/polars/io/ipc.rb +275 -0
- data/lib/polars/io/json.rb +29 -0
- data/lib/polars/io/ndjson.rb +80 -0
- data/lib/polars/io/parquet.rb +233 -0
- data/lib/polars/lazy_frame.rb +2708 -0
- data/lib/polars/lazy_group_by.rb +181 -0
- data/lib/polars/list_expr.rb +791 -0
- data/lib/polars/list_name_space.rb +449 -0
- data/lib/polars/meta_expr.rb +222 -0
- data/lib/polars/name_expr.rb +198 -0
- data/lib/polars/plot.rb +109 -0
- data/lib/polars/rolling_group_by.rb +35 -0
- data/lib/polars/series.rb +4444 -0
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/sql_context.rb +194 -0
- data/lib/polars/string_cache.rb +75 -0
- data/lib/polars/string_expr.rb +1495 -0
- data/lib/polars/string_name_space.rb +811 -0
- data/lib/polars/struct_expr.rb +98 -0
- data/lib/polars/struct_name_space.rb +96 -0
- data/lib/polars/testing.rb +507 -0
- data/lib/polars/utils/constants.rb +9 -0
- data/lib/polars/utils/convert.rb +97 -0
- data/lib/polars/utils/parse.rb +89 -0
- data/lib/polars/utils/various.rb +76 -0
- data/lib/polars/utils/wrap.rb +19 -0
- data/lib/polars/utils.rb +130 -0
- data/lib/polars/version.rb +4 -0
- data/lib/polars/whenthen.rb +83 -0
- data/lib/polars-df.rb +1 -0
- data/lib/polars.rb +91 -0
- metadata +138 -0
@@ -0,0 +1,271 @@
|
|
1
|
+
module Polars
|
2
|
+
module Functions
|
3
|
+
# Create polars `Duration` from distinct time components.
|
4
|
+
#
|
5
|
+
# @return [Expr]
|
6
|
+
#
|
7
|
+
# @example
|
8
|
+
# df = Polars::DataFrame.new(
|
9
|
+
# {
|
10
|
+
# "datetime" => [DateTime.new(2022, 1, 1), DateTime.new(2022, 1, 2)],
|
11
|
+
# "add" => [1, 2]
|
12
|
+
# }
|
13
|
+
# )
|
14
|
+
# df.select(
|
15
|
+
# [
|
16
|
+
# (Polars.col("datetime") + Polars.duration(weeks: "add")).alias("add_weeks"),
|
17
|
+
# (Polars.col("datetime") + Polars.duration(days: "add")).alias("add_days"),
|
18
|
+
# (Polars.col("datetime") + Polars.duration(seconds: "add")).alias("add_seconds"),
|
19
|
+
# (Polars.col("datetime") + Polars.duration(milliseconds: "add")).alias(
|
20
|
+
# "add_milliseconds"
|
21
|
+
# ),
|
22
|
+
# (Polars.col("datetime") + Polars.duration(hours: "add")).alias("add_hours")
|
23
|
+
# ]
|
24
|
+
# )
|
25
|
+
# # =>
|
26
|
+
# # shape: (2, 5)
|
27
|
+
# # ┌─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────────┬─────────────────────┐
|
28
|
+
# # │ add_weeks ┆ add_days ┆ add_seconds ┆ add_milliseconds ┆ add_hours │
|
29
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
30
|
+
# # │ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] │
|
31
|
+
# # ╞═════════════════════╪═════════════════════╪═════════════════════╪═════════════════════════╪═════════════════════╡
|
32
|
+
# # │ 2022-01-08 00:00:00 ┆ 2022-01-02 00:00:00 ┆ 2022-01-01 00:00:01 ┆ 2022-01-01 00:00:00.001 ┆ 2022-01-01 01:00:00 │
|
33
|
+
# # │ 2022-01-16 00:00:00 ┆ 2022-01-04 00:00:00 ┆ 2022-01-02 00:00:02 ┆ 2022-01-02 00:00:00.002 ┆ 2022-01-02 02:00:00 │
|
34
|
+
# # └─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────────┴─────────────────────┘
|
35
|
+
def duration(
|
36
|
+
weeks: nil,
|
37
|
+
days: nil,
|
38
|
+
hours: nil,
|
39
|
+
minutes: nil,
|
40
|
+
seconds: nil,
|
41
|
+
milliseconds: nil,
|
42
|
+
microseconds: nil,
|
43
|
+
nanoseconds: nil,
|
44
|
+
time_unit: "us"
|
45
|
+
)
|
46
|
+
if !weeks.nil?
|
47
|
+
weeks = Utils.parse_into_expression(weeks, str_as_lit: false)
|
48
|
+
end
|
49
|
+
if !days.nil?
|
50
|
+
days = Utils.parse_into_expression(days, str_as_lit: false)
|
51
|
+
end
|
52
|
+
if !hours.nil?
|
53
|
+
hours = Utils.parse_into_expression(hours, str_as_lit: false)
|
54
|
+
end
|
55
|
+
if !minutes.nil?
|
56
|
+
minutes = Utils.parse_into_expression(minutes, str_as_lit: false)
|
57
|
+
end
|
58
|
+
if !seconds.nil?
|
59
|
+
seconds = Utils.parse_into_expression(seconds, str_as_lit: false)
|
60
|
+
end
|
61
|
+
if !milliseconds.nil?
|
62
|
+
milliseconds = Utils.parse_into_expression(milliseconds, str_as_lit: false)
|
63
|
+
end
|
64
|
+
if !microseconds.nil?
|
65
|
+
microseconds = Utils.parse_into_expression(microseconds, str_as_lit: false)
|
66
|
+
end
|
67
|
+
if !nanoseconds.nil?
|
68
|
+
nanoseconds = Utils.parse_into_expression(nanoseconds, str_as_lit: false)
|
69
|
+
end
|
70
|
+
|
71
|
+
Utils.wrap_expr(
|
72
|
+
Plr.duration(
|
73
|
+
weeks,
|
74
|
+
days,
|
75
|
+
hours,
|
76
|
+
minutes,
|
77
|
+
seconds,
|
78
|
+
milliseconds,
|
79
|
+
microseconds,
|
80
|
+
nanoseconds,
|
81
|
+
time_unit
|
82
|
+
)
|
83
|
+
)
|
84
|
+
end
|
85
|
+
|
86
|
+
# Concat the arrays in a Series dtype List in linear time.
|
87
|
+
#
|
88
|
+
# @return [Expr]
|
89
|
+
def concat_list(exprs)
|
90
|
+
exprs = Utils.parse_into_list_of_expressions(exprs)
|
91
|
+
Utils.wrap_expr(Plr.concat_list(exprs))
|
92
|
+
end
|
93
|
+
|
94
|
+
# Collect several columns into a Series of dtype Struct.
|
95
|
+
#
|
96
|
+
# @param exprs [Array]
|
97
|
+
# Column(s) to collect into a struct column, specified as positional arguments.
|
98
|
+
# Accepts expression input. Strings are parsed as column names,
|
99
|
+
# other non-expression inputs are parsed as literals.
|
100
|
+
# @param schema [Hash]
|
101
|
+
# Optional schema that explicitly defines the struct field dtypes. If no columns
|
102
|
+
# or expressions are provided, schema keys are used to define columns.
|
103
|
+
# @param eager [Boolean]
|
104
|
+
# Evaluate immediately and return a `Series`. If set to `false` (default),
|
105
|
+
# return an expression instead.
|
106
|
+
# @param named_exprs [Hash]
|
107
|
+
# Additional columns to collect into the struct column, specified as keyword
|
108
|
+
# arguments. The columns will be renamed to the keyword used.
|
109
|
+
#
|
110
|
+
# @return [Object]
|
111
|
+
#
|
112
|
+
# @example
|
113
|
+
# df = Polars::DataFrame.new(
|
114
|
+
# {
|
115
|
+
# "int" => [1, 2],
|
116
|
+
# "str" => ["a", "b"],
|
117
|
+
# "bool" => [true, nil],
|
118
|
+
# "list" => [[1, 2], [3]],
|
119
|
+
# }
|
120
|
+
# )
|
121
|
+
# df.select([Polars.struct(Polars.all).alias("my_struct")])
|
122
|
+
# # =>
|
123
|
+
# # shape: (2, 1)
|
124
|
+
# # ┌─────────────────────┐
|
125
|
+
# # │ my_struct │
|
126
|
+
# # │ --- │
|
127
|
+
# # │ struct[4] │
|
128
|
+
# # ╞═════════════════════╡
|
129
|
+
# # │ {1,"a",true,[1, 2]} │
|
130
|
+
# # │ {2,"b",null,[3]} │
|
131
|
+
# # └─────────────────────┘
|
132
|
+
#
|
133
|
+
# @example Collect selected columns into a struct by either passing a list of columns, or by specifying each column as a positional argument.
|
134
|
+
# df.select(Polars.struct("int", false).alias("my_struct"))
|
135
|
+
# # =>
|
136
|
+
# # shape: (2, 1)
|
137
|
+
# # ┌───────────┐
|
138
|
+
# # │ my_struct │
|
139
|
+
# # │ --- │
|
140
|
+
# # │ struct[2] │
|
141
|
+
# # ╞═══════════╡
|
142
|
+
# # │ {1,false} │
|
143
|
+
# # │ {2,false} │
|
144
|
+
# # └───────────┘
|
145
|
+
#
|
146
|
+
# @example Use keyword arguments to easily name each struct field.
|
147
|
+
# df.select(Polars.struct(p: "int", q: "bool").alias("my_struct")).schema
|
148
|
+
# # => {"my_struct"=>Polars::Struct({"p"=>Polars::Int64, "q"=>Polars::Boolean})}
|
149
|
+
def struct(*exprs, schema: nil, eager: false, **named_exprs)
|
150
|
+
rbexprs = Utils.parse_into_list_of_expressions(*exprs, **named_exprs)
|
151
|
+
expr = Utils.wrap_expr(Plr.as_struct(rbexprs))
|
152
|
+
|
153
|
+
if !schema.nil? && !schema.empty?
|
154
|
+
if !exprs.any?
|
155
|
+
# no columns or expressions provided; create one from schema keys
|
156
|
+
expr =
|
157
|
+
Utils.wrap_expr(
|
158
|
+
Plr.as_struct(Utils.parse_into_list_of_expressions(schema.keys))
|
159
|
+
)
|
160
|
+
expr = expr.cast(Struct.new(schema), strict: false)
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
if eager
|
165
|
+
Polars.select(expr).to_series
|
166
|
+
else
|
167
|
+
expr
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
# Horizontally concat Utf8 Series in linear time. Non-Utf8 columns are cast to Utf8.
|
172
|
+
#
|
173
|
+
# @param exprs [Object]
|
174
|
+
# Columns to concat into a Utf8 Series.
|
175
|
+
# @param sep [String]
|
176
|
+
# String value that will be used to separate the values.
|
177
|
+
# @param ignore_nulls [Boolean]
|
178
|
+
# Ignore null values (default).
|
179
|
+
#
|
180
|
+
# @return [Expr]
|
181
|
+
#
|
182
|
+
# @example
|
183
|
+
# df = Polars::DataFrame.new(
|
184
|
+
# {
|
185
|
+
# "a" => [1, 2, 3],
|
186
|
+
# "b" => ["dogs", "cats", nil],
|
187
|
+
# "c" => ["play", "swim", "walk"]
|
188
|
+
# }
|
189
|
+
# )
|
190
|
+
# df.with_columns(
|
191
|
+
# [
|
192
|
+
# Polars.concat_str(
|
193
|
+
# [
|
194
|
+
# Polars.col("a") * 2,
|
195
|
+
# Polars.col("b"),
|
196
|
+
# Polars.col("c")
|
197
|
+
# ],
|
198
|
+
# sep: " "
|
199
|
+
# ).alias("full_sentence")
|
200
|
+
# ]
|
201
|
+
# )
|
202
|
+
# # =>
|
203
|
+
# # shape: (3, 4)
|
204
|
+
# # ┌─────┬──────┬──────┬───────────────┐
|
205
|
+
# # │ a ┆ b ┆ c ┆ full_sentence │
|
206
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
207
|
+
# # │ i64 ┆ str ┆ str ┆ str │
|
208
|
+
# # ╞═════╪══════╪══════╪═══════════════╡
|
209
|
+
# # │ 1 ┆ dogs ┆ play ┆ 2 dogs play │
|
210
|
+
# # │ 2 ┆ cats ┆ swim ┆ 4 cats swim │
|
211
|
+
# # │ 3 ┆ null ┆ walk ┆ null │
|
212
|
+
# # └─────┴──────┴──────┴───────────────┘
|
213
|
+
def concat_str(exprs, sep: "", ignore_nulls: false)
|
214
|
+
exprs = Utils.parse_into_list_of_expressions(exprs)
|
215
|
+
Utils.wrap_expr(Plr.concat_str(exprs, sep, ignore_nulls))
|
216
|
+
end
|
217
|
+
|
218
|
+
# Format expressions as a string.
|
219
|
+
#
|
220
|
+
# @param f_string [String]
|
221
|
+
# A string that with placeholders.
|
222
|
+
# For example: "hello_{}" or "{}_world
|
223
|
+
# @param args [Object]
|
224
|
+
# Expression(s) that fill the placeholders
|
225
|
+
#
|
226
|
+
# @return [Expr]
|
227
|
+
#
|
228
|
+
# @example
|
229
|
+
# df = Polars::DataFrame.new(
|
230
|
+
# {
|
231
|
+
# "a": ["a", "b", "c"],
|
232
|
+
# "b": [1, 2, 3]
|
233
|
+
# }
|
234
|
+
# )
|
235
|
+
# df.select(
|
236
|
+
# [
|
237
|
+
# Polars.format("foo_{}_bar_{}", Polars.col("a"), "b").alias("fmt")
|
238
|
+
# ]
|
239
|
+
# )
|
240
|
+
# # =>
|
241
|
+
# # shape: (3, 1)
|
242
|
+
# # ┌─────────────┐
|
243
|
+
# # │ fmt │
|
244
|
+
# # │ --- │
|
245
|
+
# # │ str │
|
246
|
+
# # ╞═════════════╡
|
247
|
+
# # │ foo_a_bar_1 │
|
248
|
+
# # │ foo_b_bar_2 │
|
249
|
+
# # │ foo_c_bar_3 │
|
250
|
+
# # └─────────────┘
|
251
|
+
def format(f_string, *args)
|
252
|
+
if f_string.scan("{}").length != args.length
|
253
|
+
raise ArgumentError, "number of placeholders should equal the number of arguments"
|
254
|
+
end
|
255
|
+
|
256
|
+
exprs = []
|
257
|
+
|
258
|
+
arguments = args.each
|
259
|
+
f_string.split(/(\{\})/).each do |s|
|
260
|
+
if s == "{}"
|
261
|
+
e = Utils.wrap_expr(Utils.parse_into_expression(arguments.next))
|
262
|
+
exprs << e
|
263
|
+
elsif s.length > 0
|
264
|
+
exprs << lit(s)
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
concat_str(exprs, sep: "")
|
269
|
+
end
|
270
|
+
end
|
271
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module Polars
|
2
|
+
module Functions
|
3
|
+
# Return an expression representing a column in a DataFrame.
|
4
|
+
#
|
5
|
+
# @return [Expr]
|
6
|
+
def col(name, *more_names)
|
7
|
+
if more_names.any?
|
8
|
+
if Utils.strlike?(name)
|
9
|
+
names_str = [name]
|
10
|
+
names_str.concat(more_names)
|
11
|
+
return Utils.wrap_expr(Plr.cols(names_str.map(&:to_s)))
|
12
|
+
elsif Utils.is_polars_dtype(name)
|
13
|
+
dtypes = [name]
|
14
|
+
dtypes.concat(more_names)
|
15
|
+
return Utils.wrap_expr(Plr.dtype_cols(dtypes))
|
16
|
+
else
|
17
|
+
msg = "invalid input for `col`\n\nExpected `str` or `DataType`, got #{name.class.name}."
|
18
|
+
raise TypeError, msg
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
if Utils.strlike?(name)
|
23
|
+
Utils.wrap_expr(Plr.col(name.to_s))
|
24
|
+
elsif Utils.is_polars_dtype(name)
|
25
|
+
Utils.wrap_expr(Plr.dtype_cols([name]))
|
26
|
+
elsif name.is_a?(::Array)
|
27
|
+
names = Array(name)
|
28
|
+
if names.empty?
|
29
|
+
return Utils.wrap_expr(Plr.cols(names))
|
30
|
+
end
|
31
|
+
|
32
|
+
item = names[0]
|
33
|
+
if Utils.strlike?(item)
|
34
|
+
Utils.wrap_expr(Plr.cols(names.map(&:to_s)))
|
35
|
+
elsif Utils.is_polars_dtype(item)
|
36
|
+
Utils.wrap_expr(Plr.dtype_cols(names))
|
37
|
+
else
|
38
|
+
msg = "invalid input for `col`\n\nExpected iterable of type `str` or `DataType`, got iterable of type #{item.class.name}."
|
39
|
+
raise TypeError, msg
|
40
|
+
end
|
41
|
+
else
|
42
|
+
msg = "invalid input for `col`\n\nExpected `str` or `DataType`, got #{name.class.name}."
|
43
|
+
raise TypeError, msg
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,182 @@
|
|
1
|
+
module Polars
|
2
|
+
module Functions
|
3
|
+
# Aggregate multiple Dataframes/Series to a single DataFrame/Series.
|
4
|
+
#
|
5
|
+
# @param items [Object]
|
6
|
+
# DataFrames/Series/LazyFrames to concatenate.
|
7
|
+
# @param rechunk [Boolean]
|
8
|
+
# Make sure that all data is in contiguous memory.
|
9
|
+
# @param how ["vertical", "vertical_relaxed", "diagonal", "horizontal"]
|
10
|
+
# LazyFrames do not support the `horizontal` strategy.
|
11
|
+
#
|
12
|
+
# - Vertical: applies multiple `vstack` operations.
|
13
|
+
# - Diagonal: finds a union between the column schemas and fills missing column values with null.
|
14
|
+
# - Horizontal: stacks Series horizontally and fills with nulls if the lengths don't match.
|
15
|
+
# @param parallel [Boolean]
|
16
|
+
# Only relevant for LazyFrames. This determines if the concatenated
|
17
|
+
# lazy computations may be executed in parallel.
|
18
|
+
#
|
19
|
+
# @return [Object]
|
20
|
+
#
|
21
|
+
# @example
|
22
|
+
# df1 = Polars::DataFrame.new({"a" => [1], "b" => [3]})
|
23
|
+
# df2 = Polars::DataFrame.new({"a" => [2], "b" => [4]})
|
24
|
+
# Polars.concat([df1, df2])
|
25
|
+
# # =>
|
26
|
+
# # shape: (2, 2)
|
27
|
+
# # ┌─────┬─────┐
|
28
|
+
# # │ a ┆ b │
|
29
|
+
# # │ --- ┆ --- │
|
30
|
+
# # │ i64 ┆ i64 │
|
31
|
+
# # ╞═════╪═════╡
|
32
|
+
# # │ 1 ┆ 3 │
|
33
|
+
# # │ 2 ┆ 4 │
|
34
|
+
# # └─────┴─────┘
|
35
|
+
def concat(items, rechunk: true, how: "vertical", parallel: true)
|
36
|
+
if items.empty?
|
37
|
+
raise ArgumentError, "cannot concat empty list"
|
38
|
+
end
|
39
|
+
|
40
|
+
first = items[0]
|
41
|
+
if first.is_a?(DataFrame)
|
42
|
+
if how == "vertical"
|
43
|
+
out = Utils.wrap_df(Plr.concat_df(items))
|
44
|
+
elsif how == "diagonal"
|
45
|
+
out = Utils.wrap_df(Plr.concat_df_diagonal(items))
|
46
|
+
elsif how == "horizontal"
|
47
|
+
out = Utils.wrap_df(Plr.concat_df_horizontal(items))
|
48
|
+
else
|
49
|
+
raise ArgumentError, "how must be one of {{'vertical', 'diagonal', 'horizontal'}}, got #{how}"
|
50
|
+
end
|
51
|
+
elsif first.is_a?(LazyFrame)
|
52
|
+
if how == "vertical"
|
53
|
+
return Utils.wrap_ldf(Plr.concat_lf(items, rechunk, parallel, false))
|
54
|
+
elsif how == "vertical_relaxed"
|
55
|
+
return Utils.wrap_ldf(Plr.concat_lf(items, rechunk, parallel, true))
|
56
|
+
elsif how == "diagonal"
|
57
|
+
return Utils.wrap_ldf(Plr.concat_lf_diagonal(items, rechunk, parallel, false))
|
58
|
+
else
|
59
|
+
raise ArgumentError, "Lazy only allows 'vertical', 'vertical_relaxed', and 'diagonal' concat strategy."
|
60
|
+
end
|
61
|
+
elsif first.is_a?(Series)
|
62
|
+
# TODO
|
63
|
+
out = Utils.wrap_s(Plr.concat_series(items))
|
64
|
+
elsif first.is_a?(Expr)
|
65
|
+
out = first
|
66
|
+
items[1..-1].each do |e|
|
67
|
+
out = out.append(e)
|
68
|
+
end
|
69
|
+
else
|
70
|
+
raise ArgumentError, "did not expect type: #{first.class.name} in 'Polars.concat'."
|
71
|
+
end
|
72
|
+
|
73
|
+
if rechunk
|
74
|
+
out.rechunk
|
75
|
+
else
|
76
|
+
out
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# Align a sequence of frames using the uique values from one or more columns as a key.
|
81
|
+
#
|
82
|
+
# Frames that do not contain the given key values have rows injected (with nulls
|
83
|
+
# filling the non-key columns), and each resulting frame is sorted by the key.
|
84
|
+
#
|
85
|
+
# The original column order of input frames is not changed unless ``select`` is
|
86
|
+
# specified (in which case the final column order is determined from that).
|
87
|
+
#
|
88
|
+
# Note that this does not result in a joined frame - you receive the same number
|
89
|
+
# of frames back that you passed in, but each is now aligned by key and has
|
90
|
+
# the same number of rows.
|
91
|
+
#
|
92
|
+
# @param frames [Array]
|
93
|
+
# Sequence of DataFrames or LazyFrames.
|
94
|
+
# @param on [Object]
|
95
|
+
# One or more columns whose unique values will be used to align the frames.
|
96
|
+
# @param select [Object]
|
97
|
+
# Optional post-alignment column select to constrain and/or order
|
98
|
+
# the columns returned from the newly aligned frames.
|
99
|
+
# @param reverse [Object]
|
100
|
+
# Sort the alignment column values in descending order; can be a single
|
101
|
+
# boolean or a list of booleans associated with each column in `on`.
|
102
|
+
#
|
103
|
+
# @return [Object]
|
104
|
+
#
|
105
|
+
# @example
|
106
|
+
# df1 = Polars::DataFrame.new(
|
107
|
+
# {
|
108
|
+
# "dt" => [Date.new(2022, 9, 1), Date.new(2022, 9, 2), Date.new(2022, 9, 3)],
|
109
|
+
# "x" => [3.5, 4.0, 1.0],
|
110
|
+
# "y" => [10.0, 2.5, 1.5]
|
111
|
+
# }
|
112
|
+
# )
|
113
|
+
# df2 = Polars::DataFrame.new(
|
114
|
+
# {
|
115
|
+
# "dt" => [Date.new(2022, 9, 2), Date.new(2022, 9, 3), Date.new(2022, 9, 1)],
|
116
|
+
# "x" => [8.0, 1.0, 3.5],
|
117
|
+
# "y" => [1.5, 12.0, 5.0]
|
118
|
+
# }
|
119
|
+
# )
|
120
|
+
# df3 = Polars::DataFrame.new(
|
121
|
+
# {
|
122
|
+
# "dt" => [Date.new(2022, 9, 3), Date.new(2022, 9, 2)],
|
123
|
+
# "x" => [2.0, 5.0],
|
124
|
+
# "y" => [2.5, 2.0]
|
125
|
+
# }
|
126
|
+
# )
|
127
|
+
# af1, af2, af3 = Polars.align_frames(
|
128
|
+
# df1, df2, df3, on: "dt", select: ["x", "y"]
|
129
|
+
# )
|
130
|
+
# (af1 * af2 * af3).fill_null(0).select(Polars.sum(Polars.col("*")).alias("dot"))
|
131
|
+
# # =>
|
132
|
+
# # shape: (3, 1)
|
133
|
+
# # ┌───────┐
|
134
|
+
# # │ dot │
|
135
|
+
# # │ --- │
|
136
|
+
# # │ f64 │
|
137
|
+
# # ╞═══════╡
|
138
|
+
# # │ 0.0 │
|
139
|
+
# # ├╌╌╌╌╌╌╌┤
|
140
|
+
# # │ 167.5 │
|
141
|
+
# # ├╌╌╌╌╌╌╌┤
|
142
|
+
# # │ 47.0 │
|
143
|
+
# # └───────┘
|
144
|
+
def align_frames(
|
145
|
+
*frames,
|
146
|
+
on:,
|
147
|
+
select: nil,
|
148
|
+
reverse: false
|
149
|
+
)
|
150
|
+
if frames.empty?
|
151
|
+
return []
|
152
|
+
elsif frames.map(&:class).uniq.length != 1
|
153
|
+
raise TypeError, "Input frames must be of a consistent type (all LazyFrame or all DataFrame)"
|
154
|
+
end
|
155
|
+
|
156
|
+
# establish the superset of all "on" column values, sort, and cache
|
157
|
+
eager = frames[0].is_a?(DataFrame)
|
158
|
+
alignment_frame = (
|
159
|
+
concat(frames.map { |df| df.lazy.select(on) })
|
160
|
+
.unique(maintain_order: false)
|
161
|
+
.sort(on, reverse: reverse)
|
162
|
+
)
|
163
|
+
alignment_frame = (
|
164
|
+
eager ? alignment_frame.collect.lazy : alignment_frame.cache
|
165
|
+
)
|
166
|
+
# finally, align all frames
|
167
|
+
aligned_frames =
|
168
|
+
frames.map do |df|
|
169
|
+
alignment_frame.join(
|
170
|
+
df.lazy,
|
171
|
+
on: alignment_frame.columns,
|
172
|
+
how: "left"
|
173
|
+
).select(df.columns)
|
174
|
+
end
|
175
|
+
if !select.nil?
|
176
|
+
aligned_frames = aligned_frames.map { |df| df.select(select) }
|
177
|
+
end
|
178
|
+
|
179
|
+
eager ? aligned_frames.map(&:collect) : aligned_frames
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|