polars-df 0.13.0-x64-mingw-ucrt
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +3 -0
- data/CHANGELOG.md +208 -0
- data/Cargo.lock +2556 -0
- data/Cargo.toml +6 -0
- data/LICENSE-THIRD-PARTY.txt +39278 -0
- data/LICENSE.txt +20 -0
- data/README.md +437 -0
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/array_expr.rb +537 -0
- data/lib/polars/array_name_space.rb +423 -0
- data/lib/polars/batched_csv_reader.rb +104 -0
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/cat_expr.rb +36 -0
- data/lib/polars/cat_name_space.rb +88 -0
- data/lib/polars/config.rb +530 -0
- data/lib/polars/convert.rb +98 -0
- data/lib/polars/data_frame.rb +5191 -0
- data/lib/polars/data_types.rb +466 -0
- data/lib/polars/date_time_expr.rb +1397 -0
- data/lib/polars/date_time_name_space.rb +1287 -0
- data/lib/polars/dynamic_group_by.rb +52 -0
- data/lib/polars/exceptions.rb +38 -0
- data/lib/polars/expr.rb +7256 -0
- data/lib/polars/expr_dispatch.rb +22 -0
- data/lib/polars/functions/aggregation/horizontal.rb +246 -0
- data/lib/polars/functions/aggregation/vertical.rb +282 -0
- data/lib/polars/functions/as_datatype.rb +271 -0
- data/lib/polars/functions/col.rb +47 -0
- data/lib/polars/functions/eager.rb +182 -0
- data/lib/polars/functions/lazy.rb +1329 -0
- data/lib/polars/functions/len.rb +49 -0
- data/lib/polars/functions/lit.rb +35 -0
- data/lib/polars/functions/random.rb +16 -0
- data/lib/polars/functions/range/date_range.rb +136 -0
- data/lib/polars/functions/range/datetime_range.rb +149 -0
- data/lib/polars/functions/range/int_range.rb +51 -0
- data/lib/polars/functions/range/time_range.rb +141 -0
- data/lib/polars/functions/repeat.rb +144 -0
- data/lib/polars/functions/whenthen.rb +96 -0
- data/lib/polars/functions.rb +57 -0
- data/lib/polars/group_by.rb +613 -0
- data/lib/polars/io/avro.rb +24 -0
- data/lib/polars/io/csv.rb +696 -0
- data/lib/polars/io/database.rb +73 -0
- data/lib/polars/io/ipc.rb +275 -0
- data/lib/polars/io/json.rb +29 -0
- data/lib/polars/io/ndjson.rb +80 -0
- data/lib/polars/io/parquet.rb +233 -0
- data/lib/polars/lazy_frame.rb +2708 -0
- data/lib/polars/lazy_group_by.rb +181 -0
- data/lib/polars/list_expr.rb +791 -0
- data/lib/polars/list_name_space.rb +449 -0
- data/lib/polars/meta_expr.rb +222 -0
- data/lib/polars/name_expr.rb +198 -0
- data/lib/polars/plot.rb +109 -0
- data/lib/polars/rolling_group_by.rb +35 -0
- data/lib/polars/series.rb +4444 -0
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/sql_context.rb +194 -0
- data/lib/polars/string_cache.rb +75 -0
- data/lib/polars/string_expr.rb +1495 -0
- data/lib/polars/string_name_space.rb +811 -0
- data/lib/polars/struct_expr.rb +98 -0
- data/lib/polars/struct_name_space.rb +96 -0
- data/lib/polars/testing.rb +507 -0
- data/lib/polars/utils/constants.rb +9 -0
- data/lib/polars/utils/convert.rb +97 -0
- data/lib/polars/utils/parse.rb +89 -0
- data/lib/polars/utils/various.rb +76 -0
- data/lib/polars/utils/wrap.rb +19 -0
- data/lib/polars/utils.rb +130 -0
- data/lib/polars/version.rb +4 -0
- data/lib/polars/whenthen.rb +83 -0
- data/lib/polars-df.rb +1 -0
- data/lib/polars.rb +91 -0
- metadata +138 -0
@@ -0,0 +1,271 @@
|
|
1
|
+
module Polars
|
2
|
+
module Functions
|
3
|
+
# Create polars `Duration` from distinct time components.
|
4
|
+
#
|
5
|
+
# @return [Expr]
|
6
|
+
#
|
7
|
+
# @example
|
8
|
+
# df = Polars::DataFrame.new(
|
9
|
+
# {
|
10
|
+
# "datetime" => [DateTime.new(2022, 1, 1), DateTime.new(2022, 1, 2)],
|
11
|
+
# "add" => [1, 2]
|
12
|
+
# }
|
13
|
+
# )
|
14
|
+
# df.select(
|
15
|
+
# [
|
16
|
+
# (Polars.col("datetime") + Polars.duration(weeks: "add")).alias("add_weeks"),
|
17
|
+
# (Polars.col("datetime") + Polars.duration(days: "add")).alias("add_days"),
|
18
|
+
# (Polars.col("datetime") + Polars.duration(seconds: "add")).alias("add_seconds"),
|
19
|
+
# (Polars.col("datetime") + Polars.duration(milliseconds: "add")).alias(
|
20
|
+
# "add_milliseconds"
|
21
|
+
# ),
|
22
|
+
# (Polars.col("datetime") + Polars.duration(hours: "add")).alias("add_hours")
|
23
|
+
# ]
|
24
|
+
# )
|
25
|
+
# # =>
|
26
|
+
# # shape: (2, 5)
|
27
|
+
# # ┌─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────────┬─────────────────────┐
|
28
|
+
# # │ add_weeks ┆ add_days ┆ add_seconds ┆ add_milliseconds ┆ add_hours │
|
29
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
30
|
+
# # │ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] │
|
31
|
+
# # ╞═════════════════════╪═════════════════════╪═════════════════════╪═════════════════════════╪═════════════════════╡
|
32
|
+
# # │ 2022-01-08 00:00:00 ┆ 2022-01-02 00:00:00 ┆ 2022-01-01 00:00:01 ┆ 2022-01-01 00:00:00.001 ┆ 2022-01-01 01:00:00 │
|
33
|
+
# # │ 2022-01-16 00:00:00 ┆ 2022-01-04 00:00:00 ┆ 2022-01-02 00:00:02 ┆ 2022-01-02 00:00:00.002 ┆ 2022-01-02 02:00:00 │
|
34
|
+
# # └─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────────┴─────────────────────┘
|
35
|
+
def duration(
|
36
|
+
weeks: nil,
|
37
|
+
days: nil,
|
38
|
+
hours: nil,
|
39
|
+
minutes: nil,
|
40
|
+
seconds: nil,
|
41
|
+
milliseconds: nil,
|
42
|
+
microseconds: nil,
|
43
|
+
nanoseconds: nil,
|
44
|
+
time_unit: "us"
|
45
|
+
)
|
46
|
+
if !weeks.nil?
|
47
|
+
weeks = Utils.parse_into_expression(weeks, str_as_lit: false)
|
48
|
+
end
|
49
|
+
if !days.nil?
|
50
|
+
days = Utils.parse_into_expression(days, str_as_lit: false)
|
51
|
+
end
|
52
|
+
if !hours.nil?
|
53
|
+
hours = Utils.parse_into_expression(hours, str_as_lit: false)
|
54
|
+
end
|
55
|
+
if !minutes.nil?
|
56
|
+
minutes = Utils.parse_into_expression(minutes, str_as_lit: false)
|
57
|
+
end
|
58
|
+
if !seconds.nil?
|
59
|
+
seconds = Utils.parse_into_expression(seconds, str_as_lit: false)
|
60
|
+
end
|
61
|
+
if !milliseconds.nil?
|
62
|
+
milliseconds = Utils.parse_into_expression(milliseconds, str_as_lit: false)
|
63
|
+
end
|
64
|
+
if !microseconds.nil?
|
65
|
+
microseconds = Utils.parse_into_expression(microseconds, str_as_lit: false)
|
66
|
+
end
|
67
|
+
if !nanoseconds.nil?
|
68
|
+
nanoseconds = Utils.parse_into_expression(nanoseconds, str_as_lit: false)
|
69
|
+
end
|
70
|
+
|
71
|
+
Utils.wrap_expr(
|
72
|
+
Plr.duration(
|
73
|
+
weeks,
|
74
|
+
days,
|
75
|
+
hours,
|
76
|
+
minutes,
|
77
|
+
seconds,
|
78
|
+
milliseconds,
|
79
|
+
microseconds,
|
80
|
+
nanoseconds,
|
81
|
+
time_unit
|
82
|
+
)
|
83
|
+
)
|
84
|
+
end
|
85
|
+
|
86
|
+
# Concat the arrays in a Series dtype List in linear time.
|
87
|
+
#
|
88
|
+
# @return [Expr]
|
89
|
+
def concat_list(exprs)
|
90
|
+
exprs = Utils.parse_into_list_of_expressions(exprs)
|
91
|
+
Utils.wrap_expr(Plr.concat_list(exprs))
|
92
|
+
end
|
93
|
+
|
94
|
+
# Collect several columns into a Series of dtype Struct.
|
95
|
+
#
|
96
|
+
# @param exprs [Array]
|
97
|
+
# Column(s) to collect into a struct column, specified as positional arguments.
|
98
|
+
# Accepts expression input. Strings are parsed as column names,
|
99
|
+
# other non-expression inputs are parsed as literals.
|
100
|
+
# @param schema [Hash]
|
101
|
+
# Optional schema that explicitly defines the struct field dtypes. If no columns
|
102
|
+
# or expressions are provided, schema keys are used to define columns.
|
103
|
+
# @param eager [Boolean]
|
104
|
+
# Evaluate immediately and return a `Series`. If set to `false` (default),
|
105
|
+
# return an expression instead.
|
106
|
+
# @param named_exprs [Hash]
|
107
|
+
# Additional columns to collect into the struct column, specified as keyword
|
108
|
+
# arguments. The columns will be renamed to the keyword used.
|
109
|
+
#
|
110
|
+
# @return [Object]
|
111
|
+
#
|
112
|
+
# @example
|
113
|
+
# df = Polars::DataFrame.new(
|
114
|
+
# {
|
115
|
+
# "int" => [1, 2],
|
116
|
+
# "str" => ["a", "b"],
|
117
|
+
# "bool" => [true, nil],
|
118
|
+
# "list" => [[1, 2], [3]],
|
119
|
+
# }
|
120
|
+
# )
|
121
|
+
# df.select([Polars.struct(Polars.all).alias("my_struct")])
|
122
|
+
# # =>
|
123
|
+
# # shape: (2, 1)
|
124
|
+
# # ┌─────────────────────┐
|
125
|
+
# # │ my_struct │
|
126
|
+
# # │ --- │
|
127
|
+
# # │ struct[4] │
|
128
|
+
# # ╞═════════════════════╡
|
129
|
+
# # │ {1,"a",true,[1, 2]} │
|
130
|
+
# # │ {2,"b",null,[3]} │
|
131
|
+
# # └─────────────────────┘
|
132
|
+
#
|
133
|
+
# @example Collect selected columns into a struct by either passing a list of columns, or by specifying each column as a positional argument.
|
134
|
+
# df.select(Polars.struct("int", false).alias("my_struct"))
|
135
|
+
# # =>
|
136
|
+
# # shape: (2, 1)
|
137
|
+
# # ┌───────────┐
|
138
|
+
# # │ my_struct │
|
139
|
+
# # │ --- │
|
140
|
+
# # │ struct[2] │
|
141
|
+
# # ╞═══════════╡
|
142
|
+
# # │ {1,false} │
|
143
|
+
# # │ {2,false} │
|
144
|
+
# # └───────────┘
|
145
|
+
#
|
146
|
+
# @example Use keyword arguments to easily name each struct field.
|
147
|
+
# df.select(Polars.struct(p: "int", q: "bool").alias("my_struct")).schema
|
148
|
+
# # => {"my_struct"=>Polars::Struct({"p"=>Polars::Int64, "q"=>Polars::Boolean})}
|
149
|
+
def struct(*exprs, schema: nil, eager: false, **named_exprs)
|
150
|
+
rbexprs = Utils.parse_into_list_of_expressions(*exprs, **named_exprs)
|
151
|
+
expr = Utils.wrap_expr(Plr.as_struct(rbexprs))
|
152
|
+
|
153
|
+
if !schema.nil? && !schema.empty?
|
154
|
+
if !exprs.any?
|
155
|
+
# no columns or expressions provided; create one from schema keys
|
156
|
+
expr =
|
157
|
+
Utils.wrap_expr(
|
158
|
+
Plr.as_struct(Utils.parse_into_list_of_expressions(schema.keys))
|
159
|
+
)
|
160
|
+
expr = expr.cast(Struct.new(schema), strict: false)
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
if eager
|
165
|
+
Polars.select(expr).to_series
|
166
|
+
else
|
167
|
+
expr
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
# Horizontally concat Utf8 Series in linear time. Non-Utf8 columns are cast to Utf8.
|
172
|
+
#
|
173
|
+
# @param exprs [Object]
|
174
|
+
# Columns to concat into a Utf8 Series.
|
175
|
+
# @param sep [String]
|
176
|
+
# String value that will be used to separate the values.
|
177
|
+
# @param ignore_nulls [Boolean]
|
178
|
+
# Ignore null values (default).
|
179
|
+
#
|
180
|
+
# @return [Expr]
|
181
|
+
#
|
182
|
+
# @example
|
183
|
+
# df = Polars::DataFrame.new(
|
184
|
+
# {
|
185
|
+
# "a" => [1, 2, 3],
|
186
|
+
# "b" => ["dogs", "cats", nil],
|
187
|
+
# "c" => ["play", "swim", "walk"]
|
188
|
+
# }
|
189
|
+
# )
|
190
|
+
# df.with_columns(
|
191
|
+
# [
|
192
|
+
# Polars.concat_str(
|
193
|
+
# [
|
194
|
+
# Polars.col("a") * 2,
|
195
|
+
# Polars.col("b"),
|
196
|
+
# Polars.col("c")
|
197
|
+
# ],
|
198
|
+
# sep: " "
|
199
|
+
# ).alias("full_sentence")
|
200
|
+
# ]
|
201
|
+
# )
|
202
|
+
# # =>
|
203
|
+
# # shape: (3, 4)
|
204
|
+
# # ┌─────┬──────┬──────┬───────────────┐
|
205
|
+
# # │ a ┆ b ┆ c ┆ full_sentence │
|
206
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
207
|
+
# # │ i64 ┆ str ┆ str ┆ str │
|
208
|
+
# # ╞═════╪══════╪══════╪═══════════════╡
|
209
|
+
# # │ 1 ┆ dogs ┆ play ┆ 2 dogs play │
|
210
|
+
# # │ 2 ┆ cats ┆ swim ┆ 4 cats swim │
|
211
|
+
# # │ 3 ┆ null ┆ walk ┆ null │
|
212
|
+
# # └─────┴──────┴──────┴───────────────┘
|
213
|
+
def concat_str(exprs, sep: "", ignore_nulls: false)
|
214
|
+
exprs = Utils.parse_into_list_of_expressions(exprs)
|
215
|
+
Utils.wrap_expr(Plr.concat_str(exprs, sep, ignore_nulls))
|
216
|
+
end
|
217
|
+
|
218
|
+
# Format expressions as a string.
|
219
|
+
#
|
220
|
+
# @param f_string [String]
|
221
|
+
# A string that with placeholders.
|
222
|
+
# For example: "hello_{}" or "{}_world
|
223
|
+
# @param args [Object]
|
224
|
+
# Expression(s) that fill the placeholders
|
225
|
+
#
|
226
|
+
# @return [Expr]
|
227
|
+
#
|
228
|
+
# @example
|
229
|
+
# df = Polars::DataFrame.new(
|
230
|
+
# {
|
231
|
+
# "a": ["a", "b", "c"],
|
232
|
+
# "b": [1, 2, 3]
|
233
|
+
# }
|
234
|
+
# )
|
235
|
+
# df.select(
|
236
|
+
# [
|
237
|
+
# Polars.format("foo_{}_bar_{}", Polars.col("a"), "b").alias("fmt")
|
238
|
+
# ]
|
239
|
+
# )
|
240
|
+
# # =>
|
241
|
+
# # shape: (3, 1)
|
242
|
+
# # ┌─────────────┐
|
243
|
+
# # │ fmt │
|
244
|
+
# # │ --- │
|
245
|
+
# # │ str │
|
246
|
+
# # ╞═════════════╡
|
247
|
+
# # │ foo_a_bar_1 │
|
248
|
+
# # │ foo_b_bar_2 │
|
249
|
+
# # │ foo_c_bar_3 │
|
250
|
+
# # └─────────────┘
|
251
|
+
def format(f_string, *args)
|
252
|
+
if f_string.scan("{}").length != args.length
|
253
|
+
raise ArgumentError, "number of placeholders should equal the number of arguments"
|
254
|
+
end
|
255
|
+
|
256
|
+
exprs = []
|
257
|
+
|
258
|
+
arguments = args.each
|
259
|
+
f_string.split(/(\{\})/).each do |s|
|
260
|
+
if s == "{}"
|
261
|
+
e = Utils.wrap_expr(Utils.parse_into_expression(arguments.next))
|
262
|
+
exprs << e
|
263
|
+
elsif s.length > 0
|
264
|
+
exprs << lit(s)
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
concat_str(exprs, sep: "")
|
269
|
+
end
|
270
|
+
end
|
271
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module Polars
|
2
|
+
module Functions
|
3
|
+
# Return an expression representing a column in a DataFrame.
|
4
|
+
#
|
5
|
+
# @return [Expr]
|
6
|
+
def col(name, *more_names)
|
7
|
+
if more_names.any?
|
8
|
+
if Utils.strlike?(name)
|
9
|
+
names_str = [name]
|
10
|
+
names_str.concat(more_names)
|
11
|
+
return Utils.wrap_expr(Plr.cols(names_str.map(&:to_s)))
|
12
|
+
elsif Utils.is_polars_dtype(name)
|
13
|
+
dtypes = [name]
|
14
|
+
dtypes.concat(more_names)
|
15
|
+
return Utils.wrap_expr(Plr.dtype_cols(dtypes))
|
16
|
+
else
|
17
|
+
msg = "invalid input for `col`\n\nExpected `str` or `DataType`, got #{name.class.name}."
|
18
|
+
raise TypeError, msg
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
if Utils.strlike?(name)
|
23
|
+
Utils.wrap_expr(Plr.col(name.to_s))
|
24
|
+
elsif Utils.is_polars_dtype(name)
|
25
|
+
Utils.wrap_expr(Plr.dtype_cols([name]))
|
26
|
+
elsif name.is_a?(::Array)
|
27
|
+
names = Array(name)
|
28
|
+
if names.empty?
|
29
|
+
return Utils.wrap_expr(Plr.cols(names))
|
30
|
+
end
|
31
|
+
|
32
|
+
item = names[0]
|
33
|
+
if Utils.strlike?(item)
|
34
|
+
Utils.wrap_expr(Plr.cols(names.map(&:to_s)))
|
35
|
+
elsif Utils.is_polars_dtype(item)
|
36
|
+
Utils.wrap_expr(Plr.dtype_cols(names))
|
37
|
+
else
|
38
|
+
msg = "invalid input for `col`\n\nExpected iterable of type `str` or `DataType`, got iterable of type #{item.class.name}."
|
39
|
+
raise TypeError, msg
|
40
|
+
end
|
41
|
+
else
|
42
|
+
msg = "invalid input for `col`\n\nExpected `str` or `DataType`, got #{name.class.name}."
|
43
|
+
raise TypeError, msg
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,182 @@
|
|
1
|
+
module Polars
|
2
|
+
module Functions
|
3
|
+
# Aggregate multiple Dataframes/Series to a single DataFrame/Series.
|
4
|
+
#
|
5
|
+
# @param items [Object]
|
6
|
+
# DataFrames/Series/LazyFrames to concatenate.
|
7
|
+
# @param rechunk [Boolean]
|
8
|
+
# Make sure that all data is in contiguous memory.
|
9
|
+
# @param how ["vertical", "vertical_relaxed", "diagonal", "horizontal"]
|
10
|
+
# LazyFrames do not support the `horizontal` strategy.
|
11
|
+
#
|
12
|
+
# - Vertical: applies multiple `vstack` operations.
|
13
|
+
# - Diagonal: finds a union between the column schemas and fills missing column values with null.
|
14
|
+
# - Horizontal: stacks Series horizontally and fills with nulls if the lengths don't match.
|
15
|
+
# @param parallel [Boolean]
|
16
|
+
# Only relevant for LazyFrames. This determines if the concatenated
|
17
|
+
# lazy computations may be executed in parallel.
|
18
|
+
#
|
19
|
+
# @return [Object]
|
20
|
+
#
|
21
|
+
# @example
|
22
|
+
# df1 = Polars::DataFrame.new({"a" => [1], "b" => [3]})
|
23
|
+
# df2 = Polars::DataFrame.new({"a" => [2], "b" => [4]})
|
24
|
+
# Polars.concat([df1, df2])
|
25
|
+
# # =>
|
26
|
+
# # shape: (2, 2)
|
27
|
+
# # ┌─────┬─────┐
|
28
|
+
# # │ a ┆ b │
|
29
|
+
# # │ --- ┆ --- │
|
30
|
+
# # │ i64 ┆ i64 │
|
31
|
+
# # ╞═════╪═════╡
|
32
|
+
# # │ 1 ┆ 3 │
|
33
|
+
# # │ 2 ┆ 4 │
|
34
|
+
# # └─────┴─────┘
|
35
|
+
def concat(items, rechunk: true, how: "vertical", parallel: true)
|
36
|
+
if items.empty?
|
37
|
+
raise ArgumentError, "cannot concat empty list"
|
38
|
+
end
|
39
|
+
|
40
|
+
first = items[0]
|
41
|
+
if first.is_a?(DataFrame)
|
42
|
+
if how == "vertical"
|
43
|
+
out = Utils.wrap_df(Plr.concat_df(items))
|
44
|
+
elsif how == "diagonal"
|
45
|
+
out = Utils.wrap_df(Plr.concat_df_diagonal(items))
|
46
|
+
elsif how == "horizontal"
|
47
|
+
out = Utils.wrap_df(Plr.concat_df_horizontal(items))
|
48
|
+
else
|
49
|
+
raise ArgumentError, "how must be one of {{'vertical', 'diagonal', 'horizontal'}}, got #{how}"
|
50
|
+
end
|
51
|
+
elsif first.is_a?(LazyFrame)
|
52
|
+
if how == "vertical"
|
53
|
+
return Utils.wrap_ldf(Plr.concat_lf(items, rechunk, parallel, false))
|
54
|
+
elsif how == "vertical_relaxed"
|
55
|
+
return Utils.wrap_ldf(Plr.concat_lf(items, rechunk, parallel, true))
|
56
|
+
elsif how == "diagonal"
|
57
|
+
return Utils.wrap_ldf(Plr.concat_lf_diagonal(items, rechunk, parallel, false))
|
58
|
+
else
|
59
|
+
raise ArgumentError, "Lazy only allows 'vertical', 'vertical_relaxed', and 'diagonal' concat strategy."
|
60
|
+
end
|
61
|
+
elsif first.is_a?(Series)
|
62
|
+
# TODO
|
63
|
+
out = Utils.wrap_s(Plr.concat_series(items))
|
64
|
+
elsif first.is_a?(Expr)
|
65
|
+
out = first
|
66
|
+
items[1..-1].each do |e|
|
67
|
+
out = out.append(e)
|
68
|
+
end
|
69
|
+
else
|
70
|
+
raise ArgumentError, "did not expect type: #{first.class.name} in 'Polars.concat'."
|
71
|
+
end
|
72
|
+
|
73
|
+
if rechunk
|
74
|
+
out.rechunk
|
75
|
+
else
|
76
|
+
out
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# Align a sequence of frames using the uique values from one or more columns as a key.
|
81
|
+
#
|
82
|
+
# Frames that do not contain the given key values have rows injected (with nulls
|
83
|
+
# filling the non-key columns), and each resulting frame is sorted by the key.
|
84
|
+
#
|
85
|
+
# The original column order of input frames is not changed unless ``select`` is
|
86
|
+
# specified (in which case the final column order is determined from that).
|
87
|
+
#
|
88
|
+
# Note that this does not result in a joined frame - you receive the same number
|
89
|
+
# of frames back that you passed in, but each is now aligned by key and has
|
90
|
+
# the same number of rows.
|
91
|
+
#
|
92
|
+
# @param frames [Array]
|
93
|
+
# Sequence of DataFrames or LazyFrames.
|
94
|
+
# @param on [Object]
|
95
|
+
# One or more columns whose unique values will be used to align the frames.
|
96
|
+
# @param select [Object]
|
97
|
+
# Optional post-alignment column select to constrain and/or order
|
98
|
+
# the columns returned from the newly aligned frames.
|
99
|
+
# @param reverse [Object]
|
100
|
+
# Sort the alignment column values in descending order; can be a single
|
101
|
+
# boolean or a list of booleans associated with each column in `on`.
|
102
|
+
#
|
103
|
+
# @return [Object]
|
104
|
+
#
|
105
|
+
# @example
|
106
|
+
# df1 = Polars::DataFrame.new(
|
107
|
+
# {
|
108
|
+
# "dt" => [Date.new(2022, 9, 1), Date.new(2022, 9, 2), Date.new(2022, 9, 3)],
|
109
|
+
# "x" => [3.5, 4.0, 1.0],
|
110
|
+
# "y" => [10.0, 2.5, 1.5]
|
111
|
+
# }
|
112
|
+
# )
|
113
|
+
# df2 = Polars::DataFrame.new(
|
114
|
+
# {
|
115
|
+
# "dt" => [Date.new(2022, 9, 2), Date.new(2022, 9, 3), Date.new(2022, 9, 1)],
|
116
|
+
# "x" => [8.0, 1.0, 3.5],
|
117
|
+
# "y" => [1.5, 12.0, 5.0]
|
118
|
+
# }
|
119
|
+
# )
|
120
|
+
# df3 = Polars::DataFrame.new(
|
121
|
+
# {
|
122
|
+
# "dt" => [Date.new(2022, 9, 3), Date.new(2022, 9, 2)],
|
123
|
+
# "x" => [2.0, 5.0],
|
124
|
+
# "y" => [2.5, 2.0]
|
125
|
+
# }
|
126
|
+
# )
|
127
|
+
# af1, af2, af3 = Polars.align_frames(
|
128
|
+
# df1, df2, df3, on: "dt", select: ["x", "y"]
|
129
|
+
# )
|
130
|
+
# (af1 * af2 * af3).fill_null(0).select(Polars.sum(Polars.col("*")).alias("dot"))
|
131
|
+
# # =>
|
132
|
+
# # shape: (3, 1)
|
133
|
+
# # ┌───────┐
|
134
|
+
# # │ dot │
|
135
|
+
# # │ --- │
|
136
|
+
# # │ f64 │
|
137
|
+
# # ╞═══════╡
|
138
|
+
# # │ 0.0 │
|
139
|
+
# # ├╌╌╌╌╌╌╌┤
|
140
|
+
# # │ 167.5 │
|
141
|
+
# # ├╌╌╌╌╌╌╌┤
|
142
|
+
# # │ 47.0 │
|
143
|
+
# # └───────┘
|
144
|
+
def align_frames(
|
145
|
+
*frames,
|
146
|
+
on:,
|
147
|
+
select: nil,
|
148
|
+
reverse: false
|
149
|
+
)
|
150
|
+
if frames.empty?
|
151
|
+
return []
|
152
|
+
elsif frames.map(&:class).uniq.length != 1
|
153
|
+
raise TypeError, "Input frames must be of a consistent type (all LazyFrame or all DataFrame)"
|
154
|
+
end
|
155
|
+
|
156
|
+
# establish the superset of all "on" column values, sort, and cache
|
157
|
+
eager = frames[0].is_a?(DataFrame)
|
158
|
+
alignment_frame = (
|
159
|
+
concat(frames.map { |df| df.lazy.select(on) })
|
160
|
+
.unique(maintain_order: false)
|
161
|
+
.sort(on, reverse: reverse)
|
162
|
+
)
|
163
|
+
alignment_frame = (
|
164
|
+
eager ? alignment_frame.collect.lazy : alignment_frame.cache
|
165
|
+
)
|
166
|
+
# finally, align all frames
|
167
|
+
aligned_frames =
|
168
|
+
frames.map do |df|
|
169
|
+
alignment_frame.join(
|
170
|
+
df.lazy,
|
171
|
+
on: alignment_frame.columns,
|
172
|
+
how: "left"
|
173
|
+
).select(df.columns)
|
174
|
+
end
|
175
|
+
if !select.nil?
|
176
|
+
aligned_frames = aligned_frames.map { |df| df.select(select) }
|
177
|
+
end
|
178
|
+
|
179
|
+
eager ? aligned_frames.map(&:collect) : aligned_frames
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|