polars-df 0.8.0-aarch64-linux → 0.10.0-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +42 -1
- data/Cargo.lock +159 -66
- data/Cargo.toml +0 -3
- data/LICENSE-THIRD-PARTY.txt +3112 -1613
- data/LICENSE.txt +1 -1
- data/README.md +3 -2
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/array_expr.rb +453 -0
- data/lib/polars/array_name_space.rb +346 -0
- data/lib/polars/batched_csv_reader.rb +4 -2
- data/lib/polars/cat_expr.rb +24 -0
- data/lib/polars/cat_name_space.rb +75 -0
- data/lib/polars/config.rb +2 -2
- data/lib/polars/data_frame.rb +306 -96
- data/lib/polars/data_types.rb +191 -28
- data/lib/polars/date_time_expr.rb +41 -18
- data/lib/polars/date_time_name_space.rb +9 -3
- data/lib/polars/exceptions.rb +12 -1
- data/lib/polars/expr.rb +898 -215
- data/lib/polars/functions/aggregation/horizontal.rb +246 -0
- data/lib/polars/functions/aggregation/vertical.rb +282 -0
- data/lib/polars/functions/as_datatype.rb +248 -0
- data/lib/polars/functions/col.rb +47 -0
- data/lib/polars/functions/eager.rb +182 -0
- data/lib/polars/functions/lazy.rb +1280 -0
- data/lib/polars/functions/len.rb +49 -0
- data/lib/polars/functions/lit.rb +35 -0
- data/lib/polars/functions/random.rb +16 -0
- data/lib/polars/functions/range/date_range.rb +103 -0
- data/lib/polars/functions/range/int_range.rb +51 -0
- data/lib/polars/functions/repeat.rb +144 -0
- data/lib/polars/functions/whenthen.rb +96 -0
- data/lib/polars/functions.rb +29 -416
- data/lib/polars/group_by.rb +2 -2
- data/lib/polars/io.rb +36 -31
- data/lib/polars/lazy_frame.rb +405 -88
- data/lib/polars/list_expr.rb +158 -8
- data/lib/polars/list_name_space.rb +102 -0
- data/lib/polars/meta_expr.rb +175 -7
- data/lib/polars/series.rb +282 -41
- data/lib/polars/string_cache.rb +75 -0
- data/lib/polars/string_expr.rb +413 -96
- data/lib/polars/string_name_space.rb +4 -4
- data/lib/polars/testing.rb +507 -0
- data/lib/polars/utils.rb +106 -8
- data/lib/polars/version.rb +1 -1
- data/lib/polars/whenthen.rb +83 -0
- data/lib/polars.rb +16 -4
- metadata +34 -6
- data/lib/polars/lazy_functions.rb +0 -1181
- data/lib/polars/when.rb +0 -16
- data/lib/polars/when_then.rb +0 -19
@@ -0,0 +1,1280 @@
|
|
1
|
+
module Polars
|
2
|
+
module Functions
|
3
|
+
# Alias for an element in evaluated in an `eval` expression.
|
4
|
+
#
|
5
|
+
# @return [Expr]
|
6
|
+
#
|
7
|
+
# @example A horizontal rank computation by taking the elements of a list
|
8
|
+
# df = Polars::DataFrame.new({"a" => [1, 8, 3], "b" => [4, 5, 2]})
|
9
|
+
# df.with_column(
|
10
|
+
# Polars.concat_list(["a", "b"]).list.eval(Polars.element.rank).alias("rank")
|
11
|
+
# )
|
12
|
+
# # =>
|
13
|
+
# # shape: (3, 3)
|
14
|
+
# # ┌─────┬─────┬────────────┐
|
15
|
+
# # │ a ┆ b ┆ rank │
|
16
|
+
# # │ --- ┆ --- ┆ --- │
|
17
|
+
# # │ i64 ┆ i64 ┆ list[f64] │
|
18
|
+
# # ╞═════╪═════╪════════════╡
|
19
|
+
# # │ 1 ┆ 4 ┆ [1.0, 2.0] │
|
20
|
+
# # │ 8 ┆ 5 ┆ [2.0, 1.0] │
|
21
|
+
# # │ 3 ┆ 2 ┆ [2.0, 1.0] │
|
22
|
+
# # └─────┴─────┴────────────┘
|
23
|
+
def element
|
24
|
+
col("")
|
25
|
+
end
|
26
|
+
|
27
|
+
# Return the number of non-null values in the column.
|
28
|
+
#
|
29
|
+
# This function is syntactic sugar for `col(columns).count`.
|
30
|
+
#
|
31
|
+
# Calling this function without any arguments returns the number of rows in the
|
32
|
+
# context. **This way of using the function is deprecated.** Please use `len`
|
33
|
+
# instead.
|
34
|
+
#
|
35
|
+
# @param columns [Array]
|
36
|
+
# One or more column names.
|
37
|
+
#
|
38
|
+
# @return [Expr]
|
39
|
+
#
|
40
|
+
# @example
|
41
|
+
# df = Polars::DataFrame.new(
|
42
|
+
# {
|
43
|
+
# "a" => [1, 2, nil],
|
44
|
+
# "b" => [3, nil, nil],
|
45
|
+
# "c" => ["foo", "bar", "foo"]
|
46
|
+
# }
|
47
|
+
# )
|
48
|
+
# df.select(Polars.count("a"))
|
49
|
+
# # =>
|
50
|
+
# # shape: (1, 1)
|
51
|
+
# # ┌─────┐
|
52
|
+
# # │ a │
|
53
|
+
# # │ --- │
|
54
|
+
# # │ u32 │
|
55
|
+
# # ╞═════╡
|
56
|
+
# # │ 2 │
|
57
|
+
# # └─────┘
|
58
|
+
#
|
59
|
+
# @example Return the number of non-null values in multiple columns.
|
60
|
+
# df.select(Polars.count("b", "c"))
|
61
|
+
# # =>
|
62
|
+
# # shape: (1, 2)
|
63
|
+
# # ┌─────┬─────┐
|
64
|
+
# # │ b ┆ c │
|
65
|
+
# # │ --- ┆ --- │
|
66
|
+
# # │ u32 ┆ u32 │
|
67
|
+
# # ╞═════╪═════╡
|
68
|
+
# # │ 1 ┆ 3 │
|
69
|
+
# # └─────┴─────┘
|
70
|
+
def count(*columns)
|
71
|
+
if columns.empty?
|
72
|
+
warn "`Polars.count` is deprecated. Use `Polars.length` instead."
|
73
|
+
return Utils.wrap_expr(Plr.len._alias("count"))
|
74
|
+
end
|
75
|
+
|
76
|
+
col(*columns).count
|
77
|
+
end
|
78
|
+
|
79
|
+
# Return the cumulative count of the non-null values in the column.
|
80
|
+
#
|
81
|
+
# This function is syntactic sugar for `col(columns).cum_count`.
|
82
|
+
#
|
83
|
+
# If no arguments are passed, returns the cumulative count of a context.
|
84
|
+
# Rows containing null values count towards the result.
|
85
|
+
#
|
86
|
+
# @param columns [Array]
|
87
|
+
# Name(s) of the columns to use.
|
88
|
+
# @param reverse [Boolean]
|
89
|
+
# Reverse the operation.
|
90
|
+
#
|
91
|
+
# @return [Expr]
|
92
|
+
#
|
93
|
+
# @example
|
94
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, nil], "b" => [3, nil, nil]})
|
95
|
+
# df.select(Polars.cum_count("a"))
|
96
|
+
# # =>
|
97
|
+
# # shape: (3, 1)
|
98
|
+
# # ┌─────┐
|
99
|
+
# # │ a │
|
100
|
+
# # │ --- │
|
101
|
+
# # │ u32 │
|
102
|
+
# # ╞═════╡
|
103
|
+
# # │ 1 │
|
104
|
+
# # │ 2 │
|
105
|
+
# # │ 2 │
|
106
|
+
# # └─────┘
|
107
|
+
def cum_count(*columns, reverse: false)
|
108
|
+
col(*columns).cum_count(reverse: reverse)
|
109
|
+
end
|
110
|
+
|
111
|
+
# Aggregate all column values into a list.
|
112
|
+
#
|
113
|
+
# This function is syntactic sugar for `col(name).implode`.
|
114
|
+
#
|
115
|
+
# @param columns [Array]
|
116
|
+
# One or more column names.
|
117
|
+
#
|
118
|
+
# @return [Expr]
|
119
|
+
#
|
120
|
+
# @example
|
121
|
+
# df = Polars::DataFrame.new(
|
122
|
+
# {
|
123
|
+
# "a" => [1, 2, 3],
|
124
|
+
# "b" => [9, 8, 7],
|
125
|
+
# "c" => ["foo", "bar", "foo"]
|
126
|
+
# }
|
127
|
+
# )
|
128
|
+
# df.select(Polars.implode("a"))
|
129
|
+
# # =>
|
130
|
+
# # shape: (1, 1)
|
131
|
+
# # ┌───────────┐
|
132
|
+
# # │ a │
|
133
|
+
# # │ --- │
|
134
|
+
# # │ list[i64] │
|
135
|
+
# # ╞═══════════╡
|
136
|
+
# # │ [1, 2, 3] │
|
137
|
+
# # └───────────┘
|
138
|
+
#
|
139
|
+
# @example
|
140
|
+
# df.select(Polars.implode("b", "c"))
|
141
|
+
# # =>
|
142
|
+
# # shape: (1, 2)
|
143
|
+
# # ┌───────────┬───────────────────────┐
|
144
|
+
# # │ b ┆ c │
|
145
|
+
# # │ --- ┆ --- │
|
146
|
+
# # │ list[i64] ┆ list[str] │
|
147
|
+
# # ╞═══════════╪═══════════════════════╡
|
148
|
+
# # │ [9, 8, 7] ┆ ["foo", "bar", "foo"] │
|
149
|
+
# # └───────────┴───────────────────────┘
|
150
|
+
def implode(*columns)
|
151
|
+
col(*columns).implode
|
152
|
+
end
|
153
|
+
|
154
|
+
# Get the standard deviation.
|
155
|
+
#
|
156
|
+
# This function is syntactic sugar for `col(column).std(ddof: ddof)`.
|
157
|
+
#
|
158
|
+
# @param column [Object]
|
159
|
+
# Column name.
|
160
|
+
# @param ddof [Integer]
|
161
|
+
# “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
|
162
|
+
# where N represents the number of elements.
|
163
|
+
# By default ddof is 1.
|
164
|
+
#
|
165
|
+
# @return [Expr]
|
166
|
+
#
|
167
|
+
# @example
|
168
|
+
# df = Polars::DataFrame.new(
|
169
|
+
# {
|
170
|
+
# "a" => [1, 8, 3],
|
171
|
+
# "b" => [4, 5, 2],
|
172
|
+
# "c" => ["foo", "bar", "foo"]
|
173
|
+
# }
|
174
|
+
# )
|
175
|
+
# df.select(Polars.std("a"))
|
176
|
+
# # =>
|
177
|
+
# # shape: (1, 1)
|
178
|
+
# # ┌──────────┐
|
179
|
+
# # │ a │
|
180
|
+
# # │ --- │
|
181
|
+
# # │ f64 │
|
182
|
+
# # ╞══════════╡
|
183
|
+
# # │ 3.605551 │
|
184
|
+
# # └──────────┘
|
185
|
+
#
|
186
|
+
# @example
|
187
|
+
# df["a"].std
|
188
|
+
# # => 3.605551275463989
|
189
|
+
def std(column, ddof: 1)
|
190
|
+
col(column).std(ddof: ddof)
|
191
|
+
end
|
192
|
+
|
193
|
+
# Get the variance.
|
194
|
+
#
|
195
|
+
# This function is syntactic sugar for `col(column).var(ddof: ddof)`.
|
196
|
+
#
|
197
|
+
# @param column [Object]
|
198
|
+
# Column name.
|
199
|
+
# @param ddof [Integer]
|
200
|
+
# “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
|
201
|
+
# where N represents the number of elements.
|
202
|
+
# By default ddof is 1.
|
203
|
+
#
|
204
|
+
# @return [Expr]
|
205
|
+
#
|
206
|
+
# @example
|
207
|
+
# df = Polars::DataFrame.new(
|
208
|
+
# {
|
209
|
+
# "a" => [1, 8, 3],
|
210
|
+
# "b" => [4, 5, 2],
|
211
|
+
# "c" => ["foo", "bar", "foo"]
|
212
|
+
# }
|
213
|
+
# )
|
214
|
+
# df.select(Polars.var("a"))
|
215
|
+
# # =>
|
216
|
+
# # shape: (1, 1)
|
217
|
+
# # ┌──────┐
|
218
|
+
# # │ a │
|
219
|
+
# # │ --- │
|
220
|
+
# # │ f64 │
|
221
|
+
# # ╞══════╡
|
222
|
+
# # │ 13.0 │
|
223
|
+
# # └──────┘
|
224
|
+
#
|
225
|
+
# @example
|
226
|
+
# df["a"].var
|
227
|
+
# # => 13.0
|
228
|
+
def var(column, ddof: 1)
|
229
|
+
col(column).var(ddof: ddof)
|
230
|
+
end
|
231
|
+
|
232
|
+
|
233
|
+
# Get the mean value.
|
234
|
+
#
|
235
|
+
# This function is syntactic sugar for `col(columns).mean`.
|
236
|
+
#
|
237
|
+
# @param columns [Array]
|
238
|
+
# One or more column names.
|
239
|
+
#
|
240
|
+
# @return [Expr]
|
241
|
+
#
|
242
|
+
# @example
|
243
|
+
# df = Polars::DataFrame.new(
|
244
|
+
# {
|
245
|
+
# "a" => [1, 8, 3],
|
246
|
+
# "b" => [4, 5, 2],
|
247
|
+
# "c" => ["foo", "bar", "foo"]
|
248
|
+
# }
|
249
|
+
# )
|
250
|
+
# df.select(Polars.mean("a"))
|
251
|
+
# # =>
|
252
|
+
# # shape: (1, 1)
|
253
|
+
# # ┌─────┐
|
254
|
+
# # │ a │
|
255
|
+
# # │ --- │
|
256
|
+
# # │ f64 │
|
257
|
+
# # ╞═════╡
|
258
|
+
# # │ 4.0 │
|
259
|
+
# # └─────┘
|
260
|
+
#
|
261
|
+
# @example
|
262
|
+
# df.select(Polars.mean("a", "b"))
|
263
|
+
# # =>
|
264
|
+
# # shape: (1, 2)
|
265
|
+
# # ┌─────┬──────────┐
|
266
|
+
# # │ a ┆ b │
|
267
|
+
# # │ --- ┆ --- │
|
268
|
+
# # │ f64 ┆ f64 │
|
269
|
+
# # ╞═════╪══════════╡
|
270
|
+
# # │ 4.0 ┆ 3.666667 │
|
271
|
+
# # └─────┴──────────┘
|
272
|
+
def mean(*columns)
|
273
|
+
col(*columns).mean
|
274
|
+
end
|
275
|
+
alias_method :avg, :mean
|
276
|
+
|
277
|
+
# Get the median value.
|
278
|
+
#
|
279
|
+
# This function is syntactic sugar for `pl.col(columns).median`.
|
280
|
+
#
|
281
|
+
# @param columns [Array]
|
282
|
+
# One or more column names.
|
283
|
+
#
|
284
|
+
# @return [Expr]
|
285
|
+
#
|
286
|
+
# @example
|
287
|
+
# df = Polars::DataFrame.new(
|
288
|
+
# {
|
289
|
+
# "a" => [1, 8, 3],
|
290
|
+
# "b" => [4, 5, 2],
|
291
|
+
# "c" => ["foo", "bar", "foo"]
|
292
|
+
# }
|
293
|
+
# )
|
294
|
+
# df.select(Polars.median("a"))
|
295
|
+
# # =>
|
296
|
+
# # shape: (1, 1)
|
297
|
+
# # ┌─────┐
|
298
|
+
# # │ a │
|
299
|
+
# # │ --- │
|
300
|
+
# # │ f64 │
|
301
|
+
# # ╞═════╡
|
302
|
+
# # │ 3.0 │
|
303
|
+
# # └─────┘
|
304
|
+
#
|
305
|
+
# @example
|
306
|
+
# df.select(Polars.median("a", "b"))
|
307
|
+
# # =>
|
308
|
+
# # shape: (1, 2)
|
309
|
+
# # ┌─────┬─────┐
|
310
|
+
# # │ a ┆ b │
|
311
|
+
# # │ --- ┆ --- │
|
312
|
+
# # │ f64 ┆ f64 │
|
313
|
+
# # ╞═════╪═════╡
|
314
|
+
# # │ 3.0 ┆ 4.0 │
|
315
|
+
# # └─────┴─────┘
|
316
|
+
def median(*columns)
|
317
|
+
col(*columns).median
|
318
|
+
end
|
319
|
+
|
320
|
+
# Count unique values.
|
321
|
+
#
|
322
|
+
# This function is syntactic sugar for `col(columns).n_unique`.
|
323
|
+
#
|
324
|
+
# @param columns [Array]
|
325
|
+
# One or more column names.
|
326
|
+
#
|
327
|
+
# @return [Expr]
|
328
|
+
#
|
329
|
+
# @example
|
330
|
+
# df = Polars::DataFrame.new(
|
331
|
+
# {
|
332
|
+
# "a" => [1, 8, 1],
|
333
|
+
# "b" => [4, 5, 2],
|
334
|
+
# "c" => ["foo", "bar", "foo"]
|
335
|
+
# }
|
336
|
+
# )
|
337
|
+
# df.select(Polars.n_unique("a"))
|
338
|
+
# # =>
|
339
|
+
# # shape: (1, 1)
|
340
|
+
# # ┌─────┐
|
341
|
+
# # │ a │
|
342
|
+
# # │ --- │
|
343
|
+
# # │ u32 │
|
344
|
+
# # ╞═════╡
|
345
|
+
# # │ 2 │
|
346
|
+
# # └─────┘
|
347
|
+
#
|
348
|
+
# @example
|
349
|
+
# df.select(Polars.n_unique("b", "c"))
|
350
|
+
# # =>
|
351
|
+
# # shape: (1, 2)
|
352
|
+
# # ┌─────┬─────┐
|
353
|
+
# # │ b ┆ c │
|
354
|
+
# # │ --- ┆ --- │
|
355
|
+
# # │ u32 ┆ u32 │
|
356
|
+
# # ╞═════╪═════╡
|
357
|
+
# # │ 3 ┆ 2 │
|
358
|
+
# # └─────┴─────┘
|
359
|
+
def n_unique(*columns)
|
360
|
+
col(*columns).n_unique
|
361
|
+
end
|
362
|
+
|
363
|
+
# Approximate count of unique values.
|
364
|
+
#
|
365
|
+
# This function is syntactic sugar for `col(columns).approx_n_unique`, and
|
366
|
+
# uses the HyperLogLog++ algorithm for cardinality estimation.
|
367
|
+
#
|
368
|
+
# @param columns [Array]
|
369
|
+
# One or more column names.
|
370
|
+
#
|
371
|
+
# @return [Expr]
|
372
|
+
#
|
373
|
+
# @example
|
374
|
+
# df = Polars::DataFrame.new(
|
375
|
+
# {
|
376
|
+
# "a" => [1, 8, 1],
|
377
|
+
# "b" => [4, 5, 2],
|
378
|
+
# "c" => ["foo", "bar", "foo"]
|
379
|
+
# }
|
380
|
+
# )
|
381
|
+
# df.select(Polars.approx_n_unique("a"))
|
382
|
+
# # =>
|
383
|
+
# # shape: (1, 1)
|
384
|
+
# # ┌─────┐
|
385
|
+
# # │ a │
|
386
|
+
# # │ --- │
|
387
|
+
# # │ u32 │
|
388
|
+
# # ╞═════╡
|
389
|
+
# # │ 2 │
|
390
|
+
# # └─────┘
|
391
|
+
#
|
392
|
+
# @example
|
393
|
+
# df.select(Polars.approx_n_unique("b", "c"))
|
394
|
+
# # =>
|
395
|
+
# # shape: (1, 2)
|
396
|
+
# # ┌─────┬─────┐
|
397
|
+
# # │ b ┆ c │
|
398
|
+
# # │ --- ┆ --- │
|
399
|
+
# # │ u32 ┆ u32 │
|
400
|
+
# # ╞═════╪═════╡
|
401
|
+
# # │ 3 ┆ 2 │
|
402
|
+
# # └─────┴─────┘
|
403
|
+
def approx_n_unique(*columns)
|
404
|
+
col(*columns).approx_n_unique
|
405
|
+
end
|
406
|
+
|
407
|
+
# Get the first value.
|
408
|
+
#
|
409
|
+
# @param columns [Array]
|
410
|
+
# One or more column names. If not provided (default), returns an expression
|
411
|
+
# to take the first column of the context instead.
|
412
|
+
#
|
413
|
+
# @return [Expr]
|
414
|
+
#
|
415
|
+
# @example
|
416
|
+
# df = Polars::DataFrame.new(
|
417
|
+
# {
|
418
|
+
# "a" => [1, 8, 3],
|
419
|
+
# "b" => [4, 5, 2],
|
420
|
+
# "c" => ["foo", "bar", "baz"]
|
421
|
+
# }
|
422
|
+
# )
|
423
|
+
# df.select(Polars.first)
|
424
|
+
# # =>
|
425
|
+
# # shape: (3, 1)
|
426
|
+
# # ┌─────┐
|
427
|
+
# # │ a │
|
428
|
+
# # │ --- │
|
429
|
+
# # │ i64 │
|
430
|
+
# # ╞═════╡
|
431
|
+
# # │ 1 │
|
432
|
+
# # │ 8 │
|
433
|
+
# # │ 3 │
|
434
|
+
# # └─────┘
|
435
|
+
#
|
436
|
+
# @example
|
437
|
+
# df.select(Polars.first("b"))
|
438
|
+
# # =>
|
439
|
+
# # shape: (1, 1)
|
440
|
+
# # ┌─────┐
|
441
|
+
# # │ b │
|
442
|
+
# # │ --- │
|
443
|
+
# # │ i64 │
|
444
|
+
# # ╞═════╡
|
445
|
+
# # │ 4 │
|
446
|
+
# # └─────┘
|
447
|
+
#
|
448
|
+
# @example
|
449
|
+
# df.select(Polars.first("a", "c"))
|
450
|
+
# # =>
|
451
|
+
# # shape: (1, 2)
|
452
|
+
# # ┌─────┬─────┐
|
453
|
+
# # │ a ┆ c │
|
454
|
+
# # │ --- ┆ --- │
|
455
|
+
# # │ i64 ┆ str │
|
456
|
+
# # ╞═════╪═════╡
|
457
|
+
# # │ 1 ┆ foo │
|
458
|
+
# # └─────┴─────┘
|
459
|
+
def first(*columns)
|
460
|
+
if columns.empty?
|
461
|
+
return Utils.wrap_expr(Plr.first)
|
462
|
+
end
|
463
|
+
|
464
|
+
col(*columns).first
|
465
|
+
end
|
466
|
+
|
467
|
+
# Get the last value.
|
468
|
+
#
|
469
|
+
# @param columns [Array]
|
470
|
+
# One or more column names. If set to `nil` (default), returns an expression
|
471
|
+
# to take the last column of the context instead.
|
472
|
+
#
|
473
|
+
# @return [Expr]
|
474
|
+
#
|
475
|
+
# @example
|
476
|
+
# df = Polars::DataFrame.new(
|
477
|
+
# {
|
478
|
+
# "a" => [1, 8, 3],
|
479
|
+
# "b" => [4, 5, 2],
|
480
|
+
# "c" => ["foo", "bar", "baz"]
|
481
|
+
# }
|
482
|
+
# )
|
483
|
+
# df.select(Polars.last)
|
484
|
+
# # =>
|
485
|
+
# # shape: (3, 1)
|
486
|
+
# # ┌─────┐
|
487
|
+
# # │ c │
|
488
|
+
# # │ --- │
|
489
|
+
# # │ str │
|
490
|
+
# # ╞═════╡
|
491
|
+
# # │ foo │
|
492
|
+
# # │ bar │
|
493
|
+
# # │ baz │
|
494
|
+
# # └─────┘
|
495
|
+
#
|
496
|
+
# @example
|
497
|
+
# df.select(Polars.last("a"))
|
498
|
+
# # =>
|
499
|
+
# # shape: (1, 1)
|
500
|
+
# # ┌─────┐
|
501
|
+
# # │ a │
|
502
|
+
# # │ --- │
|
503
|
+
# # │ i64 │
|
504
|
+
# # ╞═════╡
|
505
|
+
# # │ 3 │
|
506
|
+
# # └─────┘
|
507
|
+
#
|
508
|
+
# @example
|
509
|
+
# df.select(Polars.last("b", "c"))
|
510
|
+
# # =>
|
511
|
+
# # shape: (1, 2)
|
512
|
+
# # ┌─────┬─────┐
|
513
|
+
# # │ b ┆ c │
|
514
|
+
# # │ --- ┆ --- │
|
515
|
+
# # │ i64 ┆ str │
|
516
|
+
# # ╞═════╪═════╡
|
517
|
+
# # │ 2 ┆ baz │
|
518
|
+
# # └─────┴─────┘
|
519
|
+
def last(*columns)
|
520
|
+
if columns.empty?
|
521
|
+
return Utils.wrap_expr(Plr.last)
|
522
|
+
end
|
523
|
+
|
524
|
+
col(*columns).last
|
525
|
+
end
|
526
|
+
|
527
|
+
# Get the first `n` rows.
|
528
|
+
#
|
529
|
+
# This function is syntactic sugar for `col(column).head(n)`.
|
530
|
+
#
|
531
|
+
# @param column [Object]
|
532
|
+
# Column name.
|
533
|
+
# @param n [Integer]
|
534
|
+
# Number of rows to return.
|
535
|
+
#
|
536
|
+
# @return [Expr]
|
537
|
+
#
|
538
|
+
# @example
|
539
|
+
# df = Polars::DataFrame.new(
|
540
|
+
# {
|
541
|
+
# "a" => [1, 8, 3],
|
542
|
+
# "b" => [4, 5, 2],
|
543
|
+
# "c" => ["foo", "bar", "foo"]
|
544
|
+
# }
|
545
|
+
# )
|
546
|
+
# df.select(Polars.head("a"))
|
547
|
+
# # =>
|
548
|
+
# # shape: (3, 1)
|
549
|
+
# # ┌─────┐
|
550
|
+
# # │ a │
|
551
|
+
# # │ --- │
|
552
|
+
# # │ i64 │
|
553
|
+
# # ╞═════╡
|
554
|
+
# # │ 1 │
|
555
|
+
# # │ 8 │
|
556
|
+
# # │ 3 │
|
557
|
+
# # └─────┘
|
558
|
+
#
|
559
|
+
# @example
|
560
|
+
# df.select(Polars.head("a", 2))
|
561
|
+
# # =>
|
562
|
+
# # shape: (2, 1)
|
563
|
+
# # ┌─────┐
|
564
|
+
# # │ a │
|
565
|
+
# # │ --- │
|
566
|
+
# # │ i64 │
|
567
|
+
# # ╞═════╡
|
568
|
+
# # │ 1 │
|
569
|
+
# # │ 8 │
|
570
|
+
# # └─────┘
|
571
|
+
def head(column, n = 10)
|
572
|
+
col(column).head(n)
|
573
|
+
end
|
574
|
+
|
575
|
+
# Get the last `n` rows.
|
576
|
+
#
|
577
|
+
# This function is syntactic sugar for `col(column).tail(n)`.
|
578
|
+
#
|
579
|
+
# @param column [Object]
|
580
|
+
# Column name.
|
581
|
+
# @param n [Integer]
|
582
|
+
# Number of rows to return.
|
583
|
+
#
|
584
|
+
# @return [Expr]
|
585
|
+
#
|
586
|
+
# @example
|
587
|
+
# df = Polars::DataFrame.new(
|
588
|
+
# {
|
589
|
+
# "a" => [1, 8, 3],
|
590
|
+
# "b" => [4, 5, 2],
|
591
|
+
# "c" => ["foo", "bar", "foo"]
|
592
|
+
# }
|
593
|
+
# )
|
594
|
+
# df.select(Polars.tail("a"))
|
595
|
+
# # =>
|
596
|
+
# # shape: (3, 1)
|
597
|
+
# # ┌─────┐
|
598
|
+
# # │ a │
|
599
|
+
# # │ --- │
|
600
|
+
# # │ i64 │
|
601
|
+
# # ╞═════╡
|
602
|
+
# # │ 1 │
|
603
|
+
# # │ 8 │
|
604
|
+
# # │ 3 │
|
605
|
+
# # └─────┘
|
606
|
+
#
|
607
|
+
# @example
|
608
|
+
# df.select(Polars.tail("a", 2))
|
609
|
+
# # =>
|
610
|
+
# # shape: (2, 1)
|
611
|
+
# # ┌─────┐
|
612
|
+
# # │ a │
|
613
|
+
# # │ --- │
|
614
|
+
# # │ i64 │
|
615
|
+
# # ╞═════╡
|
616
|
+
# # │ 8 │
|
617
|
+
# # │ 3 │
|
618
|
+
# # └─────┘
|
619
|
+
def tail(column, n = 10)
|
620
|
+
col(column).tail(n)
|
621
|
+
end
|
622
|
+
|
623
|
+
# Compute the Pearson's or Spearman rank correlation correlation between two columns.
|
624
|
+
#
|
625
|
+
# @param a [Object]
|
626
|
+
# Column name or Expression.
|
627
|
+
# @param b [Object]
|
628
|
+
# Column name or Expression.
|
629
|
+
# @param ddof [Integer]
|
630
|
+
# "Delta Degrees of Freedom": the divisor used in the calculation is N - ddof,
|
631
|
+
# where N represents the number of elements.
|
632
|
+
# By default ddof is 1.
|
633
|
+
# @param method ["pearson", "spearman"]
|
634
|
+
# Correlation method.
|
635
|
+
# @param propagate_nans [Boolean]
|
636
|
+
# If `true` any `NaN` encountered will lead to `NaN` in the output.
|
637
|
+
# Defaults to `False` where `NaN` are regarded as larger than any finite number
|
638
|
+
# and thus lead to the highest rank.
|
639
|
+
#
|
640
|
+
# @return [Expr]
|
641
|
+
#
|
642
|
+
# @example Pearson's correlation:
|
643
|
+
# df = Polars::DataFrame.new(
|
644
|
+
# {
|
645
|
+
# "a" => [1, 8, 3],
|
646
|
+
# "b" => [4, 5, 2],
|
647
|
+
# "c" => ["foo", "bar", "foo"]
|
648
|
+
# }
|
649
|
+
# )
|
650
|
+
# df.select(Polars.corr("a", "b"))
|
651
|
+
# # =>
|
652
|
+
# # shape: (1, 1)
|
653
|
+
# # ┌──────────┐
|
654
|
+
# # │ a │
|
655
|
+
# # │ --- │
|
656
|
+
# # │ f64 │
|
657
|
+
# # ╞══════════╡
|
658
|
+
# # │ 0.544705 │
|
659
|
+
# # └──────────┘
|
660
|
+
#
|
661
|
+
# @example Spearman rank correlation:
|
662
|
+
# df = Polars::DataFrame.new(
|
663
|
+
# {
|
664
|
+
# "a" => [1, 8, 3],
|
665
|
+
# "b" => [4, 5, 2],
|
666
|
+
# "c" => ["foo", "bar", "foo"]
|
667
|
+
# }
|
668
|
+
# )
|
669
|
+
# df.select(Polars.corr("a", "b", method: "spearman"))
|
670
|
+
# # =>
|
671
|
+
# # shape: (1, 1)
|
672
|
+
# # ┌─────┐
|
673
|
+
# # │ a │
|
674
|
+
# # │ --- │
|
675
|
+
# # │ f64 │
|
676
|
+
# # ╞═════╡
|
677
|
+
# # │ 0.5 │
|
678
|
+
# # └─────┘
|
679
|
+
def corr(
|
680
|
+
a,
|
681
|
+
b,
|
682
|
+
method: "pearson",
|
683
|
+
ddof: 1,
|
684
|
+
propagate_nans: false
|
685
|
+
)
|
686
|
+
a = Utils.parse_as_expression(a)
|
687
|
+
b = Utils.parse_as_expression(b)
|
688
|
+
|
689
|
+
if method == "pearson"
|
690
|
+
Utils.wrap_expr(Plr.pearson_corr(a, b, ddof))
|
691
|
+
elsif method == "spearman"
|
692
|
+
Utils.wrap_expr(Plr.spearman_rank_corr(a, b, ddof, propagate_nans))
|
693
|
+
else
|
694
|
+
msg = "method must be one of {{'pearson', 'spearman'}}, got #{method}"
|
695
|
+
raise ArgumentError, msg
|
696
|
+
end
|
697
|
+
end
|
698
|
+
|
699
|
+
# Compute the covariance between two columns/ expressions.
|
700
|
+
#
|
701
|
+
# @param a [Object]
|
702
|
+
# Column name or Expression.
|
703
|
+
# @param b [Object]
|
704
|
+
# Column name or Expression.
|
705
|
+
# @param ddof [Integer]
|
706
|
+
# "Delta Degrees of Freedom": the divisor used in the calculation is N - ddof,
|
707
|
+
# where N represents the number of elements.
|
708
|
+
# By default ddof is 1.
|
709
|
+
#
|
710
|
+
# @return [Expr]
|
711
|
+
#
|
712
|
+
# @example
|
713
|
+
# df = Polars::DataFrame.new(
|
714
|
+
# {
|
715
|
+
# "a" => [1, 8, 3],
|
716
|
+
# "b" => [4, 5, 2],
|
717
|
+
# "c" => ["foo", "bar", "foo"]
|
718
|
+
# }
|
719
|
+
# )
|
720
|
+
# df.select(Polars.cov("a", "b"))
|
721
|
+
# # =>
|
722
|
+
# # shape: (1, 1)
|
723
|
+
# # ┌─────┐
|
724
|
+
# # │ a │
|
725
|
+
# # │ --- │
|
726
|
+
# # │ f64 │
|
727
|
+
# # ╞═════╡
|
728
|
+
# # │ 3.0 │
|
729
|
+
# # └─────┘
|
730
|
+
def cov(a, b, ddof: 1)
|
731
|
+
a = Utils.parse_as_expression(a)
|
732
|
+
b = Utils.parse_as_expression(b)
|
733
|
+
Utils.wrap_expr(Plr.cov(a, b, ddof))
|
734
|
+
end
|
735
|
+
|
736
|
+
# def map
|
737
|
+
# end
|
738
|
+
|
739
|
+
# def apply
|
740
|
+
# end
|
741
|
+
|
742
|
+
# Accumulate over multiple columns horizontally/row wise with a left fold.
|
743
|
+
#
|
744
|
+
# @return [Expr]
|
745
|
+
def fold(acc, f, exprs)
|
746
|
+
acc = Utils.expr_to_lit_or_expr(acc, str_to_lit: true)
|
747
|
+
if exprs.is_a?(Expr)
|
748
|
+
exprs = [exprs]
|
749
|
+
end
|
750
|
+
|
751
|
+
exprs = Utils.selection_to_rbexpr_list(exprs)
|
752
|
+
Utils.wrap_expr(Plr.fold(acc._rbexpr, f, exprs))
|
753
|
+
end
|
754
|
+
|
755
|
+
# def reduce
|
756
|
+
# end
|
757
|
+
|
758
|
+
# Cumulatively accumulate over multiple columns horizontally/row wise with a left fold.
|
759
|
+
#
|
760
|
+
# Every cumulative result is added as a separate field in a Struct column.
|
761
|
+
#
|
762
|
+
# @param acc [Object]
|
763
|
+
# Accumulator Expression. This is the value that will be initialized when the fold
|
764
|
+
# starts. For a sum this could for instance be lit(0).
|
765
|
+
# @param f [Object]
|
766
|
+
# Function to apply over the accumulator and the value.
|
767
|
+
# Fn(acc, value) -> new_value
|
768
|
+
# @param exprs [Object]
|
769
|
+
# Expressions to aggregate over. May also be a wildcard expression.
|
770
|
+
# @param include_init [Boolean]
|
771
|
+
# Include the initial accumulator state as struct field.
|
772
|
+
#
|
773
|
+
# @return [Object]
|
774
|
+
#
|
775
|
+
# @note
|
776
|
+
# If you simply want the first encountered expression as accumulator,
|
777
|
+
# consider using `cumreduce`.
|
778
|
+
def cum_fold(acc, f, exprs, include_init: false)
|
779
|
+
acc = Utils.expr_to_lit_or_expr(acc, str_to_lit: true)
|
780
|
+
if exprs.is_a?(Expr)
|
781
|
+
exprs = [exprs]
|
782
|
+
end
|
783
|
+
|
784
|
+
exprs = Utils.selection_to_rbexpr_list(exprs)
|
785
|
+
Utils.wrap_expr(Plr.cum_fold(acc._rbexpr, f, exprs, include_init))
|
786
|
+
end
|
787
|
+
alias_method :cumfold, :cum_fold
|
788
|
+
|
789
|
+
# def cum_reduce
|
790
|
+
# end
|
791
|
+
|
792
|
+
# Compute two argument arctan in radians.
|
793
|
+
#
|
794
|
+
# Returns the angle (in radians) in the plane between the
|
795
|
+
# positive x-axis and the ray from the origin to (x,y).
|
796
|
+
#
|
797
|
+
# @param y [Object]
|
798
|
+
# Column name or Expression.
|
799
|
+
# @param x [Object]
|
800
|
+
# Column name or Expression.
|
801
|
+
#
|
802
|
+
# @return [Expr]
|
803
|
+
#
|
804
|
+
# @example
|
805
|
+
# twoRootTwo = Math.sqrt(2) / 2
|
806
|
+
# df = Polars::DataFrame.new(
|
807
|
+
# {
|
808
|
+
# "y" => [twoRootTwo, -twoRootTwo, twoRootTwo, -twoRootTwo],
|
809
|
+
# "x" => [twoRootTwo, twoRootTwo, -twoRootTwo, -twoRootTwo]
|
810
|
+
# }
|
811
|
+
# )
|
812
|
+
# df.select(
|
813
|
+
# Polars.arctan2d("y", "x").alias("atan2d"), Polars.arctan2("y", "x").alias("atan2")
|
814
|
+
# )
|
815
|
+
# # =>
|
816
|
+
# # shape: (4, 2)
|
817
|
+
# # ┌────────┬───────────┐
|
818
|
+
# # │ atan2d ┆ atan2 │
|
819
|
+
# # │ --- ┆ --- │
|
820
|
+
# # │ f64 ┆ f64 │
|
821
|
+
# # ╞════════╪═══════════╡
|
822
|
+
# # │ 45.0 ┆ 0.785398 │
|
823
|
+
# # │ -45.0 ┆ -0.785398 │
|
824
|
+
# # │ 135.0 ┆ 2.356194 │
|
825
|
+
# # │ -135.0 ┆ -2.356194 │
|
826
|
+
# # └────────┴───────────┘
|
827
|
+
def arctan2(y, x)
|
828
|
+
if Utils.strlike?(y)
|
829
|
+
y = col(y)
|
830
|
+
end
|
831
|
+
if Utils.strlike?(x)
|
832
|
+
x = col(x)
|
833
|
+
end
|
834
|
+
Utils.wrap_expr(Plr.arctan2(y._rbexpr, x._rbexpr))
|
835
|
+
end
|
836
|
+
|
837
|
+
# Compute two argument arctan in degrees.
|
838
|
+
#
|
839
|
+
# Returns the angle (in degrees) in the plane between the positive x-axis
|
840
|
+
# and the ray from the origin to (x,y).
|
841
|
+
#
|
842
|
+
# @param y [Object]
|
843
|
+
# Column name or Expression.
|
844
|
+
# @param x [Object]
|
845
|
+
# Column name or Expression.
|
846
|
+
#
|
847
|
+
# @return [Expr]
|
848
|
+
#
|
849
|
+
# @example
|
850
|
+
# twoRootTwo = Math.sqrt(2) / 2
|
851
|
+
# df = Polars::DataFrame.new(
|
852
|
+
# {
|
853
|
+
# "y" => [twoRootTwo, -twoRootTwo, twoRootTwo, -twoRootTwo],
|
854
|
+
# "x" => [twoRootTwo, twoRootTwo, -twoRootTwo, -twoRootTwo]
|
855
|
+
# }
|
856
|
+
# )
|
857
|
+
# df.select(
|
858
|
+
# Polars.arctan2d("y", "x").alias("atan2d"), Polars.arctan2("y", "x").alias("atan2")
|
859
|
+
# )
|
860
|
+
# # =>
|
861
|
+
# # shape: (4, 2)
|
862
|
+
# # ┌────────┬───────────┐
|
863
|
+
# # │ atan2d ┆ atan2 │
|
864
|
+
# # │ --- ┆ --- │
|
865
|
+
# # │ f64 ┆ f64 │
|
866
|
+
# # ╞════════╪═══════════╡
|
867
|
+
# # │ 45.0 ┆ 0.785398 │
|
868
|
+
# # │ -45.0 ┆ -0.785398 │
|
869
|
+
# # │ 135.0 ┆ 2.356194 │
|
870
|
+
# # │ -135.0 ┆ -2.356194 │
|
871
|
+
# # └────────┴───────────┘
|
872
|
+
def arctan2d(y, x)
|
873
|
+
if Utils.strlike?(y)
|
874
|
+
y = col(y)
|
875
|
+
end
|
876
|
+
if Utils.strlike?(x)
|
877
|
+
x = col(x)
|
878
|
+
end
|
879
|
+
Utils.wrap_expr(Plr.arctan2d(y._rbexpr, x._rbexpr))
|
880
|
+
end
|
881
|
+
|
882
|
+
# Exclude certain columns from a wildcard/regex selection.
|
883
|
+
#
|
884
|
+
# @param columns [Object]
|
885
|
+
# Column(s) to exclude from selection
|
886
|
+
# This can be:
|
887
|
+
#
|
888
|
+
# - a column name, or multiple column names
|
889
|
+
# - a regular expression starting with `^` and ending with `$`
|
890
|
+
# - a dtype or multiple dtypes
|
891
|
+
#
|
892
|
+
# @return [Object]
|
893
|
+
#
|
894
|
+
# @example
|
895
|
+
# df = Polars::DataFrame.new(
|
896
|
+
# {
|
897
|
+
# "aa" => [1, 2, 3],
|
898
|
+
# "ba" => ["a", "b", nil],
|
899
|
+
# "cc" => [nil, 2.5, 1.5]
|
900
|
+
# }
|
901
|
+
# )
|
902
|
+
# # =>
|
903
|
+
# # shape: (3, 3)
|
904
|
+
# # ┌─────┬──────┬──────┐
|
905
|
+
# # │ aa ┆ ba ┆ cc │
|
906
|
+
# # │ --- ┆ --- ┆ --- │
|
907
|
+
# # │ i64 ┆ str ┆ f64 │
|
908
|
+
# # ╞═════╪══════╪══════╡
|
909
|
+
# # │ 1 ┆ a ┆ null │
|
910
|
+
# # │ 2 ┆ b ┆ 2.5 │
|
911
|
+
# # │ 3 ┆ null ┆ 1.5 │
|
912
|
+
# # └─────┴──────┴──────┘
|
913
|
+
#
|
914
|
+
# @example Exclude by column name(s):
|
915
|
+
# df.select(Polars.exclude("ba"))
|
916
|
+
# # =>
|
917
|
+
# # shape: (3, 2)
|
918
|
+
# # ┌─────┬──────┐
|
919
|
+
# # │ aa ┆ cc │
|
920
|
+
# # │ --- ┆ --- │
|
921
|
+
# # │ i64 ┆ f64 │
|
922
|
+
# # ╞═════╪══════╡
|
923
|
+
# # │ 1 ┆ null │
|
924
|
+
# # │ 2 ┆ 2.5 │
|
925
|
+
# # │ 3 ┆ 1.5 │
|
926
|
+
# # └─────┴──────┘
|
927
|
+
#
|
928
|
+
# @example Exclude by regex, e.g. removing all columns whose names end with the letter "a":
|
929
|
+
# df.select(Polars.exclude("^.*a$"))
|
930
|
+
# # =>
|
931
|
+
# # shape: (3, 1)
|
932
|
+
# # ┌──────┐
|
933
|
+
# # │ cc │
|
934
|
+
# # │ --- │
|
935
|
+
# # │ f64 │
|
936
|
+
# # ╞══════╡
|
937
|
+
# # │ null │
|
938
|
+
# # │ 2.5 │
|
939
|
+
# # │ 1.5 │
|
940
|
+
# # └──────┘
|
941
|
+
def exclude(columns)
|
942
|
+
col("*").exclude(columns)
|
943
|
+
end
|
944
|
+
|
945
|
+
# Syntactic sugar for `Polars.col("foo").agg_groups`.
|
946
|
+
#
|
947
|
+
# @return [Object]
|
948
|
+
def groups(column)
|
949
|
+
col(column).agg_groups
|
950
|
+
end
|
951
|
+
|
952
|
+
# Syntactic sugar for `Polars.col("foo").quantile(...)`.
|
953
|
+
#
|
954
|
+
# @param column [String]
|
955
|
+
# Column name.
|
956
|
+
# @param quantile [Float]
|
957
|
+
# Quantile between 0.0 and 1.0.
|
958
|
+
# @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
|
959
|
+
# Interpolation method.
|
960
|
+
#
|
961
|
+
# @return [Expr]
|
962
|
+
def quantile(column, quantile, interpolation: "nearest")
|
963
|
+
col(column).quantile(quantile, interpolation: interpolation)
|
964
|
+
end
|
965
|
+
|
966
|
+
# Find the indexes that would sort the columns.
|
967
|
+
#
|
968
|
+
# Argsort by multiple columns. The first column will be used for the ordering.
|
969
|
+
# If there are duplicates in the first column, the second column will be used to
|
970
|
+
# determine the ordering and so on.
|
971
|
+
#
|
972
|
+
# @param exprs [Object]
|
973
|
+
# Columns use to determine the ordering.
|
974
|
+
# @param reverse [Boolean]
|
975
|
+
# Default is ascending.
|
976
|
+
#
|
977
|
+
# @return [Expr]
|
978
|
+
def arg_sort_by(exprs, reverse: false)
|
979
|
+
if !exprs.is_a?(::Array)
|
980
|
+
exprs = [exprs]
|
981
|
+
end
|
982
|
+
if reverse == true || reverse == false
|
983
|
+
reverse = [reverse] * exprs.length
|
984
|
+
end
|
985
|
+
exprs = Utils.selection_to_rbexpr_list(exprs)
|
986
|
+
Utils.wrap_expr(Plr.arg_sort_by(exprs, reverse))
|
987
|
+
end
|
988
|
+
alias_method :argsort_by, :arg_sort_by
|
989
|
+
|
990
|
+
# Collect multiple LazyFrames at the same time.
|
991
|
+
#
|
992
|
+
# This runs all the computation graphs in parallel on Polars threadpool.
|
993
|
+
#
|
994
|
+
# @param lazy_frames [Boolean]
|
995
|
+
# A list of LazyFrames to collect.
|
996
|
+
# @param type_coercion [Boolean]
|
997
|
+
# Do type coercion optimization.
|
998
|
+
# @param predicate_pushdown [Boolean]
|
999
|
+
# Do predicate pushdown optimization.
|
1000
|
+
# @param projection_pushdown [Boolean]
|
1001
|
+
# Do projection pushdown optimization.
|
1002
|
+
# @param simplify_expression [Boolean]
|
1003
|
+
# Run simplify expressions optimization.
|
1004
|
+
# @param string_cache [Boolean]
|
1005
|
+
# This argument is deprecated and will be ignored
|
1006
|
+
# @param no_optimization [Boolean]
|
1007
|
+
# Turn off optimizations.
|
1008
|
+
# @param slice_pushdown [Boolean]
|
1009
|
+
# Slice pushdown optimization.
|
1010
|
+
# @param common_subplan_elimination [Boolean]
|
1011
|
+
# Will try to cache branching subplans that occur on self-joins or unions.
|
1012
|
+
# @param allow_streaming [Boolean]
|
1013
|
+
# Run parts of the query in a streaming fashion (this is in an alpha state)
|
1014
|
+
#
|
1015
|
+
# @return [Array]
|
1016
|
+
def collect_all(
|
1017
|
+
lazy_frames,
|
1018
|
+
type_coercion: true,
|
1019
|
+
predicate_pushdown: true,
|
1020
|
+
projection_pushdown: true,
|
1021
|
+
simplify_expression: true,
|
1022
|
+
string_cache: false,
|
1023
|
+
no_optimization: false,
|
1024
|
+
slice_pushdown: true,
|
1025
|
+
common_subplan_elimination: true,
|
1026
|
+
allow_streaming: false
|
1027
|
+
)
|
1028
|
+
if no_optimization
|
1029
|
+
predicate_pushdown = false
|
1030
|
+
projection_pushdown = false
|
1031
|
+
slice_pushdown = false
|
1032
|
+
common_subplan_elimination = false
|
1033
|
+
end
|
1034
|
+
|
1035
|
+
prepared = []
|
1036
|
+
|
1037
|
+
lazy_frames.each do |lf|
|
1038
|
+
ldf = lf._ldf.optimization_toggle(
|
1039
|
+
type_coercion,
|
1040
|
+
predicate_pushdown,
|
1041
|
+
projection_pushdown,
|
1042
|
+
simplify_expression,
|
1043
|
+
slice_pushdown,
|
1044
|
+
common_subplan_elimination,
|
1045
|
+
allow_streaming,
|
1046
|
+
false
|
1047
|
+
)
|
1048
|
+
prepared << ldf
|
1049
|
+
end
|
1050
|
+
|
1051
|
+
out = Plr.collect_all(prepared)
|
1052
|
+
|
1053
|
+
# wrap the rbdataframes into dataframe
|
1054
|
+
result = out.map { |rbdf| Utils.wrap_df(rbdf) }
|
1055
|
+
|
1056
|
+
result
|
1057
|
+
end
|
1058
|
+
|
1059
|
+
# Run polars expressions without a context.
|
1060
|
+
#
|
1061
|
+
# This is syntactic sugar for running `df.select` on an empty DataFrame.
|
1062
|
+
#
|
1063
|
+
# @param exprs [Array]
|
1064
|
+
# Column(s) to select, specified as positional arguments.
|
1065
|
+
# Accepts expression input. Strings are parsed as column names,
|
1066
|
+
# other non-expression inputs are parsed as literals.
|
1067
|
+
# @param named_exprs [Hash]
|
1068
|
+
# Additional columns to select, specified as keyword arguments.
|
1069
|
+
# The columns will be renamed to the keyword used.
|
1070
|
+
#
|
1071
|
+
# @return [DataFrame]
|
1072
|
+
#
|
1073
|
+
# @example
|
1074
|
+
# foo = Polars::Series.new("foo", [1, 2, 3])
|
1075
|
+
# bar = Polars::Series.new("bar", [3, 2, 1])
|
1076
|
+
# Polars.select(min: Polars.min_horizontal(foo, bar))
|
1077
|
+
# # =>
|
1078
|
+
# # shape: (3, 1)
|
1079
|
+
# # ┌─────┐
|
1080
|
+
# # │ min │
|
1081
|
+
# # │ --- │
|
1082
|
+
# # │ i64 │
|
1083
|
+
# # ╞═════╡
|
1084
|
+
# # │ 1 │
|
1085
|
+
# # │ 2 │
|
1086
|
+
# # │ 1 │
|
1087
|
+
# # └─────┘
|
1088
|
+
def select(*exprs, **named_exprs)
|
1089
|
+
DataFrame.new([]).select(*exprs, **named_exprs)
|
1090
|
+
end
|
1091
|
+
|
1092
|
+
# Return indices where `condition` evaluates `true`.
|
1093
|
+
#
|
1094
|
+
# @param condition [Expr]
|
1095
|
+
# Boolean expression to evaluate
|
1096
|
+
# @param eager [Boolean]
|
1097
|
+
# Whether to apply this function eagerly (as opposed to lazily).
|
1098
|
+
#
|
1099
|
+
# @return [Expr, Series]
|
1100
|
+
#
|
1101
|
+
# @example
|
1102
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4, 5]})
|
1103
|
+
# df.select(
|
1104
|
+
# [
|
1105
|
+
# Polars.arg_where(Polars.col("a") % 2 == 0)
|
1106
|
+
# ]
|
1107
|
+
# ).to_series
|
1108
|
+
# # =>
|
1109
|
+
# # shape: (2,)
|
1110
|
+
# # Series: 'a' [u32]
|
1111
|
+
# # [
|
1112
|
+
# # 1
|
1113
|
+
# # 3
|
1114
|
+
# # ]
|
1115
|
+
def arg_where(condition, eager: false)
|
1116
|
+
if eager
|
1117
|
+
if !condition.is_a?(Series)
|
1118
|
+
raise ArgumentError, "expected 'Series' in 'arg_where' if 'eager: true', got #{condition.class.name}"
|
1119
|
+
end
|
1120
|
+
condition.to_frame.select(arg_where(Polars.col(condition.name))).to_series
|
1121
|
+
else
|
1122
|
+
condition = Utils.expr_to_lit_or_expr(condition, str_to_lit: true)
|
1123
|
+
Utils.wrap_expr(Plr.arg_where(condition._rbexpr))
|
1124
|
+
end
|
1125
|
+
end
|
1126
|
+
|
1127
|
+
# Folds the columns from left to right, keeping the first non-null value.
|
1128
|
+
#
|
1129
|
+
# @param exprs [Array]
|
1130
|
+
# Columns to coalesce. Accepts expression input. Strings are parsed as column
|
1131
|
+
# names, other non-expression inputs are parsed as literals.
|
1132
|
+
# @param more_exprs [Hash]
|
1133
|
+
# Additional columns to coalesce, specified as positional arguments.
|
1134
|
+
#
|
1135
|
+
# @return [Expr]
|
1136
|
+
#
|
1137
|
+
# @example
|
1138
|
+
# df = Polars::DataFrame.new(
|
1139
|
+
# {
|
1140
|
+
# "a" => [1, nil, nil, nil],
|
1141
|
+
# "b" => [1, 2, nil, nil],
|
1142
|
+
# "c" => [5, nil, 3, nil]
|
1143
|
+
# }
|
1144
|
+
# )
|
1145
|
+
# df.with_columns(Polars.coalesce(["a", "b", "c", 10]).alias("d"))
|
1146
|
+
# # =>
|
1147
|
+
# # shape: (4, 4)
|
1148
|
+
# # ┌──────┬──────┬──────┬─────┐
|
1149
|
+
# # │ a ┆ b ┆ c ┆ d │
|
1150
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1151
|
+
# # │ i64 ┆ i64 ┆ i64 ┆ i64 │
|
1152
|
+
# # ╞══════╪══════╪══════╪═════╡
|
1153
|
+
# # │ 1 ┆ 1 ┆ 5 ┆ 1 │
|
1154
|
+
# # │ null ┆ 2 ┆ null ┆ 2 │
|
1155
|
+
# # │ null ┆ null ┆ 3 ┆ 3 │
|
1156
|
+
# # │ null ┆ null ┆ null ┆ 10 │
|
1157
|
+
# # └──────┴──────┴──────┴─────┘
|
1158
|
+
#
|
1159
|
+
# @example
|
1160
|
+
# df.with_columns(Polars.coalesce(Polars.col(["a", "b", "c"]), 10.0).alias("d"))
|
1161
|
+
# # =>
|
1162
|
+
# # shape: (4, 4)
|
1163
|
+
# # ┌──────┬──────┬──────┬──────┐
|
1164
|
+
# # │ a ┆ b ┆ c ┆ d │
|
1165
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1166
|
+
# # │ i64 ┆ i64 ┆ i64 ┆ f64 │
|
1167
|
+
# # ╞══════╪══════╪══════╪══════╡
|
1168
|
+
# # │ 1 ┆ 1 ┆ 5 ┆ 1.0 │
|
1169
|
+
# # │ null ┆ 2 ┆ null ┆ 2.0 │
|
1170
|
+
# # │ null ┆ null ┆ 3 ┆ 3.0 │
|
1171
|
+
# # │ null ┆ null ┆ null ┆ 10.0 │
|
1172
|
+
# # └──────┴──────┴──────┴──────┘
|
1173
|
+
def coalesce(exprs, *more_exprs)
|
1174
|
+
exprs = Utils.parse_as_list_of_expressions(exprs, *more_exprs)
|
1175
|
+
Utils.wrap_expr(Plr.coalesce(exprs))
|
1176
|
+
end
|
1177
|
+
|
1178
|
+
# Utility function that parses an epoch timestamp (or Unix time) to Polars Date(time).
|
1179
|
+
#
|
1180
|
+
# Depending on the `unit` provided, this function will return a different dtype:
|
1181
|
+
# - unit: "d" returns pl.Date
|
1182
|
+
# - unit: "s" returns pl.Datetime["us"] (pl.Datetime's default)
|
1183
|
+
# - unit: "ms" returns pl.Datetime["ms"]
|
1184
|
+
# - unit: "us" returns pl.Datetime["us"]
|
1185
|
+
# - unit: "ns" returns pl.Datetime["ns"]
|
1186
|
+
#
|
1187
|
+
# @param column [Object]
|
1188
|
+
# Series or expression to parse integers to pl.Datetime.
|
1189
|
+
# @param unit [String]
|
1190
|
+
# The unit of the timesteps since epoch time.
|
1191
|
+
# @param eager [Boolean]
|
1192
|
+
# If eager evaluation is `true`, a Series is returned instead of an Expr.
|
1193
|
+
#
|
1194
|
+
# @return [Object]
|
1195
|
+
#
|
1196
|
+
# @example
|
1197
|
+
# df = Polars::DataFrame.new({"timestamp" => [1666683077, 1666683099]}).lazy
|
1198
|
+
# df.select(Polars.from_epoch(Polars.col("timestamp"), unit: "s")).collect
|
1199
|
+
# # =>
|
1200
|
+
# # shape: (2, 1)
|
1201
|
+
# # ┌─────────────────────┐
|
1202
|
+
# # │ timestamp │
|
1203
|
+
# # │ --- │
|
1204
|
+
# # │ datetime[μs] │
|
1205
|
+
# # ╞═════════════════════╡
|
1206
|
+
# # │ 2022-10-25 07:31:17 │
|
1207
|
+
# # │ 2022-10-25 07:31:39 │
|
1208
|
+
# # └─────────────────────┘
|
1209
|
+
def from_epoch(column, unit: "s", eager: false)
|
1210
|
+
if Utils.strlike?(column)
|
1211
|
+
column = col(column)
|
1212
|
+
elsif !column.is_a?(Series) && !column.is_a?(Expr)
|
1213
|
+
column = Series.new(column)
|
1214
|
+
end
|
1215
|
+
|
1216
|
+
if unit == "d"
|
1217
|
+
expr = column.cast(Date)
|
1218
|
+
elsif unit == "s"
|
1219
|
+
expr = (column.cast(Int64) * 1_000_000).cast(Datetime.new("us"))
|
1220
|
+
elsif Utils::DTYPE_TEMPORAL_UNITS.include?(unit)
|
1221
|
+
expr = column.cast(Datetime.new(unit))
|
1222
|
+
else
|
1223
|
+
raise ArgumentError, "'unit' must be one of {{'ns', 'us', 'ms', 's', 'd'}}, got '#{unit}'."
|
1224
|
+
end
|
1225
|
+
|
1226
|
+
if eager
|
1227
|
+
if !column.is_a?(Series)
|
1228
|
+
raise ArgumentError, "expected Series or Array if eager: true, got #{column.class.name}"
|
1229
|
+
else
|
1230
|
+
column.to_frame.select(expr).to_series
|
1231
|
+
end
|
1232
|
+
else
|
1233
|
+
expr
|
1234
|
+
end
|
1235
|
+
end
|
1236
|
+
|
1237
|
+
# Parse one or more SQL expressions to polars expression(s).
|
1238
|
+
#
|
1239
|
+
# @param sql [Object]
|
1240
|
+
# One or more SQL expressions.
|
1241
|
+
#
|
1242
|
+
# @return [Expr]
|
1243
|
+
#
|
1244
|
+
# @example Parse a single SQL expression:
|
1245
|
+
# df = Polars::DataFrame.new({"a" => [2, 1]})
|
1246
|
+
# expr = Polars.sql_expr("MAX(a)")
|
1247
|
+
# df.select(expr)
|
1248
|
+
# # =>
|
1249
|
+
# # shape: (1, 1)
|
1250
|
+
# # ┌─────┐
|
1251
|
+
# # │ a │
|
1252
|
+
# # │ --- │
|
1253
|
+
# # │ i64 │
|
1254
|
+
# # ╞═════╡
|
1255
|
+
# # │ 2 │
|
1256
|
+
# # └─────┘
|
1257
|
+
#
|
1258
|
+
# @example Parse multiple SQL expressions:
|
1259
|
+
# df.with_columns(
|
1260
|
+
# *Polars.sql_expr(["POWER(a,a) AS a_a", "CAST(a AS TEXT) AS a_txt"])
|
1261
|
+
# )
|
1262
|
+
# # =>
|
1263
|
+
# # shape: (2, 3)
|
1264
|
+
# # ┌─────┬─────┬───────┐
|
1265
|
+
# # │ a ┆ a_a ┆ a_txt │
|
1266
|
+
# # │ --- ┆ --- ┆ --- │
|
1267
|
+
# # │ i64 ┆ i64 ┆ str │
|
1268
|
+
# # ╞═════╪═════╪═══════╡
|
1269
|
+
# # │ 2 ┆ 4 ┆ 2 │
|
1270
|
+
# # │ 1 ┆ 1 ┆ 1 │
|
1271
|
+
# # └─────┴─────┴───────┘
|
1272
|
+
def sql_expr(sql)
|
1273
|
+
if sql.is_a?(::String)
|
1274
|
+
Utils.wrap_expr(Plr.sql_expr(sql))
|
1275
|
+
else
|
1276
|
+
sql.map { |q| Utils.wrap_expr(Plr.sql_expr(q)) }
|
1277
|
+
end
|
1278
|
+
end
|
1279
|
+
end
|
1280
|
+
end
|