polars-df 0.8.0-arm64-darwin → 0.10.0-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +42 -1
- data/Cargo.lock +159 -66
- data/Cargo.toml +0 -3
- data/LICENSE-THIRD-PARTY.txt +3112 -1613
- data/LICENSE.txt +1 -1
- data/README.md +3 -2
- data/lib/polars/3.1/polars.bundle +0 -0
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/3.3/polars.bundle +0 -0
- data/lib/polars/array_expr.rb +453 -0
- data/lib/polars/array_name_space.rb +346 -0
- data/lib/polars/batched_csv_reader.rb +4 -2
- data/lib/polars/cat_expr.rb +24 -0
- data/lib/polars/cat_name_space.rb +75 -0
- data/lib/polars/config.rb +2 -2
- data/lib/polars/data_frame.rb +306 -96
- data/lib/polars/data_types.rb +191 -28
- data/lib/polars/date_time_expr.rb +41 -18
- data/lib/polars/date_time_name_space.rb +9 -3
- data/lib/polars/exceptions.rb +12 -1
- data/lib/polars/expr.rb +898 -215
- data/lib/polars/functions/aggregation/horizontal.rb +246 -0
- data/lib/polars/functions/aggregation/vertical.rb +282 -0
- data/lib/polars/functions/as_datatype.rb +248 -0
- data/lib/polars/functions/col.rb +47 -0
- data/lib/polars/functions/eager.rb +182 -0
- data/lib/polars/functions/lazy.rb +1280 -0
- data/lib/polars/functions/len.rb +49 -0
- data/lib/polars/functions/lit.rb +35 -0
- data/lib/polars/functions/random.rb +16 -0
- data/lib/polars/functions/range/date_range.rb +103 -0
- data/lib/polars/functions/range/int_range.rb +51 -0
- data/lib/polars/functions/repeat.rb +144 -0
- data/lib/polars/functions/whenthen.rb +96 -0
- data/lib/polars/functions.rb +29 -416
- data/lib/polars/group_by.rb +2 -2
- data/lib/polars/io.rb +36 -31
- data/lib/polars/lazy_frame.rb +405 -88
- data/lib/polars/list_expr.rb +158 -8
- data/lib/polars/list_name_space.rb +102 -0
- data/lib/polars/meta_expr.rb +175 -7
- data/lib/polars/series.rb +282 -41
- data/lib/polars/string_cache.rb +75 -0
- data/lib/polars/string_expr.rb +413 -96
- data/lib/polars/string_name_space.rb +4 -4
- data/lib/polars/testing.rb +507 -0
- data/lib/polars/utils.rb +106 -8
- data/lib/polars/version.rb +1 -1
- data/lib/polars/whenthen.rb +83 -0
- data/lib/polars.rb +16 -4
- metadata +34 -6
- data/lib/polars/lazy_functions.rb +0 -1181
- data/lib/polars/when.rb +0 -16
- data/lib/polars/when_then.rb +0 -19
@@ -1,1181 +0,0 @@
|
|
1
|
-
module Polars
|
2
|
-
module LazyFunctions
|
3
|
-
# Return an expression representing a column in a DataFrame.
|
4
|
-
#
|
5
|
-
# @return [Expr]
|
6
|
-
def col(name)
|
7
|
-
if name.is_a?(Series)
|
8
|
-
name = name.to_a
|
9
|
-
end
|
10
|
-
|
11
|
-
if name.is_a?(Class) && name < DataType
|
12
|
-
name = [name]
|
13
|
-
end
|
14
|
-
|
15
|
-
if name.is_a?(DataType)
|
16
|
-
Utils.wrap_expr(_dtype_cols([name]))
|
17
|
-
elsif name.is_a?(::Array)
|
18
|
-
if name.length == 0 || Utils.strlike?(name[0])
|
19
|
-
name = name.map { |v| v.is_a?(Symbol) ? v.to_s : v }
|
20
|
-
Utils.wrap_expr(RbExpr.cols(name))
|
21
|
-
elsif Utils.is_polars_dtype(name[0])
|
22
|
-
Utils.wrap_expr(_dtype_cols(name))
|
23
|
-
else
|
24
|
-
raise ArgumentError, "Expected list values to be all `str` or all `DataType`"
|
25
|
-
end
|
26
|
-
else
|
27
|
-
name = name.to_s if name.is_a?(Symbol)
|
28
|
-
Utils.wrap_expr(RbExpr.col(name))
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
# Alias for an element in evaluated in an `eval` expression.
|
33
|
-
#
|
34
|
-
# @return [Expr]
|
35
|
-
#
|
36
|
-
# @example A horizontal rank computation by taking the elements of a list
|
37
|
-
# df = Polars::DataFrame.new({"a" => [1, 8, 3], "b" => [4, 5, 2]})
|
38
|
-
# df.with_column(
|
39
|
-
# Polars.concat_list(["a", "b"]).list.eval(Polars.element.rank).alias("rank")
|
40
|
-
# )
|
41
|
-
# # =>
|
42
|
-
# # shape: (3, 3)
|
43
|
-
# # ┌─────┬─────┬────────────┐
|
44
|
-
# # │ a ┆ b ┆ rank │
|
45
|
-
# # │ --- ┆ --- ┆ --- │
|
46
|
-
# # │ i64 ┆ i64 ┆ list[f64] │
|
47
|
-
# # ╞═════╪═════╪════════════╡
|
48
|
-
# # │ 1 ┆ 4 ┆ [1.0, 2.0] │
|
49
|
-
# # │ 8 ┆ 5 ┆ [2.0, 1.0] │
|
50
|
-
# # │ 3 ┆ 2 ┆ [2.0, 1.0] │
|
51
|
-
# # └─────┴─────┴────────────┘
|
52
|
-
def element
|
53
|
-
col("")
|
54
|
-
end
|
55
|
-
|
56
|
-
# Count the number of values in this column/context.
|
57
|
-
#
|
58
|
-
# @param column [String, Series, nil]
|
59
|
-
# If dtype is:
|
60
|
-
#
|
61
|
-
# * `Series` : count the values in the series.
|
62
|
-
# * `String` : count the values in this column.
|
63
|
-
# * `None` : count the number of values in this context.
|
64
|
-
#
|
65
|
-
# @return [Expr, Integer]
|
66
|
-
def count(column = nil)
|
67
|
-
if column.nil?
|
68
|
-
return Utils.wrap_expr(RbExpr.count)
|
69
|
-
end
|
70
|
-
|
71
|
-
if column.is_a?(Series)
|
72
|
-
column.len
|
73
|
-
else
|
74
|
-
col(column).count
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
# Aggregate to list.
|
79
|
-
#
|
80
|
-
# @return [Expr]
|
81
|
-
def to_list(name)
|
82
|
-
col(name).list
|
83
|
-
end
|
84
|
-
|
85
|
-
# Get the standard deviation.
|
86
|
-
#
|
87
|
-
# @return [Object]
|
88
|
-
def std(column, ddof: 1)
|
89
|
-
if column.is_a?(Series)
|
90
|
-
column.std(ddof: ddof)
|
91
|
-
else
|
92
|
-
col(column).std(ddof: ddof)
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
# Get the variance.
|
97
|
-
#
|
98
|
-
# @return [Object]
|
99
|
-
def var(column, ddof: 1)
|
100
|
-
if column.is_a?(Series)
|
101
|
-
column.var(ddof: ddof)
|
102
|
-
else
|
103
|
-
col(column).var(ddof: ddof)
|
104
|
-
end
|
105
|
-
end
|
106
|
-
|
107
|
-
# Get the maximum value.
|
108
|
-
#
|
109
|
-
# @param column [Object]
|
110
|
-
# Column(s) to be used in aggregation.
|
111
|
-
#
|
112
|
-
# @return [Expr, Object]
|
113
|
-
def max(column)
|
114
|
-
if column.is_a?(Series)
|
115
|
-
column.max
|
116
|
-
else
|
117
|
-
col(column).max
|
118
|
-
end
|
119
|
-
end
|
120
|
-
|
121
|
-
# Get the minimum value.
|
122
|
-
#
|
123
|
-
# @param column [Object]
|
124
|
-
# Column(s) to be used in aggregation.
|
125
|
-
#
|
126
|
-
# @return [Expr, Object]
|
127
|
-
def min(column)
|
128
|
-
if column.is_a?(Series)
|
129
|
-
column.min
|
130
|
-
else
|
131
|
-
col(column).min
|
132
|
-
end
|
133
|
-
end
|
134
|
-
|
135
|
-
# Sum values in a column/Series, or horizontally across list of columns/expressions.
|
136
|
-
#
|
137
|
-
# @return [Object]
|
138
|
-
def sum(column)
|
139
|
-
if column.is_a?(Series)
|
140
|
-
column.sum
|
141
|
-
elsif Utils.strlike?(column)
|
142
|
-
col(column.to_s).sum
|
143
|
-
elsif column.is_a?(::Array)
|
144
|
-
exprs = Utils.selection_to_rbexpr_list(column)
|
145
|
-
Utils.wrap_expr(_sum_horizontal(exprs))
|
146
|
-
else
|
147
|
-
fold(lit(0).cast(:u32), ->(a, b) { a + b }, column).alias("sum")
|
148
|
-
end
|
149
|
-
end
|
150
|
-
|
151
|
-
# Get the mean value.
|
152
|
-
#
|
153
|
-
# @return [Expr, Float]
|
154
|
-
def mean(column)
|
155
|
-
if column.is_a?(Series)
|
156
|
-
column.mean
|
157
|
-
else
|
158
|
-
col(column).mean
|
159
|
-
end
|
160
|
-
end
|
161
|
-
|
162
|
-
# Get the mean value.
|
163
|
-
#
|
164
|
-
# @return [Expr, Float]
|
165
|
-
def avg(column)
|
166
|
-
mean(column)
|
167
|
-
end
|
168
|
-
|
169
|
-
# Get the median value.
|
170
|
-
#
|
171
|
-
# @return [Object]
|
172
|
-
def median(column)
|
173
|
-
if column.is_a?(Series)
|
174
|
-
column.median
|
175
|
-
else
|
176
|
-
col(column).median
|
177
|
-
end
|
178
|
-
end
|
179
|
-
|
180
|
-
# Count unique values.
|
181
|
-
#
|
182
|
-
# @return [Object]
|
183
|
-
def n_unique(column)
|
184
|
-
if column.is_a?(Series)
|
185
|
-
column.n_unique
|
186
|
-
else
|
187
|
-
col(column).n_unique
|
188
|
-
end
|
189
|
-
end
|
190
|
-
|
191
|
-
# Get the first value.
|
192
|
-
#
|
193
|
-
# @return [Object]
|
194
|
-
def first(column = nil)
|
195
|
-
if column.nil?
|
196
|
-
return Utils.wrap_expr(RbExpr.first)
|
197
|
-
end
|
198
|
-
|
199
|
-
if column.is_a?(Series)
|
200
|
-
if column.len > 0
|
201
|
-
column[0]
|
202
|
-
else
|
203
|
-
raise IndexError, "The series is empty, so no first value can be returned."
|
204
|
-
end
|
205
|
-
else
|
206
|
-
col(column).first
|
207
|
-
end
|
208
|
-
end
|
209
|
-
|
210
|
-
# Get the last value.
|
211
|
-
#
|
212
|
-
# Depending on the input type this function does different things:
|
213
|
-
#
|
214
|
-
# - nil -> expression to take last column of a context.
|
215
|
-
# - String -> syntactic sugar for `Polars.col(..).last`
|
216
|
-
# - Series -> Take last value in `Series`
|
217
|
-
#
|
218
|
-
# @return [Object]
|
219
|
-
def last(column = nil)
|
220
|
-
if column.nil?
|
221
|
-
return Utils.wrap_expr(_last)
|
222
|
-
end
|
223
|
-
|
224
|
-
if column.is_a?(Series)
|
225
|
-
if column.len > 0
|
226
|
-
return column[-1]
|
227
|
-
else
|
228
|
-
raise IndexError, "The series is empty, so no last value can be returned"
|
229
|
-
end
|
230
|
-
end
|
231
|
-
col(column).last
|
232
|
-
end
|
233
|
-
|
234
|
-
# Get the first `n` rows.
|
235
|
-
#
|
236
|
-
# @param column [Object]
|
237
|
-
# Column name or Series.
|
238
|
-
# @param n [Integer]
|
239
|
-
# Number of rows to return.
|
240
|
-
#
|
241
|
-
# @return [Object]
|
242
|
-
def head(column, n = 10)
|
243
|
-
if column.is_a?(Series)
|
244
|
-
column.head(n)
|
245
|
-
else
|
246
|
-
col(column).head(n)
|
247
|
-
end
|
248
|
-
end
|
249
|
-
|
250
|
-
# Get the last `n` rows.
|
251
|
-
#
|
252
|
-
# @param column [Object]
|
253
|
-
# Column name or Series.
|
254
|
-
# @param n [Integer]
|
255
|
-
# Number of rows to return.
|
256
|
-
#
|
257
|
-
# @return [Object]
|
258
|
-
def tail(column, n = 10)
|
259
|
-
if column.is_a?(Series)
|
260
|
-
column.tail(n)
|
261
|
-
else
|
262
|
-
col(column).tail(n)
|
263
|
-
end
|
264
|
-
end
|
265
|
-
|
266
|
-
# Return an expression representing a literal value.
|
267
|
-
#
|
268
|
-
# @return [Expr]
|
269
|
-
def lit(value, dtype: nil, allow_object: nil)
|
270
|
-
if value.is_a?(::Time) || value.is_a?(::DateTime)
|
271
|
-
time_unit = dtype&.time_unit || "ns"
|
272
|
-
time_zone = dtype.&time_zone
|
273
|
-
e = lit(Utils._datetime_to_pl_timestamp(value, time_unit)).cast(Datetime.new(time_unit))
|
274
|
-
if time_zone
|
275
|
-
return e.dt.replace_time_zone(time_zone.to_s)
|
276
|
-
else
|
277
|
-
return e
|
278
|
-
end
|
279
|
-
elsif value.is_a?(::Date)
|
280
|
-
return lit(::Time.utc(value.year, value.month, value.day)).cast(Date)
|
281
|
-
elsif value.is_a?(Polars::Series)
|
282
|
-
name = value.name
|
283
|
-
value = value._s
|
284
|
-
e = Utils.wrap_expr(RbExpr.lit(value, allow_object))
|
285
|
-
if name == ""
|
286
|
-
return e
|
287
|
-
end
|
288
|
-
return e.alias(name)
|
289
|
-
elsif (defined?(Numo::NArray) && value.is_a?(Numo::NArray)) || value.is_a?(::Array)
|
290
|
-
return lit(Series.new("", value))
|
291
|
-
elsif dtype
|
292
|
-
return Utils.wrap_expr(RbExpr.lit(value, allow_object)).cast(dtype)
|
293
|
-
end
|
294
|
-
|
295
|
-
Utils.wrap_expr(RbExpr.lit(value, allow_object))
|
296
|
-
end
|
297
|
-
|
298
|
-
# Cumulatively sum values in a column/Series, or horizontally across list of columns/expressions.
|
299
|
-
#
|
300
|
-
# @param column [Object]
|
301
|
-
# Column(s) to be used in aggregation.
|
302
|
-
#
|
303
|
-
# @return [Object]
|
304
|
-
#
|
305
|
-
# @example
|
306
|
-
# df = Polars::DataFrame.new(
|
307
|
-
# {
|
308
|
-
# "a" => [1, 2],
|
309
|
-
# "b" => [3, 4],
|
310
|
-
# "c" => [5, 6]
|
311
|
-
# }
|
312
|
-
# )
|
313
|
-
# # =>
|
314
|
-
# # shape: (2, 3)
|
315
|
-
# # ┌─────┬─────┬─────┐
|
316
|
-
# # │ a ┆ b ┆ c │
|
317
|
-
# # │ --- ┆ --- ┆ --- │
|
318
|
-
# # │ i64 ┆ i64 ┆ i64 │
|
319
|
-
# # ╞═════╪═════╪═════╡
|
320
|
-
# # │ 1 ┆ 3 ┆ 5 │
|
321
|
-
# # │ 2 ┆ 4 ┆ 6 │
|
322
|
-
# # └─────┴─────┴─────┘
|
323
|
-
#
|
324
|
-
# @example Cumulatively sum a column by name:
|
325
|
-
# df.select(Polars.cumsum("a"))
|
326
|
-
# # =>
|
327
|
-
# # shape: (2, 1)
|
328
|
-
# # ┌─────┐
|
329
|
-
# # │ a │
|
330
|
-
# # │ --- │
|
331
|
-
# # │ i64 │
|
332
|
-
# # ╞═════╡
|
333
|
-
# # │ 1 │
|
334
|
-
# # │ 3 │
|
335
|
-
# # └─────┘
|
336
|
-
#
|
337
|
-
# @example Cumulatively sum a list of columns/expressions horizontally:
|
338
|
-
# df.with_column(Polars.cumsum(["a", "c"]))
|
339
|
-
# # =>
|
340
|
-
# # shape: (2, 4)
|
341
|
-
# # ┌─────┬─────┬─────┬───────────┐
|
342
|
-
# # │ a ┆ b ┆ c ┆ cumsum │
|
343
|
-
# # │ --- ┆ --- ┆ --- ┆ --- │
|
344
|
-
# # │ i64 ┆ i64 ┆ i64 ┆ struct[2] │
|
345
|
-
# # ╞═════╪═════╪═════╪═══════════╡
|
346
|
-
# # │ 1 ┆ 3 ┆ 5 ┆ {1,6} │
|
347
|
-
# # │ 2 ┆ 4 ┆ 6 ┆ {2,8} │
|
348
|
-
# # └─────┴─────┴─────┴───────────┘
|
349
|
-
def cumsum(column)
|
350
|
-
if column.is_a?(Series)
|
351
|
-
column.cumsum
|
352
|
-
elsif Utils.strlike?(column)
|
353
|
-
col(column).cumsum
|
354
|
-
else
|
355
|
-
cumfold(lit(0).cast(:u32), ->(a, b) { a + b }, column).alias("cumsum")
|
356
|
-
end
|
357
|
-
end
|
358
|
-
|
359
|
-
# Compute the spearman rank correlation between two columns.
|
360
|
-
#
|
361
|
-
# Missing data will be excluded from the computation.
|
362
|
-
#
|
363
|
-
# @param a [Object]
|
364
|
-
# Column name or Expression.
|
365
|
-
# @param b [Object]
|
366
|
-
# Column name or Expression.
|
367
|
-
# @param ddof [Integer]
|
368
|
-
# Delta degrees of freedom
|
369
|
-
# @param propagate_nans [Boolean]
|
370
|
-
# If `True` any `NaN` encountered will lead to `NaN` in the output.
|
371
|
-
# Defaults to `False` where `NaN` are regarded as larger than any finite number
|
372
|
-
# and thus lead to the highest rank.
|
373
|
-
#
|
374
|
-
# @return [Expr]
|
375
|
-
def spearman_rank_corr(a, b, ddof: 1, propagate_nans: false)
|
376
|
-
if Utils.strlike?(a)
|
377
|
-
a = col(a)
|
378
|
-
end
|
379
|
-
if Utils.strlike?(b)
|
380
|
-
b = col(b)
|
381
|
-
end
|
382
|
-
Utils.wrap_expr(RbExpr.spearman_rank_corr(a._rbexpr, b._rbexpr, ddof, propagate_nans))
|
383
|
-
end
|
384
|
-
|
385
|
-
# Compute the pearson's correlation between two columns.
|
386
|
-
#
|
387
|
-
# @param a [Object]
|
388
|
-
# Column name or Expression.
|
389
|
-
# @param b [Object]
|
390
|
-
# Column name or Expression.
|
391
|
-
# @param ddof [Integer]
|
392
|
-
# Delta degrees of freedom
|
393
|
-
#
|
394
|
-
# @return [Expr]
|
395
|
-
def pearson_corr(a, b, ddof: 1)
|
396
|
-
if Utils.strlike?(a)
|
397
|
-
a = col(a)
|
398
|
-
end
|
399
|
-
if Utils.strlike?(b)
|
400
|
-
b = col(b)
|
401
|
-
end
|
402
|
-
Utils.wrap_expr(RbExpr.pearson_corr(a._rbexpr, b._rbexpr, ddof))
|
403
|
-
end
|
404
|
-
|
405
|
-
# Compute the covariance between two columns/ expressions.
|
406
|
-
#
|
407
|
-
# @param a [Object]
|
408
|
-
# Column name or Expression.
|
409
|
-
# @param b [Object]
|
410
|
-
# Column name or Expression.
|
411
|
-
#
|
412
|
-
# @return [Expr]
|
413
|
-
def cov(a, b)
|
414
|
-
if Utils.strlike?(a)
|
415
|
-
a = col(a)
|
416
|
-
end
|
417
|
-
if Utils.strlike?(b)
|
418
|
-
b = col(b)
|
419
|
-
end
|
420
|
-
Utils.wrap_expr(RbExpr.cov(a._rbexpr, b._rbexpr))
|
421
|
-
end
|
422
|
-
|
423
|
-
# def map
|
424
|
-
# end
|
425
|
-
|
426
|
-
# def apply
|
427
|
-
# end
|
428
|
-
|
429
|
-
# Accumulate over multiple columns horizontally/row wise with a left fold.
|
430
|
-
#
|
431
|
-
# @return [Expr]
|
432
|
-
def fold(acc, f, exprs)
|
433
|
-
acc = Utils.expr_to_lit_or_expr(acc, str_to_lit: true)
|
434
|
-
if exprs.is_a?(Expr)
|
435
|
-
exprs = [exprs]
|
436
|
-
end
|
437
|
-
|
438
|
-
exprs = Utils.selection_to_rbexpr_list(exprs)
|
439
|
-
Utils.wrap_expr(RbExpr.fold(acc._rbexpr, f, exprs))
|
440
|
-
end
|
441
|
-
|
442
|
-
# def reduce
|
443
|
-
# end
|
444
|
-
|
445
|
-
# Cumulatively accumulate over multiple columns horizontally/row wise with a left fold.
|
446
|
-
#
|
447
|
-
# Every cumulative result is added as a separate field in a Struct column.
|
448
|
-
#
|
449
|
-
# @param acc [Object]
|
450
|
-
# Accumulator Expression. This is the value that will be initialized when the fold
|
451
|
-
# starts. For a sum this could for instance be lit(0).
|
452
|
-
# @param f [Object]
|
453
|
-
# Function to apply over the accumulator and the value.
|
454
|
-
# Fn(acc, value) -> new_value
|
455
|
-
# @param exprs [Object]
|
456
|
-
# Expressions to aggregate over. May also be a wildcard expression.
|
457
|
-
# @param include_init [Boolean]
|
458
|
-
# Include the initial accumulator state as struct field.
|
459
|
-
#
|
460
|
-
# @return [Object]
|
461
|
-
#
|
462
|
-
# @note
|
463
|
-
# If you simply want the first encountered expression as accumulator,
|
464
|
-
# consider using `cumreduce`.
|
465
|
-
def cumfold(acc, f, exprs, include_init: false)
|
466
|
-
acc = Utils.expr_to_lit_or_expr(acc, str_to_lit: true)
|
467
|
-
if exprs.is_a?(Expr)
|
468
|
-
exprs = [exprs]
|
469
|
-
end
|
470
|
-
|
471
|
-
exprs = Utils.selection_to_rbexpr_list(exprs)
|
472
|
-
Utils.wrap_expr(RbExpr.cumfold(acc._rbexpr, f, exprs, include_init))
|
473
|
-
end
|
474
|
-
|
475
|
-
# def cumreduce
|
476
|
-
# end
|
477
|
-
|
478
|
-
# Evaluate columnwise or elementwise with a bitwise OR operation.
|
479
|
-
#
|
480
|
-
# @return [Expr]
|
481
|
-
def any(name)
|
482
|
-
if Utils.strlike?(name)
|
483
|
-
col(name).any
|
484
|
-
else
|
485
|
-
fold(lit(false), ->(a, b) { a.cast(:bool) | b.cast(:bool) }, name).alias("any")
|
486
|
-
end
|
487
|
-
end
|
488
|
-
|
489
|
-
# Exclude certain columns from a wildcard/regex selection.
|
490
|
-
#
|
491
|
-
# @param columns [Object]
|
492
|
-
# Column(s) to exclude from selection
|
493
|
-
# This can be:
|
494
|
-
#
|
495
|
-
# - a column name, or multiple column names
|
496
|
-
# - a regular expression starting with `^` and ending with `$`
|
497
|
-
# - a dtype or multiple dtypes
|
498
|
-
#
|
499
|
-
# @return [Object]
|
500
|
-
#
|
501
|
-
# @example
|
502
|
-
# df = Polars::DataFrame.new(
|
503
|
-
# {
|
504
|
-
# "aa" => [1, 2, 3],
|
505
|
-
# "ba" => ["a", "b", nil],
|
506
|
-
# "cc" => [nil, 2.5, 1.5]
|
507
|
-
# }
|
508
|
-
# )
|
509
|
-
# # =>
|
510
|
-
# # shape: (3, 3)
|
511
|
-
# # ┌─────┬──────┬──────┐
|
512
|
-
# # │ aa ┆ ba ┆ cc │
|
513
|
-
# # │ --- ┆ --- ┆ --- │
|
514
|
-
# # │ i64 ┆ str ┆ f64 │
|
515
|
-
# # ╞═════╪══════╪══════╡
|
516
|
-
# # │ 1 ┆ a ┆ null │
|
517
|
-
# # │ 2 ┆ b ┆ 2.5 │
|
518
|
-
# # │ 3 ┆ null ┆ 1.5 │
|
519
|
-
# # └─────┴──────┴──────┘
|
520
|
-
#
|
521
|
-
# @example Exclude by column name(s):
|
522
|
-
# df.select(Polars.exclude("ba"))
|
523
|
-
# # =>
|
524
|
-
# # shape: (3, 2)
|
525
|
-
# # ┌─────┬──────┐
|
526
|
-
# # │ aa ┆ cc │
|
527
|
-
# # │ --- ┆ --- │
|
528
|
-
# # │ i64 ┆ f64 │
|
529
|
-
# # ╞═════╪══════╡
|
530
|
-
# # │ 1 ┆ null │
|
531
|
-
# # │ 2 ┆ 2.5 │
|
532
|
-
# # │ 3 ┆ 1.5 │
|
533
|
-
# # └─────┴──────┘
|
534
|
-
#
|
535
|
-
# @example Exclude by regex, e.g. removing all columns whose names end with the letter "a":
|
536
|
-
# df.select(Polars.exclude("^.*a$"))
|
537
|
-
# # =>
|
538
|
-
# # shape: (3, 1)
|
539
|
-
# # ┌──────┐
|
540
|
-
# # │ cc │
|
541
|
-
# # │ --- │
|
542
|
-
# # │ f64 │
|
543
|
-
# # ╞══════╡
|
544
|
-
# # │ null │
|
545
|
-
# # │ 2.5 │
|
546
|
-
# # │ 1.5 │
|
547
|
-
# # └──────┘
|
548
|
-
def exclude(columns)
|
549
|
-
col("*").exclude(columns)
|
550
|
-
end
|
551
|
-
|
552
|
-
# Do one of two things.
|
553
|
-
#
|
554
|
-
# * function can do a columnwise or elementwise AND operation
|
555
|
-
# * a wildcard column selection
|
556
|
-
#
|
557
|
-
# @param name [Object]
|
558
|
-
# If given this function will apply a bitwise & on the columns.
|
559
|
-
#
|
560
|
-
# @return [Expr]
|
561
|
-
#
|
562
|
-
# @example Sum all columns
|
563
|
-
# df = Polars::DataFrame.new(
|
564
|
-
# {"a" => [1, 2, 3], "b" => ["hello", "foo", "bar"], "c" => [1, 1, 1]}
|
565
|
-
# )
|
566
|
-
# df.select(Polars.all.sum)
|
567
|
-
# # =>
|
568
|
-
# # shape: (1, 3)
|
569
|
-
# # ┌─────┬──────┬─────┐
|
570
|
-
# # │ a ┆ b ┆ c │
|
571
|
-
# # │ --- ┆ --- ┆ --- │
|
572
|
-
# # │ i64 ┆ str ┆ i64 │
|
573
|
-
# # ╞═════╪══════╪═════╡
|
574
|
-
# # │ 6 ┆ null ┆ 3 │
|
575
|
-
# # └─────┴──────┴─────┘
|
576
|
-
def all(name = nil)
|
577
|
-
if name.nil?
|
578
|
-
col("*")
|
579
|
-
elsif Utils.strlike?(name)
|
580
|
-
col(name).all
|
581
|
-
else
|
582
|
-
raise Todo
|
583
|
-
end
|
584
|
-
end
|
585
|
-
|
586
|
-
# Syntactic sugar for `Polars.col("foo").agg_groups`.
|
587
|
-
#
|
588
|
-
# @return [Object]
|
589
|
-
def groups(column)
|
590
|
-
col(column).agg_groups
|
591
|
-
end
|
592
|
-
|
593
|
-
# Syntactic sugar for `Polars.col("foo").quantile(...)`.
|
594
|
-
#
|
595
|
-
# @param column [String]
|
596
|
-
# Column name.
|
597
|
-
# @param quantile [Float]
|
598
|
-
# Quantile between 0.0 and 1.0.
|
599
|
-
# @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
|
600
|
-
# Interpolation method.
|
601
|
-
#
|
602
|
-
# @return [Expr]
|
603
|
-
def quantile(column, quantile, interpolation: "nearest")
|
604
|
-
col(column).quantile(quantile, interpolation: interpolation)
|
605
|
-
end
|
606
|
-
|
607
|
-
# Create a range expression (or Series).
|
608
|
-
#
|
609
|
-
# This can be used in a `select`, `with_column`, etc. Be sure that the resulting
|
610
|
-
# range size is equal to the length of the DataFrame you are collecting.
|
611
|
-
#
|
612
|
-
# @param start [Integer, Expr, Series]
|
613
|
-
# Lower bound of range.
|
614
|
-
# @param stop [Integer, Expr, Series]
|
615
|
-
# Upper bound of range.
|
616
|
-
# @param step [Integer]
|
617
|
-
# Step size of the range.
|
618
|
-
# @param eager [Boolean]
|
619
|
-
# If eager evaluation is `True`, a Series is returned instead of an Expr.
|
620
|
-
# @param dtype [Symbol]
|
621
|
-
# Apply an explicit integer dtype to the resulting expression (default is `Int64`).
|
622
|
-
#
|
623
|
-
# @return [Expr, Series]
|
624
|
-
#
|
625
|
-
# @example
|
626
|
-
# Polars.arange(0, 3, eager: true)
|
627
|
-
# # =>
|
628
|
-
# # shape: (3,)
|
629
|
-
# # Series: 'arange' [i64]
|
630
|
-
# # [
|
631
|
-
# # 0
|
632
|
-
# # 1
|
633
|
-
# # 2
|
634
|
-
# # ]
|
635
|
-
def int_range(start, stop, step: 1, eager: false, dtype: nil)
|
636
|
-
start = Utils.parse_as_expression(start)
|
637
|
-
stop = Utils.parse_as_expression(stop)
|
638
|
-
dtype ||= Int64
|
639
|
-
dtype = dtype.to_s if dtype.is_a?(Symbol)
|
640
|
-
result = Utils.wrap_expr(RbExpr.int_range(start, stop, step, dtype)).alias("arange")
|
641
|
-
|
642
|
-
if eager
|
643
|
-
return select(result).to_series
|
644
|
-
end
|
645
|
-
|
646
|
-
result
|
647
|
-
end
|
648
|
-
alias_method :arange, :int_range
|
649
|
-
|
650
|
-
# Find the indexes that would sort the columns.
|
651
|
-
#
|
652
|
-
# Argsort by multiple columns. The first column will be used for the ordering.
|
653
|
-
# If there are duplicates in the first column, the second column will be used to
|
654
|
-
# determine the ordering and so on.
|
655
|
-
#
|
656
|
-
# @param exprs [Object]
|
657
|
-
# Columns use to determine the ordering.
|
658
|
-
# @param reverse [Boolean]
|
659
|
-
# Default is ascending.
|
660
|
-
#
|
661
|
-
# @return [Expr]
|
662
|
-
def arg_sort_by(exprs, reverse: false)
|
663
|
-
if !exprs.is_a?(::Array)
|
664
|
-
exprs = [exprs]
|
665
|
-
end
|
666
|
-
if reverse == true || reverse == false
|
667
|
-
reverse = [reverse] * exprs.length
|
668
|
-
end
|
669
|
-
exprs = Utils.selection_to_rbexpr_list(exprs)
|
670
|
-
Utils.wrap_expr(RbExpr.arg_sort_by(exprs, reverse))
|
671
|
-
end
|
672
|
-
alias_method :argsort_by, :arg_sort_by
|
673
|
-
|
674
|
-
# Create polars `Duration` from distinct time components.
|
675
|
-
#
|
676
|
-
# @return [Expr]
|
677
|
-
#
|
678
|
-
# @example
|
679
|
-
# df = Polars::DataFrame.new(
|
680
|
-
# {
|
681
|
-
# "datetime" => [DateTime.new(2022, 1, 1), DateTime.new(2022, 1, 2)],
|
682
|
-
# "add" => [1, 2]
|
683
|
-
# }
|
684
|
-
# )
|
685
|
-
# df.select(
|
686
|
-
# [
|
687
|
-
# (Polars.col("datetime") + Polars.duration(weeks: "add")).alias("add_weeks"),
|
688
|
-
# (Polars.col("datetime") + Polars.duration(days: "add")).alias("add_days"),
|
689
|
-
# (Polars.col("datetime") + Polars.duration(seconds: "add")).alias("add_seconds"),
|
690
|
-
# (Polars.col("datetime") + Polars.duration(milliseconds: "add")).alias(
|
691
|
-
# "add_milliseconds"
|
692
|
-
# ),
|
693
|
-
# (Polars.col("datetime") + Polars.duration(hours: "add")).alias("add_hours")
|
694
|
-
# ]
|
695
|
-
# )
|
696
|
-
# # =>
|
697
|
-
# # shape: (2, 5)
|
698
|
-
# # ┌─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────────┬─────────────────────┐
|
699
|
-
# # │ add_weeks ┆ add_days ┆ add_seconds ┆ add_milliseconds ┆ add_hours │
|
700
|
-
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
701
|
-
# # │ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] │
|
702
|
-
# # ╞═════════════════════╪═════════════════════╪═════════════════════╪═════════════════════════╪═════════════════════╡
|
703
|
-
# # │ 2022-01-08 00:00:00 ┆ 2022-01-02 00:00:00 ┆ 2022-01-01 00:00:01 ┆ 2022-01-01 00:00:00.001 ┆ 2022-01-01 01:00:00 │
|
704
|
-
# # │ 2022-01-16 00:00:00 ┆ 2022-01-04 00:00:00 ┆ 2022-01-02 00:00:02 ┆ 2022-01-02 00:00:00.002 ┆ 2022-01-02 02:00:00 │
|
705
|
-
# # └─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────────┴─────────────────────┘
|
706
|
-
def duration(
|
707
|
-
weeks: nil,
|
708
|
-
days: nil,
|
709
|
-
hours: nil,
|
710
|
-
minutes: nil,
|
711
|
-
seconds: nil,
|
712
|
-
milliseconds: nil,
|
713
|
-
microseconds: nil,
|
714
|
-
nanoseconds: nil,
|
715
|
-
time_unit: "us"
|
716
|
-
)
|
717
|
-
if !weeks.nil?
|
718
|
-
weeks = Utils.expr_to_lit_or_expr(weeks, str_to_lit: false)._rbexpr
|
719
|
-
end
|
720
|
-
if !days.nil?
|
721
|
-
days = Utils.expr_to_lit_or_expr(days, str_to_lit: false)._rbexpr
|
722
|
-
end
|
723
|
-
if !hours.nil?
|
724
|
-
hours = Utils.expr_to_lit_or_expr(hours, str_to_lit: false)._rbexpr
|
725
|
-
end
|
726
|
-
if !minutes.nil?
|
727
|
-
minutes = Utils.expr_to_lit_or_expr(minutes, str_to_lit: false)._rbexpr
|
728
|
-
end
|
729
|
-
if !seconds.nil?
|
730
|
-
seconds = Utils.expr_to_lit_or_expr(seconds, str_to_lit: false)._rbexpr
|
731
|
-
end
|
732
|
-
if !milliseconds.nil?
|
733
|
-
milliseconds = Utils.expr_to_lit_or_expr(milliseconds, str_to_lit: false)._rbexpr
|
734
|
-
end
|
735
|
-
if !microseconds.nil?
|
736
|
-
microseconds = Utils.expr_to_lit_or_expr(microseconds, str_to_lit: false)._rbexpr
|
737
|
-
end
|
738
|
-
if !nanoseconds.nil?
|
739
|
-
nanoseconds = Utils.expr_to_lit_or_expr(nanoseconds, str_to_lit: false)._rbexpr
|
740
|
-
end
|
741
|
-
|
742
|
-
Utils.wrap_expr(
|
743
|
-
_rb_duration(
|
744
|
-
weeks,
|
745
|
-
days,
|
746
|
-
hours,
|
747
|
-
minutes,
|
748
|
-
seconds,
|
749
|
-
milliseconds,
|
750
|
-
microseconds,
|
751
|
-
nanoseconds,
|
752
|
-
time_unit
|
753
|
-
)
|
754
|
-
)
|
755
|
-
end
|
756
|
-
|
757
|
-
# Horizontally concat Utf8 Series in linear time. Non-Utf8 columns are cast to Utf8.
|
758
|
-
#
|
759
|
-
# @param exprs [Object]
|
760
|
-
# Columns to concat into a Utf8 Series.
|
761
|
-
# @param sep [String]
|
762
|
-
# String value that will be used to separate the values.
|
763
|
-
#
|
764
|
-
# @return [Expr]
|
765
|
-
#
|
766
|
-
# @example
|
767
|
-
# df = Polars::DataFrame.new(
|
768
|
-
# {
|
769
|
-
# "a" => [1, 2, 3],
|
770
|
-
# "b" => ["dogs", "cats", nil],
|
771
|
-
# "c" => ["play", "swim", "walk"]
|
772
|
-
# }
|
773
|
-
# )
|
774
|
-
# df.with_columns(
|
775
|
-
# [
|
776
|
-
# Polars.concat_str(
|
777
|
-
# [
|
778
|
-
# Polars.col("a") * 2,
|
779
|
-
# Polars.col("b"),
|
780
|
-
# Polars.col("c")
|
781
|
-
# ],
|
782
|
-
# sep: " "
|
783
|
-
# ).alias("full_sentence")
|
784
|
-
# ]
|
785
|
-
# )
|
786
|
-
# # =>
|
787
|
-
# # shape: (3, 4)
|
788
|
-
# # ┌─────┬──────┬──────┬───────────────┐
|
789
|
-
# # │ a ┆ b ┆ c ┆ full_sentence │
|
790
|
-
# # │ --- ┆ --- ┆ --- ┆ --- │
|
791
|
-
# # │ i64 ┆ str ┆ str ┆ str │
|
792
|
-
# # ╞═════╪══════╪══════╪═══════════════╡
|
793
|
-
# # │ 1 ┆ dogs ┆ play ┆ 2 dogs play │
|
794
|
-
# # │ 2 ┆ cats ┆ swim ┆ 4 cats swim │
|
795
|
-
# # │ 3 ┆ null ┆ walk ┆ null │
|
796
|
-
# # └─────┴──────┴──────┴───────────────┘
|
797
|
-
def concat_str(exprs, sep: "")
|
798
|
-
exprs = Utils.selection_to_rbexpr_list(exprs)
|
799
|
-
return Utils.wrap_expr(RbExpr.concat_str(exprs, sep))
|
800
|
-
end
|
801
|
-
|
802
|
-
# Format expressions as a string.
|
803
|
-
#
|
804
|
-
# @param fstring [String]
|
805
|
-
# A string that with placeholders.
|
806
|
-
# For example: "hello_{}" or "{}_world
|
807
|
-
# @param args [Object]
|
808
|
-
# Expression(s) that fill the placeholders
|
809
|
-
#
|
810
|
-
# @return [Expr]
|
811
|
-
#
|
812
|
-
# @example
|
813
|
-
# df = Polars::DataFrame.new(
|
814
|
-
# {
|
815
|
-
# "a": ["a", "b", "c"],
|
816
|
-
# "b": [1, 2, 3]
|
817
|
-
# }
|
818
|
-
# )
|
819
|
-
# df.select(
|
820
|
-
# [
|
821
|
-
# Polars.format("foo_{}_bar_{}", Polars.col("a"), "b").alias("fmt")
|
822
|
-
# ]
|
823
|
-
# )
|
824
|
-
# # =>
|
825
|
-
# # shape: (3, 1)
|
826
|
-
# # ┌─────────────┐
|
827
|
-
# # │ fmt │
|
828
|
-
# # │ --- │
|
829
|
-
# # │ str │
|
830
|
-
# # ╞═════════════╡
|
831
|
-
# # │ foo_a_bar_1 │
|
832
|
-
# # │ foo_b_bar_2 │
|
833
|
-
# # │ foo_c_bar_3 │
|
834
|
-
# # └─────────────┘
|
835
|
-
def format(fstring, *args)
|
836
|
-
if fstring.scan("{}").length != args.length
|
837
|
-
raise ArgumentError, "number of placeholders should equal the number of arguments"
|
838
|
-
end
|
839
|
-
|
840
|
-
exprs = []
|
841
|
-
|
842
|
-
arguments = args.each
|
843
|
-
fstring.split(/(\{\})/).each do |s|
|
844
|
-
if s == "{}"
|
845
|
-
e = Utils.expr_to_lit_or_expr(arguments.next, str_to_lit: false)
|
846
|
-
exprs << e
|
847
|
-
elsif s.length > 0
|
848
|
-
exprs << lit(s)
|
849
|
-
end
|
850
|
-
end
|
851
|
-
|
852
|
-
concat_str(exprs, sep: "")
|
853
|
-
end
|
854
|
-
|
855
|
-
# Concat the arrays in a Series dtype List in linear time.
|
856
|
-
#
|
857
|
-
# @return [Expr]
|
858
|
-
def concat_list(exprs)
|
859
|
-
exprs = Utils.selection_to_rbexpr_list(exprs)
|
860
|
-
Utils.wrap_expr(RbExpr.concat_lst(exprs))
|
861
|
-
end
|
862
|
-
|
863
|
-
# Collect multiple LazyFrames at the same time.
|
864
|
-
#
|
865
|
-
# This runs all the computation graphs in parallel on Polars threadpool.
|
866
|
-
#
|
867
|
-
# @param lazy_frames [Boolean]
|
868
|
-
# A list of LazyFrames to collect.
|
869
|
-
# @param type_coercion [Boolean]
|
870
|
-
# Do type coercion optimization.
|
871
|
-
# @param predicate_pushdown [Boolean]
|
872
|
-
# Do predicate pushdown optimization.
|
873
|
-
# @param projection_pushdown [Boolean]
|
874
|
-
# Do projection pushdown optimization.
|
875
|
-
# @param simplify_expression [Boolean]
|
876
|
-
# Run simplify expressions optimization.
|
877
|
-
# @param string_cache [Boolean]
|
878
|
-
# This argument is deprecated and will be ignored
|
879
|
-
# @param no_optimization [Boolean]
|
880
|
-
# Turn off optimizations.
|
881
|
-
# @param slice_pushdown [Boolean]
|
882
|
-
# Slice pushdown optimization.
|
883
|
-
# @param common_subplan_elimination [Boolean]
|
884
|
-
# Will try to cache branching subplans that occur on self-joins or unions.
|
885
|
-
# @param allow_streaming [Boolean]
|
886
|
-
# Run parts of the query in a streaming fashion (this is in an alpha state)
|
887
|
-
#
|
888
|
-
# @return [Array]
|
889
|
-
def collect_all(
|
890
|
-
lazy_frames,
|
891
|
-
type_coercion: true,
|
892
|
-
predicate_pushdown: true,
|
893
|
-
projection_pushdown: true,
|
894
|
-
simplify_expression: true,
|
895
|
-
string_cache: false,
|
896
|
-
no_optimization: false,
|
897
|
-
slice_pushdown: true,
|
898
|
-
common_subplan_elimination: true,
|
899
|
-
allow_streaming: false
|
900
|
-
)
|
901
|
-
if no_optimization
|
902
|
-
predicate_pushdown = false
|
903
|
-
projection_pushdown = false
|
904
|
-
slice_pushdown = false
|
905
|
-
common_subplan_elimination = false
|
906
|
-
end
|
907
|
-
|
908
|
-
prepared = []
|
909
|
-
|
910
|
-
lazy_frames.each do |lf|
|
911
|
-
ldf = lf._ldf.optimization_toggle(
|
912
|
-
type_coercion,
|
913
|
-
predicate_pushdown,
|
914
|
-
projection_pushdown,
|
915
|
-
simplify_expression,
|
916
|
-
slice_pushdown,
|
917
|
-
common_subplan_elimination,
|
918
|
-
allow_streaming,
|
919
|
-
false
|
920
|
-
)
|
921
|
-
prepared << ldf
|
922
|
-
end
|
923
|
-
|
924
|
-
out = _collect_all(prepared)
|
925
|
-
|
926
|
-
# wrap the rbdataframes into dataframe
|
927
|
-
result = out.map { |rbdf| Utils.wrap_df(rbdf) }
|
928
|
-
|
929
|
-
result
|
930
|
-
end
|
931
|
-
|
932
|
-
# Run polars expressions without a context.
|
933
|
-
#
|
934
|
-
# @return [DataFrame]
|
935
|
-
def select(exprs)
|
936
|
-
DataFrame.new([]).select(exprs)
|
937
|
-
end
|
938
|
-
|
939
|
-
# Collect several columns into a Series of dtype Struct.
|
940
|
-
#
|
941
|
-
# @param exprs [Object]
|
942
|
-
# Columns/Expressions to collect into a Struct
|
943
|
-
# @param eager [Boolean]
|
944
|
-
# Evaluate immediately
|
945
|
-
#
|
946
|
-
# @return [Object]
|
947
|
-
#
|
948
|
-
# @example
|
949
|
-
# Polars::DataFrame.new(
|
950
|
-
# {
|
951
|
-
# "int" => [1, 2],
|
952
|
-
# "str" => ["a", "b"],
|
953
|
-
# "bool" => [true, nil],
|
954
|
-
# "list" => [[1, 2], [3]],
|
955
|
-
# }
|
956
|
-
# ).select([Polars.struct(Polars.all).alias("my_struct")])
|
957
|
-
# # =>
|
958
|
-
# # shape: (2, 1)
|
959
|
-
# # ┌─────────────────────┐
|
960
|
-
# # │ my_struct │
|
961
|
-
# # │ --- │
|
962
|
-
# # │ struct[4] │
|
963
|
-
# # ╞═════════════════════╡
|
964
|
-
# # │ {1,"a",true,[1, 2]} │
|
965
|
-
# # │ {2,"b",null,[3]} │
|
966
|
-
# # └─────────────────────┘
|
967
|
-
#
|
968
|
-
# @example Only collect specific columns as a struct:
|
969
|
-
# df = Polars::DataFrame.new(
|
970
|
-
# {"a" => [1, 2, 3, 4], "b" => ["one", "two", "three", "four"], "c" => [9, 8, 7, 6]}
|
971
|
-
# )
|
972
|
-
# df.with_column(Polars.struct(Polars.col(["a", "b"])).alias("a_and_b"))
|
973
|
-
# # =>
|
974
|
-
# # shape: (4, 4)
|
975
|
-
# # ┌─────┬───────┬─────┬─────────────┐
|
976
|
-
# # │ a ┆ b ┆ c ┆ a_and_b │
|
977
|
-
# # │ --- ┆ --- ┆ --- ┆ --- │
|
978
|
-
# # │ i64 ┆ str ┆ i64 ┆ struct[2] │
|
979
|
-
# # ╞═════╪═══════╪═════╪═════════════╡
|
980
|
-
# # │ 1 ┆ one ┆ 9 ┆ {1,"one"} │
|
981
|
-
# # │ 2 ┆ two ┆ 8 ┆ {2,"two"} │
|
982
|
-
# # │ 3 ┆ three ┆ 7 ┆ {3,"three"} │
|
983
|
-
# # │ 4 ┆ four ┆ 6 ┆ {4,"four"} │
|
984
|
-
# # └─────┴───────┴─────┴─────────────┘
|
985
|
-
def struct(exprs, eager: false)
|
986
|
-
if eager
|
987
|
-
Polars.select(struct(exprs, eager: false)).to_series
|
988
|
-
end
|
989
|
-
exprs = Utils.selection_to_rbexpr_list(exprs)
|
990
|
-
Utils.wrap_expr(_as_struct(exprs))
|
991
|
-
end
|
992
|
-
|
993
|
-
# Repeat a single value n times.
|
994
|
-
#
|
995
|
-
# @param value [Object]
|
996
|
-
# Value to repeat.
|
997
|
-
# @param n [Integer]
|
998
|
-
# Repeat `n` times.
|
999
|
-
# @param eager [Boolean]
|
1000
|
-
# Run eagerly and collect into a `Series`.
|
1001
|
-
# @param name [String]
|
1002
|
-
# Only used in `eager` mode. As expression, use `alias`.
|
1003
|
-
#
|
1004
|
-
# @return [Expr]
|
1005
|
-
def repeat(value, n, dtype: nil, eager: false, name: nil)
|
1006
|
-
if !name.nil?
|
1007
|
-
warn "the `name` argument is deprecated. Use the `alias` method instead."
|
1008
|
-
end
|
1009
|
-
|
1010
|
-
if n.is_a?(Integer)
|
1011
|
-
n = lit(n)
|
1012
|
-
end
|
1013
|
-
|
1014
|
-
value = Utils.parse_as_expression(value, str_as_lit: true)
|
1015
|
-
expr = Utils.wrap_expr(RbExpr.repeat(value, n._rbexpr, dtype))
|
1016
|
-
if !name.nil?
|
1017
|
-
expr = expr.alias(name)
|
1018
|
-
end
|
1019
|
-
if eager
|
1020
|
-
return select(expr).to_series
|
1021
|
-
end
|
1022
|
-
expr
|
1023
|
-
end
|
1024
|
-
|
1025
|
-
# Return indices where `condition` evaluates `true`.
|
1026
|
-
#
|
1027
|
-
# @param condition [Expr]
|
1028
|
-
# Boolean expression to evaluate
|
1029
|
-
# @param eager [Boolean]
|
1030
|
-
# Whether to apply this function eagerly (as opposed to lazily).
|
1031
|
-
#
|
1032
|
-
# @return [Expr, Series]
|
1033
|
-
#
|
1034
|
-
# @example
|
1035
|
-
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4, 5]})
|
1036
|
-
# df.select(
|
1037
|
-
# [
|
1038
|
-
# Polars.arg_where(Polars.col("a") % 2 == 0)
|
1039
|
-
# ]
|
1040
|
-
# ).to_series
|
1041
|
-
# # =>
|
1042
|
-
# # shape: (2,)
|
1043
|
-
# # Series: 'a' [u32]
|
1044
|
-
# # [
|
1045
|
-
# # 1
|
1046
|
-
# # 3
|
1047
|
-
# # ]
|
1048
|
-
def arg_where(condition, eager: false)
|
1049
|
-
if eager
|
1050
|
-
if !condition.is_a?(Series)
|
1051
|
-
raise ArgumentError, "expected 'Series' in 'arg_where' if 'eager=True', got #{condition.class.name}"
|
1052
|
-
end
|
1053
|
-
condition.to_frame.select(arg_where(Polars.col(condition.name))).to_series
|
1054
|
-
else
|
1055
|
-
condition = Utils.expr_to_lit_or_expr(condition, str_to_lit: true)
|
1056
|
-
Utils.wrap_expr(_arg_where(condition._rbexpr))
|
1057
|
-
end
|
1058
|
-
end
|
1059
|
-
|
1060
|
-
# Folds the expressions from left to right, keeping the first non-null value.
|
1061
|
-
#
|
1062
|
-
# @param exprs [Object]
|
1063
|
-
# Expressions to coalesce.
|
1064
|
-
#
|
1065
|
-
# @return [Expr]
|
1066
|
-
#
|
1067
|
-
# @example
|
1068
|
-
# df = Polars::DataFrame.new(
|
1069
|
-
# [
|
1070
|
-
# [nil, 1.0, 1.0],
|
1071
|
-
# [nil, 2.0, 2.0],
|
1072
|
-
# [nil, nil, 3.0],
|
1073
|
-
# [nil, nil, nil]
|
1074
|
-
# ],
|
1075
|
-
# columns: [["a", :f64], ["b", :f64], ["c", :f64]]
|
1076
|
-
# )
|
1077
|
-
# df.with_column(Polars.coalesce(["a", "b", "c", 99.9]).alias("d"))
|
1078
|
-
# # =>
|
1079
|
-
# # shape: (4, 4)
|
1080
|
-
# # ┌──────┬──────┬──────┬──────┐
|
1081
|
-
# # │ a ┆ b ┆ c ┆ d │
|
1082
|
-
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1083
|
-
# # │ f64 ┆ f64 ┆ f64 ┆ f64 │
|
1084
|
-
# # ╞══════╪══════╪══════╪══════╡
|
1085
|
-
# # │ null ┆ 1.0 ┆ 1.0 ┆ 1.0 │
|
1086
|
-
# # │ null ┆ 2.0 ┆ 2.0 ┆ 2.0 │
|
1087
|
-
# # │ null ┆ null ┆ 3.0 ┆ 3.0 │
|
1088
|
-
# # │ null ┆ null ┆ null ┆ 99.9 │
|
1089
|
-
# # └──────┴──────┴──────┴──────┘
|
1090
|
-
def coalesce(exprs, *more_exprs)
|
1091
|
-
exprs = Utils.selection_to_rbexpr_list(exprs)
|
1092
|
-
if more_exprs.any?
|
1093
|
-
exprs.concat(Utils.selection_to_rbexpr_list(more_exprs))
|
1094
|
-
end
|
1095
|
-
Utils.wrap_expr(_coalesce_exprs(exprs))
|
1096
|
-
end
|
1097
|
-
|
1098
|
-
# Utility function that parses an epoch timestamp (or Unix time) to Polars Date(time).
|
1099
|
-
#
|
1100
|
-
# Depending on the `unit` provided, this function will return a different dtype:
|
1101
|
-
# - unit: "d" returns pl.Date
|
1102
|
-
# - unit: "s" returns pl.Datetime["us"] (pl.Datetime's default)
|
1103
|
-
# - unit: "ms" returns pl.Datetime["ms"]
|
1104
|
-
# - unit: "us" returns pl.Datetime["us"]
|
1105
|
-
# - unit: "ns" returns pl.Datetime["ns"]
|
1106
|
-
#
|
1107
|
-
# @param column [Object]
|
1108
|
-
# Series or expression to parse integers to pl.Datetime.
|
1109
|
-
# @param unit [String]
|
1110
|
-
# The unit of the timesteps since epoch time.
|
1111
|
-
# @param eager [Boolean]
|
1112
|
-
# If eager evaluation is `true`, a Series is returned instead of an Expr.
|
1113
|
-
#
|
1114
|
-
# @return [Object]
|
1115
|
-
#
|
1116
|
-
# @example
|
1117
|
-
# df = Polars::DataFrame.new({"timestamp" => [1666683077, 1666683099]}).lazy
|
1118
|
-
# df.select(Polars.from_epoch(Polars.col("timestamp"), unit: "s")).collect
|
1119
|
-
# # =>
|
1120
|
-
# # shape: (2, 1)
|
1121
|
-
# # ┌─────────────────────┐
|
1122
|
-
# # │ timestamp │
|
1123
|
-
# # │ --- │
|
1124
|
-
# # │ datetime[μs] │
|
1125
|
-
# # ╞═════════════════════╡
|
1126
|
-
# # │ 2022-10-25 07:31:17 │
|
1127
|
-
# # │ 2022-10-25 07:31:39 │
|
1128
|
-
# # └─────────────────────┘
|
1129
|
-
def from_epoch(column, unit: "s", eager: false)
|
1130
|
-
if Utils.strlike?(column)
|
1131
|
-
column = col(column)
|
1132
|
-
elsif !column.is_a?(Series) && !column.is_a?(Expr)
|
1133
|
-
column = Series.new(column)
|
1134
|
-
end
|
1135
|
-
|
1136
|
-
if unit == "d"
|
1137
|
-
expr = column.cast(Date)
|
1138
|
-
elsif unit == "s"
|
1139
|
-
expr = (column.cast(Int64) * 1_000_000).cast(Datetime.new("us"))
|
1140
|
-
elsif Utils::DTYPE_TEMPORAL_UNITS.include?(unit)
|
1141
|
-
expr = column.cast(Datetime.new(unit))
|
1142
|
-
else
|
1143
|
-
raise ArgumentError, "'unit' must be one of {{'ns', 'us', 'ms', 's', 'd'}}, got '#{unit}'."
|
1144
|
-
end
|
1145
|
-
|
1146
|
-
if eager
|
1147
|
-
if !column.is_a?(Series)
|
1148
|
-
raise ArgumentError, "expected Series or Array if eager: true, got #{column.class.name}"
|
1149
|
-
else
|
1150
|
-
column.to_frame.select(expr).to_series
|
1151
|
-
end
|
1152
|
-
else
|
1153
|
-
expr
|
1154
|
-
end
|
1155
|
-
end
|
1156
|
-
|
1157
|
-
# Start a "when, then, otherwise" expression.
|
1158
|
-
#
|
1159
|
-
# @return [When]
|
1160
|
-
#
|
1161
|
-
# @example
|
1162
|
-
# df = Polars::DataFrame.new({"foo" => [1, 3, 4], "bar" => [3, 4, 0]})
|
1163
|
-
# df.with_column(Polars.when(Polars.col("foo") > 2).then(Polars.lit(1)).otherwise(Polars.lit(-1)))
|
1164
|
-
# # =>
|
1165
|
-
# # shape: (3, 3)
|
1166
|
-
# # ┌─────┬─────┬─────────┐
|
1167
|
-
# # │ foo ┆ bar ┆ literal │
|
1168
|
-
# # │ --- ┆ --- ┆ --- │
|
1169
|
-
# # │ i64 ┆ i64 ┆ i32 │
|
1170
|
-
# # ╞═════╪═════╪═════════╡
|
1171
|
-
# # │ 1 ┆ 3 ┆ -1 │
|
1172
|
-
# # │ 3 ┆ 4 ┆ 1 │
|
1173
|
-
# # │ 4 ┆ 0 ┆ 1 │
|
1174
|
-
# # └─────┴─────┴─────────┘
|
1175
|
-
def when(expr)
|
1176
|
-
expr = Utils.expr_to_lit_or_expr(expr)
|
1177
|
-
pw = RbExpr.when(expr._rbexpr)
|
1178
|
-
When.new(pw)
|
1179
|
-
end
|
1180
|
-
end
|
1181
|
-
end
|