polars-df 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +3 -0
- data/CHANGELOG.md +9 -0
- data/Cargo.lock +74 -3
- data/Cargo.toml +3 -0
- data/README.md +1 -1
- data/ext/polars/Cargo.toml +18 -1
- data/ext/polars/src/conversion.rs +115 -2
- data/ext/polars/src/dataframe.rs +228 -11
- data/ext/polars/src/error.rs +4 -0
- data/ext/polars/src/lazy/dataframe.rs +5 -5
- data/ext/polars/src/lazy/dsl.rs +157 -2
- data/ext/polars/src/lib.rs +185 -10
- data/ext/polars/src/list_construction.rs +100 -0
- data/ext/polars/src/series.rs +217 -29
- data/ext/polars/src/set.rs +91 -0
- data/ext/polars/src/utils.rs +19 -0
- data/lib/polars/batched_csv_reader.rb +1 -0
- data/lib/polars/cat_expr.rb +39 -0
- data/lib/polars/cat_name_space.rb +54 -0
- data/lib/polars/data_frame.rb +2384 -140
- data/lib/polars/date_time_expr.rb +1282 -7
- data/lib/polars/date_time_name_space.rb +1484 -0
- data/lib/polars/exceptions.rb +20 -0
- data/lib/polars/expr.rb +4374 -53
- data/lib/polars/expr_dispatch.rb +22 -0
- data/lib/polars/functions.rb +219 -0
- data/lib/polars/group_by.rb +518 -0
- data/lib/polars/io.rb +421 -2
- data/lib/polars/lazy_frame.rb +1267 -69
- data/lib/polars/lazy_functions.rb +412 -24
- data/lib/polars/lazy_group_by.rb +80 -0
- data/lib/polars/list_expr.rb +507 -5
- data/lib/polars/list_name_space.rb +346 -0
- data/lib/polars/meta_expr.rb +21 -0
- data/lib/polars/series.rb +2256 -242
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/string_expr.rb +847 -10
- data/lib/polars/string_name_space.rb +690 -0
- data/lib/polars/struct_expr.rb +73 -0
- data/lib/polars/struct_name_space.rb +64 -0
- data/lib/polars/utils.rb +71 -3
- data/lib/polars/version.rb +2 -1
- data/lib/polars/when.rb +1 -0
- data/lib/polars/when_then.rb +1 -0
- data/lib/polars.rb +12 -10
- metadata +15 -2
@@ -1,5 +1,8 @@
|
|
1
1
|
module Polars
|
2
2
|
module LazyFunctions
|
3
|
+
# Return an expression representing a column in a DataFrame.
|
4
|
+
#
|
5
|
+
# @return [Expr]
|
3
6
|
def col(name)
|
4
7
|
if name.is_a?(Series)
|
5
8
|
name = name.to_a
|
@@ -21,10 +24,42 @@ module Polars
|
|
21
24
|
end
|
22
25
|
end
|
23
26
|
|
27
|
+
# Alias for an element in evaluated in an `eval` expression.
|
28
|
+
#
|
29
|
+
# @return [Expr]
|
30
|
+
#
|
31
|
+
# @example A horizontal rank computation by taking the elements of a list
|
32
|
+
# df = Polars::DataFrame.new({"a" => [1, 8, 3], "b" => [4, 5, 2]})
|
33
|
+
# df.with_column(
|
34
|
+
# Polars.concat_list(["a", "b"]).arr.eval(Polars.element.rank).alias("rank")
|
35
|
+
# )
|
36
|
+
# # =>
|
37
|
+
# # shape: (3, 3)
|
38
|
+
# # ┌─────┬─────┬────────────┐
|
39
|
+
# # │ a ┆ b ┆ rank │
|
40
|
+
# # │ --- ┆ --- ┆ --- │
|
41
|
+
# # │ i64 ┆ i64 ┆ list[f32] │
|
42
|
+
# # ╞═════╪═════╪════════════╡
|
43
|
+
# # │ 1 ┆ 4 ┆ [1.0, 2.0] │
|
44
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
45
|
+
# # │ 8 ┆ 5 ┆ [2.0, 1.0] │
|
46
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
47
|
+
# # │ 3 ┆ 2 ┆ [2.0, 1.0] │
|
48
|
+
# # └─────┴─────┴────────────┘
|
24
49
|
def element
|
25
50
|
col("")
|
26
51
|
end
|
27
52
|
|
53
|
+
# Count the number of values in this column/context.
|
54
|
+
#
|
55
|
+
# @param column [String, Series, nil]
|
56
|
+
# If dtype is:
|
57
|
+
#
|
58
|
+
# * `Series` : count the values in the series.
|
59
|
+
# * `String` : count the values in this column.
|
60
|
+
# * `None` : count the number of values in this context.
|
61
|
+
#
|
62
|
+
# @return [Expr, Integer]
|
28
63
|
def count(column = nil)
|
29
64
|
if column.nil?
|
30
65
|
return Utils.wrap_expr(RbExpr.count)
|
@@ -37,9 +72,16 @@ module Polars
|
|
37
72
|
end
|
38
73
|
end
|
39
74
|
|
40
|
-
#
|
41
|
-
#
|
75
|
+
# Aggregate to list.
|
76
|
+
#
|
77
|
+
# @return [Expr]
|
78
|
+
def to_list(name)
|
79
|
+
col(name).list
|
80
|
+
end
|
42
81
|
|
82
|
+
# Get the standard deviation.
|
83
|
+
#
|
84
|
+
# @return [Object]
|
43
85
|
def std(column, ddof: 1)
|
44
86
|
if column.is_a?(Series)
|
45
87
|
column.std(ddof: ddof)
|
@@ -48,6 +90,9 @@ module Polars
|
|
48
90
|
end
|
49
91
|
end
|
50
92
|
|
93
|
+
# Get the variance.
|
94
|
+
#
|
95
|
+
# @return [Object]
|
51
96
|
def var(column, ddof: 1)
|
52
97
|
if column.is_a?(Series)
|
53
98
|
column.var(ddof: ddof)
|
@@ -56,6 +101,16 @@ module Polars
|
|
56
101
|
end
|
57
102
|
end
|
58
103
|
|
104
|
+
# Get the maximum value.
|
105
|
+
#
|
106
|
+
# @param column [Object]
|
107
|
+
# Column(s) to be used in aggregation. Will lead to different behavior based on
|
108
|
+
# the input:
|
109
|
+
#
|
110
|
+
# - [String, Series] -> aggregate the maximum value of that column.
|
111
|
+
# - [Array<Expr>] -> aggregate the maximum value horizontally.
|
112
|
+
#
|
113
|
+
# @return [Expr, Object]
|
59
114
|
def max(column)
|
60
115
|
if column.is_a?(Series)
|
61
116
|
column.max
|
@@ -68,6 +123,16 @@ module Polars
|
|
68
123
|
end
|
69
124
|
end
|
70
125
|
|
126
|
+
# Get the minimum value.
|
127
|
+
#
|
128
|
+
# @param column [Object]
|
129
|
+
# Column(s) to be used in aggregation. Will lead to different behavior based on
|
130
|
+
# the input:
|
131
|
+
#
|
132
|
+
# - [String, Series] -> aggregate the minimum value of that column.
|
133
|
+
# - [Array<Expr>] -> aggregate the minimum value horizontally.
|
134
|
+
#
|
135
|
+
# @return [Expr, Object]
|
71
136
|
def min(column)
|
72
137
|
if column.is_a?(Series)
|
73
138
|
column.min
|
@@ -80,6 +145,9 @@ module Polars
|
|
80
145
|
end
|
81
146
|
end
|
82
147
|
|
148
|
+
# Sum values in a column/Series, or horizontally across list of columns/expressions.
|
149
|
+
#
|
150
|
+
# @return [Object]
|
83
151
|
def sum(column)
|
84
152
|
if column.is_a?(Series)
|
85
153
|
column.sum
|
@@ -94,6 +162,9 @@ module Polars
|
|
94
162
|
end
|
95
163
|
end
|
96
164
|
|
165
|
+
# Get the mean value.
|
166
|
+
#
|
167
|
+
# @return [Expr, Float]
|
97
168
|
def mean(column)
|
98
169
|
if column.is_a?(Series)
|
99
170
|
column.mean
|
@@ -102,10 +173,16 @@ module Polars
|
|
102
173
|
end
|
103
174
|
end
|
104
175
|
|
176
|
+
# Get the mean value.
|
177
|
+
#
|
178
|
+
# @return [Expr, Float]
|
105
179
|
def avg(column)
|
106
180
|
mean(column)
|
107
181
|
end
|
108
182
|
|
183
|
+
# Get the median value.
|
184
|
+
#
|
185
|
+
# @return [Object]
|
109
186
|
def median(column)
|
110
187
|
if column.is_a?(Series)
|
111
188
|
column.median
|
@@ -114,9 +191,20 @@ module Polars
|
|
114
191
|
end
|
115
192
|
end
|
116
193
|
|
117
|
-
#
|
118
|
-
#
|
194
|
+
# Count unique values.
|
195
|
+
#
|
196
|
+
# @return [Object]
|
197
|
+
def n_unique(column)
|
198
|
+
if column.is_a?(Series)
|
199
|
+
column.n_unique
|
200
|
+
else
|
201
|
+
col(column).n_unique
|
202
|
+
end
|
203
|
+
end
|
119
204
|
|
205
|
+
# Get the first value.
|
206
|
+
#
|
207
|
+
# @return [Object]
|
120
208
|
def first(column = nil)
|
121
209
|
if column.nil?
|
122
210
|
return Utils.wrap_expr(RbExpr.first)
|
@@ -133,30 +221,145 @@ module Polars
|
|
133
221
|
end
|
134
222
|
end
|
135
223
|
|
136
|
-
#
|
137
|
-
#
|
224
|
+
# Get the last value.
|
225
|
+
#
|
226
|
+
# Depending on the input type this function does different things:
|
227
|
+
#
|
228
|
+
# - nil -> expression to take last column of a context.
|
229
|
+
# - String -> syntactic sugar for `Polars.col(..).last`
|
230
|
+
# - Series -> Take last value in `Series`
|
231
|
+
#
|
232
|
+
# @return [Object]
|
233
|
+
def last(column = nil)
|
234
|
+
if column.nil?
|
235
|
+
return Utils.wrap_expr(_last)
|
236
|
+
end
|
138
237
|
|
139
|
-
|
140
|
-
|
238
|
+
if column.is_a?(Series)
|
239
|
+
if column.len > 0
|
240
|
+
return column[-1]
|
241
|
+
else
|
242
|
+
raise IndexError, "The series is empty, so no last value can be returned"
|
243
|
+
end
|
244
|
+
end
|
245
|
+
col(column).last
|
246
|
+
end
|
141
247
|
|
142
|
-
#
|
143
|
-
#
|
248
|
+
# Get the first `n` rows.
|
249
|
+
#
|
250
|
+
# @param column [Object]
|
251
|
+
# Column name or Series.
|
252
|
+
# @param n [Integer]
|
253
|
+
# Number of rows to return.
|
254
|
+
#
|
255
|
+
# @return [Object]
|
256
|
+
def head(column, n = 10)
|
257
|
+
if column.is_a?(Series)
|
258
|
+
column.head(n)
|
259
|
+
else
|
260
|
+
col(column).head(n)
|
261
|
+
end
|
262
|
+
end
|
144
263
|
|
264
|
+
# Get the last `n` rows.
|
265
|
+
#
|
266
|
+
# @param column [Object]
|
267
|
+
# Column name or Series.
|
268
|
+
# @param n [Integer]
|
269
|
+
# Number of rows to return.
|
270
|
+
#
|
271
|
+
# @return [Object]
|
272
|
+
def tail(column, n = 10)
|
273
|
+
if column.is_a?(Series)
|
274
|
+
column.tail(n)
|
275
|
+
else
|
276
|
+
col(column).tail(n)
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
# Return an expression representing a literal value.
|
281
|
+
#
|
282
|
+
# @return [Expr]
|
145
283
|
def lit(value)
|
284
|
+
if value.is_a?(Polars::Series)
|
285
|
+
name = value.name
|
286
|
+
value = value._s
|
287
|
+
e = Utils.wrap_expr(RbExpr.lit(value))
|
288
|
+
if name == ""
|
289
|
+
return e
|
290
|
+
end
|
291
|
+
return e.alias(name)
|
292
|
+
end
|
293
|
+
|
146
294
|
Utils.wrap_expr(RbExpr.lit(value))
|
147
295
|
end
|
148
296
|
|
149
297
|
# def cumsum
|
150
298
|
# end
|
151
299
|
|
152
|
-
#
|
153
|
-
#
|
300
|
+
# Compute the spearman rank correlation between two columns.
|
301
|
+
#
|
302
|
+
# Missing data will be excluded from the computation.
|
303
|
+
#
|
304
|
+
# @param a [Object]
|
305
|
+
# Column name or Expression.
|
306
|
+
# @param b [Object]
|
307
|
+
# Column name or Expression.
|
308
|
+
# @param ddof [Integer]
|
309
|
+
# Delta degrees of freedom
|
310
|
+
# @param propagate_nans [Boolean]
|
311
|
+
# If `True` any `NaN` encountered will lead to `NaN` in the output.
|
312
|
+
# Defaults to `False` where `NaN` are regarded as larger than any finite number
|
313
|
+
# and thus lead to the highest rank.
|
314
|
+
#
|
315
|
+
# @return [Expr]
|
316
|
+
def spearman_rank_corr(a, b, ddof: 1, propagate_nans: false)
|
317
|
+
if a.is_a?(String)
|
318
|
+
a = col(a)
|
319
|
+
end
|
320
|
+
if b.is_a?(String)
|
321
|
+
b = col(b)
|
322
|
+
end
|
323
|
+
Utils.wrap_expr(RbExpr.spearman_rank_corr(a._rbexpr, b._rbexpr, ddof, propagate_nans))
|
324
|
+
end
|
154
325
|
|
155
|
-
#
|
156
|
-
#
|
326
|
+
# Compute the pearson's correlation between two columns.
|
327
|
+
#
|
328
|
+
# @param a [Object]
|
329
|
+
# Column name or Expression.
|
330
|
+
# @param b [Object]
|
331
|
+
# Column name or Expression.
|
332
|
+
# @param ddof [Integer]
|
333
|
+
# Delta degrees of freedom
|
334
|
+
#
|
335
|
+
# @return [Expr]
|
336
|
+
def pearson_corr(a, b, ddof: 1)
|
337
|
+
if a.is_a?(String)
|
338
|
+
a = col(a)
|
339
|
+
end
|
340
|
+
if b.is_a?(String)
|
341
|
+
b = col(b)
|
342
|
+
end
|
343
|
+
Utils.wrap_expr(RbExpr.pearson_corr(a._rbexpr, b._rbexpr, ddof))
|
344
|
+
end
|
157
345
|
|
158
|
-
#
|
159
|
-
#
|
346
|
+
# Compute the covariance between two columns/ expressions.
|
347
|
+
#
|
348
|
+
# @param a [Object]
|
349
|
+
# Column name or Expression.
|
350
|
+
# @param b [Object]
|
351
|
+
# Column name or Expression.
|
352
|
+
#
|
353
|
+
# @return [Expr]
|
354
|
+
def cov(a, b)
|
355
|
+
if a.is_a?(String)
|
356
|
+
a = col(a)
|
357
|
+
end
|
358
|
+
if b.is_a?(String)
|
359
|
+
b = col(b)
|
360
|
+
end
|
361
|
+
Utils.wrap_expr(RbExpr.cov(a._rbexpr, b._rbexpr))
|
362
|
+
end
|
160
363
|
|
161
364
|
# def map
|
162
365
|
# end
|
@@ -164,6 +367,9 @@ module Polars
|
|
164
367
|
# def apply
|
165
368
|
# end
|
166
369
|
|
370
|
+
# Accumulate over multiple columns horizontally/ row wise with a left fold.
|
371
|
+
#
|
372
|
+
# @return [Expr]
|
167
373
|
def fold(acc, f, exprs)
|
168
374
|
acc = Utils.expr_to_lit_or_expr(acc, str_to_lit: true)
|
169
375
|
if exprs.is_a?(Expr)
|
@@ -189,6 +395,30 @@ module Polars
|
|
189
395
|
# def exclude
|
190
396
|
# end
|
191
397
|
|
398
|
+
# Do one of two things.
|
399
|
+
#
|
400
|
+
# * function can do a columnwise or elementwise AND operation
|
401
|
+
# * a wildcard column selection
|
402
|
+
#
|
403
|
+
# @param name [Object]
|
404
|
+
# If given this function will apply a bitwise & on the columns.
|
405
|
+
#
|
406
|
+
# @return [Expr]
|
407
|
+
#
|
408
|
+
# @example Sum all columns
|
409
|
+
# df = Polars::DataFrame.new(
|
410
|
+
# {"a" => [1, 2, 3], "b" => ["hello", "foo", "bar"], "c" => [1, 1, 1]}
|
411
|
+
# )
|
412
|
+
# df.select(Polars.all.sum)
|
413
|
+
# # =>
|
414
|
+
# # shape: (1, 3)
|
415
|
+
# # ┌─────┬──────┬─────┐
|
416
|
+
# # │ a ┆ b ┆ c │
|
417
|
+
# # │ --- ┆ --- ┆ --- │
|
418
|
+
# # │ i64 ┆ str ┆ i64 │
|
419
|
+
# # ╞═════╪══════╪═════╡
|
420
|
+
# # │ 6 ┆ null ┆ 3 │
|
421
|
+
# # └─────┴──────┴─────┘
|
192
422
|
def all(name = nil)
|
193
423
|
if name.nil?
|
194
424
|
col("*")
|
@@ -205,6 +435,26 @@ module Polars
|
|
205
435
|
# def quantile
|
206
436
|
# end
|
207
437
|
|
438
|
+
# Create a range expression (or Series).
|
439
|
+
#
|
440
|
+
# This can be used in a `select`, `with_column`, etc. Be sure that the resulting
|
441
|
+
# range size is equal to the length of the DataFrame you are collecting.
|
442
|
+
#
|
443
|
+
# @param low [Integer, Expr, Series]
|
444
|
+
# Lower bound of range.
|
445
|
+
# @param high [Integer, Expr, Series]
|
446
|
+
# Upper bound of range.
|
447
|
+
# @param step [Integer]
|
448
|
+
# Step size of the range.
|
449
|
+
# @param eager [Boolean]
|
450
|
+
# If eager evaluation is `True`, a Series is returned instead of an Expr.
|
451
|
+
# @param dtype [Symbol]
|
452
|
+
# Apply an explicit integer dtype to the resulting expression (default is Int64).
|
453
|
+
#
|
454
|
+
# @return [Expr, Series]
|
455
|
+
#
|
456
|
+
# @example
|
457
|
+
# df.lazy.filter(Polars.col("foo") < Polars.arange(0, 100)).collect
|
208
458
|
def arange(low, high, step: 1, eager: false, dtype: nil)
|
209
459
|
low = Utils.expr_to_lit_or_expr(low, str_to_lit: false)
|
210
460
|
high = Utils.expr_to_lit_or_expr(high, str_to_lit: false)
|
@@ -233,6 +483,9 @@ module Polars
|
|
233
483
|
# def format
|
234
484
|
# end
|
235
485
|
|
486
|
+
# Concat the arrays in a Series dtype List in linear time.
|
487
|
+
#
|
488
|
+
# @return [Expr]
|
236
489
|
def concat_list(exprs)
|
237
490
|
exprs = Utils.selection_to_rbexpr_list(exprs)
|
238
491
|
Utils.wrap_expr(RbExpr.concat_lst(exprs))
|
@@ -241,17 +494,132 @@ module Polars
|
|
241
494
|
# def collect_all
|
242
495
|
# end
|
243
496
|
|
244
|
-
#
|
245
|
-
#
|
497
|
+
# Run polars expressions without a context.
|
498
|
+
#
|
499
|
+
# @return [DataFrame]
|
500
|
+
def select(exprs)
|
501
|
+
DataFrame.new([]).select(exprs)
|
502
|
+
end
|
246
503
|
|
247
|
-
#
|
248
|
-
#
|
504
|
+
# Collect several columns into a Series of dtype Struct.
|
505
|
+
#
|
506
|
+
# @param exprs [Object]
|
507
|
+
# Columns/Expressions to collect into a Struct
|
508
|
+
# @param eager [Boolean]
|
509
|
+
# Evaluate immediately
|
510
|
+
#
|
511
|
+
# @return [Object]
|
512
|
+
#
|
513
|
+
# @example
|
514
|
+
# Polars::DataFrame.new(
|
515
|
+
# {
|
516
|
+
# "int" => [1, 2],
|
517
|
+
# "str" => ["a", "b"],
|
518
|
+
# "bool" => [true, nil],
|
519
|
+
# "list" => [[1, 2], [3]],
|
520
|
+
# }
|
521
|
+
# ).select([Polars.struct(Polars.all).alias("my_struct")])
|
522
|
+
# # =>
|
523
|
+
# # shape: (2, 1)
|
524
|
+
# # ┌─────────────────────┐
|
525
|
+
# # │ my_struct │
|
526
|
+
# # │ --- │
|
527
|
+
# # │ struct[4] │
|
528
|
+
# # ╞═════════════════════╡
|
529
|
+
# # │ {1,"a",true,[1, 2]} │
|
530
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
531
|
+
# # │ {2,"b",null,[3]} │
|
532
|
+
# # └─────────────────────┘
|
533
|
+
#
|
534
|
+
# @example Only collect specific columns as a struct:
|
535
|
+
# df = Polars::DataFrame.new(
|
536
|
+
# {"a" => [1, 2, 3, 4], "b" => ["one", "two", "three", "four"], "c" => [9, 8, 7, 6]}
|
537
|
+
# )
|
538
|
+
# df.with_column(Polars.struct(Polars.col(["a", "b"])).alias("a_and_b"))
|
539
|
+
# # =>
|
540
|
+
# # shape: (4, 4)
|
541
|
+
# # ┌─────┬───────┬─────┬─────────────┐
|
542
|
+
# # │ a ┆ b ┆ c ┆ a_and_b │
|
543
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
544
|
+
# # │ i64 ┆ str ┆ i64 ┆ struct[2] │
|
545
|
+
# # ╞═════╪═══════╪═════╪═════════════╡
|
546
|
+
# # │ 1 ┆ one ┆ 9 ┆ {1,"one"} │
|
547
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
548
|
+
# # │ 2 ┆ two ┆ 8 ┆ {2,"two"} │
|
549
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
550
|
+
# # │ 3 ┆ three ┆ 7 ┆ {3,"three"} │
|
551
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
552
|
+
# # │ 4 ┆ four ┆ 6 ┆ {4,"four"} │
|
553
|
+
# # └─────┴───────┴─────┴─────────────┘
|
554
|
+
def struct(exprs, eager: false)
|
555
|
+
if eager
|
556
|
+
Polars.select(struct(exprs, eager: false)).to_series
|
557
|
+
end
|
558
|
+
exprs = Utils.selection_to_rbexpr_list(exprs)
|
559
|
+
Utils.wrap_expr(_as_struct(exprs))
|
560
|
+
end
|
249
561
|
|
250
|
-
#
|
251
|
-
#
|
562
|
+
# Repeat a single value n times.
|
563
|
+
#
|
564
|
+
# @param value [Object]
|
565
|
+
# Value to repeat.
|
566
|
+
# @param n [Integer]
|
567
|
+
# Repeat `n` times.
|
568
|
+
# @param eager [Boolean]
|
569
|
+
# Run eagerly and collect into a `Series`.
|
570
|
+
# @param name [String]
|
571
|
+
# Only used in `eager` mode. As expression, use `alias`.
|
572
|
+
#
|
573
|
+
# @return [Expr]
|
574
|
+
def repeat(value, n, eager: false, name: nil)
|
575
|
+
if eager
|
576
|
+
if name.nil?
|
577
|
+
name = ""
|
578
|
+
end
|
579
|
+
dtype = py_type_to_dtype(type(value))
|
580
|
+
Series._repeat(name, value, n, dtype)
|
581
|
+
else
|
582
|
+
if n.is_a?(Integer)
|
583
|
+
n = lit(n)
|
584
|
+
end
|
585
|
+
Utils.wrap_expr(RbExpr.repeat(value, n._rbexpr))
|
586
|
+
end
|
587
|
+
end
|
252
588
|
|
253
|
-
#
|
254
|
-
#
|
589
|
+
# Return indices where `condition` evaluates `true`.
|
590
|
+
#
|
591
|
+
# @param condition [Expr]
|
592
|
+
# Boolean expression to evaluate
|
593
|
+
# @param eager [Boolean]
|
594
|
+
# Whether to apply this function eagerly (as opposed to lazily).
|
595
|
+
#
|
596
|
+
# @return [Expr, Series]
|
597
|
+
#
|
598
|
+
# @example
|
599
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4, 5]})
|
600
|
+
# df.select(
|
601
|
+
# [
|
602
|
+
# Polars.arg_where(Polars.col("a") % 2 == 0)
|
603
|
+
# ]
|
604
|
+
# ).to_series
|
605
|
+
# # =>
|
606
|
+
# # shape: (2,)
|
607
|
+
# # Series: 'a' [u32]
|
608
|
+
# # [
|
609
|
+
# # 1
|
610
|
+
# # 3
|
611
|
+
# # ]
|
612
|
+
def arg_where(condition, eager: false)
|
613
|
+
if eager
|
614
|
+
if !condition.is_a?(Series)
|
615
|
+
raise ArgumentError, "expected 'Series' in 'arg_where' if 'eager=True', got #{condition.class.name}"
|
616
|
+
end
|
617
|
+
condition.to_frame.select(arg_where(Polars.col(condition.name))).to_series
|
618
|
+
else
|
619
|
+
condition = Utils.expr_to_lit_or_expr(condition, str_to_lit: true)
|
620
|
+
Utils.wrap_expr(_arg_where(condition._rbexpr))
|
621
|
+
end
|
622
|
+
end
|
255
623
|
|
256
624
|
# def coalesce
|
257
625
|
# end
|
@@ -259,6 +627,26 @@ module Polars
|
|
259
627
|
# def from_epoch
|
260
628
|
# end
|
261
629
|
|
630
|
+
# Start a "when, then, otherwise" expression.
|
631
|
+
#
|
632
|
+
# @return [When]
|
633
|
+
#
|
634
|
+
# @example
|
635
|
+
# df = Polars::DataFrame.new({"foo" => [1, 3, 4], "bar" => [3, 4, 0]})
|
636
|
+
# df.with_column(Polars.when(Polars.col("foo") > 2).then(Polars.lit(1)).otherwise(Polars.lit(-1)))
|
637
|
+
# # =>
|
638
|
+
# # shape: (3, 3)
|
639
|
+
# # ┌─────┬─────┬─────────┐
|
640
|
+
# # │ foo ┆ bar ┆ literal │
|
641
|
+
# # │ --- ┆ --- ┆ --- │
|
642
|
+
# # │ i64 ┆ i64 ┆ i32 │
|
643
|
+
# # ╞═════╪═════╪═════════╡
|
644
|
+
# # │ 1 ┆ 3 ┆ -1 │
|
645
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
646
|
+
# # │ 3 ┆ 4 ┆ 1 │
|
647
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
648
|
+
# # │ 4 ┆ 0 ┆ 1 │
|
649
|
+
# # └─────┴─────┴─────────┘
|
262
650
|
def when(expr)
|
263
651
|
expr = Utils.expr_to_lit_or_expr(expr)
|
264
652
|
pw = RbExpr.when(expr._rbexpr)
|
data/lib/polars/lazy_group_by.rb
CHANGED
@@ -1,13 +1,93 @@
|
|
1
1
|
module Polars
|
2
|
+
# Created by `df.lazy.groupby("foo")`.
|
2
3
|
class LazyGroupBy
|
4
|
+
# @private
|
3
5
|
def initialize(lgb, lazyframe_class)
|
4
6
|
@lgb = lgb
|
5
7
|
@lazyframe_class = lazyframe_class
|
6
8
|
end
|
7
9
|
|
10
|
+
# Describe the aggregation that need to be done on a group.
|
11
|
+
#
|
12
|
+
# @return [LazyFrame]
|
8
13
|
def agg(aggs)
|
9
14
|
rbexprs = Utils.selection_to_rbexpr_list(aggs)
|
10
15
|
@lazyframe_class._from_rbldf(@lgb.agg(rbexprs))
|
11
16
|
end
|
17
|
+
|
18
|
+
# Get the first `n` rows of each group.
|
19
|
+
#
|
20
|
+
# @param n [Integer]
|
21
|
+
# Number of rows to return.
|
22
|
+
#
|
23
|
+
# @return [LazyFrame]
|
24
|
+
#
|
25
|
+
# @example
|
26
|
+
# df = Polars::DataFrame.new(
|
27
|
+
# {
|
28
|
+
# "letters" => ["c", "c", "a", "c", "a", "b"],
|
29
|
+
# "nrs" => [1, 2, 3, 4, 5, 6]
|
30
|
+
# }
|
31
|
+
# )
|
32
|
+
# df.groupby("letters").head(2).sort("letters")
|
33
|
+
# # =>
|
34
|
+
# # shape: (5, 2)
|
35
|
+
# # ┌─────────┬─────┐
|
36
|
+
# # │ letters ┆ nrs │
|
37
|
+
# # │ --- ┆ --- │
|
38
|
+
# # │ str ┆ i64 │
|
39
|
+
# # ╞═════════╪═════╡
|
40
|
+
# # │ a ┆ 3 │
|
41
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
42
|
+
# # │ a ┆ 5 │
|
43
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
44
|
+
# # │ b ┆ 6 │
|
45
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
46
|
+
# # │ c ┆ 1 │
|
47
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
48
|
+
# # │ c ┆ 2 │
|
49
|
+
# # └─────────┴─────┘
|
50
|
+
def head(n = 5)
|
51
|
+
@lazyframe_class._from_rbldf(@lgb.head(n))
|
52
|
+
end
|
53
|
+
|
54
|
+
# Get the last `n` rows of each group.
|
55
|
+
#
|
56
|
+
# @param n [Integer]
|
57
|
+
# Number of rows to return.
|
58
|
+
#
|
59
|
+
# @return [LazyFrame]
|
60
|
+
#
|
61
|
+
# @example
|
62
|
+
# df = Polars::DataFrame.new(
|
63
|
+
# {
|
64
|
+
# "letters" => ["c", "c", "a", "c", "a", "b"],
|
65
|
+
# "nrs" => [1, 2, 3, 4, 5, 6]
|
66
|
+
# }
|
67
|
+
# )
|
68
|
+
# df.groupby("letters").tail(2).sort("letters")
|
69
|
+
# # =>
|
70
|
+
# # shape: (5, 2)
|
71
|
+
# # ┌─────────┬─────┐
|
72
|
+
# # │ letters ┆ nrs │
|
73
|
+
# # │ --- ┆ --- │
|
74
|
+
# # │ str ┆ i64 │
|
75
|
+
# # ╞═════════╪═════╡
|
76
|
+
# # │ a ┆ 3 │
|
77
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
78
|
+
# # │ a ┆ 5 │
|
79
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
80
|
+
# # │ b ┆ 6 │
|
81
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
82
|
+
# # │ c ┆ 2 │
|
83
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
84
|
+
# # │ c ┆ 4 │
|
85
|
+
# # └─────────┴─────┘
|
86
|
+
def tail(n = 5)
|
87
|
+
@lazyframe_class._from_rbldf(@lgb.tail(n))
|
88
|
+
end
|
89
|
+
|
90
|
+
# def apply
|
91
|
+
# end
|
12
92
|
end
|
13
93
|
end
|