polars-df 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,8 @@
1
1
  module Polars
2
2
  module LazyFunctions
3
+ # Return an expression representing a column in a DataFrame.
4
+ #
5
+ # @return [Expr]
3
6
  def col(name)
4
7
  if name.is_a?(Series)
5
8
  name = name.to_a
@@ -21,10 +24,42 @@ module Polars
21
24
  end
22
25
  end
23
26
 
27
+ # Alias for an element in evaluated in an `eval` expression.
28
+ #
29
+ # @return [Expr]
30
+ #
31
+ # @example A horizontal rank computation by taking the elements of a list
32
+ # df = Polars::DataFrame.new({"a" => [1, 8, 3], "b" => [4, 5, 2]})
33
+ # df.with_column(
34
+ # Polars.concat_list(["a", "b"]).arr.eval(Polars.element.rank).alias("rank")
35
+ # )
36
+ # # =>
37
+ # # shape: (3, 3)
38
+ # # ┌─────┬─────┬────────────┐
39
+ # # │ a ┆ b ┆ rank │
40
+ # # │ --- ┆ --- ┆ --- │
41
+ # # │ i64 ┆ i64 ┆ list[f32] │
42
+ # # ╞═════╪═════╪════════════╡
43
+ # # │ 1 ┆ 4 ┆ [1.0, 2.0] │
44
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
45
+ # # │ 8 ┆ 5 ┆ [2.0, 1.0] │
46
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
47
+ # # │ 3 ┆ 2 ┆ [2.0, 1.0] │
48
+ # # └─────┴─────┴────────────┘
24
49
  def element
25
50
  col("")
26
51
  end
27
52
 
53
+ # Count the number of values in this column/context.
54
+ #
55
+ # @param column [String, Series, nil]
56
+ # If dtype is:
57
+ #
58
+ # * `Series` : count the values in the series.
59
+ # * `String` : count the values in this column.
60
+ # * `None` : count the number of values in this context.
61
+ #
62
+ # @return [Expr, Integer]
28
63
  def count(column = nil)
29
64
  if column.nil?
30
65
  return Utils.wrap_expr(RbExpr.count)
@@ -37,9 +72,16 @@ module Polars
37
72
  end
38
73
  end
39
74
 
40
- # def to_list
41
- # end
75
+ # Aggregate to list.
76
+ #
77
+ # @return [Expr]
78
+ def to_list(name)
79
+ col(name).list
80
+ end
42
81
 
82
+ # Get the standard deviation.
83
+ #
84
+ # @return [Object]
43
85
  def std(column, ddof: 1)
44
86
  if column.is_a?(Series)
45
87
  column.std(ddof: ddof)
@@ -48,6 +90,9 @@ module Polars
48
90
  end
49
91
  end
50
92
 
93
+ # Get the variance.
94
+ #
95
+ # @return [Object]
51
96
  def var(column, ddof: 1)
52
97
  if column.is_a?(Series)
53
98
  column.var(ddof: ddof)
@@ -56,6 +101,16 @@ module Polars
56
101
  end
57
102
  end
58
103
 
104
+ # Get the maximum value.
105
+ #
106
+ # @param column [Object]
107
+ # Column(s) to be used in aggregation. Will lead to different behavior based on
108
+ # the input:
109
+ #
110
+ # - [String, Series] -> aggregate the maximum value of that column.
111
+ # - [Array<Expr>] -> aggregate the maximum value horizontally.
112
+ #
113
+ # @return [Expr, Object]
59
114
  def max(column)
60
115
  if column.is_a?(Series)
61
116
  column.max
@@ -68,6 +123,16 @@ module Polars
68
123
  end
69
124
  end
70
125
 
126
+ # Get the minimum value.
127
+ #
128
+ # @param column [Object]
129
+ # Column(s) to be used in aggregation. Will lead to different behavior based on
130
+ # the input:
131
+ #
132
+ # - [String, Series] -> aggregate the minimum value of that column.
133
+ # - [Array<Expr>] -> aggregate the minimum value horizontally.
134
+ #
135
+ # @return [Expr, Object]
71
136
  def min(column)
72
137
  if column.is_a?(Series)
73
138
  column.min
@@ -80,6 +145,9 @@ module Polars
80
145
  end
81
146
  end
82
147
 
148
+ # Sum values in a column/Series, or horizontally across list of columns/expressions.
149
+ #
150
+ # @return [Object]
83
151
  def sum(column)
84
152
  if column.is_a?(Series)
85
153
  column.sum
@@ -94,6 +162,9 @@ module Polars
94
162
  end
95
163
  end
96
164
 
165
+ # Get the mean value.
166
+ #
167
+ # @return [Expr, Float]
97
168
  def mean(column)
98
169
  if column.is_a?(Series)
99
170
  column.mean
@@ -102,10 +173,16 @@ module Polars
102
173
  end
103
174
  end
104
175
 
176
+ # Get the mean value.
177
+ #
178
+ # @return [Expr, Float]
105
179
  def avg(column)
106
180
  mean(column)
107
181
  end
108
182
 
183
+ # Get the median value.
184
+ #
185
+ # @return [Object]
109
186
  def median(column)
110
187
  if column.is_a?(Series)
111
188
  column.median
@@ -117,6 +194,9 @@ module Polars
117
194
  # def n_unique
118
195
  # end
119
196
 
197
+ # Get the first value.
198
+ #
199
+ # @return [Object]
120
200
  def first(column = nil)
121
201
  if column.nil?
122
202
  return Utils.wrap_expr(RbExpr.first)
@@ -142,7 +222,20 @@ module Polars
142
222
  # def tail
143
223
  # end
144
224
 
225
+ # Return an expression representing a literal value.
226
+ #
227
+ # @return [Expr]
145
228
  def lit(value)
229
+ if value.is_a?(Polars::Series)
230
+ name = value.name
231
+ value = value._s
232
+ e = Utils.wrap_expr(RbExpr.lit(value))
233
+ if name == ""
234
+ return e
235
+ end
236
+ return e.alias(name)
237
+ end
238
+
146
239
  Utils.wrap_expr(RbExpr.lit(value))
147
240
  end
148
241
 
@@ -164,6 +257,9 @@ module Polars
164
257
  # def apply
165
258
  # end
166
259
 
260
+ # Accumulate over multiple columns horizontally/ row wise with a left fold.
261
+ #
262
+ # @return [Expr]
167
263
  def fold(acc, f, exprs)
168
264
  acc = Utils.expr_to_lit_or_expr(acc, str_to_lit: true)
169
265
  if exprs.is_a?(Expr)
@@ -189,6 +285,30 @@ module Polars
189
285
  # def exclude
190
286
  # end
191
287
 
288
+ # Do one of two things.
289
+ #
290
+ # * function can do a columnwise or elementwise AND operation
291
+ # * a wildcard column selection
292
+ #
293
+ # @param name [Object]
294
+ # If given this function will apply a bitwise & on the columns.
295
+ #
296
+ # @return [Expr]
297
+ #
298
+ # @example Sum all columns
299
+ # df = Polars::DataFrame.new(
300
+ # {"a" => [1, 2, 3], "b" => ["hello", "foo", "bar"], "c" => [1, 1, 1]}
301
+ # )
302
+ # df.select(Polars.all.sum)
303
+ # # =>
304
+ # # shape: (1, 3)
305
+ # # ┌─────┬──────┬─────┐
306
+ # # │ a ┆ b ┆ c │
307
+ # # │ --- ┆ --- ┆ --- │
308
+ # # │ i64 ┆ str ┆ i64 │
309
+ # # ╞═════╪══════╪═════╡
310
+ # # │ 6 ┆ null ┆ 3 │
311
+ # # └─────┴──────┴─────┘
192
312
  def all(name = nil)
193
313
  if name.nil?
194
314
  col("*")
@@ -205,6 +325,26 @@ module Polars
205
325
  # def quantile
206
326
  # end
207
327
 
328
+ # Create a range expression (or Series).
329
+ #
330
+ # This can be used in a `select`, `with_column`, etc. Be sure that the resulting
331
+ # range size is equal to the length of the DataFrame you are collecting.
332
+ #
333
+ # @param low [Integer, Expr, Series]
334
+ # Lower bound of range.
335
+ # @param high [Integer, Expr, Series]
336
+ # Upper bound of range.
337
+ # @param step [Integer]
338
+ # Step size of the range.
339
+ # @param eager [Boolean]
340
+ # If eager evaluation is `True`, a Series is returned instead of an Expr.
341
+ # @param dtype [Symbol]
342
+ # Apply an explicit integer dtype to the resulting expression (default is Int64).
343
+ #
344
+ # @return [Expr, Series]
345
+ #
346
+ # @example
347
+ # df.lazy.filter(Polars.col("foo") < Polars.arange(0, 100)).collect
208
348
  def arange(low, high, step: 1, eager: false, dtype: nil)
209
349
  low = Utils.expr_to_lit_or_expr(low, str_to_lit: false)
210
350
  high = Utils.expr_to_lit_or_expr(high, str_to_lit: false)
@@ -233,6 +373,9 @@ module Polars
233
373
  # def format
234
374
  # end
235
375
 
376
+ # Concat the arrays in a Series dtype List in linear time.
377
+ #
378
+ # @return [Expr]
236
379
  def concat_list(exprs)
237
380
  exprs = Utils.selection_to_rbexpr_list(exprs)
238
381
  Utils.wrap_expr(RbExpr.concat_lst(exprs))
@@ -241,17 +384,132 @@ module Polars
241
384
  # def collect_all
242
385
  # end
243
386
 
244
- # def select
245
- # end
387
+ # Run polars expressions without a context.
388
+ #
389
+ # @return [DataFrame]
390
+ def select(exprs)
391
+ DataFrame.new([]).select(exprs)
392
+ end
246
393
 
247
- # def struct
248
- # end
394
+ # Collect several columns into a Series of dtype Struct.
395
+ #
396
+ # @param exprs [Object]
397
+ # Columns/Expressions to collect into a Struct
398
+ # @param eager [Boolean]
399
+ # Evaluate immediately
400
+ #
401
+ # @return [Object]
402
+ #
403
+ # @example
404
+ # Polars::DataFrame.new(
405
+ # {
406
+ # "int" => [1, 2],
407
+ # "str" => ["a", "b"],
408
+ # "bool" => [true, nil],
409
+ # "list" => [[1, 2], [3]],
410
+ # }
411
+ # ).select([Polars.struct(Polars.all()).alias("my_struct")])
412
+ # # =>
413
+ # # shape: (2, 1)
414
+ # # ┌─────────────────────┐
415
+ # # │ my_struct │
416
+ # # │ --- │
417
+ # # │ struct[4] │
418
+ # # ╞═════════════════════╡
419
+ # # │ {1,"a",true,[1, 2]} │
420
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
421
+ # # │ {2,"b",null,[3]} │
422
+ # # └─────────────────────┘
423
+ #
424
+ # @example Only collect specific columns as a struct:
425
+ # df = Polars::DataFrame.new(
426
+ # {"a" => [1, 2, 3, 4], "b" => ["one", "two", "three", "four"], "c" => [9, 8, 7, 6]}
427
+ # )
428
+ # df.with_column(pl.struct(pl.col(["a", "b"])).alias("a_and_b"))
429
+ # # =>
430
+ # # shape: (4, 4)
431
+ # # ┌─────┬───────┬─────┬─────────────┐
432
+ # # │ a ┆ b ┆ c ┆ a_and_b │
433
+ # # │ --- ┆ --- ┆ --- ┆ --- │
434
+ # # │ i64 ┆ str ┆ i64 ┆ struct[2] │
435
+ # # ╞═════╪═══════╪═════╪═════════════╡
436
+ # # │ 1 ┆ one ┆ 9 ┆ {1,"one"} │
437
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
438
+ # # │ 2 ┆ two ┆ 8 ┆ {2,"two"} │
439
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
440
+ # # │ 3 ┆ three ┆ 7 ┆ {3,"three"} │
441
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
442
+ # # │ 4 ┆ four ┆ 6 ┆ {4,"four"} │
443
+ # # └─────┴───────┴─────┴─────────────┘
444
+ def struct(exprs, eager: false)
445
+ if eager
446
+ Polars.select(struct(exprs, eager: false)).to_series
447
+ end
448
+ exprs = Utils.selection_to_rbexpr_list(exprs)
449
+ Utils.wrap_expr(_as_struct(exprs))
450
+ end
249
451
 
250
- # def repeat
251
- # end
452
+ # Repeat a single value n times.
453
+ #
454
+ # @param value [Object]
455
+ # Value to repeat.
456
+ # @param n [Integer]
457
+ # Repeat `n` times.
458
+ # @param eager [Boolean]
459
+ # Run eagerly and collect into a `Series`.
460
+ # @param name [String]
461
+ # Only used in `eager` mode. As expression, use `alias`.
462
+ #
463
+ # @return [Expr]
464
+ def repeat(value, n, eager: false, name: nil)
465
+ if eager
466
+ if name.nil?
467
+ name = ""
468
+ end
469
+ dtype = py_type_to_dtype(type(value))
470
+ Series._repeat(name, value, n, dtype)
471
+ else
472
+ if n.is_a?(Integer)
473
+ n = lit(n)
474
+ end
475
+ Utils.wrap_expr(RbExpr.repeat(value, n._rbexpr))
476
+ end
477
+ end
252
478
 
253
- # def arg_where
254
- # end
479
+ # Return indices where `condition` evaluates `true`.
480
+ #
481
+ # @param condition [Expr]
482
+ # Boolean expression to evaluate
483
+ # @param eager [Boolean]
484
+ # Whether to apply this function eagerly (as opposed to lazily).
485
+ #
486
+ # @return [Expr, Series]
487
+ #
488
+ # @example
489
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4, 5]})
490
+ # df.select(
491
+ # [
492
+ # Polars.arg_where(Polars.col("a") % 2 == 0)
493
+ # ]
494
+ # ).to_series
495
+ # # =>
496
+ # # shape: (2,)
497
+ # # Series: 'a' [u32]
498
+ # # [
499
+ # # 1
500
+ # # 3
501
+ # # ]
502
+ def arg_where(condition, eager: false)
503
+ if eager
504
+ if !condition.is_a?(Series)
505
+ raise ArgumentError, "expected 'Series' in 'arg_where' if 'eager=True', got #{condition.class.name}"
506
+ end
507
+ condition.to_frame.select(arg_where(Polars.col(condition.name))).to_series
508
+ else
509
+ condition = Utils.expr_to_lit_or_expr(condition, str_to_lit: true)
510
+ Utils.wrap_expr(_arg_where(condition._rbexpr))
511
+ end
512
+ end
255
513
 
256
514
  # def coalesce
257
515
  # end
@@ -259,6 +517,26 @@ module Polars
259
517
  # def from_epoch
260
518
  # end
261
519
 
520
+ # Start a "when, then, otherwise" expression.
521
+ #
522
+ # @return [When]
523
+ #
524
+ # @example
525
+ # df = Polars::DataFrame.new({"foo" => [1, 3, 4], "bar" => [3, 4, 0]})
526
+ # df.with_column(Polars.when(Polars.col("foo") > 2).then(Polars.lit(1)).otherwise(Polars.lit(-1)))
527
+ # # =>
528
+ # # shape: (3, 3)
529
+ # # ┌─────┬─────┬─────────┐
530
+ # # │ foo ┆ bar ┆ literal │
531
+ # # │ --- ┆ --- ┆ --- │
532
+ # # │ i64 ┆ i64 ┆ i32 │
533
+ # # ╞═════╪═════╪═════════╡
534
+ # # │ 1 ┆ 3 ┆ -1 │
535
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
536
+ # # │ 3 ┆ 4 ┆ 1 │
537
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
538
+ # # │ 4 ┆ 0 ┆ 1 │
539
+ # # └─────┴─────┴─────────┘
262
540
  def when(expr)
263
541
  expr = Utils.expr_to_lit_or_expr(expr)
264
542
  pw = RbExpr.when(expr._rbexpr)
@@ -1,13 +1,92 @@
1
1
  module Polars
2
2
  class LazyGroupBy
3
+ # @private
3
4
  def initialize(lgb, lazyframe_class)
4
5
  @lgb = lgb
5
6
  @lazyframe_class = lazyframe_class
6
7
  end
7
8
 
9
+ # Describe the aggregation that need to be done on a group.
10
+ #
11
+ # @return [LazyFrame]
8
12
  def agg(aggs)
9
13
  rbexprs = Utils.selection_to_rbexpr_list(aggs)
10
14
  @lazyframe_class._from_rbldf(@lgb.agg(rbexprs))
11
15
  end
16
+
17
+ # Get the first `n` rows of each group.
18
+ #
19
+ # @param n [Integer]
20
+ # Number of rows to return.
21
+ #
22
+ # @return [LazyFrame]
23
+ #
24
+ # @example
25
+ # df = Polars::DataFrame.new(
26
+ # {
27
+ # "letters" => ["c", "c", "a", "c", "a", "b"],
28
+ # "nrs" => [1, 2, 3, 4, 5, 6]
29
+ # }
30
+ # )
31
+ # df.groupby("letters").head(2).sort("letters")
32
+ # # =>
33
+ # # shape: (5, 2)
34
+ # # ┌─────────┬─────┐
35
+ # # │ letters ┆ nrs │
36
+ # # │ --- ┆ --- │
37
+ # # │ str ┆ i64 │
38
+ # # ╞═════════╪═════╡
39
+ # # │ a ┆ 3 │
40
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
41
+ # # │ a ┆ 5 │
42
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
43
+ # # │ b ┆ 6 │
44
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
45
+ # # │ c ┆ 1 │
46
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
47
+ # # │ c ┆ 2 │
48
+ # # └─────────┴─────┘
49
+ def head(n = 5)
50
+ @lazyframe_class._from_rbldf(@lgb.head(n))
51
+ end
52
+
53
+ # Get the last `n` rows of each group.
54
+ #
55
+ # @param n [Integer]
56
+ # Number of rows to return.
57
+ #
58
+ # @return [LazyFrame]
59
+ #
60
+ # @example
61
+ # df = Polars::DataFrame.new(
62
+ # {
63
+ # "letters" => ["c", "c", "a", "c", "a", "b"],
64
+ # "nrs" => [1, 2, 3, 4, 5, 6]
65
+ # }
66
+ # )
67
+ # df.groupby("letters").tail(2).sort("letters")
68
+ # # =>
69
+ # # shape: (5, 2)
70
+ # # ┌─────────┬─────┐
71
+ # # │ letters ┆ nrs │
72
+ # # │ --- ┆ --- │
73
+ # # │ str ┆ i64 │
74
+ # # ╞═════════╪═════╡
75
+ # # │ a ┆ 3 │
76
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
77
+ # # │ a ┆ 5 │
78
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
79
+ # # │ b ┆ 6 │
80
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
81
+ # # │ c ┆ 2 │
82
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
83
+ # # │ c ┆ 4 │
84
+ # # └─────────┴─────┘
85
+ def tail(n = 5)
86
+ @lazyframe_class._from_rbldf(@lgb.tail(n))
87
+ end
88
+
89
+ # def apply
90
+ # end
12
91
  end
13
92
  end
@@ -1,7 +1,10 @@
1
1
  module Polars
2
+ # Namespace for list related expressions.
2
3
  class ListExpr
4
+ # @private
3
5
  attr_accessor :_rbexpr
4
6
 
7
+ # @private
5
8
  def initialize(expr)
6
9
  self._rbexpr = expr._rbexpr
7
10
  end
@@ -41,6 +44,7 @@ module Polars
41
44
  # def concat
42
45
  # end
43
46
 
47
+ #
44
48
  def get(index)
45
49
  index = Utils.expr_to_lit_or_expr(index, str_to_lit: false)._rbexpr
46
50
  Utils.wrap_expr(_rbexpr.lst_get(index))
@@ -101,6 +105,7 @@ module Polars
101
105
  # Utils.wrap_expr(_rbexpr.lst_to_struct(n_field_strategy, name_generator))
102
106
  # end
103
107
 
108
+ #
104
109
  def eval(expr, parallel: false)
105
110
  Utils.wrap_expr(_rbexpr.lst_eval(expr._rbexpr, parallel))
106
111
  end
@@ -1,31 +1,52 @@
1
1
  module Polars
2
+ # Namespace for expressions on a meta level.
2
3
  class MetaExpr
4
+ # @private
3
5
  attr_accessor :_rbexpr
4
6
 
7
+ # @private
5
8
  def initialize(expr)
6
9
  self._rbexpr = expr._rbexpr
7
10
  end
8
11
 
12
+ # Equal.
13
+ #
14
+ # @return [Boolean]
9
15
  def ==(other)
10
16
  _rbexpr.meta_eq(other._rbexpr)
11
17
  end
12
18
 
19
+ # Not equal.
20
+ #
21
+ # @return [Boolean]
13
22
  def !=(other)
14
23
  !(self == other)
15
24
  end
16
25
 
26
+ # Pop the latest expression and return the input(s) of the popped expression.
27
+ #
28
+ # @return [Array]
17
29
  def pop
18
30
  _rbexpr.meta_pop.map { |e| Utils.wrap_expr(e) }
19
31
  end
20
32
 
33
+ # Get a list with the root column name.
34
+ #
35
+ # @return [Array]
21
36
  def root_names
22
37
  _rbexpr.meta_roots
23
38
  end
24
39
 
40
+ # Get the column name that this expression would produce.
41
+ #
42
+ # @return [String]
25
43
  def output_name
26
44
  _rbexpr.meta_output_name
27
45
  end
28
46
 
47
+ # Undo any renaming operation like `alias` or `keep_name`.
48
+ #
49
+ # @return [Expr]
29
50
  def undo_aliases
30
51
  Utils.wrap_expr(_rbexpr.meta_undo_aliases)
31
52
  end