polars-df 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,8 @@
1
1
  module Polars
2
2
  module LazyFunctions
3
+ # Return an expression representing a column in a DataFrame.
4
+ #
5
+ # @return [Expr]
3
6
  def col(name)
4
7
  if name.is_a?(Series)
5
8
  name = name.to_a
@@ -21,10 +24,42 @@ module Polars
21
24
  end
22
25
  end
23
26
 
27
+ # Alias for an element in evaluated in an `eval` expression.
28
+ #
29
+ # @return [Expr]
30
+ #
31
+ # @example A horizontal rank computation by taking the elements of a list
32
+ # df = Polars::DataFrame.new({"a" => [1, 8, 3], "b" => [4, 5, 2]})
33
+ # df.with_column(
34
+ # Polars.concat_list(["a", "b"]).arr.eval(Polars.element.rank).alias("rank")
35
+ # )
36
+ # # =>
37
+ # # shape: (3, 3)
38
+ # # ┌─────┬─────┬────────────┐
39
+ # # │ a ┆ b ┆ rank │
40
+ # # │ --- ┆ --- ┆ --- │
41
+ # # │ i64 ┆ i64 ┆ list[f32] │
42
+ # # ╞═════╪═════╪════════════╡
43
+ # # │ 1 ┆ 4 ┆ [1.0, 2.0] │
44
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
45
+ # # │ 8 ┆ 5 ┆ [2.0, 1.0] │
46
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
47
+ # # │ 3 ┆ 2 ┆ [2.0, 1.0] │
48
+ # # └─────┴─────┴────────────┘
24
49
  def element
25
50
  col("")
26
51
  end
27
52
 
53
+ # Count the number of values in this column/context.
54
+ #
55
+ # @param column [String, Series, nil]
56
+ # If dtype is:
57
+ #
58
+ # * `Series` : count the values in the series.
59
+ # * `String` : count the values in this column.
60
+ # * `None` : count the number of values in this context.
61
+ #
62
+ # @return [Expr, Integer]
28
63
  def count(column = nil)
29
64
  if column.nil?
30
65
  return Utils.wrap_expr(RbExpr.count)
@@ -37,9 +72,16 @@ module Polars
37
72
  end
38
73
  end
39
74
 
40
- # def to_list
41
- # end
75
+ # Aggregate to list.
76
+ #
77
+ # @return [Expr]
78
+ def to_list(name)
79
+ col(name).list
80
+ end
42
81
 
82
+ # Get the standard deviation.
83
+ #
84
+ # @return [Object]
43
85
  def std(column, ddof: 1)
44
86
  if column.is_a?(Series)
45
87
  column.std(ddof: ddof)
@@ -48,6 +90,9 @@ module Polars
48
90
  end
49
91
  end
50
92
 
93
+ # Get the variance.
94
+ #
95
+ # @return [Object]
51
96
  def var(column, ddof: 1)
52
97
  if column.is_a?(Series)
53
98
  column.var(ddof: ddof)
@@ -56,6 +101,16 @@ module Polars
56
101
  end
57
102
  end
58
103
 
104
+ # Get the maximum value.
105
+ #
106
+ # @param column [Object]
107
+ # Column(s) to be used in aggregation. Will lead to different behavior based on
108
+ # the input:
109
+ #
110
+ # - [String, Series] -> aggregate the maximum value of that column.
111
+ # - [Array<Expr>] -> aggregate the maximum value horizontally.
112
+ #
113
+ # @return [Expr, Object]
59
114
  def max(column)
60
115
  if column.is_a?(Series)
61
116
  column.max
@@ -68,6 +123,16 @@ module Polars
68
123
  end
69
124
  end
70
125
 
126
+ # Get the minimum value.
127
+ #
128
+ # @param column [Object]
129
+ # Column(s) to be used in aggregation. Will lead to different behavior based on
130
+ # the input:
131
+ #
132
+ # - [String, Series] -> aggregate the minimum value of that column.
133
+ # - [Array<Expr>] -> aggregate the minimum value horizontally.
134
+ #
135
+ # @return [Expr, Object]
71
136
  def min(column)
72
137
  if column.is_a?(Series)
73
138
  column.min
@@ -80,6 +145,9 @@ module Polars
80
145
  end
81
146
  end
82
147
 
148
+ # Sum values in a column/Series, or horizontally across list of columns/expressions.
149
+ #
150
+ # @return [Object]
83
151
  def sum(column)
84
152
  if column.is_a?(Series)
85
153
  column.sum
@@ -94,6 +162,9 @@ module Polars
94
162
  end
95
163
  end
96
164
 
165
+ # Get the mean value.
166
+ #
167
+ # @return [Expr, Float]
97
168
  def mean(column)
98
169
  if column.is_a?(Series)
99
170
  column.mean
@@ -102,10 +173,16 @@ module Polars
102
173
  end
103
174
  end
104
175
 
176
+ # Get the mean value.
177
+ #
178
+ # @return [Expr, Float]
105
179
  def avg(column)
106
180
  mean(column)
107
181
  end
108
182
 
183
+ # Get the median value.
184
+ #
185
+ # @return [Object]
109
186
  def median(column)
110
187
  if column.is_a?(Series)
111
188
  column.median
@@ -117,6 +194,9 @@ module Polars
117
194
  # def n_unique
118
195
  # end
119
196
 
197
+ # Get the first value.
198
+ #
199
+ # @return [Object]
120
200
  def first(column = nil)
121
201
  if column.nil?
122
202
  return Utils.wrap_expr(RbExpr.first)
@@ -142,7 +222,20 @@ module Polars
142
222
  # def tail
143
223
  # end
144
224
 
225
+ # Return an expression representing a literal value.
226
+ #
227
+ # @return [Expr]
145
228
  def lit(value)
229
+ if value.is_a?(Polars::Series)
230
+ name = value.name
231
+ value = value._s
232
+ e = Utils.wrap_expr(RbExpr.lit(value))
233
+ if name == ""
234
+ return e
235
+ end
236
+ return e.alias(name)
237
+ end
238
+
146
239
  Utils.wrap_expr(RbExpr.lit(value))
147
240
  end
148
241
 
@@ -164,6 +257,9 @@ module Polars
164
257
  # def apply
165
258
  # end
166
259
 
260
+ # Accumulate over multiple columns horizontally/ row wise with a left fold.
261
+ #
262
+ # @return [Expr]
167
263
  def fold(acc, f, exprs)
168
264
  acc = Utils.expr_to_lit_or_expr(acc, str_to_lit: true)
169
265
  if exprs.is_a?(Expr)
@@ -189,6 +285,30 @@ module Polars
189
285
  # def exclude
190
286
  # end
191
287
 
288
+ # Do one of two things.
289
+ #
290
+ # * function can do a columnwise or elementwise AND operation
291
+ # * a wildcard column selection
292
+ #
293
+ # @param name [Object]
294
+ # If given this function will apply a bitwise & on the columns.
295
+ #
296
+ # @return [Expr]
297
+ #
298
+ # @example Sum all columns
299
+ # df = Polars::DataFrame.new(
300
+ # {"a" => [1, 2, 3], "b" => ["hello", "foo", "bar"], "c" => [1, 1, 1]}
301
+ # )
302
+ # df.select(Polars.all.sum)
303
+ # # =>
304
+ # # shape: (1, 3)
305
+ # # ┌─────┬──────┬─────┐
306
+ # # │ a ┆ b ┆ c │
307
+ # # │ --- ┆ --- ┆ --- │
308
+ # # │ i64 ┆ str ┆ i64 │
309
+ # # ╞═════╪══════╪═════╡
310
+ # # │ 6 ┆ null ┆ 3 │
311
+ # # └─────┴──────┴─────┘
192
312
  def all(name = nil)
193
313
  if name.nil?
194
314
  col("*")
@@ -205,6 +325,26 @@ module Polars
205
325
  # def quantile
206
326
  # end
207
327
 
328
+ # Create a range expression (or Series).
329
+ #
330
+ # This can be used in a `select`, `with_column`, etc. Be sure that the resulting
331
+ # range size is equal to the length of the DataFrame you are collecting.
332
+ #
333
+ # @param low [Integer, Expr, Series]
334
+ # Lower bound of range.
335
+ # @param high [Integer, Expr, Series]
336
+ # Upper bound of range.
337
+ # @param step [Integer]
338
+ # Step size of the range.
339
+ # @param eager [Boolean]
340
+ # If eager evaluation is `True`, a Series is returned instead of an Expr.
341
+ # @param dtype [Symbol]
342
+ # Apply an explicit integer dtype to the resulting expression (default is Int64).
343
+ #
344
+ # @return [Expr, Series]
345
+ #
346
+ # @example
347
+ # df.lazy.filter(Polars.col("foo") < Polars.arange(0, 100)).collect
208
348
  def arange(low, high, step: 1, eager: false, dtype: nil)
209
349
  low = Utils.expr_to_lit_or_expr(low, str_to_lit: false)
210
350
  high = Utils.expr_to_lit_or_expr(high, str_to_lit: false)
@@ -233,6 +373,9 @@ module Polars
233
373
  # def format
234
374
  # end
235
375
 
376
+ # Concat the arrays in a Series dtype List in linear time.
377
+ #
378
+ # @return [Expr]
236
379
  def concat_list(exprs)
237
380
  exprs = Utils.selection_to_rbexpr_list(exprs)
238
381
  Utils.wrap_expr(RbExpr.concat_lst(exprs))
@@ -241,17 +384,132 @@ module Polars
241
384
  # def collect_all
242
385
  # end
243
386
 
244
- # def select
245
- # end
387
+ # Run polars expressions without a context.
388
+ #
389
+ # @return [DataFrame]
390
+ def select(exprs)
391
+ DataFrame.new([]).select(exprs)
392
+ end
246
393
 
247
- # def struct
248
- # end
394
+ # Collect several columns into a Series of dtype Struct.
395
+ #
396
+ # @param exprs [Object]
397
+ # Columns/Expressions to collect into a Struct
398
+ # @param eager [Boolean]
399
+ # Evaluate immediately
400
+ #
401
+ # @return [Object]
402
+ #
403
+ # @example
404
+ # Polars::DataFrame.new(
405
+ # {
406
+ # "int" => [1, 2],
407
+ # "str" => ["a", "b"],
408
+ # "bool" => [true, nil],
409
+ # "list" => [[1, 2], [3]],
410
+ # }
411
+ # ).select([Polars.struct(Polars.all()).alias("my_struct")])
412
+ # # =>
413
+ # # shape: (2, 1)
414
+ # # ┌─────────────────────┐
415
+ # # │ my_struct │
416
+ # # │ --- │
417
+ # # │ struct[4] │
418
+ # # ╞═════════════════════╡
419
+ # # │ {1,"a",true,[1, 2]} │
420
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
421
+ # # │ {2,"b",null,[3]} │
422
+ # # └─────────────────────┘
423
+ #
424
+ # @example Only collect specific columns as a struct:
425
+ # df = Polars::DataFrame.new(
426
+ # {"a" => [1, 2, 3, 4], "b" => ["one", "two", "three", "four"], "c" => [9, 8, 7, 6]}
427
+ # )
428
+ # df.with_column(pl.struct(pl.col(["a", "b"])).alias("a_and_b"))
429
+ # # =>
430
+ # # shape: (4, 4)
431
+ # # ┌─────┬───────┬─────┬─────────────┐
432
+ # # │ a ┆ b ┆ c ┆ a_and_b │
433
+ # # │ --- ┆ --- ┆ --- ┆ --- │
434
+ # # │ i64 ┆ str ┆ i64 ┆ struct[2] │
435
+ # # ╞═════╪═══════╪═════╪═════════════╡
436
+ # # │ 1 ┆ one ┆ 9 ┆ {1,"one"} │
437
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
438
+ # # │ 2 ┆ two ┆ 8 ┆ {2,"two"} │
439
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
440
+ # # │ 3 ┆ three ┆ 7 ┆ {3,"three"} │
441
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
442
+ # # │ 4 ┆ four ┆ 6 ┆ {4,"four"} │
443
+ # # └─────┴───────┴─────┴─────────────┘
444
+ def struct(exprs, eager: false)
445
+ if eager
446
+ Polars.select(struct(exprs, eager: false)).to_series
447
+ end
448
+ exprs = Utils.selection_to_rbexpr_list(exprs)
449
+ Utils.wrap_expr(_as_struct(exprs))
450
+ end
249
451
 
250
- # def repeat
251
- # end
452
+ # Repeat a single value n times.
453
+ #
454
+ # @param value [Object]
455
+ # Value to repeat.
456
+ # @param n [Integer]
457
+ # Repeat `n` times.
458
+ # @param eager [Boolean]
459
+ # Run eagerly and collect into a `Series`.
460
+ # @param name [String]
461
+ # Only used in `eager` mode. As expression, use `alias`.
462
+ #
463
+ # @return [Expr]
464
+ def repeat(value, n, eager: false, name: nil)
465
+ if eager
466
+ if name.nil?
467
+ name = ""
468
+ end
469
+ dtype = py_type_to_dtype(type(value))
470
+ Series._repeat(name, value, n, dtype)
471
+ else
472
+ if n.is_a?(Integer)
473
+ n = lit(n)
474
+ end
475
+ Utils.wrap_expr(RbExpr.repeat(value, n._rbexpr))
476
+ end
477
+ end
252
478
 
253
- # def arg_where
254
- # end
479
+ # Return indices where `condition` evaluates `true`.
480
+ #
481
+ # @param condition [Expr]
482
+ # Boolean expression to evaluate
483
+ # @param eager [Boolean]
484
+ # Whether to apply this function eagerly (as opposed to lazily).
485
+ #
486
+ # @return [Expr, Series]
487
+ #
488
+ # @example
489
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4, 5]})
490
+ # df.select(
491
+ # [
492
+ # Polars.arg_where(Polars.col("a") % 2 == 0)
493
+ # ]
494
+ # ).to_series
495
+ # # =>
496
+ # # shape: (2,)
497
+ # # Series: 'a' [u32]
498
+ # # [
499
+ # # 1
500
+ # # 3
501
+ # # ]
502
+ def arg_where(condition, eager: false)
503
+ if eager
504
+ if !condition.is_a?(Series)
505
+ raise ArgumentError, "expected 'Series' in 'arg_where' if 'eager=True', got #{condition.class.name}"
506
+ end
507
+ condition.to_frame.select(arg_where(Polars.col(condition.name))).to_series
508
+ else
509
+ condition = Utils.expr_to_lit_or_expr(condition, str_to_lit: true)
510
+ Utils.wrap_expr(_arg_where(condition._rbexpr))
511
+ end
512
+ end
255
513
 
256
514
  # def coalesce
257
515
  # end
@@ -259,6 +517,26 @@ module Polars
259
517
  # def from_epoch
260
518
  # end
261
519
 
520
+ # Start a "when, then, otherwise" expression.
521
+ #
522
+ # @return [When]
523
+ #
524
+ # @example
525
+ # df = Polars::DataFrame.new({"foo" => [1, 3, 4], "bar" => [3, 4, 0]})
526
+ # df.with_column(Polars.when(Polars.col("foo") > 2).then(Polars.lit(1)).otherwise(Polars.lit(-1)))
527
+ # # =>
528
+ # # shape: (3, 3)
529
+ # # ┌─────┬─────┬─────────┐
530
+ # # │ foo ┆ bar ┆ literal │
531
+ # # │ --- ┆ --- ┆ --- │
532
+ # # │ i64 ┆ i64 ┆ i32 │
533
+ # # ╞═════╪═════╪═════════╡
534
+ # # │ 1 ┆ 3 ┆ -1 │
535
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
536
+ # # │ 3 ┆ 4 ┆ 1 │
537
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
538
+ # # │ 4 ┆ 0 ┆ 1 │
539
+ # # └─────┴─────┴─────────┘
262
540
  def when(expr)
263
541
  expr = Utils.expr_to_lit_or_expr(expr)
264
542
  pw = RbExpr.when(expr._rbexpr)
@@ -1,13 +1,92 @@
1
1
  module Polars
2
2
  class LazyGroupBy
3
+ # @private
3
4
  def initialize(lgb, lazyframe_class)
4
5
  @lgb = lgb
5
6
  @lazyframe_class = lazyframe_class
6
7
  end
7
8
 
9
+ # Describe the aggregation that need to be done on a group.
10
+ #
11
+ # @return [LazyFrame]
8
12
  def agg(aggs)
9
13
  rbexprs = Utils.selection_to_rbexpr_list(aggs)
10
14
  @lazyframe_class._from_rbldf(@lgb.agg(rbexprs))
11
15
  end
16
+
17
+ # Get the first `n` rows of each group.
18
+ #
19
+ # @param n [Integer]
20
+ # Number of rows to return.
21
+ #
22
+ # @return [LazyFrame]
23
+ #
24
+ # @example
25
+ # df = Polars::DataFrame.new(
26
+ # {
27
+ # "letters" => ["c", "c", "a", "c", "a", "b"],
28
+ # "nrs" => [1, 2, 3, 4, 5, 6]
29
+ # }
30
+ # )
31
+ # df.groupby("letters").head(2).sort("letters")
32
+ # # =>
33
+ # # shape: (5, 2)
34
+ # # ┌─────────┬─────┐
35
+ # # │ letters ┆ nrs │
36
+ # # │ --- ┆ --- │
37
+ # # │ str ┆ i64 │
38
+ # # ╞═════════╪═════╡
39
+ # # │ a ┆ 3 │
40
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
41
+ # # │ a ┆ 5 │
42
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
43
+ # # │ b ┆ 6 │
44
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
45
+ # # │ c ┆ 1 │
46
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
47
+ # # │ c ┆ 2 │
48
+ # # └─────────┴─────┘
49
+ def head(n = 5)
50
+ @lazyframe_class._from_rbldf(@lgb.head(n))
51
+ end
52
+
53
+ # Get the last `n` rows of each group.
54
+ #
55
+ # @param n [Integer]
56
+ # Number of rows to return.
57
+ #
58
+ # @return [LazyFrame]
59
+ #
60
+ # @example
61
+ # df = Polars::DataFrame.new(
62
+ # {
63
+ # "letters" => ["c", "c", "a", "c", "a", "b"],
64
+ # "nrs" => [1, 2, 3, 4, 5, 6]
65
+ # }
66
+ # )
67
+ # df.groupby("letters").tail(2).sort("letters")
68
+ # # =>
69
+ # # shape: (5, 2)
70
+ # # ┌─────────┬─────┐
71
+ # # │ letters ┆ nrs │
72
+ # # │ --- ┆ --- │
73
+ # # │ str ┆ i64 │
74
+ # # ╞═════════╪═════╡
75
+ # # │ a ┆ 3 │
76
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
77
+ # # │ a ┆ 5 │
78
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
79
+ # # │ b ┆ 6 │
80
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
81
+ # # │ c ┆ 2 │
82
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
83
+ # # │ c ┆ 4 │
84
+ # # └─────────┴─────┘
85
+ def tail(n = 5)
86
+ @lazyframe_class._from_rbldf(@lgb.tail(n))
87
+ end
88
+
89
+ # def apply
90
+ # end
12
91
  end
13
92
  end
@@ -1,7 +1,10 @@
1
1
  module Polars
2
+ # Namespace for list related expressions.
2
3
  class ListExpr
4
+ # @private
3
5
  attr_accessor :_rbexpr
4
6
 
7
+ # @private
5
8
  def initialize(expr)
6
9
  self._rbexpr = expr._rbexpr
7
10
  end
@@ -41,6 +44,7 @@ module Polars
41
44
  # def concat
42
45
  # end
43
46
 
47
+ #
44
48
  def get(index)
45
49
  index = Utils.expr_to_lit_or_expr(index, str_to_lit: false)._rbexpr
46
50
  Utils.wrap_expr(_rbexpr.lst_get(index))
@@ -101,6 +105,7 @@ module Polars
101
105
  # Utils.wrap_expr(_rbexpr.lst_to_struct(n_field_strategy, name_generator))
102
106
  # end
103
107
 
108
+ #
104
109
  def eval(expr, parallel: false)
105
110
  Utils.wrap_expr(_rbexpr.lst_eval(expr._rbexpr, parallel))
106
111
  end
@@ -1,31 +1,52 @@
1
1
  module Polars
2
+ # Namespace for expressions on a meta level.
2
3
  class MetaExpr
4
+ # @private
3
5
  attr_accessor :_rbexpr
4
6
 
7
+ # @private
5
8
  def initialize(expr)
6
9
  self._rbexpr = expr._rbexpr
7
10
  end
8
11
 
12
+ # Equal.
13
+ #
14
+ # @return [Boolean]
9
15
  def ==(other)
10
16
  _rbexpr.meta_eq(other._rbexpr)
11
17
  end
12
18
 
19
+ # Not equal.
20
+ #
21
+ # @return [Boolean]
13
22
  def !=(other)
14
23
  !(self == other)
15
24
  end
16
25
 
26
+ # Pop the latest expression and return the input(s) of the popped expression.
27
+ #
28
+ # @return [Array]
17
29
  def pop
18
30
  _rbexpr.meta_pop.map { |e| Utils.wrap_expr(e) }
19
31
  end
20
32
 
33
+ # Get a list with the root column name.
34
+ #
35
+ # @return [Array]
21
36
  def root_names
22
37
  _rbexpr.meta_roots
23
38
  end
24
39
 
40
+ # Get the column name that this expression would produce.
41
+ #
42
+ # @return [String]
25
43
  def output_name
26
44
  _rbexpr.meta_output_name
27
45
  end
28
46
 
47
+ # Undo any renaming operation like `alias` or `keep_name`.
48
+ #
49
+ # @return [Expr]
29
50
  def undo_aliases
30
51
  Utils.wrap_expr(_rbexpr.meta_undo_aliases)
31
52
  end