polars-df 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -9,103 +9,600 @@ module Polars
9
9
  self._rbexpr = expr._rbexpr
10
10
  end
11
11
 
12
+ # Get the length of the arrays as `:u32`.
13
+ #
14
+ # @return [Expr]
15
+ #
16
+ # @example
17
+ # df = Polars::DataFrame.new({"foo" => [1, 2], "bar" => [["a", "b"], ["c"]]})
18
+ # df.select(Polars.col("bar").arr.lengths)
19
+ # # =>
20
+ # # shape: (2, 1)
21
+ # # ┌─────┐
22
+ # # │ bar │
23
+ # # │ --- │
24
+ # # │ u32 │
25
+ # # ╞═════╡
26
+ # # │ 2 │
27
+ # # ├╌╌╌╌╌┤
28
+ # # │ 1 │
29
+ # # └─────┘
12
30
  def lengths
13
31
  Utils.wrap_expr(_rbexpr.arr_lengths)
14
32
  end
15
33
 
34
+ # Sum all the lists in the array.
35
+ #
36
+ # @return [Expr]
37
+ #
38
+ # @example
39
+ # df = Polars::DataFrame.new({"values" => [[1], [2, 3]]})
40
+ # df.select(Polars.col("values").arr.sum)
41
+ # # =>
42
+ # # shape: (2, 1)
43
+ # # ┌────────┐
44
+ # # │ values │
45
+ # # │ --- │
46
+ # # │ i64 │
47
+ # # ╞════════╡
48
+ # # │ 1 │
49
+ # # ├╌╌╌╌╌╌╌╌┤
50
+ # # │ 5 │
51
+ # # └────────┘
16
52
  def sum
17
53
  Utils.wrap_expr(_rbexpr.lst_sum)
18
54
  end
19
55
 
56
+ # Compute the max value of the lists in the array.
57
+ #
58
+ # @return [Expr]
59
+ #
60
+ # @example
61
+ # df = Polars::DataFrame.new({"values" => [[1], [2, 3]]})
62
+ # df.select(Polars.col("values").arr.max)
63
+ # # =>
64
+ # # shape: (2, 1)
65
+ # # ┌────────┐
66
+ # # │ values │
67
+ # # │ --- │
68
+ # # │ i64 │
69
+ # # ╞════════╡
70
+ # # │ 1 │
71
+ # # ├╌╌╌╌╌╌╌╌┤
72
+ # # │ 3 │
73
+ # # └────────┘
20
74
  def max
21
75
  Utils.wrap_expr(_rbexpr.lst_max)
22
76
  end
23
77
 
78
+ # Compute the min value of the lists in the array.
79
+ #
80
+ # @return [Expr]
81
+ #
82
+ # @example
83
+ # df = Polars::DataFrame.new({"values" => [[1], [2, 3]]})
84
+ # df.select(Polars.col("values").arr.min)
85
+ # # =>
86
+ # # shape: (2, 1)
87
+ # # ┌────────┐
88
+ # # │ values │
89
+ # # │ --- │
90
+ # # │ i64 │
91
+ # # ╞════════╡
92
+ # # │ 1 │
93
+ # # ├╌╌╌╌╌╌╌╌┤
94
+ # # │ 2 │
95
+ # # └────────┘
24
96
  def min
25
97
  Utils.wrap_expr(_rbexpr.lst_min)
26
98
  end
27
99
 
100
+ # Compute the mean value of the lists in the array.
101
+ #
102
+ # @return [Expr]
103
+ #
104
+ # @example
105
+ # df = Polars::DataFrame.new({"values" => [[1], [2, 3]]})
106
+ # df.select(Polars.col("values").arr.mean)
107
+ # # =>
108
+ # # shape: (2, 1)
109
+ # # ┌────────┐
110
+ # # │ values │
111
+ # # │ --- │
112
+ # # │ f64 │
113
+ # # ╞════════╡
114
+ # # │ 1.0 │
115
+ # # ├╌╌╌╌╌╌╌╌┤
116
+ # # │ 2.5 │
117
+ # # └────────┘
28
118
  def mean
29
119
  Utils.wrap_expr(_rbexpr.lst_mean)
30
120
  end
31
121
 
122
+ # Sort the arrays in the list.
123
+ #
124
+ # @return [Expr]
125
+ #
126
+ # @example
127
+ # df = Polars::DataFrame.new(
128
+ # {
129
+ # "a" => [[3, 2, 1], [9, 1, 2]]
130
+ # }
131
+ # )
132
+ # df.select(Polars.col("a").arr.sort)
133
+ # # =>
134
+ # # shape: (2, 1)
135
+ # # ┌───────────┐
136
+ # # │ a │
137
+ # # │ --- │
138
+ # # │ list[i64] │
139
+ # # ╞═══════════╡
140
+ # # │ [1, 2, 3] │
141
+ # # ├╌╌╌╌╌╌╌╌╌╌╌┤
142
+ # # │ [1, 2, 9] │
143
+ # # └───────────┘
32
144
  def sort(reverse: false)
33
145
  Utils.wrap_expr(_rbexpr.lst_sort(reverse))
34
146
  end
35
147
 
148
+ # Reverse the arrays in the list.
149
+ #
150
+ # @return [Expr]
151
+ #
152
+ # @example
153
+ # df = Polars::DataFrame.new(
154
+ # {
155
+ # "a" => [[3, 2, 1], [9, 1, 2]]
156
+ # }
157
+ # )
158
+ # df.select(Polars.col("a").arr.reverse)
159
+ # # =>
160
+ # # shape: (2, 1)
161
+ # # ┌───────────┐
162
+ # # │ a │
163
+ # # │ --- │
164
+ # # │ list[i64] │
165
+ # # ╞═══════════╡
166
+ # # │ [1, 2, 3] │
167
+ # # ├╌╌╌╌╌╌╌╌╌╌╌┤
168
+ # # │ [2, 1, 9] │
169
+ # # └───────────┘
36
170
  def reverse
37
171
  Utils.wrap_expr(_rbexpr.lst_reverse)
38
172
  end
39
173
 
174
+ # Get the unique/distinct values in the list.
175
+ #
176
+ # @return [Expr]
177
+ #
178
+ # @example
179
+ # df = Polars::DataFrame.new(
180
+ # {
181
+ # "a" => [[1, 1, 2]]
182
+ # }
183
+ # )
184
+ # df.select(Polars.col("a").arr.unique)
185
+ # # =>
186
+ # # shape: (1, 1)
187
+ # # ┌───────────┐
188
+ # # │ a │
189
+ # # │ --- │
190
+ # # │ list[i64] │
191
+ # # ╞═══════════╡
192
+ # # │ [1, 2] │
193
+ # # └───────────┘
40
194
  def unique
41
195
  Utils.wrap_expr(_rbexpr.lst_unique)
42
196
  end
43
197
 
44
- # def concat
45
- # end
198
+ # Concat the arrays in a Series dtype List in linear time.
199
+ #
200
+ # @param other [Object]
201
+ # Columns to concat into a List Series
202
+ #
203
+ # @return [Expr]
204
+ #
205
+ # @example
206
+ # df = Polars::DataFrame.new(
207
+ # {
208
+ # "a" => [["a"], ["x"]],
209
+ # "b" => [["b", "c"], ["y", "z"]]
210
+ # }
211
+ # )
212
+ # df.select(Polars.col("a").arr.concat("b"))
213
+ # # =>
214
+ # # shape: (2, 1)
215
+ # # ┌─────────────────┐
216
+ # # │ a │
217
+ # # │ --- │
218
+ # # │ list[str] │
219
+ # # ╞═════════════════╡
220
+ # # │ ["a", "b", "c"] │
221
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
222
+ # # │ ["x", "y", "z"] │
223
+ # # └─────────────────┘
224
+ def concat(other)
225
+ if other.is_a?(Array) && ![Expr, String, Series].any? { |c| other[0].is_a?(c) }
226
+ return concat(Series.new([other]))
227
+ end
228
+
229
+ if !other.is_a?(Array)
230
+ other_list = [other]
231
+ else
232
+ other_list = other.dup
233
+ end
46
234
 
235
+ other_list.insert(0, Utils.wrap_expr(_rbexpr))
236
+ Polars.concat_list(other_list)
237
+ end
238
+
239
+ # Get the value by index in the sublists.
240
+ #
241
+ # So index `0` would return the first item of every sublist
242
+ # and index `-1` would return the last item of every sublist
243
+ # if an index is out of bounds, it will return a `None`.
244
+ #
245
+ # @param index [Integer]
246
+ # Index to return per sublist
47
247
  #
248
+ # @return [Expr]
249
+ #
250
+ # @example
251
+ # df = Polars::DataFrame.new({"foo" => [[3, 2, 1], [], [1, 2]]})
252
+ # df.select(Polars.col("foo").arr.get(0))
253
+ # # =>
254
+ # # shape: (3, 1)
255
+ # # ┌──────┐
256
+ # # │ foo │
257
+ # # │ --- │
258
+ # # │ i64 │
259
+ # # ╞══════╡
260
+ # # │ 3 │
261
+ # # ├╌╌╌╌╌╌┤
262
+ # # │ null │
263
+ # # ├╌╌╌╌╌╌┤
264
+ # # │ 1 │
265
+ # # └──────┘
48
266
  def get(index)
49
267
  index = Utils.expr_to_lit_or_expr(index, str_to_lit: false)._rbexpr
50
268
  Utils.wrap_expr(_rbexpr.lst_get(index))
51
269
  end
52
270
 
271
+ # Get the value by index in the sublists.
272
+ #
273
+ # @return [Expr]
53
274
  def [](item)
54
275
  get(item)
55
276
  end
56
277
 
278
+ # Get the first value of the sublists.
279
+ #
280
+ # @return [Expr]
281
+ #
282
+ # @example
283
+ # df = Polars::DataFrame.new({"foo" => [[3, 2, 1], [], [1, 2]]})
284
+ # df.select(Polars.col("foo").arr.first)
285
+ # # =>
286
+ # # shape: (3, 1)
287
+ # # ┌──────┐
288
+ # # │ foo │
289
+ # # │ --- │
290
+ # # │ i64 │
291
+ # # ╞══════╡
292
+ # # │ 3 │
293
+ # # ├╌╌╌╌╌╌┤
294
+ # # │ null │
295
+ # # ├╌╌╌╌╌╌┤
296
+ # # │ 1 │
297
+ # # └──────┘
57
298
  def first
58
299
  get(0)
59
300
  end
60
301
 
302
+ # Get the last value of the sublists.
303
+ #
304
+ # @return [Expr]
305
+ #
306
+ # @example
307
+ # df = Polars::DataFrame.new({"foo" => [[3, 2, 1], [], [1, 2]]})
308
+ # df.select(Polars.col("foo").arr.last)
309
+ # # =>
310
+ # # shape: (3, 1)
311
+ # # ┌──────┐
312
+ # # │ foo │
313
+ # # │ --- │
314
+ # # │ i64 │
315
+ # # ╞══════╡
316
+ # # │ 1 │
317
+ # # ├╌╌╌╌╌╌┤
318
+ # # │ null │
319
+ # # ├╌╌╌╌╌╌┤
320
+ # # │ 2 │
321
+ # # └──────┘
61
322
  def last
62
323
  get(-1)
63
324
  end
64
325
 
326
+ # Check if sublists contain the given item.
327
+ #
328
+ # @param item [Object]
329
+ # Item that will be checked for membership
330
+ #
331
+ # @return [Expr]
332
+ #
333
+ # @example
334
+ # df = Polars::DataFrame.new({"foo" => [[3, 2, 1], [], [1, 2]]})
335
+ # df.select(Polars.col("foo").arr.contains(1))
336
+ # # =>
337
+ # # shape: (3, 1)
338
+ # # ┌───────┐
339
+ # # │ foo │
340
+ # # │ --- │
341
+ # # │ bool │
342
+ # # ╞═══════╡
343
+ # # │ true │
344
+ # # ├╌╌╌╌╌╌╌┤
345
+ # # │ false │
346
+ # # ├╌╌╌╌╌╌╌┤
347
+ # # │ true │
348
+ # # └───────┘
65
349
  def contains(item)
66
350
  Utils.wrap_expr(_rbexpr.arr_contains(Utils.expr_to_lit_or_expr(item)._rbexpr))
67
351
  end
68
352
 
353
+ # Join all string items in a sublist and place a separator between them.
354
+ #
355
+ # This errors if inner type of list `!= :str`.
356
+ #
357
+ # @param separator [String]
358
+ # string to separate the items with
359
+ #
360
+ # @return [Expr]
361
+ #
362
+ # @example
363
+ # df = Polars::DataFrame.new({"s" => [["a", "b", "c"], ["x", "y"]]})
364
+ # df.select(Polars.col("s").arr.join(" "))
365
+ # # =>
366
+ # # shape: (2, 1)
367
+ # # ┌───────┐
368
+ # # │ s │
369
+ # # │ --- │
370
+ # # │ str │
371
+ # # ╞═══════╡
372
+ # # │ a b c │
373
+ # # ├╌╌╌╌╌╌╌┤
374
+ # # │ x y │
375
+ # # └───────┘
69
376
  def join(separator)
70
377
  Utils.wrap_expr(_rbexpr.lst_join(separator))
71
378
  end
72
379
 
380
+ # Retrieve the index of the minimal value in every sublist.
381
+ #
382
+ # @return [Expr]
383
+ #
384
+ # @example
385
+ # df = Polars::DataFrame.new(
386
+ # {
387
+ # "a" => [[1, 2], [2, 1]]
388
+ # }
389
+ # )
390
+ # df.select(Polars.col("a").arr.arg_min)
391
+ # # =>
392
+ # # shape: (2, 1)
393
+ # # ┌─────┐
394
+ # # │ a │
395
+ # # │ --- │
396
+ # # │ u32 │
397
+ # # ╞═════╡
398
+ # # │ 0 │
399
+ # # ├╌╌╌╌╌┤
400
+ # # │ 1 │
401
+ # # └─────┘
73
402
  def arg_min
74
403
  Utils.wrap_expr(_rbexpr.lst_arg_min)
75
404
  end
76
405
 
406
+ # Retrieve the index of the maximum value in every sublist.
407
+ #
408
+ # @return [Expr]
409
+ #
410
+ # @example
411
+ # df = Polars::DataFrame.new(
412
+ # {
413
+ # "a" => [[1, 2], [2, 1]]
414
+ # }
415
+ # )
416
+ # df.select(Polars.col("a").arr.arg_max)
417
+ # # =>
418
+ # # shape: (2, 1)
419
+ # # ┌─────┐
420
+ # # │ a │
421
+ # # │ --- │
422
+ # # │ u32 │
423
+ # # ╞═════╡
424
+ # # │ 1 │
425
+ # # ├╌╌╌╌╌┤
426
+ # # │ 0 │
427
+ # # └─────┘
77
428
  def arg_max
78
429
  Utils.wrap_expr(_rbexpr.lst_arg_max)
79
430
  end
80
431
 
432
+ # Calculate the n-th discrete difference of every sublist.
433
+ #
434
+ # @param n [Integer]
435
+ # Number of slots to shift.
436
+ # @param null_behavior ["ignore", "drop"]
437
+ # How to handle null values.
438
+ #
439
+ # @return [Expr]
440
+ #
441
+ # @example
442
+ # s = Polars::Series.new("a", [[1, 2, 3, 4], [10, 2, 1]])
443
+ # s.arr.diff
444
+ # # =>
445
+ # # shape: (2,)
446
+ # # Series: 'a' [list]
447
+ # # [
448
+ # # [null, 1, ... 1]
449
+ # # [null, -8, -1]
450
+ # # ]
81
451
  def diff(n: 1, null_behavior: "ignore")
82
452
  Utils.wrap_expr(_rbexpr.lst_diff(n, null_behavior))
83
453
  end
84
454
 
455
+ # Shift values by the given period.
456
+ #
457
+ # @param periods [Integer]
458
+ # Number of places to shift (may be negative).
459
+ #
460
+ # @return [Expr]
461
+ #
462
+ # @example
463
+ # s = Polars::Series.new("a", [[1, 2, 3, 4], [10, 2, 1]])
464
+ # s.arr.shift
465
+ # # =>
466
+ # # shape: (2,)
467
+ # # Series: 'a' [list]
468
+ # # [
469
+ # # [null, 1, ... 3]
470
+ # # [null, 10, 2]
471
+ # # ]
85
472
  def shift(periods = 1)
86
473
  Utils.wrap_expr(_rbexpr.lst_shift(periods))
87
474
  end
88
475
 
476
+ # Slice every sublist.
477
+ #
478
+ # @param offset [Integer]
479
+ # Start index. Negative indexing is supported.
480
+ # @param length [Integer]
481
+ # Length of the slice. If set to `nil` (default), the slice is taken to the
482
+ # end of the list.
483
+ #
484
+ # @return [Expr]
485
+ #
486
+ # @example
487
+ # s = Polars::Series.new("a", [[1, 2, 3, 4], [10, 2, 1]])
488
+ # s.arr.slice(1, 2)
489
+ # # =>
490
+ # # shape: (2,)
491
+ # # Series: 'a' [list]
492
+ # # [
493
+ # # [2, 3]
494
+ # # [2, 1]
495
+ # # ]
89
496
  def slice(offset, length = nil)
90
497
  offset = Utils.expr_to_lit_or_expr(offset, str_to_lit: false)._rbexpr
91
498
  length = Utils.expr_to_lit_or_expr(length, str_to_lit: false)._rbexpr
92
499
  Utils.wrap_expr(_rbexpr.lst_slice(offset, length))
93
500
  end
94
501
 
502
+ # Slice the first `n` values of every sublist.
503
+ #
504
+ # @param n [Integer]
505
+ # Number of values to return for each sublist.
506
+ #
507
+ # @return [Expr]
508
+ #
509
+ # @example
510
+ # s = Polars::Series.new("a", [[1, 2, 3, 4], [10, 2, 1]])
511
+ # s.arr.head(2)
512
+ # # =>
513
+ # # shape: (2,)
514
+ # # Series: 'a' [list]
515
+ # # [
516
+ # # [1, 2]
517
+ # # [10, 2]
518
+ # # ]
95
519
  def head(n = 5)
96
520
  slice(0, n)
97
521
  end
98
522
 
523
+ # Slice the last `n` values of every sublist.
524
+ #
525
+ # @param n [Integer]
526
+ # Number of values to return for each sublist.
527
+ #
528
+ # @return [Expr]
529
+ #
530
+ # @example
531
+ # s = Polars::Series.new("a", [[1, 2, 3, 4], [10, 2, 1]])
532
+ # s.arr.tail(2)
533
+ # # =>
534
+ # # shape: (2,)
535
+ # # Series: 'a' [list]
536
+ # # [
537
+ # # [3, 4]
538
+ # # [2, 1]
539
+ # # ]
99
540
  def tail(n = 5)
100
541
  offset = -Utils.expr_to_lit_or_expr(n, str_to_lit: false)
101
542
  slice(offset, n)
102
543
  end
103
544
 
104
- # def to_struct(n_field_strategy: "first_non_null", name_generator: nil)
105
- # Utils.wrap_expr(_rbexpr.lst_to_struct(n_field_strategy, name_generator))
106
- # end
545
+ # Convert the series of type `List` to a series of type `Struct`.
546
+ #
547
+ # @param n_field_strategy ["first_non_null", "max_width"]
548
+ # Strategy to determine the number of fields of the struct.
549
+ # @param name_generator [Object]
550
+ # A custom function that can be used to generate the field names.
551
+ # Default field names are `field_0, field_1 .. field_n`
552
+ #
553
+ # @return [Expr]
554
+ #
555
+ # @example
556
+ # df = Polars::DataFrame.new({"a" => [[1, 2, 3], [1, 2]]})
557
+ # df.select([Polars.col("a").arr.to_struct])
558
+ # # =>
559
+ # # shape: (2, 1)
560
+ # # ┌────────────┐
561
+ # # │ a │
562
+ # # │ --- │
563
+ # # │ struct[3] │
564
+ # # ╞════════════╡
565
+ # # │ {1,2,3} │
566
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┤
567
+ # # │ {1,2,null} │
568
+ # # └────────────┘
569
+ def to_struct(n_field_strategy: "first_non_null", name_generator: nil)
570
+ raise Todo if name_generator
571
+ Utils.wrap_expr(_rbexpr.lst_to_struct(n_field_strategy, name_generator))
572
+ end
107
573
 
574
+ # Run any polars expression against the lists' elements.
575
+ #
576
+ # @param expr [Expr]
577
+ # Expression to run. Note that you can select an element with `Polars.first`, or
578
+ # `Polars.col`
579
+ # @param parallel [Boolean]
580
+ # Run all expression parallel. Don't activate this blindly.
581
+ # Parallelism is worth it if there is enough work to do per thread.
582
+ #
583
+ # This likely should not be use in the groupby context, because we already
584
+ # parallel execution per group
585
+ #
586
+ # @return [Expr]
108
587
  #
588
+ # @example
589
+ # df = Polars::DataFrame.new({"a" => [1, 8, 3], "b" => [4, 5, 2]})
590
+ # df.with_column(
591
+ # Polars.concat_list(["a", "b"]).arr.eval(Polars.element.rank).alias("rank")
592
+ # )
593
+ # # =>
594
+ # # shape: (3, 3)
595
+ # # ┌─────┬─────┬────────────┐
596
+ # # │ a ┆ b ┆ rank │
597
+ # # │ --- ┆ --- ┆ --- │
598
+ # # │ i64 ┆ i64 ┆ list[f32] │
599
+ # # ╞═════╪═════╪════════════╡
600
+ # # │ 1 ┆ 4 ┆ [1.0, 2.0] │
601
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
602
+ # # │ 8 ┆ 5 ┆ [2.0, 1.0] │
603
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
604
+ # # │ 3 ┆ 2 ┆ [2.0, 1.0] │
605
+ # # └─────┴─────┴────────────┘
109
606
  def eval(expr, parallel: false)
110
607
  Utils.wrap_expr(_rbexpr.lst_eval(expr._rbexpr, parallel))
111
608
  end