polars-df 0.1.3 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/Cargo.lock +142 -11
  4. data/Cargo.toml +5 -0
  5. data/ext/polars/Cargo.toml +17 -1
  6. data/ext/polars/src/apply/dataframe.rs +292 -0
  7. data/ext/polars/src/apply/mod.rs +254 -0
  8. data/ext/polars/src/apply/series.rs +1173 -0
  9. data/ext/polars/src/conversion.rs +180 -5
  10. data/ext/polars/src/dataframe.rs +146 -1
  11. data/ext/polars/src/error.rs +12 -0
  12. data/ext/polars/src/lazy/apply.rs +34 -2
  13. data/ext/polars/src/lazy/dataframe.rs +74 -3
  14. data/ext/polars/src/lazy/dsl.rs +136 -0
  15. data/ext/polars/src/lib.rs +199 -1
  16. data/ext/polars/src/list_construction.rs +100 -0
  17. data/ext/polars/src/series.rs +331 -0
  18. data/ext/polars/src/utils.rs +25 -0
  19. data/lib/polars/cat_name_space.rb +54 -0
  20. data/lib/polars/convert.rb +100 -0
  21. data/lib/polars/data_frame.rb +1558 -60
  22. data/lib/polars/date_time_expr.rb +2 -2
  23. data/lib/polars/date_time_name_space.rb +1484 -0
  24. data/lib/polars/dynamic_group_by.rb +49 -0
  25. data/lib/polars/expr.rb +4072 -107
  26. data/lib/polars/expr_dispatch.rb +8 -0
  27. data/lib/polars/functions.rb +192 -3
  28. data/lib/polars/group_by.rb +44 -3
  29. data/lib/polars/io.rb +20 -4
  30. data/lib/polars/lazy_frame.rb +800 -26
  31. data/lib/polars/lazy_functions.rb +687 -43
  32. data/lib/polars/lazy_group_by.rb +1 -0
  33. data/lib/polars/list_expr.rb +502 -5
  34. data/lib/polars/list_name_space.rb +346 -0
  35. data/lib/polars/rolling_group_by.rb +35 -0
  36. data/lib/polars/series.rb +934 -62
  37. data/lib/polars/string_expr.rb +189 -13
  38. data/lib/polars/string_name_space.rb +690 -0
  39. data/lib/polars/struct_name_space.rb +64 -0
  40. data/lib/polars/utils.rb +44 -0
  41. data/lib/polars/version.rb +1 -1
  42. data/lib/polars.rb +14 -1
  43. metadata +15 -3
@@ -158,7 +158,7 @@ module Polars
158
158
  # TODO
159
159
  Utils.wrap_expr(_sum_exprs(exprs))
160
160
  else
161
- raise Todo
161
+ fold(lit(0).cast(:u32), ->(a, b) { a + b }, column).alias("sum")
162
162
  end
163
163
  end
164
164
 
@@ -191,8 +191,16 @@ module Polars
191
191
  end
192
192
  end
193
193
 
194
- # def n_unique
195
- # end
194
+ # Count unique values.
195
+ #
196
+ # @return [Object]
197
+ def n_unique(column)
198
+ if column.is_a?(Series)
199
+ column.n_unique
200
+ else
201
+ col(column).n_unique
202
+ end
203
+ end
196
204
 
197
205
  # Get the first value.
198
206
  #
@@ -213,14 +221,61 @@ module Polars
213
221
  end
214
222
  end
215
223
 
216
- # def last
217
- # end
224
+ # Get the last value.
225
+ #
226
+ # Depending on the input type this function does different things:
227
+ #
228
+ # - nil -> expression to take last column of a context.
229
+ # - String -> syntactic sugar for `Polars.col(..).last`
230
+ # - Series -> Take last value in `Series`
231
+ #
232
+ # @return [Object]
233
+ def last(column = nil)
234
+ if column.nil?
235
+ return Utils.wrap_expr(_last)
236
+ end
218
237
 
219
- # def head
220
- # end
238
+ if column.is_a?(Series)
239
+ if column.len > 0
240
+ return column[-1]
241
+ else
242
+ raise IndexError, "The series is empty, so no last value can be returned"
243
+ end
244
+ end
245
+ col(column).last
246
+ end
221
247
 
222
- # def tail
223
- # end
248
+ # Get the first `n` rows.
249
+ #
250
+ # @param column [Object]
251
+ # Column name or Series.
252
+ # @param n [Integer]
253
+ # Number of rows to return.
254
+ #
255
+ # @return [Object]
256
+ def head(column, n = 10)
257
+ if column.is_a?(Series)
258
+ column.head(n)
259
+ else
260
+ col(column).head(n)
261
+ end
262
+ end
263
+
264
+ # Get the last `n` rows.
265
+ #
266
+ # @param column [Object]
267
+ # Column name or Series.
268
+ # @param n [Integer]
269
+ # Number of rows to return.
270
+ #
271
+ # @return [Object]
272
+ def tail(column, n = 10)
273
+ if column.is_a?(Series)
274
+ column.tail(n)
275
+ else
276
+ col(column).tail(n)
277
+ end
278
+ end
224
279
 
225
280
  # Return an expression representing a literal value.
226
281
  #
@@ -239,17 +294,133 @@ module Polars
239
294
  Utils.wrap_expr(RbExpr.lit(value))
240
295
  end
241
296
 
242
- # def cumsum
243
- # end
297
+ # Cumulatively sum values in a column/Series, or horizontally across list of columns/expressions.
298
+ #
299
+ # @param column [Object]
300
+ # Column(s) to be used in aggregation.
301
+ #
302
+ # @return [Object]
303
+ #
304
+ # @example
305
+ # df = Polars::DataFrame.new(
306
+ # {
307
+ # "a" => [1, 2],
308
+ # "b" => [3, 4],
309
+ # "c" => [5, 6]
310
+ # }
311
+ # )
312
+ # # =>
313
+ # # shape: (2, 3)
314
+ # # ┌─────┬─────┬─────┐
315
+ # # │ a ┆ b ┆ c │
316
+ # # │ --- ┆ --- ┆ --- │
317
+ # # │ i64 ┆ i64 ┆ i64 │
318
+ # # ╞═════╪═════╪═════╡
319
+ # # │ 1 ┆ 3 ┆ 5 │
320
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
321
+ # # │ 2 ┆ 4 ┆ 6 │
322
+ # # └─────┴─────┴─────┘
323
+ #
324
+ # @example Cumulatively sum a column by name:
325
+ # df.select(Polars.cumsum("a"))
326
+ # # =>
327
+ # # shape: (2, 1)
328
+ # # ┌─────┐
329
+ # # │ a │
330
+ # # │ --- │
331
+ # # │ i64 │
332
+ # # ╞═════╡
333
+ # # │ 1 │
334
+ # # ├╌╌╌╌╌┤
335
+ # # │ 3 │
336
+ # # └─────┘
337
+ #
338
+ # @example Cumulatively sum a list of columns/expressions horizontally:
339
+ # df.with_column(Polars.cumsum(["a", "c"]))
340
+ # # =>
341
+ # # shape: (2, 4)
342
+ # # ┌─────┬─────┬─────┬───────────┐
343
+ # # │ a ┆ b ┆ c ┆ cumsum │
344
+ # # │ --- ┆ --- ┆ --- ┆ --- │
345
+ # # │ i64 ┆ i64 ┆ i64 ┆ struct[2] │
346
+ # # ╞═════╪═════╪═════╪═══════════╡
347
+ # # │ 1 ┆ 3 ┆ 5 ┆ {1,6} │
348
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
349
+ # # │ 2 ┆ 4 ┆ 6 ┆ {2,8} │
350
+ # # └─────┴─────┴─────┴───────────┘
351
+ def cumsum(column)
352
+ if column.is_a?(Series)
353
+ column.cumsum
354
+ elsif column.is_a?(String)
355
+ col(column).cumsum
356
+ else
357
+ cumfold(lit(0).cast(:u32), ->(a, b) { a + b }, column).alias("cumsum")
358
+ end
359
+ end
244
360
 
245
- # def spearman_rank_corr
246
- # end
361
+ # Compute the spearman rank correlation between two columns.
362
+ #
363
+ # Missing data will be excluded from the computation.
364
+ #
365
+ # @param a [Object]
366
+ # Column name or Expression.
367
+ # @param b [Object]
368
+ # Column name or Expression.
369
+ # @param ddof [Integer]
370
+ # Delta degrees of freedom
371
+ # @param propagate_nans [Boolean]
372
+ # If `True` any `NaN` encountered will lead to `NaN` in the output.
373
+ # Defaults to `False` where `NaN` are regarded as larger than any finite number
374
+ # and thus lead to the highest rank.
375
+ #
376
+ # @return [Expr]
377
+ def spearman_rank_corr(a, b, ddof: 1, propagate_nans: false)
378
+ if a.is_a?(String)
379
+ a = col(a)
380
+ end
381
+ if b.is_a?(String)
382
+ b = col(b)
383
+ end
384
+ Utils.wrap_expr(RbExpr.spearman_rank_corr(a._rbexpr, b._rbexpr, ddof, propagate_nans))
385
+ end
247
386
 
248
- # def pearson_corr
249
- # end
387
+ # Compute the pearson's correlation between two columns.
388
+ #
389
+ # @param a [Object]
390
+ # Column name or Expression.
391
+ # @param b [Object]
392
+ # Column name or Expression.
393
+ # @param ddof [Integer]
394
+ # Delta degrees of freedom
395
+ #
396
+ # @return [Expr]
397
+ def pearson_corr(a, b, ddof: 1)
398
+ if a.is_a?(String)
399
+ a = col(a)
400
+ end
401
+ if b.is_a?(String)
402
+ b = col(b)
403
+ end
404
+ Utils.wrap_expr(RbExpr.pearson_corr(a._rbexpr, b._rbexpr, ddof))
405
+ end
250
406
 
251
- # def cov
252
- # end
407
+ # Compute the covariance between two columns/ expressions.
408
+ #
409
+ # @param a [Object]
410
+ # Column name or Expression.
411
+ # @param b [Object]
412
+ # Column name or Expression.
413
+ #
414
+ # @return [Expr]
415
+ def cov(a, b)
416
+ if a.is_a?(String)
417
+ a = col(a)
418
+ end
419
+ if b.is_a?(String)
420
+ b = col(b)
421
+ end
422
+ Utils.wrap_expr(RbExpr.cov(a._rbexpr, b._rbexpr))
423
+ end
253
424
 
254
425
  # def map
255
426
  # end
@@ -257,7 +428,7 @@ module Polars
257
428
  # def apply
258
429
  # end
259
430
 
260
- # Accumulate over multiple columns horizontally/ row wise with a left fold.
431
+ # Accumulate over multiple columns horizontally/row wise with a left fold.
261
432
  #
262
433
  # @return [Expr]
263
434
  def fold(acc, f, exprs)
@@ -273,17 +444,118 @@ module Polars
273
444
  # def reduce
274
445
  # end
275
446
 
276
- # def cumfold
277
- # end
447
+ # Cumulatively accumulate over multiple columns horizontally/row wise with a left fold.
448
+ #
449
+ # Every cumulative result is added as a separate field in a Struct column.
450
+ #
451
+ # @param acc [Object]
452
+ # Accumulator Expression. This is the value that will be initialized when the fold
453
+ # starts. For a sum this could for instance be lit(0).
454
+ # @param f [Object]
455
+ # Function to apply over the accumulator and the value.
456
+ # Fn(acc, value) -> new_value
457
+ # @param exprs [Object]
458
+ # Expressions to aggregate over. May also be a wildcard expression.
459
+ # @param include_init [Boolean]
460
+ # Include the initial accumulator state as struct field.
461
+ #
462
+ # @return [Object]
463
+ #
464
+ # @note
465
+ # If you simply want the first encountered expression as accumulator,
466
+ # consider using `cumreduce`.
467
+ def cumfold(acc, f, exprs, include_init: false)
468
+ acc = Utils.expr_to_lit_or_expr(acc, str_to_lit: true)
469
+ if exprs.is_a?(Expr)
470
+ exprs = [exprs]
471
+ end
472
+
473
+ exprs = Utils.selection_to_rbexpr_list(exprs)
474
+ Utils.wrap_expr(RbExpr.cumfold(acc._rbexpr, f, exprs, include_init))
475
+ end
278
476
 
279
477
  # def cumreduce
280
478
  # end
281
479
 
282
- # def any
283
- # end
480
+ # Evaluate columnwise or elementwise with a bitwise OR operation.
481
+ #
482
+ # @return [Expr]
483
+ def any(name)
484
+ if name.is_a?(String)
485
+ col(name).any
486
+ else
487
+ fold(lit(false), ->(a, b) { a.cast(:bool) | b.cast(:bool) }, name).alias("any")
488
+ end
489
+ end
284
490
 
285
- # def exclude
286
- # end
491
+ # Exclude certain columns from a wildcard/regex selection.
492
+ #
493
+ # @param columns [Object]
494
+ # Column(s) to exclude from selection
495
+ # This can be:
496
+ #
497
+ # - a column name, or multiple column names
498
+ # - a regular expression starting with `^` and ending with `$`
499
+ # - a dtype or multiple dtypes
500
+ #
501
+ # @return [Object]
502
+ #
503
+ # @example
504
+ # df = Polars::DataFrame.new(
505
+ # {
506
+ # "aa" => [1, 2, 3],
507
+ # "ba" => ["a", "b", nil],
508
+ # "cc" => [nil, 2.5, 1.5]
509
+ # }
510
+ # )
511
+ # # =>
512
+ # # shape: (3, 3)
513
+ # # ┌─────┬──────┬──────┐
514
+ # # │ aa ┆ ba ┆ cc │
515
+ # # │ --- ┆ --- ┆ --- │
516
+ # # │ i64 ┆ str ┆ f64 │
517
+ # # ╞═════╪══════╪══════╡
518
+ # # │ 1 ┆ a ┆ null │
519
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
520
+ # # │ 2 ┆ b ┆ 2.5 │
521
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
522
+ # # │ 3 ┆ null ┆ 1.5 │
523
+ # # └─────┴──────┴──────┘
524
+ #
525
+ # @example Exclude by column name(s):
526
+ # df.select(Polars.exclude("ba"))
527
+ # # =>
528
+ # # shape: (3, 2)
529
+ # # ┌─────┬──────┐
530
+ # # │ aa ┆ cc │
531
+ # # │ --- ┆ --- │
532
+ # # │ i64 ┆ f64 │
533
+ # # ╞═════╪══════╡
534
+ # # │ 1 ┆ null │
535
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
536
+ # # │ 2 ┆ 2.5 │
537
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
538
+ # # │ 3 ┆ 1.5 │
539
+ # # └─────┴──────┘
540
+ #
541
+ # @example Exclude by regex, e.g. removing all columns whose names end with the letter "a":
542
+ # df.select(Polars.exclude("^.*a$"))
543
+ # # =>
544
+ # # shape: (3, 1)
545
+ # # ┌──────┐
546
+ # # │ cc │
547
+ # # │ --- │
548
+ # # │ f64 │
549
+ # # ╞══════╡
550
+ # # │ null │
551
+ # # ├╌╌╌╌╌╌┤
552
+ # # │ 2.5 │
553
+ # # ├╌╌╌╌╌╌┤
554
+ # # │ 1.5 │
555
+ # # └──────┘
556
+ def exclude(columns)
557
+ col("*").exclude(columns)
558
+ end
287
559
 
288
560
  # Do one of two things.
289
561
  #
@@ -319,11 +591,26 @@ module Polars
319
591
  end
320
592
  end
321
593
 
322
- # def groups
323
- # end
594
+ # Syntactic sugar for `Polars.col("foo").agg_groups`.
595
+ #
596
+ # @return [Object]
597
+ def groups(column)
598
+ col(column).agg_groups
599
+ end
324
600
 
325
- # def quantile
326
- # end
601
+ # Syntactic sugar for `Polars.col("foo").quantile(...)`.
602
+ #
603
+ # @param column [String]
604
+ # Column name.
605
+ # @param quantile [Float]
606
+ # Quantile between 0.0 and 1.0.
607
+ # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
608
+ # Interpolation method.
609
+ #
610
+ # @return [Expr]
611
+ def quantile(column, quantile, interpolation: "nearest")
612
+ col(column).quantile(quantile, interpolation: interpolation)
613
+ end
327
614
 
328
615
  # Create a range expression (or Series).
329
616
  #
@@ -339,7 +626,7 @@ module Polars
339
626
  # @param eager [Boolean]
340
627
  # If eager evaluation is `True`, a Series is returned instead of an Expr.
341
628
  # @param dtype [Symbol]
342
- # Apply an explicit integer dtype to the resulting expression (default is Int64).
629
+ # Apply an explicit integer dtype to the resulting expression (default is `:i64`).
343
630
  #
344
631
  # @return [Expr, Series]
345
632
  #
@@ -364,14 +651,212 @@ module Polars
364
651
  end
365
652
  end
366
653
 
367
- # def argsort_by
368
- # end
654
+ # Find the indexes that would sort the columns.
655
+ #
656
+ # Argsort by multiple columns. The first column will be used for the ordering.
657
+ # If there are duplicates in the first column, the second column will be used to
658
+ # determine the ordering and so on.
659
+ #
660
+ # @param exprs [Object]
661
+ # Columns use to determine the ordering.
662
+ # @param reverse [Boolean]
663
+ # Default is ascending.
664
+ #
665
+ # @return [Expr]
666
+ def argsort_by(exprs, reverse: false)
667
+ if !exprs.is_a?(Array)
668
+ exprs = [exprs]
669
+ end
670
+ if reverse == true || reverse == false
671
+ reverse = [reverse] * exprs.length
672
+ end
673
+ exprs = Utils.selection_to_rbexpr_list(exprs)
674
+ Utils.wrap_expr(RbExpr.argsort_by(exprs, reverse))
675
+ end
369
676
 
370
- # def duration
371
- # end
677
+ # Create polars `Duration` from distinct time components.
678
+ #
679
+ # @return [Expr]
680
+ #
681
+ # @example
682
+ # df = Polars::DataFrame.new(
683
+ # {
684
+ # "datetime" => [DateTime.new(2022, 1, 1), DateTime.new(2022, 1, 2)],
685
+ # "add" => [1, 2]
686
+ # }
687
+ # )
688
+ # df.select(
689
+ # [
690
+ # (Polars.col("datetime") + Polars.duration(weeks: "add")).alias("add_weeks"),
691
+ # (Polars.col("datetime") + Polars.duration(days: "add")).alias("add_days"),
692
+ # (Polars.col("datetime") + Polars.duration(seconds: "add")).alias("add_seconds"),
693
+ # (Polars.col("datetime") + Polars.duration(milliseconds: "add")).alias(
694
+ # "add_milliseconds"
695
+ # ),
696
+ # (Polars.col("datetime") + Polars.duration(hours: "add")).alias("add_hours")
697
+ # ]
698
+ # )
699
+ # # =>
700
+ # # shape: (2, 5)
701
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────────┬─────────────────────┐
702
+ # # │ add_weeks ┆ add_days ┆ add_seconds ┆ add_milliseconds ┆ add_hours │
703
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
704
+ # # │ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] │
705
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╪═════════════════════════╪═════════════════════╡
706
+ # # │ 2022-01-08 00:00:00 ┆ 2022-01-02 00:00:00 ┆ 2022-01-01 00:00:01 ┆ 2022-01-01 00:00:00.001 ┆ 2022-01-01 01:00:00 │
707
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
708
+ # # │ 2022-01-16 00:00:00 ┆ 2022-01-04 00:00:00 ┆ 2022-01-02 00:00:02 ┆ 2022-01-02 00:00:00.002 ┆ 2022-01-02 02:00:00 │
709
+ # # └─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────────┴─────────────────────┘
710
+ def duration(
711
+ days: nil,
712
+ seconds: nil,
713
+ nanoseconds: nil,
714
+ microseconds: nil,
715
+ milliseconds: nil,
716
+ minutes: nil,
717
+ hours: nil,
718
+ weeks: nil
719
+ )
720
+ if !hours.nil?
721
+ hours = Utils.expr_to_lit_or_expr(hours, str_to_lit: false)._rbexpr
722
+ end
723
+ if !minutes.nil?
724
+ minutes = Utils.expr_to_lit_or_expr(minutes, str_to_lit: false)._rbexpr
725
+ end
726
+ if !seconds.nil?
727
+ seconds = Utils.expr_to_lit_or_expr(seconds, str_to_lit: false)._rbexpr
728
+ end
729
+ if !milliseconds.nil?
730
+ milliseconds = Utils.expr_to_lit_or_expr(milliseconds, str_to_lit: false)._rbexpr
731
+ end
732
+ if !microseconds.nil?
733
+ microseconds = Utils.expr_to_lit_or_expr(microseconds, str_to_lit: false)._rbexpr
734
+ end
735
+ if !nanoseconds.nil?
736
+ nanoseconds = Utils.expr_to_lit_or_expr(nanoseconds, str_to_lit: false)._rbexpr
737
+ end
738
+ if !days.nil?
739
+ days = Utils.expr_to_lit_or_expr(days, str_to_lit: false)._rbexpr
740
+ end
741
+ if !weeks.nil?
742
+ weeks = Utils.expr_to_lit_or_expr(weeks, str_to_lit: false)._rbexpr
743
+ end
372
744
 
373
- # def format
374
- # end
745
+ Utils.wrap_expr(
746
+ _rb_duration(
747
+ days,
748
+ seconds,
749
+ nanoseconds,
750
+ microseconds,
751
+ milliseconds,
752
+ minutes,
753
+ hours,
754
+ weeks
755
+ )
756
+ )
757
+ end
758
+
759
+ # Horizontally concat Utf8 Series in linear time. Non-Utf8 columns are cast to Utf8.
760
+ #
761
+ # @param exprs [Object]
762
+ # Columns to concat into a Utf8 Series.
763
+ # @param sep [String]
764
+ # String value that will be used to separate the values.
765
+ #
766
+ # @return [Expr]
767
+ #
768
+ # @example
769
+ # df = Polars::DataFrame.new(
770
+ # {
771
+ # "a" => [1, 2, 3],
772
+ # "b" => ["dogs", "cats", nil],
773
+ # "c" => ["play", "swim", "walk"]
774
+ # }
775
+ # )
776
+ # df.with_columns(
777
+ # [
778
+ # Polars.concat_str(
779
+ # [
780
+ # Polars.col("a") * 2,
781
+ # Polars.col("b"),
782
+ # Polars.col("c")
783
+ # ],
784
+ # sep: " "
785
+ # ).alias("full_sentence")
786
+ # ]
787
+ # )
788
+ # # =>
789
+ # # shape: (3, 4)
790
+ # # ┌─────┬──────┬──────┬───────────────┐
791
+ # # │ a ┆ b ┆ c ┆ full_sentence │
792
+ # # │ --- ┆ --- ┆ --- ┆ --- │
793
+ # # │ i64 ┆ str ┆ str ┆ str │
794
+ # # ╞═════╪══════╪══════╪═══════════════╡
795
+ # # │ 1 ┆ dogs ┆ play ┆ 2 dogs play │
796
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
797
+ # # │ 2 ┆ cats ┆ swim ┆ 4 cats swim │
798
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
799
+ # # │ 3 ┆ null ┆ walk ┆ null │
800
+ # # └─────┴──────┴──────┴───────────────┘
801
+ def concat_str(exprs, sep: "")
802
+ exprs = Utils.selection_to_rbexpr_list(exprs)
803
+ return Utils.wrap_expr(RbExpr.concat_str(exprs, sep))
804
+ end
805
+
806
+ # Format expressions as a string.
807
+ #
808
+ # @param fstring [String]
809
+ # A string that with placeholders.
810
+ # For example: "hello_{}" or "{}_world
811
+ # @param args [Object]
812
+ # Expression(s) that fill the placeholders
813
+ #
814
+ # @return [Expr]
815
+ #
816
+ # @example
817
+ # df = Polars::DataFrame.new(
818
+ # {
819
+ # "a": ["a", "b", "c"],
820
+ # "b": [1, 2, 3]
821
+ # }
822
+ # )
823
+ # df.select(
824
+ # [
825
+ # Polars.format("foo_{}_bar_{}", Polars.col("a"), "b").alias("fmt")
826
+ # ]
827
+ # )
828
+ # # =>
829
+ # # shape: (3, 1)
830
+ # # ┌─────────────┐
831
+ # # │ fmt │
832
+ # # │ --- │
833
+ # # │ str │
834
+ # # ╞═════════════╡
835
+ # # │ foo_a_bar_1 │
836
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
837
+ # # │ foo_b_bar_2 │
838
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
839
+ # # │ foo_c_bar_3 │
840
+ # # └─────────────┘
841
+ def format(fstring, *args)
842
+ if fstring.scan("{}").length != args.length
843
+ raise ArgumentError, "number of placeholders should equal the number of arguments"
844
+ end
845
+
846
+ exprs = []
847
+
848
+ arguments = args.each
849
+ fstring.split(/(\{\})/).each do |s|
850
+ if s == "{}"
851
+ e = Utils.expr_to_lit_or_expr(arguments.next, str_to_lit: false)
852
+ exprs << e
853
+ elsif s.length > 0
854
+ exprs << lit(s)
855
+ end
856
+ end
857
+
858
+ concat_str(exprs, sep: "")
859
+ end
375
860
 
376
861
  # Concat the arrays in a Series dtype List in linear time.
377
862
  #
@@ -381,8 +866,73 @@ module Polars
381
866
  Utils.wrap_expr(RbExpr.concat_lst(exprs))
382
867
  end
383
868
 
384
- # def collect_all
385
- # end
869
+ # Collect multiple LazyFrames at the same time.
870
+ #
871
+ # This runs all the computation graphs in parallel on Polars threadpool.
872
+ #
873
+ # @param lazy_frames [Boolean]
874
+ # A list of LazyFrames to collect.
875
+ # @param type_coercion [Boolean]
876
+ # Do type coercion optimization.
877
+ # @param predicate_pushdown [Boolean]
878
+ # Do predicate pushdown optimization.
879
+ # @param projection_pushdown [Boolean]
880
+ # Do projection pushdown optimization.
881
+ # @param simplify_expression [Boolean]
882
+ # Run simplify expressions optimization.
883
+ # @param string_cache [Boolean]
884
+ # This argument is deprecated and will be ignored
885
+ # @param no_optimization [Boolean]
886
+ # Turn off optimizations.
887
+ # @param slice_pushdown [Boolean]
888
+ # Slice pushdown optimization.
889
+ # @param common_subplan_elimination [Boolean]
890
+ # Will try to cache branching subplans that occur on self-joins or unions.
891
+ # @param allow_streaming [Boolean]
892
+ # Run parts of the query in a streaming fashion (this is in an alpha state)
893
+ #
894
+ # @return [Array]
895
+ def collect_all(
896
+ lazy_frames,
897
+ type_coercion: true,
898
+ predicate_pushdown: true,
899
+ projection_pushdown: true,
900
+ simplify_expression: true,
901
+ string_cache: false,
902
+ no_optimization: false,
903
+ slice_pushdown: true,
904
+ common_subplan_elimination: true,
905
+ allow_streaming: false
906
+ )
907
+ if no_optimization
908
+ predicate_pushdown = false
909
+ projection_pushdown = false
910
+ slice_pushdown = false
911
+ common_subplan_elimination = false
912
+ end
913
+
914
+ prepared = []
915
+
916
+ lazy_frames.each do |lf|
917
+ ldf = lf._ldf.optimization_toggle(
918
+ type_coercion,
919
+ predicate_pushdown,
920
+ projection_pushdown,
921
+ simplify_expression,
922
+ slice_pushdown,
923
+ common_subplan_elimination,
924
+ allow_streaming
925
+ )
926
+ prepared << ldf
927
+ end
928
+
929
+ out = _collect_all(prepared)
930
+
931
+ # wrap the rbdataframes into dataframe
932
+ result = out.map { |rbdf| Utils.wrap_df(rbdf) }
933
+
934
+ result
935
+ end
386
936
 
387
937
  # Run polars expressions without a context.
388
938
  #
@@ -408,7 +958,7 @@ module Polars
408
958
  # "bool" => [true, nil],
409
959
  # "list" => [[1, 2], [3]],
410
960
  # }
411
- # ).select([Polars.struct(Polars.all()).alias("my_struct")])
961
+ # ).select([Polars.struct(Polars.all).alias("my_struct")])
412
962
  # # =>
413
963
  # # shape: (2, 1)
414
964
  # # ┌─────────────────────┐
@@ -425,7 +975,7 @@ module Polars
425
975
  # df = Polars::DataFrame.new(
426
976
  # {"a" => [1, 2, 3, 4], "b" => ["one", "two", "three", "four"], "c" => [9, 8, 7, 6]}
427
977
  # )
428
- # df.with_column(pl.struct(pl.col(["a", "b"])).alias("a_and_b"))
978
+ # df.with_column(Polars.struct(Polars.col(["a", "b"])).alias("a_and_b"))
429
979
  # # =>
430
980
  # # shape: (4, 4)
431
981
  # # ┌─────┬───────┬─────┬─────────────┐
@@ -511,11 +1061,105 @@ module Polars
511
1061
  end
512
1062
  end
513
1063
 
514
- # def coalesce
515
- # end
1064
+ # Folds the expressions from left to right, keeping the first non-null value.
1065
+ #
1066
+ # @param exprs [Object]
1067
+ # Expressions to coalesce.
1068
+ #
1069
+ # @return [Expr]
1070
+ #
1071
+ # @example
1072
+ # df = Polars::DataFrame.new(
1073
+ # [
1074
+ # [nil, 1.0, 1.0],
1075
+ # [nil, 2.0, 2.0],
1076
+ # [nil, nil, 3.0],
1077
+ # [nil, nil, nil]
1078
+ # ],
1079
+ # columns: [["a", :f64], ["b", :f64], ["c", :f64]]
1080
+ # )
1081
+ # df.with_column(Polars.coalesce(["a", "b", "c", 99.9]).alias("d"))
1082
+ # # =>
1083
+ # # shape: (4, 4)
1084
+ # # ┌──────┬──────┬──────┬──────┐
1085
+ # # │ a ┆ b ┆ c ┆ d │
1086
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1087
+ # # │ f64 ┆ f64 ┆ f64 ┆ f64 │
1088
+ # # ╞══════╪══════╪══════╪══════╡
1089
+ # # │ null ┆ 1.0 ┆ 1.0 ┆ 1.0 │
1090
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1091
+ # # │ null ┆ 2.0 ┆ 2.0 ┆ 2.0 │
1092
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1093
+ # # │ null ┆ null ┆ 3.0 ┆ 3.0 │
1094
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1095
+ # # │ null ┆ null ┆ null ┆ 99.9 │
1096
+ # # └──────┴──────┴──────┴──────┘
1097
+ def coalesce(exprs)
1098
+ exprs = Utils.selection_to_rbexpr_list(exprs)
1099
+ Utils.wrap_expr(_coalesce_exprs(exprs))
1100
+ end
516
1101
 
517
- # def from_epoch
518
- # end
1102
+ # Utility function that parses an epoch timestamp (or Unix time) to Polars Date(time).
1103
+ #
1104
+ # Depending on the `unit` provided, this function will return a different dtype:
1105
+ # - unit: "d" returns pl.Date
1106
+ # - unit: "s" returns pl.Datetime["us"] (pl.Datetime's default)
1107
+ # - unit: "ms" returns pl.Datetime["ms"]
1108
+ # - unit: "us" returns pl.Datetime["us"]
1109
+ # - unit: "ns" returns pl.Datetime["ns"]
1110
+ #
1111
+ # @param column [Object]
1112
+ # Series or expression to parse integers to pl.Datetime.
1113
+ # @param unit [String]
1114
+ # The unit of the timesteps since epoch time.
1115
+ # @param eager [Boolean]
1116
+ # If eager evaluation is `true`, a Series is returned instead of an Expr.
1117
+ #
1118
+ # @return [Object]
1119
+ #
1120
+ # @example
1121
+ # df = Polars::DataFrame.new({"timestamp" => [1666683077, 1666683099]}).lazy
1122
+ # df.select(Polars.from_epoch(Polars.col("timestamp"), unit: "s")).collect
1123
+ # # =>
1124
+ # # shape: (2, 1)
1125
+ # # ┌─────────────────────┐
1126
+ # # │ timestamp │
1127
+ # # │ --- │
1128
+ # # │ datetime[μs] │
1129
+ # # ╞═════════════════════╡
1130
+ # # │ 2022-10-25 07:31:17 │
1131
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1132
+ # # │ 2022-10-25 07:31:39 │
1133
+ # # └─────────────────────┘
1134
+ def from_epoch(column, unit: "s", eager: false)
1135
+ if column.is_a?(String)
1136
+ column = col(column)
1137
+ elsif !column.is_a?(Series) && !column.is_a?(Expr)
1138
+ column = Series.new(column)
1139
+ end
1140
+
1141
+ if unit == "d"
1142
+ expr = column.cast(:date)
1143
+ elsif unit == "s"
1144
+ raise Todo
1145
+ # expr = (column.cast(:i64) * 1_000_000).cast(Datetime("us"))
1146
+ elsif Utils::DTYPE_TEMPORAL_UNITS.include?(unit)
1147
+ raise Todo
1148
+ # expr = column.cast(Datetime(unit))
1149
+ else
1150
+ raise ArgumentError, "'unit' must be one of {{'ns', 'us', 'ms', 's', 'd'}}, got '#{unit}'."
1151
+ end
1152
+
1153
+ if eager
1154
+ if !column.is_a?(Series)
1155
+ raise ArgumentError, "expected Series or Array if eager: true, got #{column.class.name}"
1156
+ else
1157
+ column.to_frame.select(expr).to_series
1158
+ end
1159
+ else
1160
+ expr
1161
+ end
1162
+ end
519
1163
 
520
1164
  # Start a "when, then, otherwise" expression.
521
1165
  #