polars-df 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/Cargo.lock +142 -11
  4. data/Cargo.toml +5 -0
  5. data/ext/polars/Cargo.toml +17 -1
  6. data/ext/polars/src/apply/dataframe.rs +292 -0
  7. data/ext/polars/src/apply/mod.rs +254 -0
  8. data/ext/polars/src/apply/series.rs +1173 -0
  9. data/ext/polars/src/conversion.rs +180 -5
  10. data/ext/polars/src/dataframe.rs +146 -1
  11. data/ext/polars/src/error.rs +12 -0
  12. data/ext/polars/src/lazy/apply.rs +34 -2
  13. data/ext/polars/src/lazy/dataframe.rs +74 -3
  14. data/ext/polars/src/lazy/dsl.rs +136 -0
  15. data/ext/polars/src/lib.rs +199 -1
  16. data/ext/polars/src/list_construction.rs +100 -0
  17. data/ext/polars/src/series.rs +331 -0
  18. data/ext/polars/src/utils.rs +25 -0
  19. data/lib/polars/cat_name_space.rb +54 -0
  20. data/lib/polars/convert.rb +100 -0
  21. data/lib/polars/data_frame.rb +1558 -60
  22. data/lib/polars/date_time_expr.rb +2 -2
  23. data/lib/polars/date_time_name_space.rb +1484 -0
  24. data/lib/polars/dynamic_group_by.rb +49 -0
  25. data/lib/polars/expr.rb +4072 -107
  26. data/lib/polars/expr_dispatch.rb +8 -0
  27. data/lib/polars/functions.rb +192 -3
  28. data/lib/polars/group_by.rb +44 -3
  29. data/lib/polars/io.rb +20 -4
  30. data/lib/polars/lazy_frame.rb +800 -26
  31. data/lib/polars/lazy_functions.rb +687 -43
  32. data/lib/polars/lazy_group_by.rb +1 -0
  33. data/lib/polars/list_expr.rb +502 -5
  34. data/lib/polars/list_name_space.rb +346 -0
  35. data/lib/polars/rolling_group_by.rb +35 -0
  36. data/lib/polars/series.rb +934 -62
  37. data/lib/polars/string_expr.rb +189 -13
  38. data/lib/polars/string_name_space.rb +690 -0
  39. data/lib/polars/struct_name_space.rb +64 -0
  40. data/lib/polars/utils.rb +44 -0
  41. data/lib/polars/version.rb +1 -1
  42. data/lib/polars.rb +14 -1
  43. metadata +15 -3
@@ -158,7 +158,7 @@ module Polars
158
158
  # TODO
159
159
  Utils.wrap_expr(_sum_exprs(exprs))
160
160
  else
161
- raise Todo
161
+ fold(lit(0).cast(:u32), ->(a, b) { a + b }, column).alias("sum")
162
162
  end
163
163
  end
164
164
 
@@ -191,8 +191,16 @@ module Polars
191
191
  end
192
192
  end
193
193
 
194
- # def n_unique
195
- # end
194
+ # Count unique values.
195
+ #
196
+ # @return [Object]
197
+ def n_unique(column)
198
+ if column.is_a?(Series)
199
+ column.n_unique
200
+ else
201
+ col(column).n_unique
202
+ end
203
+ end
196
204
 
197
205
  # Get the first value.
198
206
  #
@@ -213,14 +221,61 @@ module Polars
213
221
  end
214
222
  end
215
223
 
216
- # def last
217
- # end
224
+ # Get the last value.
225
+ #
226
+ # Depending on the input type this function does different things:
227
+ #
228
+ # - nil -> expression to take last column of a context.
229
+ # - String -> syntactic sugar for `Polars.col(..).last`
230
+ # - Series -> Take last value in `Series`
231
+ #
232
+ # @return [Object]
233
+ def last(column = nil)
234
+ if column.nil?
235
+ return Utils.wrap_expr(_last)
236
+ end
218
237
 
219
- # def head
220
- # end
238
+ if column.is_a?(Series)
239
+ if column.len > 0
240
+ return column[-1]
241
+ else
242
+ raise IndexError, "The series is empty, so no last value can be returned"
243
+ end
244
+ end
245
+ col(column).last
246
+ end
221
247
 
222
- # def tail
223
- # end
248
+ # Get the first `n` rows.
249
+ #
250
+ # @param column [Object]
251
+ # Column name or Series.
252
+ # @param n [Integer]
253
+ # Number of rows to return.
254
+ #
255
+ # @return [Object]
256
+ def head(column, n = 10)
257
+ if column.is_a?(Series)
258
+ column.head(n)
259
+ else
260
+ col(column).head(n)
261
+ end
262
+ end
263
+
264
+ # Get the last `n` rows.
265
+ #
266
+ # @param column [Object]
267
+ # Column name or Series.
268
+ # @param n [Integer]
269
+ # Number of rows to return.
270
+ #
271
+ # @return [Object]
272
+ def tail(column, n = 10)
273
+ if column.is_a?(Series)
274
+ column.tail(n)
275
+ else
276
+ col(column).tail(n)
277
+ end
278
+ end
224
279
 
225
280
  # Return an expression representing a literal value.
226
281
  #
@@ -239,17 +294,133 @@ module Polars
239
294
  Utils.wrap_expr(RbExpr.lit(value))
240
295
  end
241
296
 
242
- # def cumsum
243
- # end
297
+ # Cumulatively sum values in a column/Series, or horizontally across list of columns/expressions.
298
+ #
299
+ # @param column [Object]
300
+ # Column(s) to be used in aggregation.
301
+ #
302
+ # @return [Object]
303
+ #
304
+ # @example
305
+ # df = Polars::DataFrame.new(
306
+ # {
307
+ # "a" => [1, 2],
308
+ # "b" => [3, 4],
309
+ # "c" => [5, 6]
310
+ # }
311
+ # )
312
+ # # =>
313
+ # # shape: (2, 3)
314
+ # # ┌─────┬─────┬─────┐
315
+ # # │ a ┆ b ┆ c │
316
+ # # │ --- ┆ --- ┆ --- │
317
+ # # │ i64 ┆ i64 ┆ i64 │
318
+ # # ╞═════╪═════╪═════╡
319
+ # # │ 1 ┆ 3 ┆ 5 │
320
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
321
+ # # │ 2 ┆ 4 ┆ 6 │
322
+ # # └─────┴─────┴─────┘
323
+ #
324
+ # @example Cumulatively sum a column by name:
325
+ # df.select(Polars.cumsum("a"))
326
+ # # =>
327
+ # # shape: (2, 1)
328
+ # # ┌─────┐
329
+ # # │ a │
330
+ # # │ --- │
331
+ # # │ i64 │
332
+ # # ╞═════╡
333
+ # # │ 1 │
334
+ # # ├╌╌╌╌╌┤
335
+ # # │ 3 │
336
+ # # └─────┘
337
+ #
338
+ # @example Cumulatively sum a list of columns/expressions horizontally:
339
+ # df.with_column(Polars.cumsum(["a", "c"]))
340
+ # # =>
341
+ # # shape: (2, 4)
342
+ # # ┌─────┬─────┬─────┬───────────┐
343
+ # # │ a ┆ b ┆ c ┆ cumsum │
344
+ # # │ --- ┆ --- ┆ --- ┆ --- │
345
+ # # │ i64 ┆ i64 ┆ i64 ┆ struct[2] │
346
+ # # ╞═════╪═════╪═════╪═══════════╡
347
+ # # │ 1 ┆ 3 ┆ 5 ┆ {1,6} │
348
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
349
+ # # │ 2 ┆ 4 ┆ 6 ┆ {2,8} │
350
+ # # └─────┴─────┴─────┴───────────┘
351
+ def cumsum(column)
352
+ if column.is_a?(Series)
353
+ column.cumsum
354
+ elsif column.is_a?(String)
355
+ col(column).cumsum
356
+ else
357
+ cumfold(lit(0).cast(:u32), ->(a, b) { a + b }, column).alias("cumsum")
358
+ end
359
+ end
244
360
 
245
- # def spearman_rank_corr
246
- # end
361
+ # Compute the spearman rank correlation between two columns.
362
+ #
363
+ # Missing data will be excluded from the computation.
364
+ #
365
+ # @param a [Object]
366
+ # Column name or Expression.
367
+ # @param b [Object]
368
+ # Column name or Expression.
369
+ # @param ddof [Integer]
370
+ # Delta degrees of freedom
371
+ # @param propagate_nans [Boolean]
372
+ # If `True` any `NaN` encountered will lead to `NaN` in the output.
373
+ # Defaults to `False` where `NaN` are regarded as larger than any finite number
374
+ # and thus lead to the highest rank.
375
+ #
376
+ # @return [Expr]
377
+ def spearman_rank_corr(a, b, ddof: 1, propagate_nans: false)
378
+ if a.is_a?(String)
379
+ a = col(a)
380
+ end
381
+ if b.is_a?(String)
382
+ b = col(b)
383
+ end
384
+ Utils.wrap_expr(RbExpr.spearman_rank_corr(a._rbexpr, b._rbexpr, ddof, propagate_nans))
385
+ end
247
386
 
248
- # def pearson_corr
249
- # end
387
+ # Compute the pearson's correlation between two columns.
388
+ #
389
+ # @param a [Object]
390
+ # Column name or Expression.
391
+ # @param b [Object]
392
+ # Column name or Expression.
393
+ # @param ddof [Integer]
394
+ # Delta degrees of freedom
395
+ #
396
+ # @return [Expr]
397
+ def pearson_corr(a, b, ddof: 1)
398
+ if a.is_a?(String)
399
+ a = col(a)
400
+ end
401
+ if b.is_a?(String)
402
+ b = col(b)
403
+ end
404
+ Utils.wrap_expr(RbExpr.pearson_corr(a._rbexpr, b._rbexpr, ddof))
405
+ end
250
406
 
251
- # def cov
252
- # end
407
+ # Compute the covariance between two columns/ expressions.
408
+ #
409
+ # @param a [Object]
410
+ # Column name or Expression.
411
+ # @param b [Object]
412
+ # Column name or Expression.
413
+ #
414
+ # @return [Expr]
415
+ def cov(a, b)
416
+ if a.is_a?(String)
417
+ a = col(a)
418
+ end
419
+ if b.is_a?(String)
420
+ b = col(b)
421
+ end
422
+ Utils.wrap_expr(RbExpr.cov(a._rbexpr, b._rbexpr))
423
+ end
253
424
 
254
425
  # def map
255
426
  # end
@@ -257,7 +428,7 @@ module Polars
257
428
  # def apply
258
429
  # end
259
430
 
260
- # Accumulate over multiple columns horizontally/ row wise with a left fold.
431
+ # Accumulate over multiple columns horizontally/row wise with a left fold.
261
432
  #
262
433
  # @return [Expr]
263
434
  def fold(acc, f, exprs)
@@ -273,17 +444,118 @@ module Polars
273
444
  # def reduce
274
445
  # end
275
446
 
276
- # def cumfold
277
- # end
447
+ # Cumulatively accumulate over multiple columns horizontally/row wise with a left fold.
448
+ #
449
+ # Every cumulative result is added as a separate field in a Struct column.
450
+ #
451
+ # @param acc [Object]
452
+ # Accumulator Expression. This is the value that will be initialized when the fold
453
+ # starts. For a sum this could for instance be lit(0).
454
+ # @param f [Object]
455
+ # Function to apply over the accumulator and the value.
456
+ # Fn(acc, value) -> new_value
457
+ # @param exprs [Object]
458
+ # Expressions to aggregate over. May also be a wildcard expression.
459
+ # @param include_init [Boolean]
460
+ # Include the initial accumulator state as struct field.
461
+ #
462
+ # @return [Object]
463
+ #
464
+ # @note
465
+ # If you simply want the first encountered expression as accumulator,
466
+ # consider using `cumreduce`.
467
+ def cumfold(acc, f, exprs, include_init: false)
468
+ acc = Utils.expr_to_lit_or_expr(acc, str_to_lit: true)
469
+ if exprs.is_a?(Expr)
470
+ exprs = [exprs]
471
+ end
472
+
473
+ exprs = Utils.selection_to_rbexpr_list(exprs)
474
+ Utils.wrap_expr(RbExpr.cumfold(acc._rbexpr, f, exprs, include_init))
475
+ end
278
476
 
279
477
  # def cumreduce
280
478
  # end
281
479
 
282
- # def any
283
- # end
480
+ # Evaluate columnwise or elementwise with a bitwise OR operation.
481
+ #
482
+ # @return [Expr]
483
+ def any(name)
484
+ if name.is_a?(String)
485
+ col(name).any
486
+ else
487
+ fold(lit(false), ->(a, b) { a.cast(:bool) | b.cast(:bool) }, name).alias("any")
488
+ end
489
+ end
284
490
 
285
- # def exclude
286
- # end
491
+ # Exclude certain columns from a wildcard/regex selection.
492
+ #
493
+ # @param columns [Object]
494
+ # Column(s) to exclude from selection
495
+ # This can be:
496
+ #
497
+ # - a column name, or multiple column names
498
+ # - a regular expression starting with `^` and ending with `$`
499
+ # - a dtype or multiple dtypes
500
+ #
501
+ # @return [Object]
502
+ #
503
+ # @example
504
+ # df = Polars::DataFrame.new(
505
+ # {
506
+ # "aa" => [1, 2, 3],
507
+ # "ba" => ["a", "b", nil],
508
+ # "cc" => [nil, 2.5, 1.5]
509
+ # }
510
+ # )
511
+ # # =>
512
+ # # shape: (3, 3)
513
+ # # ┌─────┬──────┬──────┐
514
+ # # │ aa ┆ ba ┆ cc │
515
+ # # │ --- ┆ --- ┆ --- │
516
+ # # │ i64 ┆ str ┆ f64 │
517
+ # # ╞═════╪══════╪══════╡
518
+ # # │ 1 ┆ a ┆ null │
519
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
520
+ # # │ 2 ┆ b ┆ 2.5 │
521
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
522
+ # # │ 3 ┆ null ┆ 1.5 │
523
+ # # └─────┴──────┴──────┘
524
+ #
525
+ # @example Exclude by column name(s):
526
+ # df.select(Polars.exclude("ba"))
527
+ # # =>
528
+ # # shape: (3, 2)
529
+ # # ┌─────┬──────┐
530
+ # # │ aa ┆ cc │
531
+ # # │ --- ┆ --- │
532
+ # # │ i64 ┆ f64 │
533
+ # # ╞═════╪══════╡
534
+ # # │ 1 ┆ null │
535
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
536
+ # # │ 2 ┆ 2.5 │
537
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
538
+ # # │ 3 ┆ 1.5 │
539
+ # # └─────┴──────┘
540
+ #
541
+ # @example Exclude by regex, e.g. removing all columns whose names end with the letter "a":
542
+ # df.select(Polars.exclude("^.*a$"))
543
+ # # =>
544
+ # # shape: (3, 1)
545
+ # # ┌──────┐
546
+ # # │ cc │
547
+ # # │ --- │
548
+ # # │ f64 │
549
+ # # ╞══════╡
550
+ # # │ null │
551
+ # # ├╌╌╌╌╌╌┤
552
+ # # │ 2.5 │
553
+ # # ├╌╌╌╌╌╌┤
554
+ # # │ 1.5 │
555
+ # # └──────┘
556
+ def exclude(columns)
557
+ col("*").exclude(columns)
558
+ end
287
559
 
288
560
  # Do one of two things.
289
561
  #
@@ -319,11 +591,26 @@ module Polars
319
591
  end
320
592
  end
321
593
 
322
- # def groups
323
- # end
594
+ # Syntactic sugar for `Polars.col("foo").agg_groups`.
595
+ #
596
+ # @return [Object]
597
+ def groups(column)
598
+ col(column).agg_groups
599
+ end
324
600
 
325
- # def quantile
326
- # end
601
+ # Syntactic sugar for `Polars.col("foo").quantile(...)`.
602
+ #
603
+ # @param column [String]
604
+ # Column name.
605
+ # @param quantile [Float]
606
+ # Quantile between 0.0 and 1.0.
607
+ # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
608
+ # Interpolation method.
609
+ #
610
+ # @return [Expr]
611
+ def quantile(column, quantile, interpolation: "nearest")
612
+ col(column).quantile(quantile, interpolation: interpolation)
613
+ end
327
614
 
328
615
  # Create a range expression (or Series).
329
616
  #
@@ -339,7 +626,7 @@ module Polars
339
626
  # @param eager [Boolean]
340
627
  # If eager evaluation is `True`, a Series is returned instead of an Expr.
341
628
  # @param dtype [Symbol]
342
- # Apply an explicit integer dtype to the resulting expression (default is Int64).
629
+ # Apply an explicit integer dtype to the resulting expression (default is `:i64`).
343
630
  #
344
631
  # @return [Expr, Series]
345
632
  #
@@ -364,14 +651,212 @@ module Polars
364
651
  end
365
652
  end
366
653
 
367
- # def argsort_by
368
- # end
654
+ # Find the indexes that would sort the columns.
655
+ #
656
+ # Argsort by multiple columns. The first column will be used for the ordering.
657
+ # If there are duplicates in the first column, the second column will be used to
658
+ # determine the ordering and so on.
659
+ #
660
+ # @param exprs [Object]
661
+ # Columns use to determine the ordering.
662
+ # @param reverse [Boolean]
663
+ # Default is ascending.
664
+ #
665
+ # @return [Expr]
666
+ def argsort_by(exprs, reverse: false)
667
+ if !exprs.is_a?(Array)
668
+ exprs = [exprs]
669
+ end
670
+ if reverse == true || reverse == false
671
+ reverse = [reverse] * exprs.length
672
+ end
673
+ exprs = Utils.selection_to_rbexpr_list(exprs)
674
+ Utils.wrap_expr(RbExpr.argsort_by(exprs, reverse))
675
+ end
369
676
 
370
- # def duration
371
- # end
677
+ # Create polars `Duration` from distinct time components.
678
+ #
679
+ # @return [Expr]
680
+ #
681
+ # @example
682
+ # df = Polars::DataFrame.new(
683
+ # {
684
+ # "datetime" => [DateTime.new(2022, 1, 1), DateTime.new(2022, 1, 2)],
685
+ # "add" => [1, 2]
686
+ # }
687
+ # )
688
+ # df.select(
689
+ # [
690
+ # (Polars.col("datetime") + Polars.duration(weeks: "add")).alias("add_weeks"),
691
+ # (Polars.col("datetime") + Polars.duration(days: "add")).alias("add_days"),
692
+ # (Polars.col("datetime") + Polars.duration(seconds: "add")).alias("add_seconds"),
693
+ # (Polars.col("datetime") + Polars.duration(milliseconds: "add")).alias(
694
+ # "add_milliseconds"
695
+ # ),
696
+ # (Polars.col("datetime") + Polars.duration(hours: "add")).alias("add_hours")
697
+ # ]
698
+ # )
699
+ # # =>
700
+ # # shape: (2, 5)
701
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────────┬─────────────────────┐
702
+ # # │ add_weeks ┆ add_days ┆ add_seconds ┆ add_milliseconds ┆ add_hours │
703
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
704
+ # # │ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] │
705
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╪═════════════════════════╪═════════════════════╡
706
+ # # │ 2022-01-08 00:00:00 ┆ 2022-01-02 00:00:00 ┆ 2022-01-01 00:00:01 ┆ 2022-01-01 00:00:00.001 ┆ 2022-01-01 01:00:00 │
707
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
708
+ # # │ 2022-01-16 00:00:00 ┆ 2022-01-04 00:00:00 ┆ 2022-01-02 00:00:02 ┆ 2022-01-02 00:00:00.002 ┆ 2022-01-02 02:00:00 │
709
+ # # └─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────────┴─────────────────────┘
710
+ def duration(
711
+ days: nil,
712
+ seconds: nil,
713
+ nanoseconds: nil,
714
+ microseconds: nil,
715
+ milliseconds: nil,
716
+ minutes: nil,
717
+ hours: nil,
718
+ weeks: nil
719
+ )
720
+ if !hours.nil?
721
+ hours = Utils.expr_to_lit_or_expr(hours, str_to_lit: false)._rbexpr
722
+ end
723
+ if !minutes.nil?
724
+ minutes = Utils.expr_to_lit_or_expr(minutes, str_to_lit: false)._rbexpr
725
+ end
726
+ if !seconds.nil?
727
+ seconds = Utils.expr_to_lit_or_expr(seconds, str_to_lit: false)._rbexpr
728
+ end
729
+ if !milliseconds.nil?
730
+ milliseconds = Utils.expr_to_lit_or_expr(milliseconds, str_to_lit: false)._rbexpr
731
+ end
732
+ if !microseconds.nil?
733
+ microseconds = Utils.expr_to_lit_or_expr(microseconds, str_to_lit: false)._rbexpr
734
+ end
735
+ if !nanoseconds.nil?
736
+ nanoseconds = Utils.expr_to_lit_or_expr(nanoseconds, str_to_lit: false)._rbexpr
737
+ end
738
+ if !days.nil?
739
+ days = Utils.expr_to_lit_or_expr(days, str_to_lit: false)._rbexpr
740
+ end
741
+ if !weeks.nil?
742
+ weeks = Utils.expr_to_lit_or_expr(weeks, str_to_lit: false)._rbexpr
743
+ end
372
744
 
373
- # def format
374
- # end
745
+ Utils.wrap_expr(
746
+ _rb_duration(
747
+ days,
748
+ seconds,
749
+ nanoseconds,
750
+ microseconds,
751
+ milliseconds,
752
+ minutes,
753
+ hours,
754
+ weeks
755
+ )
756
+ )
757
+ end
758
+
759
+ # Horizontally concat Utf8 Series in linear time. Non-Utf8 columns are cast to Utf8.
760
+ #
761
+ # @param exprs [Object]
762
+ # Columns to concat into a Utf8 Series.
763
+ # @param sep [String]
764
+ # String value that will be used to separate the values.
765
+ #
766
+ # @return [Expr]
767
+ #
768
+ # @example
769
+ # df = Polars::DataFrame.new(
770
+ # {
771
+ # "a" => [1, 2, 3],
772
+ # "b" => ["dogs", "cats", nil],
773
+ # "c" => ["play", "swim", "walk"]
774
+ # }
775
+ # )
776
+ # df.with_columns(
777
+ # [
778
+ # Polars.concat_str(
779
+ # [
780
+ # Polars.col("a") * 2,
781
+ # Polars.col("b"),
782
+ # Polars.col("c")
783
+ # ],
784
+ # sep: " "
785
+ # ).alias("full_sentence")
786
+ # ]
787
+ # )
788
+ # # =>
789
+ # # shape: (3, 4)
790
+ # # ┌─────┬──────┬──────┬───────────────┐
791
+ # # │ a ┆ b ┆ c ┆ full_sentence │
792
+ # # │ --- ┆ --- ┆ --- ┆ --- │
793
+ # # │ i64 ┆ str ┆ str ┆ str │
794
+ # # ╞═════╪══════╪══════╪═══════════════╡
795
+ # # │ 1 ┆ dogs ┆ play ┆ 2 dogs play │
796
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
797
+ # # │ 2 ┆ cats ┆ swim ┆ 4 cats swim │
798
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
799
+ # # │ 3 ┆ null ┆ walk ┆ null │
800
+ # # └─────┴──────┴──────┴───────────────┘
801
+ def concat_str(exprs, sep: "")
802
+ exprs = Utils.selection_to_rbexpr_list(exprs)
803
+ return Utils.wrap_expr(RbExpr.concat_str(exprs, sep))
804
+ end
805
+
806
+ # Format expressions as a string.
807
+ #
808
+ # @param fstring [String]
809
+ # A string that with placeholders.
810
+ # For example: "hello_{}" or "{}_world
811
+ # @param args [Object]
812
+ # Expression(s) that fill the placeholders
813
+ #
814
+ # @return [Expr]
815
+ #
816
+ # @example
817
+ # df = Polars::DataFrame.new(
818
+ # {
819
+ # "a": ["a", "b", "c"],
820
+ # "b": [1, 2, 3]
821
+ # }
822
+ # )
823
+ # df.select(
824
+ # [
825
+ # Polars.format("foo_{}_bar_{}", Polars.col("a"), "b").alias("fmt")
826
+ # ]
827
+ # )
828
+ # # =>
829
+ # # shape: (3, 1)
830
+ # # ┌─────────────┐
831
+ # # │ fmt │
832
+ # # │ --- │
833
+ # # │ str │
834
+ # # ╞═════════════╡
835
+ # # │ foo_a_bar_1 │
836
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
837
+ # # │ foo_b_bar_2 │
838
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
839
+ # # │ foo_c_bar_3 │
840
+ # # └─────────────┘
841
+ def format(fstring, *args)
842
+ if fstring.scan("{}").length != args.length
843
+ raise ArgumentError, "number of placeholders should equal the number of arguments"
844
+ end
845
+
846
+ exprs = []
847
+
848
+ arguments = args.each
849
+ fstring.split(/(\{\})/).each do |s|
850
+ if s == "{}"
851
+ e = Utils.expr_to_lit_or_expr(arguments.next, str_to_lit: false)
852
+ exprs << e
853
+ elsif s.length > 0
854
+ exprs << lit(s)
855
+ end
856
+ end
857
+
858
+ concat_str(exprs, sep: "")
859
+ end
375
860
 
376
861
  # Concat the arrays in a Series dtype List in linear time.
377
862
  #
@@ -381,8 +866,73 @@ module Polars
381
866
  Utils.wrap_expr(RbExpr.concat_lst(exprs))
382
867
  end
383
868
 
384
- # def collect_all
385
- # end
869
+ # Collect multiple LazyFrames at the same time.
870
+ #
871
+ # This runs all the computation graphs in parallel on Polars threadpool.
872
+ #
873
+ # @param lazy_frames [Boolean]
874
+ # A list of LazyFrames to collect.
875
+ # @param type_coercion [Boolean]
876
+ # Do type coercion optimization.
877
+ # @param predicate_pushdown [Boolean]
878
+ # Do predicate pushdown optimization.
879
+ # @param projection_pushdown [Boolean]
880
+ # Do projection pushdown optimization.
881
+ # @param simplify_expression [Boolean]
882
+ # Run simplify expressions optimization.
883
+ # @param string_cache [Boolean]
884
+ # This argument is deprecated and will be ignored
885
+ # @param no_optimization [Boolean]
886
+ # Turn off optimizations.
887
+ # @param slice_pushdown [Boolean]
888
+ # Slice pushdown optimization.
889
+ # @param common_subplan_elimination [Boolean]
890
+ # Will try to cache branching subplans that occur on self-joins or unions.
891
+ # @param allow_streaming [Boolean]
892
+ # Run parts of the query in a streaming fashion (this is in an alpha state)
893
+ #
894
+ # @return [Array]
895
+ def collect_all(
896
+ lazy_frames,
897
+ type_coercion: true,
898
+ predicate_pushdown: true,
899
+ projection_pushdown: true,
900
+ simplify_expression: true,
901
+ string_cache: false,
902
+ no_optimization: false,
903
+ slice_pushdown: true,
904
+ common_subplan_elimination: true,
905
+ allow_streaming: false
906
+ )
907
+ if no_optimization
908
+ predicate_pushdown = false
909
+ projection_pushdown = false
910
+ slice_pushdown = false
911
+ common_subplan_elimination = false
912
+ end
913
+
914
+ prepared = []
915
+
916
+ lazy_frames.each do |lf|
917
+ ldf = lf._ldf.optimization_toggle(
918
+ type_coercion,
919
+ predicate_pushdown,
920
+ projection_pushdown,
921
+ simplify_expression,
922
+ slice_pushdown,
923
+ common_subplan_elimination,
924
+ allow_streaming
925
+ )
926
+ prepared << ldf
927
+ end
928
+
929
+ out = _collect_all(prepared)
930
+
931
+ # wrap the rbdataframes into dataframe
932
+ result = out.map { |rbdf| Utils.wrap_df(rbdf) }
933
+
934
+ result
935
+ end
386
936
 
387
937
  # Run polars expressions without a context.
388
938
  #
@@ -408,7 +958,7 @@ module Polars
408
958
  # "bool" => [true, nil],
409
959
  # "list" => [[1, 2], [3]],
410
960
  # }
411
- # ).select([Polars.struct(Polars.all()).alias("my_struct")])
961
+ # ).select([Polars.struct(Polars.all).alias("my_struct")])
412
962
  # # =>
413
963
  # # shape: (2, 1)
414
964
  # # ┌─────────────────────┐
@@ -425,7 +975,7 @@ module Polars
425
975
  # df = Polars::DataFrame.new(
426
976
  # {"a" => [1, 2, 3, 4], "b" => ["one", "two", "three", "four"], "c" => [9, 8, 7, 6]}
427
977
  # )
428
- # df.with_column(pl.struct(pl.col(["a", "b"])).alias("a_and_b"))
978
+ # df.with_column(Polars.struct(Polars.col(["a", "b"])).alias("a_and_b"))
429
979
  # # =>
430
980
  # # shape: (4, 4)
431
981
  # # ┌─────┬───────┬─────┬─────────────┐
@@ -511,11 +1061,105 @@ module Polars
511
1061
  end
512
1062
  end
513
1063
 
514
- # def coalesce
515
- # end
1064
+ # Folds the expressions from left to right, keeping the first non-null value.
1065
+ #
1066
+ # @param exprs [Object]
1067
+ # Expressions to coalesce.
1068
+ #
1069
+ # @return [Expr]
1070
+ #
1071
+ # @example
1072
+ # df = Polars::DataFrame.new(
1073
+ # [
1074
+ # [nil, 1.0, 1.0],
1075
+ # [nil, 2.0, 2.0],
1076
+ # [nil, nil, 3.0],
1077
+ # [nil, nil, nil]
1078
+ # ],
1079
+ # columns: [["a", :f64], ["b", :f64], ["c", :f64]]
1080
+ # )
1081
+ # df.with_column(Polars.coalesce(["a", "b", "c", 99.9]).alias("d"))
1082
+ # # =>
1083
+ # # shape: (4, 4)
1084
+ # # ┌──────┬──────┬──────┬──────┐
1085
+ # # │ a ┆ b ┆ c ┆ d │
1086
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1087
+ # # │ f64 ┆ f64 ┆ f64 ┆ f64 │
1088
+ # # ╞══════╪══════╪══════╪══════╡
1089
+ # # │ null ┆ 1.0 ┆ 1.0 ┆ 1.0 │
1090
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1091
+ # # │ null ┆ 2.0 ┆ 2.0 ┆ 2.0 │
1092
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1093
+ # # │ null ┆ null ┆ 3.0 ┆ 3.0 │
1094
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
1095
+ # # │ null ┆ null ┆ null ┆ 99.9 │
1096
+ # # └──────┴──────┴──────┴──────┘
1097
+ def coalesce(exprs)
1098
+ exprs = Utils.selection_to_rbexpr_list(exprs)
1099
+ Utils.wrap_expr(_coalesce_exprs(exprs))
1100
+ end
516
1101
 
517
- # def from_epoch
518
- # end
1102
+ # Utility function that parses an epoch timestamp (or Unix time) to Polars Date(time).
1103
+ #
1104
+ # Depending on the `unit` provided, this function will return a different dtype:
1105
+ # - unit: "d" returns pl.Date
1106
+ # - unit: "s" returns pl.Datetime["us"] (pl.Datetime's default)
1107
+ # - unit: "ms" returns pl.Datetime["ms"]
1108
+ # - unit: "us" returns pl.Datetime["us"]
1109
+ # - unit: "ns" returns pl.Datetime["ns"]
1110
+ #
1111
+ # @param column [Object]
1112
+ # Series or expression to parse integers to pl.Datetime.
1113
+ # @param unit [String]
1114
+ # The unit of the timesteps since epoch time.
1115
+ # @param eager [Boolean]
1116
+ # If eager evaluation is `true`, a Series is returned instead of an Expr.
1117
+ #
1118
+ # @return [Object]
1119
+ #
1120
+ # @example
1121
+ # df = Polars::DataFrame.new({"timestamp" => [1666683077, 1666683099]}).lazy
1122
+ # df.select(Polars.from_epoch(Polars.col("timestamp"), unit: "s")).collect
1123
+ # # =>
1124
+ # # shape: (2, 1)
1125
+ # # ┌─────────────────────┐
1126
+ # # │ timestamp │
1127
+ # # │ --- │
1128
+ # # │ datetime[μs] │
1129
+ # # ╞═════════════════════╡
1130
+ # # │ 2022-10-25 07:31:17 │
1131
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1132
+ # # │ 2022-10-25 07:31:39 │
1133
+ # # └─────────────────────┘
1134
+ def from_epoch(column, unit: "s", eager: false)
1135
+ if column.is_a?(String)
1136
+ column = col(column)
1137
+ elsif !column.is_a?(Series) && !column.is_a?(Expr)
1138
+ column = Series.new(column)
1139
+ end
1140
+
1141
+ if unit == "d"
1142
+ expr = column.cast(:date)
1143
+ elsif unit == "s"
1144
+ raise Todo
1145
+ # expr = (column.cast(:i64) * 1_000_000).cast(Datetime("us"))
1146
+ elsif Utils::DTYPE_TEMPORAL_UNITS.include?(unit)
1147
+ raise Todo
1148
+ # expr = column.cast(Datetime(unit))
1149
+ else
1150
+ raise ArgumentError, "'unit' must be one of {{'ns', 'us', 'ms', 's', 'd'}}, got '#{unit}'."
1151
+ end
1152
+
1153
+ if eager
1154
+ if !column.is_a?(Series)
1155
+ raise ArgumentError, "expected Series or Array if eager: true, got #{column.class.name}"
1156
+ else
1157
+ column.to_frame.select(expr).to_series
1158
+ end
1159
+ else
1160
+ expr
1161
+ end
1162
+ end
519
1163
 
520
1164
  # Start a "when, then, otherwise" expression.
521
1165
  #