polars-df 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/Cargo.lock +430 -217
- data/Cargo.toml +2 -0
- data/LICENSE.txt +1 -1
- data/README.md +0 -2
- data/ext/polars/Cargo.toml +9 -3
- data/ext/polars/src/apply/dataframe.rs +303 -0
- data/ext/polars/src/apply/mod.rs +253 -0
- data/ext/polars/src/apply/series.rs +1173 -0
- data/ext/polars/src/conversion.rs +254 -35
- data/ext/polars/src/dataframe.rs +151 -6
- data/ext/polars/src/error.rs +8 -0
- data/ext/polars/src/lazy/apply.rs +34 -2
- data/ext/polars/src/lazy/dataframe.rs +80 -3
- data/ext/polars/src/lazy/dsl.rs +84 -10
- data/ext/polars/src/lib.rs +180 -8
- data/ext/polars/src/series.rs +328 -10
- data/ext/polars/src/utils.rs +25 -0
- data/lib/polars/convert.rb +100 -0
- data/lib/polars/data_frame.rb +1480 -77
- data/lib/polars/data_types.rb +122 -0
- data/lib/polars/date_time_expr.rb +10 -10
- data/lib/polars/date_time_name_space.rb +8 -8
- data/lib/polars/dynamic_group_by.rb +52 -0
- data/lib/polars/expr.rb +262 -12
- data/lib/polars/functions.rb +194 -5
- data/lib/polars/group_by.rb +76 -36
- data/lib/polars/io.rb +19 -3
- data/lib/polars/lazy_frame.rb +798 -25
- data/lib/polars/lazy_functions.rb +569 -30
- data/lib/polars/list_expr.rb +1 -1
- data/lib/polars/rolling_group_by.rb +35 -0
- data/lib/polars/series.rb +192 -27
- data/lib/polars/string_expr.rb +6 -5
- data/lib/polars/string_name_space.rb +1 -1
- data/lib/polars/utils.rb +25 -8
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +38 -29
- metadata +11 -4
@@ -8,13 +8,18 @@ module Polars
|
|
8
8
|
name = name.to_a
|
9
9
|
end
|
10
10
|
|
11
|
-
if name.is_a?(
|
11
|
+
if name.is_a?(Class) && name < DataType
|
12
|
+
name = [name]
|
13
|
+
end
|
14
|
+
|
15
|
+
if name.is_a?(DataType)
|
16
|
+
Utils.wrap_expr(_dtype_cols([name]))
|
17
|
+
elsif name.is_a?(Array)
|
12
18
|
if name.length == 0 || name[0].is_a?(String) || name[0].is_a?(Symbol)
|
13
19
|
name = name.map { |v| v.is_a?(Symbol) ? v.to_s : v }
|
14
20
|
Utils.wrap_expr(RbExpr.cols(name))
|
15
21
|
elsif Utils.is_polars_dtype(name[0])
|
16
|
-
|
17
|
-
# Utils.wrap_expr(_dtype_cols(name))
|
22
|
+
Utils.wrap_expr(_dtype_cols(name))
|
18
23
|
else
|
19
24
|
raise ArgumentError, "Expected list values to be all `str` or all `DataType`"
|
20
25
|
end
|
@@ -158,7 +163,7 @@ module Polars
|
|
158
163
|
# TODO
|
159
164
|
Utils.wrap_expr(_sum_exprs(exprs))
|
160
165
|
else
|
161
|
-
|
166
|
+
fold(lit(0).cast(:u32), ->(a, b) { a + b }, column).alias("sum")
|
162
167
|
end
|
163
168
|
end
|
164
169
|
|
@@ -294,8 +299,69 @@ module Polars
|
|
294
299
|
Utils.wrap_expr(RbExpr.lit(value))
|
295
300
|
end
|
296
301
|
|
297
|
-
#
|
298
|
-
#
|
302
|
+
# Cumulatively sum values in a column/Series, or horizontally across list of columns/expressions.
|
303
|
+
#
|
304
|
+
# @param column [Object]
|
305
|
+
# Column(s) to be used in aggregation.
|
306
|
+
#
|
307
|
+
# @return [Object]
|
308
|
+
#
|
309
|
+
# @example
|
310
|
+
# df = Polars::DataFrame.new(
|
311
|
+
# {
|
312
|
+
# "a" => [1, 2],
|
313
|
+
# "b" => [3, 4],
|
314
|
+
# "c" => [5, 6]
|
315
|
+
# }
|
316
|
+
# )
|
317
|
+
# # =>
|
318
|
+
# # shape: (2, 3)
|
319
|
+
# # ┌─────┬─────┬─────┐
|
320
|
+
# # │ a ┆ b ┆ c │
|
321
|
+
# # │ --- ┆ --- ┆ --- │
|
322
|
+
# # │ i64 ┆ i64 ┆ i64 │
|
323
|
+
# # ╞═════╪═════╪═════╡
|
324
|
+
# # │ 1 ┆ 3 ┆ 5 │
|
325
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
326
|
+
# # │ 2 ┆ 4 ┆ 6 │
|
327
|
+
# # └─────┴─────┴─────┘
|
328
|
+
#
|
329
|
+
# @example Cumulatively sum a column by name:
|
330
|
+
# df.select(Polars.cumsum("a"))
|
331
|
+
# # =>
|
332
|
+
# # shape: (2, 1)
|
333
|
+
# # ┌─────┐
|
334
|
+
# # │ a │
|
335
|
+
# # │ --- │
|
336
|
+
# # │ i64 │
|
337
|
+
# # ╞═════╡
|
338
|
+
# # │ 1 │
|
339
|
+
# # ├╌╌╌╌╌┤
|
340
|
+
# # │ 3 │
|
341
|
+
# # └─────┘
|
342
|
+
#
|
343
|
+
# @example Cumulatively sum a list of columns/expressions horizontally:
|
344
|
+
# df.with_column(Polars.cumsum(["a", "c"]))
|
345
|
+
# # =>
|
346
|
+
# # shape: (2, 4)
|
347
|
+
# # ┌─────┬─────┬─────┬───────────┐
|
348
|
+
# # │ a ┆ b ┆ c ┆ cumsum │
|
349
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
350
|
+
# # │ i64 ┆ i64 ┆ i64 ┆ struct[2] │
|
351
|
+
# # ╞═════╪═════╪═════╪═══════════╡
|
352
|
+
# # │ 1 ┆ 3 ┆ 5 ┆ {1,6} │
|
353
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
|
354
|
+
# # │ 2 ┆ 4 ┆ 6 ┆ {2,8} │
|
355
|
+
# # └─────┴─────┴─────┴───────────┘
|
356
|
+
def cumsum(column)
|
357
|
+
if column.is_a?(Series)
|
358
|
+
column.cumsum
|
359
|
+
elsif column.is_a?(String)
|
360
|
+
col(column).cumsum
|
361
|
+
else
|
362
|
+
cumfold(lit(0).cast(:u32), ->(a, b) { a + b }, column).alias("cumsum")
|
363
|
+
end
|
364
|
+
end
|
299
365
|
|
300
366
|
# Compute the spearman rank correlation between two columns.
|
301
367
|
#
|
@@ -367,7 +433,7 @@ module Polars
|
|
367
433
|
# def apply
|
368
434
|
# end
|
369
435
|
|
370
|
-
# Accumulate over multiple columns horizontally/
|
436
|
+
# Accumulate over multiple columns horizontally/row wise with a left fold.
|
371
437
|
#
|
372
438
|
# @return [Expr]
|
373
439
|
def fold(acc, f, exprs)
|
@@ -383,17 +449,118 @@ module Polars
|
|
383
449
|
# def reduce
|
384
450
|
# end
|
385
451
|
|
386
|
-
#
|
387
|
-
#
|
452
|
+
# Cumulatively accumulate over multiple columns horizontally/row wise with a left fold.
|
453
|
+
#
|
454
|
+
# Every cumulative result is added as a separate field in a Struct column.
|
455
|
+
#
|
456
|
+
# @param acc [Object]
|
457
|
+
# Accumulator Expression. This is the value that will be initialized when the fold
|
458
|
+
# starts. For a sum this could for instance be lit(0).
|
459
|
+
# @param f [Object]
|
460
|
+
# Function to apply over the accumulator and the value.
|
461
|
+
# Fn(acc, value) -> new_value
|
462
|
+
# @param exprs [Object]
|
463
|
+
# Expressions to aggregate over. May also be a wildcard expression.
|
464
|
+
# @param include_init [Boolean]
|
465
|
+
# Include the initial accumulator state as struct field.
|
466
|
+
#
|
467
|
+
# @return [Object]
|
468
|
+
#
|
469
|
+
# @note
|
470
|
+
# If you simply want the first encountered expression as accumulator,
|
471
|
+
# consider using `cumreduce`.
|
472
|
+
def cumfold(acc, f, exprs, include_init: false)
|
473
|
+
acc = Utils.expr_to_lit_or_expr(acc, str_to_lit: true)
|
474
|
+
if exprs.is_a?(Expr)
|
475
|
+
exprs = [exprs]
|
476
|
+
end
|
477
|
+
|
478
|
+
exprs = Utils.selection_to_rbexpr_list(exprs)
|
479
|
+
Utils.wrap_expr(RbExpr.cumfold(acc._rbexpr, f, exprs, include_init))
|
480
|
+
end
|
388
481
|
|
389
482
|
# def cumreduce
|
390
483
|
# end
|
391
484
|
|
392
|
-
#
|
393
|
-
#
|
485
|
+
# Evaluate columnwise or elementwise with a bitwise OR operation.
|
486
|
+
#
|
487
|
+
# @return [Expr]
|
488
|
+
def any(name)
|
489
|
+
if name.is_a?(String)
|
490
|
+
col(name).any
|
491
|
+
else
|
492
|
+
fold(lit(false), ->(a, b) { a.cast(:bool) | b.cast(:bool) }, name).alias("any")
|
493
|
+
end
|
494
|
+
end
|
394
495
|
|
395
|
-
#
|
396
|
-
#
|
496
|
+
# Exclude certain columns from a wildcard/regex selection.
|
497
|
+
#
|
498
|
+
# @param columns [Object]
|
499
|
+
# Column(s) to exclude from selection
|
500
|
+
# This can be:
|
501
|
+
#
|
502
|
+
# - a column name, or multiple column names
|
503
|
+
# - a regular expression starting with `^` and ending with `$`
|
504
|
+
# - a dtype or multiple dtypes
|
505
|
+
#
|
506
|
+
# @return [Object]
|
507
|
+
#
|
508
|
+
# @example
|
509
|
+
# df = Polars::DataFrame.new(
|
510
|
+
# {
|
511
|
+
# "aa" => [1, 2, 3],
|
512
|
+
# "ba" => ["a", "b", nil],
|
513
|
+
# "cc" => [nil, 2.5, 1.5]
|
514
|
+
# }
|
515
|
+
# )
|
516
|
+
# # =>
|
517
|
+
# # shape: (3, 3)
|
518
|
+
# # ┌─────┬──────┬──────┐
|
519
|
+
# # │ aa ┆ ba ┆ cc │
|
520
|
+
# # │ --- ┆ --- ┆ --- │
|
521
|
+
# # │ i64 ┆ str ┆ f64 │
|
522
|
+
# # ╞═════╪══════╪══════╡
|
523
|
+
# # │ 1 ┆ a ┆ null │
|
524
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
525
|
+
# # │ 2 ┆ b ┆ 2.5 │
|
526
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
527
|
+
# # │ 3 ┆ null ┆ 1.5 │
|
528
|
+
# # └─────┴──────┴──────┘
|
529
|
+
#
|
530
|
+
# @example Exclude by column name(s):
|
531
|
+
# df.select(Polars.exclude("ba"))
|
532
|
+
# # =>
|
533
|
+
# # shape: (3, 2)
|
534
|
+
# # ┌─────┬──────┐
|
535
|
+
# # │ aa ┆ cc │
|
536
|
+
# # │ --- ┆ --- │
|
537
|
+
# # │ i64 ┆ f64 │
|
538
|
+
# # ╞═════╪══════╡
|
539
|
+
# # │ 1 ┆ null │
|
540
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
541
|
+
# # │ 2 ┆ 2.5 │
|
542
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
543
|
+
# # │ 3 ┆ 1.5 │
|
544
|
+
# # └─────┴──────┘
|
545
|
+
#
|
546
|
+
# @example Exclude by regex, e.g. removing all columns whose names end with the letter "a":
|
547
|
+
# df.select(Polars.exclude("^.*a$"))
|
548
|
+
# # =>
|
549
|
+
# # shape: (3, 1)
|
550
|
+
# # ┌──────┐
|
551
|
+
# # │ cc │
|
552
|
+
# # │ --- │
|
553
|
+
# # │ f64 │
|
554
|
+
# # ╞══════╡
|
555
|
+
# # │ null │
|
556
|
+
# # ├╌╌╌╌╌╌┤
|
557
|
+
# # │ 2.5 │
|
558
|
+
# # ├╌╌╌╌╌╌┤
|
559
|
+
# # │ 1.5 │
|
560
|
+
# # └──────┘
|
561
|
+
def exclude(columns)
|
562
|
+
col("*").exclude(columns)
|
563
|
+
end
|
397
564
|
|
398
565
|
# Do one of two things.
|
399
566
|
#
|
@@ -429,11 +596,26 @@ module Polars
|
|
429
596
|
end
|
430
597
|
end
|
431
598
|
|
432
|
-
#
|
433
|
-
#
|
599
|
+
# Syntactic sugar for `Polars.col("foo").agg_groups`.
|
600
|
+
#
|
601
|
+
# @return [Object]
|
602
|
+
def groups(column)
|
603
|
+
col(column).agg_groups
|
604
|
+
end
|
434
605
|
|
435
|
-
#
|
436
|
-
#
|
606
|
+
# Syntactic sugar for `Polars.col("foo").quantile(...)`.
|
607
|
+
#
|
608
|
+
# @param column [String]
|
609
|
+
# Column name.
|
610
|
+
# @param quantile [Float]
|
611
|
+
# Quantile between 0.0 and 1.0.
|
612
|
+
# @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
|
613
|
+
# Interpolation method.
|
614
|
+
#
|
615
|
+
# @return [Expr]
|
616
|
+
def quantile(column, quantile, interpolation: "nearest")
|
617
|
+
col(column).quantile(quantile, interpolation: interpolation)
|
618
|
+
end
|
437
619
|
|
438
620
|
# Create a range expression (or Series).
|
439
621
|
#
|
@@ -449,7 +631,7 @@ module Polars
|
|
449
631
|
# @param eager [Boolean]
|
450
632
|
# If eager evaluation is `True`, a Series is returned instead of an Expr.
|
451
633
|
# @param dtype [Symbol]
|
452
|
-
# Apply an explicit integer dtype to the resulting expression (default is
|
634
|
+
# Apply an explicit integer dtype to the resulting expression (default is `:i64`).
|
453
635
|
#
|
454
636
|
# @return [Expr, Series]
|
455
637
|
#
|
@@ -474,14 +656,212 @@ module Polars
|
|
474
656
|
end
|
475
657
|
end
|
476
658
|
|
477
|
-
#
|
478
|
-
#
|
659
|
+
# Find the indexes that would sort the columns.
|
660
|
+
#
|
661
|
+
# Argsort by multiple columns. The first column will be used for the ordering.
|
662
|
+
# If there are duplicates in the first column, the second column will be used to
|
663
|
+
# determine the ordering and so on.
|
664
|
+
#
|
665
|
+
# @param exprs [Object]
|
666
|
+
# Columns use to determine the ordering.
|
667
|
+
# @param reverse [Boolean]
|
668
|
+
# Default is ascending.
|
669
|
+
#
|
670
|
+
# @return [Expr]
|
671
|
+
def argsort_by(exprs, reverse: false)
|
672
|
+
if !exprs.is_a?(Array)
|
673
|
+
exprs = [exprs]
|
674
|
+
end
|
675
|
+
if reverse == true || reverse == false
|
676
|
+
reverse = [reverse] * exprs.length
|
677
|
+
end
|
678
|
+
exprs = Utils.selection_to_rbexpr_list(exprs)
|
679
|
+
Utils.wrap_expr(RbExpr.argsort_by(exprs, reverse))
|
680
|
+
end
|
479
681
|
|
480
|
-
#
|
481
|
-
#
|
682
|
+
# Create polars `Duration` from distinct time components.
|
683
|
+
#
|
684
|
+
# @return [Expr]
|
685
|
+
#
|
686
|
+
# @example
|
687
|
+
# df = Polars::DataFrame.new(
|
688
|
+
# {
|
689
|
+
# "datetime" => [DateTime.new(2022, 1, 1), DateTime.new(2022, 1, 2)],
|
690
|
+
# "add" => [1, 2]
|
691
|
+
# }
|
692
|
+
# )
|
693
|
+
# df.select(
|
694
|
+
# [
|
695
|
+
# (Polars.col("datetime") + Polars.duration(weeks: "add")).alias("add_weeks"),
|
696
|
+
# (Polars.col("datetime") + Polars.duration(days: "add")).alias("add_days"),
|
697
|
+
# (Polars.col("datetime") + Polars.duration(seconds: "add")).alias("add_seconds"),
|
698
|
+
# (Polars.col("datetime") + Polars.duration(milliseconds: "add")).alias(
|
699
|
+
# "add_milliseconds"
|
700
|
+
# ),
|
701
|
+
# (Polars.col("datetime") + Polars.duration(hours: "add")).alias("add_hours")
|
702
|
+
# ]
|
703
|
+
# )
|
704
|
+
# # =>
|
705
|
+
# # shape: (2, 5)
|
706
|
+
# # ┌─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────────┬─────────────────────┐
|
707
|
+
# # │ add_weeks ┆ add_days ┆ add_seconds ┆ add_milliseconds ┆ add_hours │
|
708
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
709
|
+
# # │ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] │
|
710
|
+
# # ╞═════════════════════╪═════════════════════╪═════════════════════╪═════════════════════════╪═════════════════════╡
|
711
|
+
# # │ 2022-01-08 00:00:00 ┆ 2022-01-02 00:00:00 ┆ 2022-01-01 00:00:01 ┆ 2022-01-01 00:00:00.001 ┆ 2022-01-01 01:00:00 │
|
712
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
713
|
+
# # │ 2022-01-16 00:00:00 ┆ 2022-01-04 00:00:00 ┆ 2022-01-02 00:00:02 ┆ 2022-01-02 00:00:00.002 ┆ 2022-01-02 02:00:00 │
|
714
|
+
# # └─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────────┴─────────────────────┘
|
715
|
+
def duration(
|
716
|
+
days: nil,
|
717
|
+
seconds: nil,
|
718
|
+
nanoseconds: nil,
|
719
|
+
microseconds: nil,
|
720
|
+
milliseconds: nil,
|
721
|
+
minutes: nil,
|
722
|
+
hours: nil,
|
723
|
+
weeks: nil
|
724
|
+
)
|
725
|
+
if !hours.nil?
|
726
|
+
hours = Utils.expr_to_lit_or_expr(hours, str_to_lit: false)._rbexpr
|
727
|
+
end
|
728
|
+
if !minutes.nil?
|
729
|
+
minutes = Utils.expr_to_lit_or_expr(minutes, str_to_lit: false)._rbexpr
|
730
|
+
end
|
731
|
+
if !seconds.nil?
|
732
|
+
seconds = Utils.expr_to_lit_or_expr(seconds, str_to_lit: false)._rbexpr
|
733
|
+
end
|
734
|
+
if !milliseconds.nil?
|
735
|
+
milliseconds = Utils.expr_to_lit_or_expr(milliseconds, str_to_lit: false)._rbexpr
|
736
|
+
end
|
737
|
+
if !microseconds.nil?
|
738
|
+
microseconds = Utils.expr_to_lit_or_expr(microseconds, str_to_lit: false)._rbexpr
|
739
|
+
end
|
740
|
+
if !nanoseconds.nil?
|
741
|
+
nanoseconds = Utils.expr_to_lit_or_expr(nanoseconds, str_to_lit: false)._rbexpr
|
742
|
+
end
|
743
|
+
if !days.nil?
|
744
|
+
days = Utils.expr_to_lit_or_expr(days, str_to_lit: false)._rbexpr
|
745
|
+
end
|
746
|
+
if !weeks.nil?
|
747
|
+
weeks = Utils.expr_to_lit_or_expr(weeks, str_to_lit: false)._rbexpr
|
748
|
+
end
|
482
749
|
|
483
|
-
|
484
|
-
|
750
|
+
Utils.wrap_expr(
|
751
|
+
_rb_duration(
|
752
|
+
days,
|
753
|
+
seconds,
|
754
|
+
nanoseconds,
|
755
|
+
microseconds,
|
756
|
+
milliseconds,
|
757
|
+
minutes,
|
758
|
+
hours,
|
759
|
+
weeks
|
760
|
+
)
|
761
|
+
)
|
762
|
+
end
|
763
|
+
|
764
|
+
# Horizontally concat Utf8 Series in linear time. Non-Utf8 columns are cast to Utf8.
|
765
|
+
#
|
766
|
+
# @param exprs [Object]
|
767
|
+
# Columns to concat into a Utf8 Series.
|
768
|
+
# @param sep [String]
|
769
|
+
# String value that will be used to separate the values.
|
770
|
+
#
|
771
|
+
# @return [Expr]
|
772
|
+
#
|
773
|
+
# @example
|
774
|
+
# df = Polars::DataFrame.new(
|
775
|
+
# {
|
776
|
+
# "a" => [1, 2, 3],
|
777
|
+
# "b" => ["dogs", "cats", nil],
|
778
|
+
# "c" => ["play", "swim", "walk"]
|
779
|
+
# }
|
780
|
+
# )
|
781
|
+
# df.with_columns(
|
782
|
+
# [
|
783
|
+
# Polars.concat_str(
|
784
|
+
# [
|
785
|
+
# Polars.col("a") * 2,
|
786
|
+
# Polars.col("b"),
|
787
|
+
# Polars.col("c")
|
788
|
+
# ],
|
789
|
+
# sep: " "
|
790
|
+
# ).alias("full_sentence")
|
791
|
+
# ]
|
792
|
+
# )
|
793
|
+
# # =>
|
794
|
+
# # shape: (3, 4)
|
795
|
+
# # ┌─────┬──────┬──────┬───────────────┐
|
796
|
+
# # │ a ┆ b ┆ c ┆ full_sentence │
|
797
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
798
|
+
# # │ i64 ┆ str ┆ str ┆ str │
|
799
|
+
# # ╞═════╪══════╪══════╪═══════════════╡
|
800
|
+
# # │ 1 ┆ dogs ┆ play ┆ 2 dogs play │
|
801
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
802
|
+
# # │ 2 ┆ cats ┆ swim ┆ 4 cats swim │
|
803
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
804
|
+
# # │ 3 ┆ null ┆ walk ┆ null │
|
805
|
+
# # └─────┴──────┴──────┴───────────────┘
|
806
|
+
def concat_str(exprs, sep: "")
|
807
|
+
exprs = Utils.selection_to_rbexpr_list(exprs)
|
808
|
+
return Utils.wrap_expr(RbExpr.concat_str(exprs, sep))
|
809
|
+
end
|
810
|
+
|
811
|
+
# Format expressions as a string.
|
812
|
+
#
|
813
|
+
# @param fstring [String]
|
814
|
+
# A string that with placeholders.
|
815
|
+
# For example: "hello_{}" or "{}_world
|
816
|
+
# @param args [Object]
|
817
|
+
# Expression(s) that fill the placeholders
|
818
|
+
#
|
819
|
+
# @return [Expr]
|
820
|
+
#
|
821
|
+
# @example
|
822
|
+
# df = Polars::DataFrame.new(
|
823
|
+
# {
|
824
|
+
# "a": ["a", "b", "c"],
|
825
|
+
# "b": [1, 2, 3]
|
826
|
+
# }
|
827
|
+
# )
|
828
|
+
# df.select(
|
829
|
+
# [
|
830
|
+
# Polars.format("foo_{}_bar_{}", Polars.col("a"), "b").alias("fmt")
|
831
|
+
# ]
|
832
|
+
# )
|
833
|
+
# # =>
|
834
|
+
# # shape: (3, 1)
|
835
|
+
# # ┌─────────────┐
|
836
|
+
# # │ fmt │
|
837
|
+
# # │ --- │
|
838
|
+
# # │ str │
|
839
|
+
# # ╞═════════════╡
|
840
|
+
# # │ foo_a_bar_1 │
|
841
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
842
|
+
# # │ foo_b_bar_2 │
|
843
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
844
|
+
# # │ foo_c_bar_3 │
|
845
|
+
# # └─────────────┘
|
846
|
+
def format(fstring, *args)
|
847
|
+
if fstring.scan("{}").length != args.length
|
848
|
+
raise ArgumentError, "number of placeholders should equal the number of arguments"
|
849
|
+
end
|
850
|
+
|
851
|
+
exprs = []
|
852
|
+
|
853
|
+
arguments = args.each
|
854
|
+
fstring.split(/(\{\})/).each do |s|
|
855
|
+
if s == "{}"
|
856
|
+
e = Utils.expr_to_lit_or_expr(arguments.next, str_to_lit: false)
|
857
|
+
exprs << e
|
858
|
+
elsif s.length > 0
|
859
|
+
exprs << lit(s)
|
860
|
+
end
|
861
|
+
end
|
862
|
+
|
863
|
+
concat_str(exprs, sep: "")
|
864
|
+
end
|
485
865
|
|
486
866
|
# Concat the arrays in a Series dtype List in linear time.
|
487
867
|
#
|
@@ -491,8 +871,73 @@ module Polars
|
|
491
871
|
Utils.wrap_expr(RbExpr.concat_lst(exprs))
|
492
872
|
end
|
493
873
|
|
494
|
-
#
|
495
|
-
#
|
874
|
+
# Collect multiple LazyFrames at the same time.
|
875
|
+
#
|
876
|
+
# This runs all the computation graphs in parallel on Polars threadpool.
|
877
|
+
#
|
878
|
+
# @param lazy_frames [Boolean]
|
879
|
+
# A list of LazyFrames to collect.
|
880
|
+
# @param type_coercion [Boolean]
|
881
|
+
# Do type coercion optimization.
|
882
|
+
# @param predicate_pushdown [Boolean]
|
883
|
+
# Do predicate pushdown optimization.
|
884
|
+
# @param projection_pushdown [Boolean]
|
885
|
+
# Do projection pushdown optimization.
|
886
|
+
# @param simplify_expression [Boolean]
|
887
|
+
# Run simplify expressions optimization.
|
888
|
+
# @param string_cache [Boolean]
|
889
|
+
# This argument is deprecated and will be ignored
|
890
|
+
# @param no_optimization [Boolean]
|
891
|
+
# Turn off optimizations.
|
892
|
+
# @param slice_pushdown [Boolean]
|
893
|
+
# Slice pushdown optimization.
|
894
|
+
# @param common_subplan_elimination [Boolean]
|
895
|
+
# Will try to cache branching subplans that occur on self-joins or unions.
|
896
|
+
# @param allow_streaming [Boolean]
|
897
|
+
# Run parts of the query in a streaming fashion (this is in an alpha state)
|
898
|
+
#
|
899
|
+
# @return [Array]
|
900
|
+
def collect_all(
|
901
|
+
lazy_frames,
|
902
|
+
type_coercion: true,
|
903
|
+
predicate_pushdown: true,
|
904
|
+
projection_pushdown: true,
|
905
|
+
simplify_expression: true,
|
906
|
+
string_cache: false,
|
907
|
+
no_optimization: false,
|
908
|
+
slice_pushdown: true,
|
909
|
+
common_subplan_elimination: true,
|
910
|
+
allow_streaming: false
|
911
|
+
)
|
912
|
+
if no_optimization
|
913
|
+
predicate_pushdown = false
|
914
|
+
projection_pushdown = false
|
915
|
+
slice_pushdown = false
|
916
|
+
common_subplan_elimination = false
|
917
|
+
end
|
918
|
+
|
919
|
+
prepared = []
|
920
|
+
|
921
|
+
lazy_frames.each do |lf|
|
922
|
+
ldf = lf._ldf.optimization_toggle(
|
923
|
+
type_coercion,
|
924
|
+
predicate_pushdown,
|
925
|
+
projection_pushdown,
|
926
|
+
simplify_expression,
|
927
|
+
slice_pushdown,
|
928
|
+
common_subplan_elimination,
|
929
|
+
allow_streaming
|
930
|
+
)
|
931
|
+
prepared << ldf
|
932
|
+
end
|
933
|
+
|
934
|
+
out = _collect_all(prepared)
|
935
|
+
|
936
|
+
# wrap the rbdataframes into dataframe
|
937
|
+
result = out.map { |rbdf| Utils.wrap_df(rbdf) }
|
938
|
+
|
939
|
+
result
|
940
|
+
end
|
496
941
|
|
497
942
|
# Run polars expressions without a context.
|
498
943
|
#
|
@@ -621,11 +1066,105 @@ module Polars
|
|
621
1066
|
end
|
622
1067
|
end
|
623
1068
|
|
624
|
-
#
|
625
|
-
#
|
1069
|
+
# Folds the expressions from left to right, keeping the first non-null value.
|
1070
|
+
#
|
1071
|
+
# @param exprs [Object]
|
1072
|
+
# Expressions to coalesce.
|
1073
|
+
#
|
1074
|
+
# @return [Expr]
|
1075
|
+
#
|
1076
|
+
# @example
|
1077
|
+
# df = Polars::DataFrame.new(
|
1078
|
+
# [
|
1079
|
+
# [nil, 1.0, 1.0],
|
1080
|
+
# [nil, 2.0, 2.0],
|
1081
|
+
# [nil, nil, 3.0],
|
1082
|
+
# [nil, nil, nil]
|
1083
|
+
# ],
|
1084
|
+
# columns: [["a", :f64], ["b", :f64], ["c", :f64]]
|
1085
|
+
# )
|
1086
|
+
# df.with_column(Polars.coalesce(["a", "b", "c", 99.9]).alias("d"))
|
1087
|
+
# # =>
|
1088
|
+
# # shape: (4, 4)
|
1089
|
+
# # ┌──────┬──────┬──────┬──────┐
|
1090
|
+
# # │ a ┆ b ┆ c ┆ d │
|
1091
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1092
|
+
# # │ f64 ┆ f64 ┆ f64 ┆ f64 │
|
1093
|
+
# # ╞══════╪══════╪══════╪══════╡
|
1094
|
+
# # │ null ┆ 1.0 ┆ 1.0 ┆ 1.0 │
|
1095
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1096
|
+
# # │ null ┆ 2.0 ┆ 2.0 ┆ 2.0 │
|
1097
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1098
|
+
# # │ null ┆ null ┆ 3.0 ┆ 3.0 │
|
1099
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1100
|
+
# # │ null ┆ null ┆ null ┆ 99.9 │
|
1101
|
+
# # └──────┴──────┴──────┴──────┘
|
1102
|
+
def coalesce(exprs)
|
1103
|
+
exprs = Utils.selection_to_rbexpr_list(exprs)
|
1104
|
+
Utils.wrap_expr(_coalesce_exprs(exprs))
|
1105
|
+
end
|
626
1106
|
|
627
|
-
#
|
628
|
-
#
|
1107
|
+
# Utility function that parses an epoch timestamp (or Unix time) to Polars Date(time).
|
1108
|
+
#
|
1109
|
+
# Depending on the `unit` provided, this function will return a different dtype:
|
1110
|
+
# - unit: "d" returns pl.Date
|
1111
|
+
# - unit: "s" returns pl.Datetime["us"] (pl.Datetime's default)
|
1112
|
+
# - unit: "ms" returns pl.Datetime["ms"]
|
1113
|
+
# - unit: "us" returns pl.Datetime["us"]
|
1114
|
+
# - unit: "ns" returns pl.Datetime["ns"]
|
1115
|
+
#
|
1116
|
+
# @param column [Object]
|
1117
|
+
# Series or expression to parse integers to pl.Datetime.
|
1118
|
+
# @param unit [String]
|
1119
|
+
# The unit of the timesteps since epoch time.
|
1120
|
+
# @param eager [Boolean]
|
1121
|
+
# If eager evaluation is `true`, a Series is returned instead of an Expr.
|
1122
|
+
#
|
1123
|
+
# @return [Object]
|
1124
|
+
#
|
1125
|
+
# @example
|
1126
|
+
# df = Polars::DataFrame.new({"timestamp" => [1666683077, 1666683099]}).lazy
|
1127
|
+
# df.select(Polars.from_epoch(Polars.col("timestamp"), unit: "s")).collect
|
1128
|
+
# # =>
|
1129
|
+
# # shape: (2, 1)
|
1130
|
+
# # ┌─────────────────────┐
|
1131
|
+
# # │ timestamp │
|
1132
|
+
# # │ --- │
|
1133
|
+
# # │ datetime[μs] │
|
1134
|
+
# # ╞═════════════════════╡
|
1135
|
+
# # │ 2022-10-25 07:31:17 │
|
1136
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1137
|
+
# # │ 2022-10-25 07:31:39 │
|
1138
|
+
# # └─────────────────────┘
|
1139
|
+
def from_epoch(column, unit: "s", eager: false)
|
1140
|
+
if column.is_a?(String)
|
1141
|
+
column = col(column)
|
1142
|
+
elsif !column.is_a?(Series) && !column.is_a?(Expr)
|
1143
|
+
column = Series.new(column)
|
1144
|
+
end
|
1145
|
+
|
1146
|
+
if unit == "d"
|
1147
|
+
expr = column.cast(:date)
|
1148
|
+
elsif unit == "s"
|
1149
|
+
raise Todo
|
1150
|
+
# expr = (column.cast(:i64) * 1_000_000).cast(Datetime("us"))
|
1151
|
+
elsif Utils::DTYPE_TEMPORAL_UNITS.include?(unit)
|
1152
|
+
raise Todo
|
1153
|
+
# expr = column.cast(Datetime(unit))
|
1154
|
+
else
|
1155
|
+
raise ArgumentError, "'unit' must be one of {{'ns', 'us', 'ms', 's', 'd'}}, got '#{unit}'."
|
1156
|
+
end
|
1157
|
+
|
1158
|
+
if eager
|
1159
|
+
if !column.is_a?(Series)
|
1160
|
+
raise ArgumentError, "expected Series or Array if eager: true, got #{column.class.name}"
|
1161
|
+
else
|
1162
|
+
column.to_frame.select(expr).to_series
|
1163
|
+
end
|
1164
|
+
else
|
1165
|
+
expr
|
1166
|
+
end
|
1167
|
+
end
|
629
1168
|
|
630
1169
|
# Start a "when, then, otherwise" expression.
|
631
1170
|
#
|
data/lib/polars/list_expr.rb
CHANGED
@@ -568,7 +568,7 @@ module Polars
|
|
568
568
|
# # └────────────┘
|
569
569
|
def to_struct(n_field_strategy: "first_non_null", name_generator: nil)
|
570
570
|
raise Todo if name_generator
|
571
|
-
Utils.wrap_expr(_rbexpr.lst_to_struct(n_field_strategy, name_generator))
|
571
|
+
Utils.wrap_expr(_rbexpr.lst_to_struct(n_field_strategy, name_generator, 0))
|
572
572
|
end
|
573
573
|
|
574
574
|
# Run any polars expression against the lists' elements.
|