polars-df 0.8.0 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +30 -1
- data/Cargo.lock +107 -59
- data/Cargo.toml +0 -3
- data/LICENSE.txt +1 -1
- data/README.md +2 -2
- data/ext/polars/Cargo.toml +15 -7
- data/ext/polars/src/batched_csv.rs +4 -4
- data/ext/polars/src/conversion/anyvalue.rs +185 -0
- data/ext/polars/src/conversion/chunked_array.rs +140 -0
- data/ext/polars/src/{conversion.rs → conversion/mod.rs} +260 -340
- data/ext/polars/src/dataframe.rs +69 -53
- data/ext/polars/src/expr/array.rs +74 -0
- data/ext/polars/src/expr/datetime.rs +22 -56
- data/ext/polars/src/expr/general.rs +61 -33
- data/ext/polars/src/expr/list.rs +52 -4
- data/ext/polars/src/expr/meta.rs +48 -0
- data/ext/polars/src/expr/rolling.rs +1 -0
- data/ext/polars/src/expr/string.rs +59 -8
- data/ext/polars/src/expr/struct.rs +8 -4
- data/ext/polars/src/functions/aggregation.rs +6 -0
- data/ext/polars/src/functions/lazy.rs +103 -48
- data/ext/polars/src/functions/meta.rs +45 -1
- data/ext/polars/src/functions/string_cache.rs +14 -0
- data/ext/polars/src/{lazyframe.rs → lazyframe/mod.rs} +138 -22
- data/ext/polars/src/lib.rs +226 -168
- data/ext/polars/src/series/aggregation.rs +20 -0
- data/ext/polars/src/series/mod.rs +25 -4
- data/lib/polars/array_expr.rb +449 -0
- data/lib/polars/array_name_space.rb +346 -0
- data/lib/polars/cat_expr.rb +24 -0
- data/lib/polars/cat_name_space.rb +75 -0
- data/lib/polars/config.rb +2 -2
- data/lib/polars/data_frame.rb +179 -43
- data/lib/polars/data_types.rb +191 -28
- data/lib/polars/date_time_expr.rb +31 -14
- data/lib/polars/exceptions.rb +12 -1
- data/lib/polars/expr.rb +866 -186
- data/lib/polars/functions/aggregation/horizontal.rb +246 -0
- data/lib/polars/functions/aggregation/vertical.rb +282 -0
- data/lib/polars/functions/as_datatype.rb +248 -0
- data/lib/polars/functions/col.rb +47 -0
- data/lib/polars/functions/eager.rb +182 -0
- data/lib/polars/functions/lazy.rb +1280 -0
- data/lib/polars/functions/len.rb +49 -0
- data/lib/polars/functions/lit.rb +35 -0
- data/lib/polars/functions/random.rb +16 -0
- data/lib/polars/functions/range/date_range.rb +103 -0
- data/lib/polars/functions/range/int_range.rb +51 -0
- data/lib/polars/functions/repeat.rb +144 -0
- data/lib/polars/functions/whenthen.rb +27 -0
- data/lib/polars/functions.rb +29 -416
- data/lib/polars/group_by.rb +2 -2
- data/lib/polars/io.rb +18 -25
- data/lib/polars/lazy_frame.rb +367 -53
- data/lib/polars/list_expr.rb +152 -6
- data/lib/polars/list_name_space.rb +102 -0
- data/lib/polars/meta_expr.rb +175 -7
- data/lib/polars/series.rb +273 -34
- data/lib/polars/string_cache.rb +75 -0
- data/lib/polars/string_expr.rb +412 -96
- data/lib/polars/string_name_space.rb +4 -4
- data/lib/polars/testing.rb +507 -0
- data/lib/polars/utils.rb +52 -8
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +15 -2
- metadata +35 -5
- data/lib/polars/lazy_functions.rb +0 -1181
data/lib/polars/lazy_frame.rb
CHANGED
@@ -308,7 +308,7 @@ module Polars
|
|
308
308
|
# end
|
309
309
|
#
|
310
310
|
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]}).lazy
|
311
|
-
# df.pipe(cast_str_to_int, col_name: "b").collect
|
311
|
+
# df.pipe(cast_str_to_int, col_name: "b").collect
|
312
312
|
# # =>
|
313
313
|
# # shape: (4, 2)
|
314
314
|
# # ┌─────┬─────┐
|
@@ -342,6 +342,7 @@ module Polars
|
|
342
342
|
simplify_expression: true,
|
343
343
|
slice_pushdown: true,
|
344
344
|
common_subplan_elimination: true,
|
345
|
+
comm_subexpr_elim: true,
|
345
346
|
allow_streaming: false
|
346
347
|
)
|
347
348
|
ldf = _ldf.optimization_toggle(
|
@@ -351,6 +352,7 @@ module Polars
|
|
351
352
|
simplify_expression,
|
352
353
|
slice_pushdown,
|
353
354
|
common_subplan_elimination,
|
355
|
+
comm_subexpr_elim,
|
354
356
|
allow_streaming,
|
355
357
|
false
|
356
358
|
)
|
@@ -469,6 +471,7 @@ module Polars
|
|
469
471
|
no_optimization: false,
|
470
472
|
slice_pushdown: true,
|
471
473
|
common_subplan_elimination: true,
|
474
|
+
comm_subexpr_elim: true,
|
472
475
|
allow_streaming: false,
|
473
476
|
_eager: false
|
474
477
|
)
|
@@ -477,6 +480,7 @@ module Polars
|
|
477
480
|
projection_pushdown = false
|
478
481
|
slice_pushdown = false
|
479
482
|
common_subplan_elimination = false
|
483
|
+
comm_subexpr_elim = false
|
480
484
|
end
|
481
485
|
|
482
486
|
if allow_streaming
|
@@ -490,6 +494,7 @@ module Polars
|
|
490
494
|
simplify_expression,
|
491
495
|
slice_pushdown,
|
492
496
|
common_subplan_elimination,
|
497
|
+
comm_subexpr_elim,
|
493
498
|
allow_streaming,
|
494
499
|
_eager
|
495
500
|
)
|
@@ -559,6 +564,268 @@ module Polars
|
|
559
564
|
simplify_expression: true,
|
560
565
|
no_optimization: false,
|
561
566
|
slice_pushdown: true
|
567
|
+
)
|
568
|
+
lf = _set_sink_optimizations(
|
569
|
+
type_coercion: type_coercion,
|
570
|
+
predicate_pushdown: predicate_pushdown,
|
571
|
+
projection_pushdown: projection_pushdown,
|
572
|
+
simplify_expression: simplify_expression,
|
573
|
+
slice_pushdown: slice_pushdown,
|
574
|
+
no_optimization: no_optimization
|
575
|
+
)
|
576
|
+
|
577
|
+
lf.sink_parquet(
|
578
|
+
path,
|
579
|
+
compression,
|
580
|
+
compression_level,
|
581
|
+
statistics,
|
582
|
+
row_group_size,
|
583
|
+
data_pagesize_limit,
|
584
|
+
maintain_order
|
585
|
+
)
|
586
|
+
end
|
587
|
+
|
588
|
+
# Evaluate the query in streaming mode and write to an IPC file.
|
589
|
+
#
|
590
|
+
# This allows streaming results that are larger than RAM to be written to disk.
|
591
|
+
#
|
592
|
+
# @param path [String]
|
593
|
+
# File path to which the file should be written.
|
594
|
+
# @param compression ["lz4", "zstd"]
|
595
|
+
# Choose "zstd" for good compression performance.
|
596
|
+
# Choose "lz4" for fast compression/decompression.
|
597
|
+
# @param maintain_order [Boolean]
|
598
|
+
# Maintain the order in which data is processed.
|
599
|
+
# Setting this to `false` will be slightly faster.
|
600
|
+
# @param type_coercion [Boolean]
|
601
|
+
# Do type coercion optimization.
|
602
|
+
# @param predicate_pushdown [Boolean]
|
603
|
+
# Do predicate pushdown optimization.
|
604
|
+
# @param projection_pushdown [Boolean]
|
605
|
+
# Do projection pushdown optimization.
|
606
|
+
# @param simplify_expression [Boolean]
|
607
|
+
# Run simplify expressions optimization.
|
608
|
+
# @param slice_pushdown [Boolean]
|
609
|
+
# Slice pushdown optimization.
|
610
|
+
# @param no_optimization [Boolean]
|
611
|
+
# Turn off (certain) optimizations.
|
612
|
+
#
|
613
|
+
# @return [DataFrame]
|
614
|
+
#
|
615
|
+
# @example
|
616
|
+
# lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
|
617
|
+
# lf.sink_ipc("out.arrow")
|
618
|
+
def sink_ipc(
|
619
|
+
path,
|
620
|
+
compression: "zstd",
|
621
|
+
maintain_order: true,
|
622
|
+
type_coercion: true,
|
623
|
+
predicate_pushdown: true,
|
624
|
+
projection_pushdown: true,
|
625
|
+
simplify_expression: true,
|
626
|
+
slice_pushdown: true,
|
627
|
+
no_optimization: false
|
628
|
+
)
|
629
|
+
lf = _set_sink_optimizations(
|
630
|
+
type_coercion: type_coercion,
|
631
|
+
predicate_pushdown: predicate_pushdown,
|
632
|
+
projection_pushdown: projection_pushdown,
|
633
|
+
simplify_expression: simplify_expression,
|
634
|
+
slice_pushdown: slice_pushdown,
|
635
|
+
no_optimization: no_optimization
|
636
|
+
)
|
637
|
+
|
638
|
+
lf.sink_ipc(
|
639
|
+
path,
|
640
|
+
compression,
|
641
|
+
maintain_order
|
642
|
+
)
|
643
|
+
end
|
644
|
+
|
645
|
+
# Evaluate the query in streaming mode and write to a CSV file.
|
646
|
+
#
|
647
|
+
# This allows streaming results that are larger than RAM to be written to disk.
|
648
|
+
#
|
649
|
+
# @param path [String]
|
650
|
+
# File path to which the file should be written.
|
651
|
+
# @param include_bom [Boolean]
|
652
|
+
# Whether to include UTF-8 BOM in the CSV output.
|
653
|
+
# @param include_header [Boolean]
|
654
|
+
# Whether to include header in the CSV output.
|
655
|
+
# @param separator [String]
|
656
|
+
# Separate CSV fields with this symbol.
|
657
|
+
# @param line_terminator [String]
|
658
|
+
# String used to end each row.
|
659
|
+
# @param quote_char [String]
|
660
|
+
# Byte to use as quoting character.
|
661
|
+
# @param batch_size [Integer]
|
662
|
+
# Number of rows that will be processed per thread.
|
663
|
+
# @param datetime_format [String]
|
664
|
+
# A format string, with the specifiers defined by the
|
665
|
+
# `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
|
666
|
+
# Rust crate. If no format specified, the default fractional-second
|
667
|
+
# precision is inferred from the maximum timeunit found in the frame's
|
668
|
+
# Datetime cols (if any).
|
669
|
+
# @param date_format [String]
|
670
|
+
# A format string, with the specifiers defined by the
|
671
|
+
# `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
|
672
|
+
# Rust crate.
|
673
|
+
# @param time_format [String]
|
674
|
+
# A format string, with the specifiers defined by the
|
675
|
+
# `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
|
676
|
+
# Rust crate.
|
677
|
+
# @param float_precision [Integer]
|
678
|
+
# Number of decimal places to write, applied to both `Float32` and
|
679
|
+
# `Float64` datatypes.
|
680
|
+
# @param null_value [String]
|
681
|
+
# A string representing null values (defaulting to the empty string).
|
682
|
+
# @param quote_style ["necessary", "always", "non_numeric", "never"]
|
683
|
+
# Determines the quoting strategy used.
|
684
|
+
#
|
685
|
+
# - necessary (default): This puts quotes around fields only when necessary.
|
686
|
+
# They are necessary when fields contain a quote,
|
687
|
+
# delimiter or record terminator.
|
688
|
+
# Quotes are also necessary when writing an empty record
|
689
|
+
# (which is indistinguishable from a record with one empty field).
|
690
|
+
# This is the default.
|
691
|
+
# - always: This puts quotes around every field. Always.
|
692
|
+
# - never: This never puts quotes around fields, even if that results in
|
693
|
+
# invalid CSV data (e.g.: by not quoting strings containing the
|
694
|
+
# separator).
|
695
|
+
# - non_numeric: This puts quotes around all fields that are non-numeric.
|
696
|
+
# Namely, when writing a field that does not parse as a valid float
|
697
|
+
# or integer, then quotes will be used even if they aren`t strictly
|
698
|
+
# necessary.
|
699
|
+
# @param maintain_order [Boolean]
|
700
|
+
# Maintain the order in which data is processed.
|
701
|
+
# Setting this to `false` will be slightly faster.
|
702
|
+
# @param type_coercion [Boolean]
|
703
|
+
# Do type coercion optimization.
|
704
|
+
# @param predicate_pushdown [Boolean]
|
705
|
+
# Do predicate pushdown optimization.
|
706
|
+
# @param projection_pushdown [Boolean]
|
707
|
+
# Do projection pushdown optimization.
|
708
|
+
# @param simplify_expression [Boolean]
|
709
|
+
# Run simplify expressions optimization.
|
710
|
+
# @param slice_pushdown [Boolean]
|
711
|
+
# Slice pushdown optimization.
|
712
|
+
# @param no_optimization [Boolean]
|
713
|
+
# Turn off (certain) optimizations.
|
714
|
+
#
|
715
|
+
# @return [DataFrame]
|
716
|
+
#
|
717
|
+
# @example
|
718
|
+
# lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
|
719
|
+
# lf.sink_csv("out.csv")
|
720
|
+
def sink_csv(
|
721
|
+
path,
|
722
|
+
include_bom: false,
|
723
|
+
include_header: true,
|
724
|
+
separator: ",",
|
725
|
+
line_terminator: "\n",
|
726
|
+
quote_char: '"',
|
727
|
+
batch_size: 1024,
|
728
|
+
datetime_format: nil,
|
729
|
+
date_format: nil,
|
730
|
+
time_format: nil,
|
731
|
+
float_precision: nil,
|
732
|
+
null_value: nil,
|
733
|
+
quote_style: nil,
|
734
|
+
maintain_order: true,
|
735
|
+
type_coercion: true,
|
736
|
+
predicate_pushdown: true,
|
737
|
+
projection_pushdown: true,
|
738
|
+
simplify_expression: true,
|
739
|
+
slice_pushdown: true,
|
740
|
+
no_optimization: false
|
741
|
+
)
|
742
|
+
Utils._check_arg_is_1byte("separator", separator, false)
|
743
|
+
Utils._check_arg_is_1byte("quote_char", quote_char, false)
|
744
|
+
|
745
|
+
lf = _set_sink_optimizations(
|
746
|
+
type_coercion: type_coercion,
|
747
|
+
predicate_pushdown: predicate_pushdown,
|
748
|
+
projection_pushdown: projection_pushdown,
|
749
|
+
simplify_expression: simplify_expression,
|
750
|
+
slice_pushdown: slice_pushdown,
|
751
|
+
no_optimization: no_optimization
|
752
|
+
)
|
753
|
+
|
754
|
+
lf.sink_csv(
|
755
|
+
path,
|
756
|
+
include_bom,
|
757
|
+
include_header,
|
758
|
+
separator.ord,
|
759
|
+
line_terminator,
|
760
|
+
quote_char.ord,
|
761
|
+
batch_size,
|
762
|
+
datetime_format,
|
763
|
+
date_format,
|
764
|
+
time_format,
|
765
|
+
float_precision,
|
766
|
+
null_value,
|
767
|
+
quote_style,
|
768
|
+
maintain_order
|
769
|
+
)
|
770
|
+
end
|
771
|
+
|
772
|
+
# Evaluate the query in streaming mode and write to an NDJSON file.
|
773
|
+
#
|
774
|
+
# This allows streaming results that are larger than RAM to be written to disk.
|
775
|
+
#
|
776
|
+
# @param path [String]
|
777
|
+
# File path to which the file should be written.
|
778
|
+
# @param maintain_order [Boolean]
|
779
|
+
# Maintain the order in which data is processed.
|
780
|
+
# Setting this to `false` will be slightly faster.
|
781
|
+
# @param type_coercion [Boolean]
|
782
|
+
# Do type coercion optimization.
|
783
|
+
# @param predicate_pushdown [Boolean]
|
784
|
+
# Do predicate pushdown optimization.
|
785
|
+
# @param projection_pushdown [Boolean]
|
786
|
+
# Do projection pushdown optimization.
|
787
|
+
# @param simplify_expression [Boolean]
|
788
|
+
# Run simplify expressions optimization.
|
789
|
+
# @param slice_pushdown [Boolean]
|
790
|
+
# Slice pushdown optimization.
|
791
|
+
# @param no_optimization [Boolean]
|
792
|
+
# Turn off (certain) optimizations.
|
793
|
+
#
|
794
|
+
# @return [DataFrame]
|
795
|
+
#
|
796
|
+
# @example
|
797
|
+
# lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
|
798
|
+
# lf.sink_ndjson("out.ndjson")
|
799
|
+
def sink_ndjson(
|
800
|
+
path,
|
801
|
+
maintain_order: true,
|
802
|
+
type_coercion: true,
|
803
|
+
predicate_pushdown: true,
|
804
|
+
projection_pushdown: true,
|
805
|
+
simplify_expression: true,
|
806
|
+
slice_pushdown: true,
|
807
|
+
no_optimization: false
|
808
|
+
)
|
809
|
+
lf = _set_sink_optimizations(
|
810
|
+
type_coercion: type_coercion,
|
811
|
+
predicate_pushdown: predicate_pushdown,
|
812
|
+
projection_pushdown: projection_pushdown,
|
813
|
+
simplify_expression: simplify_expression,
|
814
|
+
slice_pushdown: slice_pushdown,
|
815
|
+
no_optimization: no_optimization
|
816
|
+
)
|
817
|
+
|
818
|
+
lf.sink_json(path, maintain_order)
|
819
|
+
end
|
820
|
+
|
821
|
+
# @private
|
822
|
+
def _set_sink_optimizations(
|
823
|
+
type_coercion: true,
|
824
|
+
predicate_pushdown: true,
|
825
|
+
projection_pushdown: true,
|
826
|
+
simplify_expression: true,
|
827
|
+
slice_pushdown: true,
|
828
|
+
no_optimization: false
|
562
829
|
)
|
563
830
|
if no_optimization
|
564
831
|
predicate_pushdown = false
|
@@ -566,25 +833,17 @@ module Polars
|
|
566
833
|
slice_pushdown = false
|
567
834
|
end
|
568
835
|
|
569
|
-
|
836
|
+
_ldf.optimization_toggle(
|
570
837
|
type_coercion,
|
571
838
|
predicate_pushdown,
|
572
839
|
projection_pushdown,
|
573
840
|
simplify_expression,
|
574
841
|
slice_pushdown,
|
575
842
|
false,
|
843
|
+
false,
|
576
844
|
true,
|
577
845
|
false
|
578
846
|
)
|
579
|
-
lf.sink_parquet(
|
580
|
-
path,
|
581
|
-
compression,
|
582
|
-
compression_level,
|
583
|
-
statistics,
|
584
|
-
row_group_size,
|
585
|
-
data_pagesize_limit,
|
586
|
-
maintain_order
|
587
|
-
)
|
588
847
|
end
|
589
848
|
|
590
849
|
# Collect a small number of rows for debugging purposes.
|
@@ -650,6 +909,7 @@ module Polars
|
|
650
909
|
no_optimization: false,
|
651
910
|
slice_pushdown: true,
|
652
911
|
common_subplan_elimination: true,
|
912
|
+
comm_subexpr_elim: true,
|
653
913
|
allow_streaming: false
|
654
914
|
)
|
655
915
|
if no_optimization
|
@@ -666,6 +926,7 @@ module Polars
|
|
666
926
|
simplify_expression,
|
667
927
|
slice_pushdown,
|
668
928
|
common_subplan_elimination,
|
929
|
+
comm_subexpr_elim,
|
669
930
|
allow_streaming,
|
670
931
|
false
|
671
932
|
)
|
@@ -699,6 +960,10 @@ module Polars
|
|
699
960
|
_from_rbldf(_ldf.cache)
|
700
961
|
end
|
701
962
|
|
963
|
+
# TODO
|
964
|
+
# def cast
|
965
|
+
# end
|
966
|
+
|
702
967
|
# Create an empty copy of the current LazyFrame.
|
703
968
|
#
|
704
969
|
# The copy has an identical schema but no data.
|
@@ -706,14 +971,14 @@ module Polars
|
|
706
971
|
# @return [LazyFrame]
|
707
972
|
#
|
708
973
|
# @example
|
709
|
-
#
|
974
|
+
# lf = Polars::LazyFrame.new(
|
710
975
|
# {
|
711
976
|
# "a" => [nil, 2, 3, 4],
|
712
977
|
# "b" => [0.5, nil, 2.5, 13],
|
713
978
|
# "c" => [true, true, false, nil],
|
714
979
|
# }
|
715
980
|
# ).lazy
|
716
|
-
#
|
981
|
+
# lf.clear.fetch
|
717
982
|
# # =>
|
718
983
|
# # shape: (0, 3)
|
719
984
|
# # ┌─────┬─────┬──────┐
|
@@ -722,9 +987,23 @@ module Polars
|
|
722
987
|
# # │ i64 ┆ f64 ┆ bool │
|
723
988
|
# # ╞═════╪═════╪══════╡
|
724
989
|
# # └─────┴─────┴──────┘
|
725
|
-
|
726
|
-
|
727
|
-
|
990
|
+
#
|
991
|
+
# @example
|
992
|
+
# lf.clear(2).fetch
|
993
|
+
# # =>
|
994
|
+
# # shape: (2, 3)
|
995
|
+
# # ┌──────┬──────┬──────┐
|
996
|
+
# # │ a ┆ b ┆ c │
|
997
|
+
# # │ --- ┆ --- ┆ --- │
|
998
|
+
# # │ i64 ┆ f64 ┆ bool │
|
999
|
+
# # ╞══════╪══════╪══════╡
|
1000
|
+
# # │ null ┆ null ┆ null │
|
1001
|
+
# # │ null ┆ null ┆ null │
|
1002
|
+
# # └──────┴──────┴──────┘
|
1003
|
+
def clear(n = 0)
|
1004
|
+
DataFrame.new(columns: schema).clear(n).lazy
|
1005
|
+
end
|
1006
|
+
alias_method :cleared, :clear
|
728
1007
|
|
729
1008
|
# Filter the rows in the DataFrame based on a predicate expression.
|
730
1009
|
#
|
@@ -774,8 +1053,13 @@ module Polars
|
|
774
1053
|
|
775
1054
|
# Select columns from this DataFrame.
|
776
1055
|
#
|
777
|
-
# @param exprs [
|
778
|
-
# Column
|
1056
|
+
# @param exprs [Array]
|
1057
|
+
# Column(s) to select, specified as positional arguments.
|
1058
|
+
# Accepts expression input. Strings are parsed as column names,
|
1059
|
+
# other non-expression inputs are parsed as literals.
|
1060
|
+
# @param named_exprs [Hash]
|
1061
|
+
# Additional columns to select, specified as keyword arguments.
|
1062
|
+
# The columns will be renamed to the keyword used.
|
779
1063
|
#
|
780
1064
|
# @return [LazyFrame]
|
781
1065
|
#
|
@@ -855,9 +1139,13 @@ module Polars
|
|
855
1139
|
# # │ 0 │
|
856
1140
|
# # │ 10 │
|
857
1141
|
# # └─────────┘
|
858
|
-
def select(exprs)
|
859
|
-
|
860
|
-
|
1142
|
+
def select(*exprs, **named_exprs)
|
1143
|
+
structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
|
1144
|
+
|
1145
|
+
rbexprs = Utils.parse_as_list_of_expressions(
|
1146
|
+
*exprs, **named_exprs, __structify: structify
|
1147
|
+
)
|
1148
|
+
_from_rbldf(_ldf.select(rbexprs))
|
861
1149
|
end
|
862
1150
|
|
863
1151
|
# Start a group by operation.
|
@@ -967,7 +1255,7 @@ module Polars
|
|
967
1255
|
# df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
|
968
1256
|
# Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
|
969
1257
|
# )
|
970
|
-
# df.
|
1258
|
+
# df.rolling(index_column: "dt", period: "2d").agg(
|
971
1259
|
# [
|
972
1260
|
# Polars.sum("a").alias("sum_a"),
|
973
1261
|
# Polars.min("a").alias("min_a"),
|
@@ -988,7 +1276,7 @@ module Polars
|
|
988
1276
|
# # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
|
989
1277
|
# # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
|
990
1278
|
# # └─────────────────────┴───────┴───────┴───────┘
|
991
|
-
def
|
1279
|
+
def rolling(
|
992
1280
|
index_column:,
|
993
1281
|
period:,
|
994
1282
|
offset: nil,
|
@@ -1005,12 +1293,13 @@ module Polars
|
|
1005
1293
|
period = Utils._timedelta_to_pl_duration(period)
|
1006
1294
|
offset = Utils._timedelta_to_pl_duration(offset)
|
1007
1295
|
|
1008
|
-
lgb = _ldf.
|
1296
|
+
lgb = _ldf.rolling(
|
1009
1297
|
index_column, period, offset, closed, rbexprs_by, check_sorted
|
1010
1298
|
)
|
1011
1299
|
LazyGroupBy.new(lgb)
|
1012
1300
|
end
|
1013
|
-
alias_method :
|
1301
|
+
alias_method :group_by_rolling, :rolling
|
1302
|
+
alias_method :groupby_rolling, :rolling
|
1014
1303
|
|
1015
1304
|
# Group based on a time value (or index value of type `:i32`, `:i64`).
|
1016
1305
|
#
|
@@ -1440,6 +1729,8 @@ module Polars
|
|
1440
1729
|
# Join strategy.
|
1441
1730
|
# @param suffix [String]
|
1442
1731
|
# Suffix to append to columns with a duplicate name.
|
1732
|
+
# @param join_nulls [Boolean]
|
1733
|
+
# Join on null values. By default null values will never produce matches.
|
1443
1734
|
# @param allow_parallel [Boolean]
|
1444
1735
|
# Allow the physical plan to optionally evaluate the computation of both
|
1445
1736
|
# DataFrames up to the join in parallel.
|
@@ -1535,6 +1826,7 @@ module Polars
|
|
1535
1826
|
on: nil,
|
1536
1827
|
how: "inner",
|
1537
1828
|
suffix: "_right",
|
1829
|
+
join_nulls: false,
|
1538
1830
|
allow_parallel: true,
|
1539
1831
|
force_parallel: false
|
1540
1832
|
)
|
@@ -1568,6 +1860,7 @@ module Polars
|
|
1568
1860
|
rbexprs_right,
|
1569
1861
|
allow_parallel,
|
1570
1862
|
force_parallel,
|
1863
|
+
join_nulls,
|
1571
1864
|
how,
|
1572
1865
|
suffix,
|
1573
1866
|
)
|
@@ -1608,27 +1901,9 @@ module Polars
|
|
1608
1901
|
# # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
|
1609
1902
|
# # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
|
1610
1903
|
# # └─────┴──────┴───────┴──────┴──────┴───────┘
|
1611
|
-
def with_columns(exprs)
|
1612
|
-
|
1613
|
-
|
1614
|
-
[]
|
1615
|
-
elsif exprs.is_a?(Expr)
|
1616
|
-
[exprs]
|
1617
|
-
else
|
1618
|
-
exprs.to_a
|
1619
|
-
end
|
1620
|
-
|
1621
|
-
rbexprs = []
|
1622
|
-
exprs.each do |e|
|
1623
|
-
case e
|
1624
|
-
when Expr
|
1625
|
-
rbexprs << e._rbexpr
|
1626
|
-
when Series
|
1627
|
-
rbexprs << Utils.lit(e)._rbexpr
|
1628
|
-
else
|
1629
|
-
raise ArgumentError, "Expected an expression, got #{e}"
|
1630
|
-
end
|
1631
|
-
end
|
1904
|
+
def with_columns(*exprs, **named_exprs)
|
1905
|
+
structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
|
1906
|
+
rbexprs = Utils.parse_as_list_of_expressions(*exprs, **named_exprs, __structify: structify)
|
1632
1907
|
|
1633
1908
|
_from_rbldf(_ldf.with_columns(rbexprs))
|
1634
1909
|
end
|
@@ -1725,7 +2000,7 @@ module Polars
|
|
1725
2000
|
if columns.is_a?(::String)
|
1726
2001
|
columns = [columns]
|
1727
2002
|
end
|
1728
|
-
_from_rbldf(_ldf.
|
2003
|
+
_from_rbldf(_ldf.drop(columns))
|
1729
2004
|
end
|
1730
2005
|
|
1731
2006
|
# Rename column names.
|
@@ -1955,7 +2230,7 @@ module Polars
|
|
1955
2230
|
# "b" => [2, 4, 6]
|
1956
2231
|
# }
|
1957
2232
|
# ).lazy
|
1958
|
-
# df.
|
2233
|
+
# df.with_row_index.collect
|
1959
2234
|
# # =>
|
1960
2235
|
# # shape: (3, 3)
|
1961
2236
|
# # ┌────────┬─────┬─────┐
|
@@ -1967,9 +2242,10 @@ module Polars
|
|
1967
2242
|
# # │ 1 ┆ 3 ┆ 4 │
|
1968
2243
|
# # │ 2 ┆ 5 ┆ 6 │
|
1969
2244
|
# # └────────┴─────┴─────┘
|
1970
|
-
def
|
1971
|
-
_from_rbldf(_ldf.
|
2245
|
+
def with_row_index(name: "row_nr", offset: 0)
|
2246
|
+
_from_rbldf(_ldf.with_row_index(name, offset))
|
1972
2247
|
end
|
2248
|
+
alias_method :with_row_count, :with_row_index
|
1973
2249
|
|
1974
2250
|
# Take every nth row in the LazyFrame and return as a new LazyFrame.
|
1975
2251
|
#
|
@@ -2470,9 +2746,47 @@ module Polars
|
|
2470
2746
|
_from_rbldf(_ldf.unnest(names))
|
2471
2747
|
end
|
2472
2748
|
|
2473
|
-
#
|
2474
|
-
#
|
2475
|
-
#
|
2749
|
+
# Take two sorted DataFrames and merge them by the sorted key.
|
2750
|
+
#
|
2751
|
+
# The output of this operation will also be sorted.
|
2752
|
+
# It is the callers responsibility that the frames are sorted
|
2753
|
+
# by that key otherwise the output will not make sense.
|
2754
|
+
#
|
2755
|
+
# The schemas of both LazyFrames must be equal.
|
2756
|
+
#
|
2757
|
+
# @param other [DataFrame]
|
2758
|
+
# Other DataFrame that must be merged
|
2759
|
+
# @param key [String]
|
2760
|
+
# Key that is sorted.
|
2761
|
+
#
|
2762
|
+
# @return [LazyFrame]
|
2763
|
+
#
|
2764
|
+
# @example
|
2765
|
+
# df0 = Polars::LazyFrame.new(
|
2766
|
+
# {"name" => ["steve", "elise", "bob"], "age" => [42, 44, 18]}
|
2767
|
+
# ).sort("age")
|
2768
|
+
# df1 = Polars::LazyFrame.new(
|
2769
|
+
# {"name" => ["anna", "megan", "steve", "thomas"], "age" => [21, 33, 42, 20]}
|
2770
|
+
# ).sort("age")
|
2771
|
+
# df0.merge_sorted(df1, "age").collect
|
2772
|
+
# # =>
|
2773
|
+
# # shape: (7, 2)
|
2774
|
+
# # ┌────────┬─────┐
|
2775
|
+
# # │ name ┆ age │
|
2776
|
+
# # │ --- ┆ --- │
|
2777
|
+
# # │ str ┆ i64 │
|
2778
|
+
# # ╞════════╪═════╡
|
2779
|
+
# # │ bob ┆ 18 │
|
2780
|
+
# # │ thomas ┆ 20 │
|
2781
|
+
# # │ anna ┆ 21 │
|
2782
|
+
# # │ megan ┆ 33 │
|
2783
|
+
# # │ steve ┆ 42 │
|
2784
|
+
# # │ steve ┆ 42 │
|
2785
|
+
# # │ elise ┆ 44 │
|
2786
|
+
# # └────────┴─────┘
|
2787
|
+
def merge_sorted(other, key)
|
2788
|
+
_from_rbldf(_ldf.merge_sorted(other._ldf, key))
|
2789
|
+
end
|
2476
2790
|
|
2477
2791
|
# Indicate that one or multiple columns are sorted.
|
2478
2792
|
#
|