polars-df 0.7.0-x86_64-darwin → 0.9.0-x86_64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +41 -0
- data/Cargo.lock +353 -237
- data/Cargo.toml +0 -3
- data/LICENSE-THIRD-PARTY.txt +4014 -3495
- data/LICENSE.txt +1 -1
- data/README.md +2 -2
- data/lib/polars/3.1/polars.bundle +0 -0
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/{3.0 → 3.3}/polars.bundle +0 -0
- data/lib/polars/array_expr.rb +449 -0
- data/lib/polars/array_name_space.rb +346 -0
- data/lib/polars/cat_expr.rb +24 -0
- data/lib/polars/cat_name_space.rb +75 -0
- data/lib/polars/config.rb +2 -2
- data/lib/polars/data_frame.rb +248 -108
- data/lib/polars/data_types.rb +195 -29
- data/lib/polars/date_time_expr.rb +41 -24
- data/lib/polars/date_time_name_space.rb +12 -12
- data/lib/polars/exceptions.rb +12 -1
- data/lib/polars/expr.rb +1080 -195
- data/lib/polars/functions/aggregation/horizontal.rb +246 -0
- data/lib/polars/functions/aggregation/vertical.rb +282 -0
- data/lib/polars/functions/as_datatype.rb +248 -0
- data/lib/polars/functions/col.rb +47 -0
- data/lib/polars/functions/eager.rb +182 -0
- data/lib/polars/functions/lazy.rb +1280 -0
- data/lib/polars/functions/len.rb +49 -0
- data/lib/polars/functions/lit.rb +35 -0
- data/lib/polars/functions/random.rb +16 -0
- data/lib/polars/functions/range/date_range.rb +103 -0
- data/lib/polars/functions/range/int_range.rb +51 -0
- data/lib/polars/functions/repeat.rb +144 -0
- data/lib/polars/functions/whenthen.rb +27 -0
- data/lib/polars/functions.rb +29 -416
- data/lib/polars/group_by.rb +3 -3
- data/lib/polars/io.rb +21 -28
- data/lib/polars/lazy_frame.rb +390 -76
- data/lib/polars/list_expr.rb +152 -6
- data/lib/polars/list_name_space.rb +102 -0
- data/lib/polars/meta_expr.rb +175 -7
- data/lib/polars/series.rb +557 -59
- data/lib/polars/sql_context.rb +1 -1
- data/lib/polars/string_cache.rb +75 -0
- data/lib/polars/string_expr.rb +412 -96
- data/lib/polars/string_name_space.rb +4 -4
- data/lib/polars/struct_expr.rb +1 -1
- data/lib/polars/struct_name_space.rb +1 -1
- data/lib/polars/testing.rb +507 -0
- data/lib/polars/utils.rb +64 -20
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +15 -2
- metadata +36 -7
- data/lib/polars/lazy_functions.rb +0 -1197
data/lib/polars/lazy_frame.rb
CHANGED
@@ -218,7 +218,7 @@ module Polars
|
|
218
218
|
# }
|
219
219
|
# ).lazy
|
220
220
|
# lf.dtypes
|
221
|
-
# # => [Polars::Int64, Polars::Float64, Polars::
|
221
|
+
# # => [Polars::Int64, Polars::Float64, Polars::String]
|
222
222
|
def dtypes
|
223
223
|
_ldf.dtypes
|
224
224
|
end
|
@@ -236,7 +236,7 @@ module Polars
|
|
236
236
|
# }
|
237
237
|
# ).lazy
|
238
238
|
# lf.schema
|
239
|
-
# # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::
|
239
|
+
# # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String}
|
240
240
|
def schema
|
241
241
|
_ldf.schema
|
242
242
|
end
|
@@ -308,7 +308,7 @@ module Polars
|
|
308
308
|
# end
|
309
309
|
#
|
310
310
|
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]}).lazy
|
311
|
-
# df.pipe(cast_str_to_int, col_name: "b").collect
|
311
|
+
# df.pipe(cast_str_to_int, col_name: "b").collect
|
312
312
|
# # =>
|
313
313
|
# # shape: (4, 2)
|
314
314
|
# # ┌─────┬─────┐
|
@@ -342,6 +342,7 @@ module Polars
|
|
342
342
|
simplify_expression: true,
|
343
343
|
slice_pushdown: true,
|
344
344
|
common_subplan_elimination: true,
|
345
|
+
comm_subexpr_elim: true,
|
345
346
|
allow_streaming: false
|
346
347
|
)
|
347
348
|
ldf = _ldf.optimization_toggle(
|
@@ -351,6 +352,7 @@ module Polars
|
|
351
352
|
simplify_expression,
|
352
353
|
slice_pushdown,
|
353
354
|
common_subplan_elimination,
|
355
|
+
comm_subexpr_elim,
|
354
356
|
allow_streaming,
|
355
357
|
false
|
356
358
|
)
|
@@ -399,7 +401,7 @@ module Polars
|
|
399
401
|
# # │ 1 ┆ 6.0 ┆ a │
|
400
402
|
# # └─────┴─────┴─────┘
|
401
403
|
def sort(by, reverse: false, nulls_last: false, maintain_order: false)
|
402
|
-
if by.is_a?(String)
|
404
|
+
if by.is_a?(::String)
|
403
405
|
return _from_rbldf(_ldf.sort(by, reverse, nulls_last, maintain_order))
|
404
406
|
end
|
405
407
|
if Utils.bool?(reverse)
|
@@ -469,6 +471,7 @@ module Polars
|
|
469
471
|
no_optimization: false,
|
470
472
|
slice_pushdown: true,
|
471
473
|
common_subplan_elimination: true,
|
474
|
+
comm_subexpr_elim: true,
|
472
475
|
allow_streaming: false,
|
473
476
|
_eager: false
|
474
477
|
)
|
@@ -477,6 +480,7 @@ module Polars
|
|
477
480
|
projection_pushdown = false
|
478
481
|
slice_pushdown = false
|
479
482
|
common_subplan_elimination = false
|
483
|
+
comm_subexpr_elim = false
|
480
484
|
end
|
481
485
|
|
482
486
|
if allow_streaming
|
@@ -490,6 +494,7 @@ module Polars
|
|
490
494
|
simplify_expression,
|
491
495
|
slice_pushdown,
|
492
496
|
common_subplan_elimination,
|
497
|
+
comm_subexpr_elim,
|
493
498
|
allow_streaming,
|
494
499
|
_eager
|
495
500
|
)
|
@@ -559,6 +564,268 @@ module Polars
|
|
559
564
|
simplify_expression: true,
|
560
565
|
no_optimization: false,
|
561
566
|
slice_pushdown: true
|
567
|
+
)
|
568
|
+
lf = _set_sink_optimizations(
|
569
|
+
type_coercion: type_coercion,
|
570
|
+
predicate_pushdown: predicate_pushdown,
|
571
|
+
projection_pushdown: projection_pushdown,
|
572
|
+
simplify_expression: simplify_expression,
|
573
|
+
slice_pushdown: slice_pushdown,
|
574
|
+
no_optimization: no_optimization
|
575
|
+
)
|
576
|
+
|
577
|
+
lf.sink_parquet(
|
578
|
+
path,
|
579
|
+
compression,
|
580
|
+
compression_level,
|
581
|
+
statistics,
|
582
|
+
row_group_size,
|
583
|
+
data_pagesize_limit,
|
584
|
+
maintain_order
|
585
|
+
)
|
586
|
+
end
|
587
|
+
|
588
|
+
# Evaluate the query in streaming mode and write to an IPC file.
|
589
|
+
#
|
590
|
+
# This allows streaming results that are larger than RAM to be written to disk.
|
591
|
+
#
|
592
|
+
# @param path [String]
|
593
|
+
# File path to which the file should be written.
|
594
|
+
# @param compression ["lz4", "zstd"]
|
595
|
+
# Choose "zstd" for good compression performance.
|
596
|
+
# Choose "lz4" for fast compression/decompression.
|
597
|
+
# @param maintain_order [Boolean]
|
598
|
+
# Maintain the order in which data is processed.
|
599
|
+
# Setting this to `false` will be slightly faster.
|
600
|
+
# @param type_coercion [Boolean]
|
601
|
+
# Do type coercion optimization.
|
602
|
+
# @param predicate_pushdown [Boolean]
|
603
|
+
# Do predicate pushdown optimization.
|
604
|
+
# @param projection_pushdown [Boolean]
|
605
|
+
# Do projection pushdown optimization.
|
606
|
+
# @param simplify_expression [Boolean]
|
607
|
+
# Run simplify expressions optimization.
|
608
|
+
# @param slice_pushdown [Boolean]
|
609
|
+
# Slice pushdown optimization.
|
610
|
+
# @param no_optimization [Boolean]
|
611
|
+
# Turn off (certain) optimizations.
|
612
|
+
#
|
613
|
+
# @return [DataFrame]
|
614
|
+
#
|
615
|
+
# @example
|
616
|
+
# lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
|
617
|
+
# lf.sink_ipc("out.arrow")
|
618
|
+
def sink_ipc(
|
619
|
+
path,
|
620
|
+
compression: "zstd",
|
621
|
+
maintain_order: true,
|
622
|
+
type_coercion: true,
|
623
|
+
predicate_pushdown: true,
|
624
|
+
projection_pushdown: true,
|
625
|
+
simplify_expression: true,
|
626
|
+
slice_pushdown: true,
|
627
|
+
no_optimization: false
|
628
|
+
)
|
629
|
+
lf = _set_sink_optimizations(
|
630
|
+
type_coercion: type_coercion,
|
631
|
+
predicate_pushdown: predicate_pushdown,
|
632
|
+
projection_pushdown: projection_pushdown,
|
633
|
+
simplify_expression: simplify_expression,
|
634
|
+
slice_pushdown: slice_pushdown,
|
635
|
+
no_optimization: no_optimization
|
636
|
+
)
|
637
|
+
|
638
|
+
lf.sink_ipc(
|
639
|
+
path,
|
640
|
+
compression,
|
641
|
+
maintain_order
|
642
|
+
)
|
643
|
+
end
|
644
|
+
|
645
|
+
# Evaluate the query in streaming mode and write to a CSV file.
|
646
|
+
#
|
647
|
+
# This allows streaming results that are larger than RAM to be written to disk.
|
648
|
+
#
|
649
|
+
# @param path [String]
|
650
|
+
# File path to which the file should be written.
|
651
|
+
# @param include_bom [Boolean]
|
652
|
+
# Whether to include UTF-8 BOM in the CSV output.
|
653
|
+
# @param include_header [Boolean]
|
654
|
+
# Whether to include header in the CSV output.
|
655
|
+
# @param separator [String]
|
656
|
+
# Separate CSV fields with this symbol.
|
657
|
+
# @param line_terminator [String]
|
658
|
+
# String used to end each row.
|
659
|
+
# @param quote_char [String]
|
660
|
+
# Byte to use as quoting character.
|
661
|
+
# @param batch_size [Integer]
|
662
|
+
# Number of rows that will be processed per thread.
|
663
|
+
# @param datetime_format [String]
|
664
|
+
# A format string, with the specifiers defined by the
|
665
|
+
# `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
|
666
|
+
# Rust crate. If no format specified, the default fractional-second
|
667
|
+
# precision is inferred from the maximum timeunit found in the frame's
|
668
|
+
# Datetime cols (if any).
|
669
|
+
# @param date_format [String]
|
670
|
+
# A format string, with the specifiers defined by the
|
671
|
+
# `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
|
672
|
+
# Rust crate.
|
673
|
+
# @param time_format [String]
|
674
|
+
# A format string, with the specifiers defined by the
|
675
|
+
# `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
|
676
|
+
# Rust crate.
|
677
|
+
# @param float_precision [Integer]
|
678
|
+
# Number of decimal places to write, applied to both `Float32` and
|
679
|
+
# `Float64` datatypes.
|
680
|
+
# @param null_value [String]
|
681
|
+
# A string representing null values (defaulting to the empty string).
|
682
|
+
# @param quote_style ["necessary", "always", "non_numeric", "never"]
|
683
|
+
# Determines the quoting strategy used.
|
684
|
+
#
|
685
|
+
# - necessary (default): This puts quotes around fields only when necessary.
|
686
|
+
# They are necessary when fields contain a quote,
|
687
|
+
# delimiter or record terminator.
|
688
|
+
# Quotes are also necessary when writing an empty record
|
689
|
+
# (which is indistinguishable from a record with one empty field).
|
690
|
+
# This is the default.
|
691
|
+
# - always: This puts quotes around every field. Always.
|
692
|
+
# - never: This never puts quotes around fields, even if that results in
|
693
|
+
# invalid CSV data (e.g.: by not quoting strings containing the
|
694
|
+
# separator).
|
695
|
+
# - non_numeric: This puts quotes around all fields that are non-numeric.
|
696
|
+
# Namely, when writing a field that does not parse as a valid float
|
697
|
+
# or integer, then quotes will be used even if they aren`t strictly
|
698
|
+
# necessary.
|
699
|
+
# @param maintain_order [Boolean]
|
700
|
+
# Maintain the order in which data is processed.
|
701
|
+
# Setting this to `false` will be slightly faster.
|
702
|
+
# @param type_coercion [Boolean]
|
703
|
+
# Do type coercion optimization.
|
704
|
+
# @param predicate_pushdown [Boolean]
|
705
|
+
# Do predicate pushdown optimization.
|
706
|
+
# @param projection_pushdown [Boolean]
|
707
|
+
# Do projection pushdown optimization.
|
708
|
+
# @param simplify_expression [Boolean]
|
709
|
+
# Run simplify expressions optimization.
|
710
|
+
# @param slice_pushdown [Boolean]
|
711
|
+
# Slice pushdown optimization.
|
712
|
+
# @param no_optimization [Boolean]
|
713
|
+
# Turn off (certain) optimizations.
|
714
|
+
#
|
715
|
+
# @return [DataFrame]
|
716
|
+
#
|
717
|
+
# @example
|
718
|
+
# lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
|
719
|
+
# lf.sink_csv("out.csv")
|
720
|
+
def sink_csv(
|
721
|
+
path,
|
722
|
+
include_bom: false,
|
723
|
+
include_header: true,
|
724
|
+
separator: ",",
|
725
|
+
line_terminator: "\n",
|
726
|
+
quote_char: '"',
|
727
|
+
batch_size: 1024,
|
728
|
+
datetime_format: nil,
|
729
|
+
date_format: nil,
|
730
|
+
time_format: nil,
|
731
|
+
float_precision: nil,
|
732
|
+
null_value: nil,
|
733
|
+
quote_style: nil,
|
734
|
+
maintain_order: true,
|
735
|
+
type_coercion: true,
|
736
|
+
predicate_pushdown: true,
|
737
|
+
projection_pushdown: true,
|
738
|
+
simplify_expression: true,
|
739
|
+
slice_pushdown: true,
|
740
|
+
no_optimization: false
|
741
|
+
)
|
742
|
+
Utils._check_arg_is_1byte("separator", separator, false)
|
743
|
+
Utils._check_arg_is_1byte("quote_char", quote_char, false)
|
744
|
+
|
745
|
+
lf = _set_sink_optimizations(
|
746
|
+
type_coercion: type_coercion,
|
747
|
+
predicate_pushdown: predicate_pushdown,
|
748
|
+
projection_pushdown: projection_pushdown,
|
749
|
+
simplify_expression: simplify_expression,
|
750
|
+
slice_pushdown: slice_pushdown,
|
751
|
+
no_optimization: no_optimization
|
752
|
+
)
|
753
|
+
|
754
|
+
lf.sink_csv(
|
755
|
+
path,
|
756
|
+
include_bom,
|
757
|
+
include_header,
|
758
|
+
separator.ord,
|
759
|
+
line_terminator,
|
760
|
+
quote_char.ord,
|
761
|
+
batch_size,
|
762
|
+
datetime_format,
|
763
|
+
date_format,
|
764
|
+
time_format,
|
765
|
+
float_precision,
|
766
|
+
null_value,
|
767
|
+
quote_style,
|
768
|
+
maintain_order
|
769
|
+
)
|
770
|
+
end
|
771
|
+
|
772
|
+
# Evaluate the query in streaming mode and write to an NDJSON file.
|
773
|
+
#
|
774
|
+
# This allows streaming results that are larger than RAM to be written to disk.
|
775
|
+
#
|
776
|
+
# @param path [String]
|
777
|
+
# File path to which the file should be written.
|
778
|
+
# @param maintain_order [Boolean]
|
779
|
+
# Maintain the order in which data is processed.
|
780
|
+
# Setting this to `false` will be slightly faster.
|
781
|
+
# @param type_coercion [Boolean]
|
782
|
+
# Do type coercion optimization.
|
783
|
+
# @param predicate_pushdown [Boolean]
|
784
|
+
# Do predicate pushdown optimization.
|
785
|
+
# @param projection_pushdown [Boolean]
|
786
|
+
# Do projection pushdown optimization.
|
787
|
+
# @param simplify_expression [Boolean]
|
788
|
+
# Run simplify expressions optimization.
|
789
|
+
# @param slice_pushdown [Boolean]
|
790
|
+
# Slice pushdown optimization.
|
791
|
+
# @param no_optimization [Boolean]
|
792
|
+
# Turn off (certain) optimizations.
|
793
|
+
#
|
794
|
+
# @return [DataFrame]
|
795
|
+
#
|
796
|
+
# @example
|
797
|
+
# lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
|
798
|
+
# lf.sink_ndjson("out.ndjson")
|
799
|
+
def sink_ndjson(
|
800
|
+
path,
|
801
|
+
maintain_order: true,
|
802
|
+
type_coercion: true,
|
803
|
+
predicate_pushdown: true,
|
804
|
+
projection_pushdown: true,
|
805
|
+
simplify_expression: true,
|
806
|
+
slice_pushdown: true,
|
807
|
+
no_optimization: false
|
808
|
+
)
|
809
|
+
lf = _set_sink_optimizations(
|
810
|
+
type_coercion: type_coercion,
|
811
|
+
predicate_pushdown: predicate_pushdown,
|
812
|
+
projection_pushdown: projection_pushdown,
|
813
|
+
simplify_expression: simplify_expression,
|
814
|
+
slice_pushdown: slice_pushdown,
|
815
|
+
no_optimization: no_optimization
|
816
|
+
)
|
817
|
+
|
818
|
+
lf.sink_json(path, maintain_order)
|
819
|
+
end
|
820
|
+
|
821
|
+
# @private
|
822
|
+
def _set_sink_optimizations(
|
823
|
+
type_coercion: true,
|
824
|
+
predicate_pushdown: true,
|
825
|
+
projection_pushdown: true,
|
826
|
+
simplify_expression: true,
|
827
|
+
slice_pushdown: true,
|
828
|
+
no_optimization: false
|
562
829
|
)
|
563
830
|
if no_optimization
|
564
831
|
predicate_pushdown = false
|
@@ -566,25 +833,17 @@ module Polars
|
|
566
833
|
slice_pushdown = false
|
567
834
|
end
|
568
835
|
|
569
|
-
|
836
|
+
_ldf.optimization_toggle(
|
570
837
|
type_coercion,
|
571
838
|
predicate_pushdown,
|
572
839
|
projection_pushdown,
|
573
840
|
simplify_expression,
|
574
841
|
slice_pushdown,
|
575
842
|
false,
|
843
|
+
false,
|
576
844
|
true,
|
577
845
|
false
|
578
846
|
)
|
579
|
-
lf.sink_parquet(
|
580
|
-
path,
|
581
|
-
compression,
|
582
|
-
compression_level,
|
583
|
-
statistics,
|
584
|
-
row_group_size,
|
585
|
-
data_pagesize_limit,
|
586
|
-
maintain_order
|
587
|
-
)
|
588
847
|
end
|
589
848
|
|
590
849
|
# Collect a small number of rows for debugging purposes.
|
@@ -650,6 +909,7 @@ module Polars
|
|
650
909
|
no_optimization: false,
|
651
910
|
slice_pushdown: true,
|
652
911
|
common_subplan_elimination: true,
|
912
|
+
comm_subexpr_elim: true,
|
653
913
|
allow_streaming: false
|
654
914
|
)
|
655
915
|
if no_optimization
|
@@ -666,6 +926,7 @@ module Polars
|
|
666
926
|
simplify_expression,
|
667
927
|
slice_pushdown,
|
668
928
|
common_subplan_elimination,
|
929
|
+
comm_subexpr_elim,
|
669
930
|
allow_streaming,
|
670
931
|
false
|
671
932
|
)
|
@@ -699,6 +960,10 @@ module Polars
|
|
699
960
|
_from_rbldf(_ldf.cache)
|
700
961
|
end
|
701
962
|
|
963
|
+
# TODO
|
964
|
+
# def cast
|
965
|
+
# end
|
966
|
+
|
702
967
|
# Create an empty copy of the current LazyFrame.
|
703
968
|
#
|
704
969
|
# The copy has an identical schema but no data.
|
@@ -706,14 +971,14 @@ module Polars
|
|
706
971
|
# @return [LazyFrame]
|
707
972
|
#
|
708
973
|
# @example
|
709
|
-
#
|
974
|
+
# lf = Polars::LazyFrame.new(
|
710
975
|
# {
|
711
976
|
# "a" => [nil, 2, 3, 4],
|
712
977
|
# "b" => [0.5, nil, 2.5, 13],
|
713
978
|
# "c" => [true, true, false, nil],
|
714
979
|
# }
|
715
980
|
# ).lazy
|
716
|
-
#
|
981
|
+
# lf.clear.fetch
|
717
982
|
# # =>
|
718
983
|
# # shape: (0, 3)
|
719
984
|
# # ┌─────┬─────┬──────┐
|
@@ -722,9 +987,23 @@ module Polars
|
|
722
987
|
# # │ i64 ┆ f64 ┆ bool │
|
723
988
|
# # ╞═════╪═════╪══════╡
|
724
989
|
# # └─────┴─────┴──────┘
|
725
|
-
|
726
|
-
|
727
|
-
|
990
|
+
#
|
991
|
+
# @example
|
992
|
+
# lf.clear(2).fetch
|
993
|
+
# # =>
|
994
|
+
# # shape: (2, 3)
|
995
|
+
# # ┌──────┬──────┬──────┐
|
996
|
+
# # │ a ┆ b ┆ c │
|
997
|
+
# # │ --- ┆ --- ┆ --- │
|
998
|
+
# # │ i64 ┆ f64 ┆ bool │
|
999
|
+
# # ╞══════╪══════╪══════╡
|
1000
|
+
# # │ null ┆ null ┆ null │
|
1001
|
+
# # │ null ┆ null ┆ null │
|
1002
|
+
# # └──────┴──────┴──────┘
|
1003
|
+
def clear(n = 0)
|
1004
|
+
DataFrame.new(columns: schema).clear(n).lazy
|
1005
|
+
end
|
1006
|
+
alias_method :cleared, :clear
|
728
1007
|
|
729
1008
|
# Filter the rows in the DataFrame based on a predicate expression.
|
730
1009
|
#
|
@@ -774,8 +1053,13 @@ module Polars
|
|
774
1053
|
|
775
1054
|
# Select columns from this DataFrame.
|
776
1055
|
#
|
777
|
-
# @param exprs [
|
778
|
-
# Column
|
1056
|
+
# @param exprs [Array]
|
1057
|
+
# Column(s) to select, specified as positional arguments.
|
1058
|
+
# Accepts expression input. Strings are parsed as column names,
|
1059
|
+
# other non-expression inputs are parsed as literals.
|
1060
|
+
# @param named_exprs [Hash]
|
1061
|
+
# Additional columns to select, specified as keyword arguments.
|
1062
|
+
# The columns will be renamed to the keyword used.
|
779
1063
|
#
|
780
1064
|
# @return [LazyFrame]
|
781
1065
|
#
|
@@ -855,9 +1139,13 @@ module Polars
|
|
855
1139
|
# # │ 0 │
|
856
1140
|
# # │ 10 │
|
857
1141
|
# # └─────────┘
|
858
|
-
def select(exprs)
|
859
|
-
|
860
|
-
|
1142
|
+
def select(*exprs, **named_exprs)
|
1143
|
+
structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
|
1144
|
+
|
1145
|
+
rbexprs = Utils.parse_as_list_of_expressions(
|
1146
|
+
*exprs, **named_exprs, __structify: structify
|
1147
|
+
)
|
1148
|
+
_from_rbldf(_ldf.select(rbexprs))
|
861
1149
|
end
|
862
1150
|
|
863
1151
|
# Start a group by operation.
|
@@ -967,7 +1255,7 @@ module Polars
|
|
967
1255
|
# df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
|
968
1256
|
# Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
|
969
1257
|
# )
|
970
|
-
# df.
|
1258
|
+
# df.rolling(index_column: "dt", period: "2d").agg(
|
971
1259
|
# [
|
972
1260
|
# Polars.sum("a").alias("sum_a"),
|
973
1261
|
# Polars.min("a").alias("min_a"),
|
@@ -988,7 +1276,7 @@ module Polars
|
|
988
1276
|
# # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
|
989
1277
|
# # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
|
990
1278
|
# # └─────────────────────┴───────┴───────┴───────┘
|
991
|
-
def
|
1279
|
+
def rolling(
|
992
1280
|
index_column:,
|
993
1281
|
period:,
|
994
1282
|
offset: nil,
|
@@ -1005,12 +1293,13 @@ module Polars
|
|
1005
1293
|
period = Utils._timedelta_to_pl_duration(period)
|
1006
1294
|
offset = Utils._timedelta_to_pl_duration(offset)
|
1007
1295
|
|
1008
|
-
lgb = _ldf.
|
1296
|
+
lgb = _ldf.rolling(
|
1009
1297
|
index_column, period, offset, closed, rbexprs_by, check_sorted
|
1010
1298
|
)
|
1011
1299
|
LazyGroupBy.new(lgb)
|
1012
1300
|
end
|
1013
|
-
alias_method :
|
1301
|
+
alias_method :group_by_rolling, :rolling
|
1302
|
+
alias_method :groupby_rolling, :rolling
|
1014
1303
|
|
1015
1304
|
# Group based on a time value (or index value of type `:i32`, `:i64`).
|
1016
1305
|
#
|
@@ -1371,7 +1660,7 @@ module Polars
|
|
1371
1660
|
raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
|
1372
1661
|
end
|
1373
1662
|
|
1374
|
-
if on.is_a?(String)
|
1663
|
+
if on.is_a?(::String)
|
1375
1664
|
left_on = on
|
1376
1665
|
right_on = on
|
1377
1666
|
end
|
@@ -1380,19 +1669,19 @@ module Polars
|
|
1380
1669
|
raise ArgumentError, "You should pass the column to join on as an argument."
|
1381
1670
|
end
|
1382
1671
|
|
1383
|
-
if by_left.is_a?(String) || by_left.is_a?(Expr)
|
1672
|
+
if by_left.is_a?(::String) || by_left.is_a?(Expr)
|
1384
1673
|
by_left_ = [by_left]
|
1385
1674
|
else
|
1386
1675
|
by_left_ = by_left
|
1387
1676
|
end
|
1388
1677
|
|
1389
|
-
if by_right.is_a?(String) || by_right.is_a?(Expr)
|
1678
|
+
if by_right.is_a?(::String) || by_right.is_a?(Expr)
|
1390
1679
|
by_right_ = [by_right]
|
1391
1680
|
else
|
1392
1681
|
by_right_ = by_right
|
1393
1682
|
end
|
1394
1683
|
|
1395
|
-
if by.is_a?(String)
|
1684
|
+
if by.is_a?(::String)
|
1396
1685
|
by_left_ = [by]
|
1397
1686
|
by_right_ = [by]
|
1398
1687
|
elsif by.is_a?(::Array)
|
@@ -1402,7 +1691,7 @@ module Polars
|
|
1402
1691
|
|
1403
1692
|
tolerance_str = nil
|
1404
1693
|
tolerance_num = nil
|
1405
|
-
if tolerance.is_a?(String)
|
1694
|
+
if tolerance.is_a?(::String)
|
1406
1695
|
tolerance_str = tolerance
|
1407
1696
|
else
|
1408
1697
|
tolerance_num = tolerance
|
@@ -1440,6 +1729,8 @@ module Polars
|
|
1440
1729
|
# Join strategy.
|
1441
1730
|
# @param suffix [String]
|
1442
1731
|
# Suffix to append to columns with a duplicate name.
|
1732
|
+
# @param join_nulls [Boolean]
|
1733
|
+
# Join on null values. By default null values will never produce matches.
|
1443
1734
|
# @param allow_parallel [Boolean]
|
1444
1735
|
# Allow the physical plan to optionally evaluate the computation of both
|
1445
1736
|
# DataFrames up to the join in parallel.
|
@@ -1478,17 +1769,17 @@ module Polars
|
|
1478
1769
|
# @example
|
1479
1770
|
# df.join(other_df, on: "ham", how: "outer").collect
|
1480
1771
|
# # =>
|
1481
|
-
# # shape: (4,
|
1482
|
-
# #
|
1483
|
-
# # │ foo ┆ bar ┆ ham
|
1484
|
-
# # │ --- ┆ --- ┆ --- ┆ ---
|
1485
|
-
# # │ i64 ┆ f64 ┆ str ┆ str
|
1486
|
-
# #
|
1487
|
-
# # │ 1 ┆ 6.0 ┆ a
|
1488
|
-
# # │ 2 ┆ 7.0 ┆ b
|
1489
|
-
# # │ null ┆ null ┆
|
1490
|
-
# # │ 3 ┆ 8.0 ┆ c
|
1491
|
-
# #
|
1772
|
+
# # shape: (4, 5)
|
1773
|
+
# # ┌──────┬──────┬──────┬───────┬───────────┐
|
1774
|
+
# # │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │
|
1775
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
1776
|
+
# # │ i64 ┆ f64 ┆ str ┆ str ┆ str │
|
1777
|
+
# # ╞══════╪══════╪══════╪═══════╪═══════════╡
|
1778
|
+
# # │ 1 ┆ 6.0 ┆ a ┆ x ┆ a │
|
1779
|
+
# # │ 2 ┆ 7.0 ┆ b ┆ y ┆ b │
|
1780
|
+
# # │ null ┆ null ┆ null ┆ z ┆ d │
|
1781
|
+
# # │ 3 ┆ 8.0 ┆ c ┆ null ┆ null │
|
1782
|
+
# # └──────┴──────┴──────┴───────┴───────────┘
|
1492
1783
|
#
|
1493
1784
|
# @example
|
1494
1785
|
# df.join(other_df, on: "ham", how: "left").collect
|
@@ -1535,6 +1826,7 @@ module Polars
|
|
1535
1826
|
on: nil,
|
1536
1827
|
how: "inner",
|
1537
1828
|
suffix: "_right",
|
1829
|
+
join_nulls: false,
|
1538
1830
|
allow_parallel: true,
|
1539
1831
|
force_parallel: false
|
1540
1832
|
)
|
@@ -1568,6 +1860,7 @@ module Polars
|
|
1568
1860
|
rbexprs_right,
|
1569
1861
|
allow_parallel,
|
1570
1862
|
force_parallel,
|
1863
|
+
join_nulls,
|
1571
1864
|
how,
|
1572
1865
|
suffix,
|
1573
1866
|
)
|
@@ -1608,27 +1901,9 @@ module Polars
|
|
1608
1901
|
# # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
|
1609
1902
|
# # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
|
1610
1903
|
# # └─────┴──────┴───────┴──────┴──────┴───────┘
|
1611
|
-
def with_columns(exprs)
|
1612
|
-
|
1613
|
-
|
1614
|
-
[]
|
1615
|
-
elsif exprs.is_a?(Expr)
|
1616
|
-
[exprs]
|
1617
|
-
else
|
1618
|
-
exprs.to_a
|
1619
|
-
end
|
1620
|
-
|
1621
|
-
rbexprs = []
|
1622
|
-
exprs.each do |e|
|
1623
|
-
case e
|
1624
|
-
when Expr
|
1625
|
-
rbexprs << e._rbexpr
|
1626
|
-
when Series
|
1627
|
-
rbexprs << Utils.lit(e)._rbexpr
|
1628
|
-
else
|
1629
|
-
raise ArgumentError, "Expected an expression, got #{e}"
|
1630
|
-
end
|
1631
|
-
end
|
1904
|
+
def with_columns(*exprs, **named_exprs)
|
1905
|
+
structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
|
1906
|
+
rbexprs = Utils.parse_as_list_of_expressions(*exprs, **named_exprs, __structify: structify)
|
1632
1907
|
|
1633
1908
|
_from_rbldf(_ldf.with_columns(rbexprs))
|
1634
1909
|
end
|
@@ -1722,10 +1997,10 @@ module Polars
|
|
1722
1997
|
#
|
1723
1998
|
# @return [LazyFrame]
|
1724
1999
|
def drop(columns)
|
1725
|
-
if columns.is_a?(String)
|
2000
|
+
if columns.is_a?(::String)
|
1726
2001
|
columns = [columns]
|
1727
2002
|
end
|
1728
|
-
_from_rbldf(_ldf.
|
2003
|
+
_from_rbldf(_ldf.drop(columns))
|
1729
2004
|
end
|
1730
2005
|
|
1731
2006
|
# Rename column names.
|
@@ -1955,7 +2230,7 @@ module Polars
|
|
1955
2230
|
# "b" => [2, 4, 6]
|
1956
2231
|
# }
|
1957
2232
|
# ).lazy
|
1958
|
-
# df.
|
2233
|
+
# df.with_row_index.collect
|
1959
2234
|
# # =>
|
1960
2235
|
# # shape: (3, 3)
|
1961
2236
|
# # ┌────────┬─────┬─────┐
|
@@ -1967,9 +2242,10 @@ module Polars
|
|
1967
2242
|
# # │ 1 ┆ 3 ┆ 4 │
|
1968
2243
|
# # │ 2 ┆ 5 ┆ 6 │
|
1969
2244
|
# # └────────┴─────┴─────┘
|
1970
|
-
def
|
1971
|
-
_from_rbldf(_ldf.
|
2245
|
+
def with_row_index(name: "row_nr", offset: 0)
|
2246
|
+
_from_rbldf(_ldf.with_row_index(name, offset))
|
1972
2247
|
end
|
2248
|
+
alias_method :with_row_count, :with_row_index
|
1973
2249
|
|
1974
2250
|
# Take every nth row in the LazyFrame and return as a new LazyFrame.
|
1975
2251
|
#
|
@@ -2363,10 +2639,10 @@ module Polars
|
|
2363
2639
|
# # │ z ┆ c ┆ 6 │
|
2364
2640
|
# # └─────┴──────────┴───────┘
|
2365
2641
|
def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil, streamable: true)
|
2366
|
-
if value_vars.is_a?(String)
|
2642
|
+
if value_vars.is_a?(::String)
|
2367
2643
|
value_vars = [value_vars]
|
2368
2644
|
end
|
2369
|
-
if id_vars.is_a?(String)
|
2645
|
+
if id_vars.is_a?(::String)
|
2370
2646
|
id_vars = [id_vars]
|
2371
2647
|
end
|
2372
2648
|
if value_vars.nil?
|
@@ -2464,15 +2740,53 @@ module Polars
|
|
2464
2740
|
# # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
|
2465
2741
|
# # └────────┴─────┴─────┴──────┴───────────┴───────┘
|
2466
2742
|
def unnest(names)
|
2467
|
-
if names.is_a?(String)
|
2743
|
+
if names.is_a?(::String)
|
2468
2744
|
names = [names]
|
2469
2745
|
end
|
2470
2746
|
_from_rbldf(_ldf.unnest(names))
|
2471
2747
|
end
|
2472
2748
|
|
2473
|
-
#
|
2474
|
-
#
|
2475
|
-
#
|
2749
|
+
# Take two sorted DataFrames and merge them by the sorted key.
|
2750
|
+
#
|
2751
|
+
# The output of this operation will also be sorted.
|
2752
|
+
# It is the callers responsibility that the frames are sorted
|
2753
|
+
# by that key otherwise the output will not make sense.
|
2754
|
+
#
|
2755
|
+
# The schemas of both LazyFrames must be equal.
|
2756
|
+
#
|
2757
|
+
# @param other [DataFrame]
|
2758
|
+
# Other DataFrame that must be merged
|
2759
|
+
# @param key [String]
|
2760
|
+
# Key that is sorted.
|
2761
|
+
#
|
2762
|
+
# @return [LazyFrame]
|
2763
|
+
#
|
2764
|
+
# @example
|
2765
|
+
# df0 = Polars::LazyFrame.new(
|
2766
|
+
# {"name" => ["steve", "elise", "bob"], "age" => [42, 44, 18]}
|
2767
|
+
# ).sort("age")
|
2768
|
+
# df1 = Polars::LazyFrame.new(
|
2769
|
+
# {"name" => ["anna", "megan", "steve", "thomas"], "age" => [21, 33, 42, 20]}
|
2770
|
+
# ).sort("age")
|
2771
|
+
# df0.merge_sorted(df1, "age").collect
|
2772
|
+
# # =>
|
2773
|
+
# # shape: (7, 2)
|
2774
|
+
# # ┌────────┬─────┐
|
2775
|
+
# # │ name ┆ age │
|
2776
|
+
# # │ --- ┆ --- │
|
2777
|
+
# # │ str ┆ i64 │
|
2778
|
+
# # ╞════════╪═════╡
|
2779
|
+
# # │ bob ┆ 18 │
|
2780
|
+
# # │ thomas ┆ 20 │
|
2781
|
+
# # │ anna ┆ 21 │
|
2782
|
+
# # │ megan ┆ 33 │
|
2783
|
+
# # │ steve ┆ 42 │
|
2784
|
+
# # │ steve ┆ 42 │
|
2785
|
+
# # │ elise ┆ 44 │
|
2786
|
+
# # └────────┴─────┘
|
2787
|
+
def merge_sorted(other, key)
|
2788
|
+
_from_rbldf(_ldf.merge_sorted(other._ldf, key))
|
2789
|
+
end
|
2476
2790
|
|
2477
2791
|
# Indicate that one or multiple columns are sorted.
|
2478
2792
|
#
|