polars-df 0.8.0-x86_64-darwin → 0.10.0-x86_64-darwin
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +42 -1
- data/Cargo.lock +159 -66
- data/Cargo.toml +0 -3
- data/LICENSE-THIRD-PARTY.txt +3112 -1613
- data/LICENSE.txt +1 -1
- data/README.md +3 -2
- data/lib/polars/3.1/polars.bundle +0 -0
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/3.3/polars.bundle +0 -0
- data/lib/polars/array_expr.rb +453 -0
- data/lib/polars/array_name_space.rb +346 -0
- data/lib/polars/batched_csv_reader.rb +4 -2
- data/lib/polars/cat_expr.rb +24 -0
- data/lib/polars/cat_name_space.rb +75 -0
- data/lib/polars/config.rb +2 -2
- data/lib/polars/data_frame.rb +306 -96
- data/lib/polars/data_types.rb +191 -28
- data/lib/polars/date_time_expr.rb +41 -18
- data/lib/polars/date_time_name_space.rb +9 -3
- data/lib/polars/exceptions.rb +12 -1
- data/lib/polars/expr.rb +898 -215
- data/lib/polars/functions/aggregation/horizontal.rb +246 -0
- data/lib/polars/functions/aggregation/vertical.rb +282 -0
- data/lib/polars/functions/as_datatype.rb +248 -0
- data/lib/polars/functions/col.rb +47 -0
- data/lib/polars/functions/eager.rb +182 -0
- data/lib/polars/functions/lazy.rb +1280 -0
- data/lib/polars/functions/len.rb +49 -0
- data/lib/polars/functions/lit.rb +35 -0
- data/lib/polars/functions/random.rb +16 -0
- data/lib/polars/functions/range/date_range.rb +103 -0
- data/lib/polars/functions/range/int_range.rb +51 -0
- data/lib/polars/functions/repeat.rb +144 -0
- data/lib/polars/functions/whenthen.rb +96 -0
- data/lib/polars/functions.rb +29 -416
- data/lib/polars/group_by.rb +2 -2
- data/lib/polars/io.rb +36 -31
- data/lib/polars/lazy_frame.rb +405 -88
- data/lib/polars/list_expr.rb +158 -8
- data/lib/polars/list_name_space.rb +102 -0
- data/lib/polars/meta_expr.rb +175 -7
- data/lib/polars/series.rb +282 -41
- data/lib/polars/string_cache.rb +75 -0
- data/lib/polars/string_expr.rb +413 -96
- data/lib/polars/string_name_space.rb +4 -4
- data/lib/polars/testing.rb +507 -0
- data/lib/polars/utils.rb +106 -8
- data/lib/polars/version.rb +1 -1
- data/lib/polars/whenthen.rb +83 -0
- data/lib/polars.rb +16 -4
- metadata +34 -6
- data/lib/polars/lazy_functions.rb +0 -1181
- data/lib/polars/when.rb +0 -16
- data/lib/polars/when_then.rb +0 -19
data/lib/polars/lazy_frame.rb
CHANGED
@@ -49,7 +49,8 @@ module Polars
|
|
49
49
|
row_count_name: nil,
|
50
50
|
row_count_offset: 0,
|
51
51
|
parse_dates: false,
|
52
|
-
eol_char: "\n"
|
52
|
+
eol_char: "\n",
|
53
|
+
truncate_ragged_lines: true
|
53
54
|
)
|
54
55
|
dtype_list = nil
|
55
56
|
if !dtypes.nil?
|
@@ -81,7 +82,8 @@ module Polars
|
|
81
82
|
encoding,
|
82
83
|
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
83
84
|
parse_dates,
|
84
|
-
eol_char
|
85
|
+
eol_char,
|
86
|
+
truncate_ragged_lines
|
85
87
|
)
|
86
88
|
)
|
87
89
|
end
|
@@ -103,6 +105,7 @@ module Polars
|
|
103
105
|
_from_rbldf(
|
104
106
|
RbLazyFrame.new_from_parquet(
|
105
107
|
file,
|
108
|
+
[],
|
106
109
|
n_rows,
|
107
110
|
cache,
|
108
111
|
parallel,
|
@@ -110,7 +113,8 @@ module Polars
|
|
110
113
|
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
111
114
|
low_memory,
|
112
115
|
use_statistics,
|
113
|
-
hive_partitioning
|
116
|
+
hive_partitioning,
|
117
|
+
nil
|
114
118
|
)
|
115
119
|
)
|
116
120
|
end
|
@@ -308,7 +312,7 @@ module Polars
|
|
308
312
|
# end
|
309
313
|
#
|
310
314
|
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]}).lazy
|
311
|
-
# df.pipe(cast_str_to_int, col_name: "b").collect
|
315
|
+
# df.pipe(cast_str_to_int, col_name: "b").collect
|
312
316
|
# # =>
|
313
317
|
# # shape: (4, 2)
|
314
318
|
# # ┌─────┬─────┐
|
@@ -342,6 +346,7 @@ module Polars
|
|
342
346
|
simplify_expression: true,
|
343
347
|
slice_pushdown: true,
|
344
348
|
common_subplan_elimination: true,
|
349
|
+
comm_subexpr_elim: true,
|
345
350
|
allow_streaming: false
|
346
351
|
)
|
347
352
|
ldf = _ldf.optimization_toggle(
|
@@ -351,6 +356,7 @@ module Polars
|
|
351
356
|
simplify_expression,
|
352
357
|
slice_pushdown,
|
353
358
|
common_subplan_elimination,
|
359
|
+
comm_subexpr_elim,
|
354
360
|
allow_streaming,
|
355
361
|
false
|
356
362
|
)
|
@@ -398,16 +404,16 @@ module Polars
|
|
398
404
|
# # │ 2 ┆ 7.0 ┆ b │
|
399
405
|
# # │ 1 ┆ 6.0 ┆ a │
|
400
406
|
# # └─────┴─────┴─────┘
|
401
|
-
def sort(by, reverse: false, nulls_last: false, maintain_order: false)
|
407
|
+
def sort(by, reverse: false, nulls_last: false, maintain_order: false, multithreaded: true)
|
402
408
|
if by.is_a?(::String)
|
403
|
-
return _from_rbldf(_ldf.sort(by, reverse, nulls_last, maintain_order))
|
409
|
+
return _from_rbldf(_ldf.sort(by, reverse, nulls_last, maintain_order, multithreaded))
|
404
410
|
end
|
405
411
|
if Utils.bool?(reverse)
|
406
412
|
reverse = [reverse]
|
407
413
|
end
|
408
414
|
|
409
415
|
by = Utils.selection_to_rbexpr_list(by)
|
410
|
-
_from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last, maintain_order))
|
416
|
+
_from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last, maintain_order, multithreaded))
|
411
417
|
end
|
412
418
|
|
413
419
|
# def profile
|
@@ -469,6 +475,7 @@ module Polars
|
|
469
475
|
no_optimization: false,
|
470
476
|
slice_pushdown: true,
|
471
477
|
common_subplan_elimination: true,
|
478
|
+
comm_subexpr_elim: true,
|
472
479
|
allow_streaming: false,
|
473
480
|
_eager: false
|
474
481
|
)
|
@@ -477,6 +484,7 @@ module Polars
|
|
477
484
|
projection_pushdown = false
|
478
485
|
slice_pushdown = false
|
479
486
|
common_subplan_elimination = false
|
487
|
+
comm_subexpr_elim = false
|
480
488
|
end
|
481
489
|
|
482
490
|
if allow_streaming
|
@@ -490,6 +498,7 @@ module Polars
|
|
490
498
|
simplify_expression,
|
491
499
|
slice_pushdown,
|
492
500
|
common_subplan_elimination,
|
501
|
+
comm_subexpr_elim,
|
493
502
|
allow_streaming,
|
494
503
|
_eager
|
495
504
|
)
|
@@ -559,6 +568,268 @@ module Polars
|
|
559
568
|
simplify_expression: true,
|
560
569
|
no_optimization: false,
|
561
570
|
slice_pushdown: true
|
571
|
+
)
|
572
|
+
lf = _set_sink_optimizations(
|
573
|
+
type_coercion: type_coercion,
|
574
|
+
predicate_pushdown: predicate_pushdown,
|
575
|
+
projection_pushdown: projection_pushdown,
|
576
|
+
simplify_expression: simplify_expression,
|
577
|
+
slice_pushdown: slice_pushdown,
|
578
|
+
no_optimization: no_optimization
|
579
|
+
)
|
580
|
+
|
581
|
+
lf.sink_parquet(
|
582
|
+
path,
|
583
|
+
compression,
|
584
|
+
compression_level,
|
585
|
+
statistics,
|
586
|
+
row_group_size,
|
587
|
+
data_pagesize_limit,
|
588
|
+
maintain_order
|
589
|
+
)
|
590
|
+
end
|
591
|
+
|
592
|
+
# Evaluate the query in streaming mode and write to an IPC file.
|
593
|
+
#
|
594
|
+
# This allows streaming results that are larger than RAM to be written to disk.
|
595
|
+
#
|
596
|
+
# @param path [String]
|
597
|
+
# File path to which the file should be written.
|
598
|
+
# @param compression ["lz4", "zstd"]
|
599
|
+
# Choose "zstd" for good compression performance.
|
600
|
+
# Choose "lz4" for fast compression/decompression.
|
601
|
+
# @param maintain_order [Boolean]
|
602
|
+
# Maintain the order in which data is processed.
|
603
|
+
# Setting this to `false` will be slightly faster.
|
604
|
+
# @param type_coercion [Boolean]
|
605
|
+
# Do type coercion optimization.
|
606
|
+
# @param predicate_pushdown [Boolean]
|
607
|
+
# Do predicate pushdown optimization.
|
608
|
+
# @param projection_pushdown [Boolean]
|
609
|
+
# Do projection pushdown optimization.
|
610
|
+
# @param simplify_expression [Boolean]
|
611
|
+
# Run simplify expressions optimization.
|
612
|
+
# @param slice_pushdown [Boolean]
|
613
|
+
# Slice pushdown optimization.
|
614
|
+
# @param no_optimization [Boolean]
|
615
|
+
# Turn off (certain) optimizations.
|
616
|
+
#
|
617
|
+
# @return [DataFrame]
|
618
|
+
#
|
619
|
+
# @example
|
620
|
+
# lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
|
621
|
+
# lf.sink_ipc("out.arrow")
|
622
|
+
def sink_ipc(
|
623
|
+
path,
|
624
|
+
compression: "zstd",
|
625
|
+
maintain_order: true,
|
626
|
+
type_coercion: true,
|
627
|
+
predicate_pushdown: true,
|
628
|
+
projection_pushdown: true,
|
629
|
+
simplify_expression: true,
|
630
|
+
slice_pushdown: true,
|
631
|
+
no_optimization: false
|
632
|
+
)
|
633
|
+
lf = _set_sink_optimizations(
|
634
|
+
type_coercion: type_coercion,
|
635
|
+
predicate_pushdown: predicate_pushdown,
|
636
|
+
projection_pushdown: projection_pushdown,
|
637
|
+
simplify_expression: simplify_expression,
|
638
|
+
slice_pushdown: slice_pushdown,
|
639
|
+
no_optimization: no_optimization
|
640
|
+
)
|
641
|
+
|
642
|
+
lf.sink_ipc(
|
643
|
+
path,
|
644
|
+
compression,
|
645
|
+
maintain_order
|
646
|
+
)
|
647
|
+
end
|
648
|
+
|
649
|
+
# Evaluate the query in streaming mode and write to a CSV file.
|
650
|
+
#
|
651
|
+
# This allows streaming results that are larger than RAM to be written to disk.
|
652
|
+
#
|
653
|
+
# @param path [String]
|
654
|
+
# File path to which the file should be written.
|
655
|
+
# @param include_bom [Boolean]
|
656
|
+
# Whether to include UTF-8 BOM in the CSV output.
|
657
|
+
# @param include_header [Boolean]
|
658
|
+
# Whether to include header in the CSV output.
|
659
|
+
# @param separator [String]
|
660
|
+
# Separate CSV fields with this symbol.
|
661
|
+
# @param line_terminator [String]
|
662
|
+
# String used to end each row.
|
663
|
+
# @param quote_char [String]
|
664
|
+
# Byte to use as quoting character.
|
665
|
+
# @param batch_size [Integer]
|
666
|
+
# Number of rows that will be processed per thread.
|
667
|
+
# @param datetime_format [String]
|
668
|
+
# A format string, with the specifiers defined by the
|
669
|
+
# `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
|
670
|
+
# Rust crate. If no format specified, the default fractional-second
|
671
|
+
# precision is inferred from the maximum timeunit found in the frame's
|
672
|
+
# Datetime cols (if any).
|
673
|
+
# @param date_format [String]
|
674
|
+
# A format string, with the specifiers defined by the
|
675
|
+
# `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
|
676
|
+
# Rust crate.
|
677
|
+
# @param time_format [String]
|
678
|
+
# A format string, with the specifiers defined by the
|
679
|
+
# `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
|
680
|
+
# Rust crate.
|
681
|
+
# @param float_precision [Integer]
|
682
|
+
# Number of decimal places to write, applied to both `Float32` and
|
683
|
+
# `Float64` datatypes.
|
684
|
+
# @param null_value [String]
|
685
|
+
# A string representing null values (defaulting to the empty string).
|
686
|
+
# @param quote_style ["necessary", "always", "non_numeric", "never"]
|
687
|
+
# Determines the quoting strategy used.
|
688
|
+
#
|
689
|
+
# - necessary (default): This puts quotes around fields only when necessary.
|
690
|
+
# They are necessary when fields contain a quote,
|
691
|
+
# delimiter or record terminator.
|
692
|
+
# Quotes are also necessary when writing an empty record
|
693
|
+
# (which is indistinguishable from a record with one empty field).
|
694
|
+
# This is the default.
|
695
|
+
# - always: This puts quotes around every field. Always.
|
696
|
+
# - never: This never puts quotes around fields, even if that results in
|
697
|
+
# invalid CSV data (e.g.: by not quoting strings containing the
|
698
|
+
# separator).
|
699
|
+
# - non_numeric: This puts quotes around all fields that are non-numeric.
|
700
|
+
# Namely, when writing a field that does not parse as a valid float
|
701
|
+
# or integer, then quotes will be used even if they aren`t strictly
|
702
|
+
# necessary.
|
703
|
+
# @param maintain_order [Boolean]
|
704
|
+
# Maintain the order in which data is processed.
|
705
|
+
# Setting this to `false` will be slightly faster.
|
706
|
+
# @param type_coercion [Boolean]
|
707
|
+
# Do type coercion optimization.
|
708
|
+
# @param predicate_pushdown [Boolean]
|
709
|
+
# Do predicate pushdown optimization.
|
710
|
+
# @param projection_pushdown [Boolean]
|
711
|
+
# Do projection pushdown optimization.
|
712
|
+
# @param simplify_expression [Boolean]
|
713
|
+
# Run simplify expressions optimization.
|
714
|
+
# @param slice_pushdown [Boolean]
|
715
|
+
# Slice pushdown optimization.
|
716
|
+
# @param no_optimization [Boolean]
|
717
|
+
# Turn off (certain) optimizations.
|
718
|
+
#
|
719
|
+
# @return [DataFrame]
|
720
|
+
#
|
721
|
+
# @example
|
722
|
+
# lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
|
723
|
+
# lf.sink_csv("out.csv")
|
724
|
+
def sink_csv(
|
725
|
+
path,
|
726
|
+
include_bom: false,
|
727
|
+
include_header: true,
|
728
|
+
separator: ",",
|
729
|
+
line_terminator: "\n",
|
730
|
+
quote_char: '"',
|
731
|
+
batch_size: 1024,
|
732
|
+
datetime_format: nil,
|
733
|
+
date_format: nil,
|
734
|
+
time_format: nil,
|
735
|
+
float_precision: nil,
|
736
|
+
null_value: nil,
|
737
|
+
quote_style: nil,
|
738
|
+
maintain_order: true,
|
739
|
+
type_coercion: true,
|
740
|
+
predicate_pushdown: true,
|
741
|
+
projection_pushdown: true,
|
742
|
+
simplify_expression: true,
|
743
|
+
slice_pushdown: true,
|
744
|
+
no_optimization: false
|
745
|
+
)
|
746
|
+
Utils._check_arg_is_1byte("separator", separator, false)
|
747
|
+
Utils._check_arg_is_1byte("quote_char", quote_char, false)
|
748
|
+
|
749
|
+
lf = _set_sink_optimizations(
|
750
|
+
type_coercion: type_coercion,
|
751
|
+
predicate_pushdown: predicate_pushdown,
|
752
|
+
projection_pushdown: projection_pushdown,
|
753
|
+
simplify_expression: simplify_expression,
|
754
|
+
slice_pushdown: slice_pushdown,
|
755
|
+
no_optimization: no_optimization
|
756
|
+
)
|
757
|
+
|
758
|
+
lf.sink_csv(
|
759
|
+
path,
|
760
|
+
include_bom,
|
761
|
+
include_header,
|
762
|
+
separator.ord,
|
763
|
+
line_terminator,
|
764
|
+
quote_char.ord,
|
765
|
+
batch_size,
|
766
|
+
datetime_format,
|
767
|
+
date_format,
|
768
|
+
time_format,
|
769
|
+
float_precision,
|
770
|
+
null_value,
|
771
|
+
quote_style,
|
772
|
+
maintain_order
|
773
|
+
)
|
774
|
+
end
|
775
|
+
|
776
|
+
# Evaluate the query in streaming mode and write to an NDJSON file.
|
777
|
+
#
|
778
|
+
# This allows streaming results that are larger than RAM to be written to disk.
|
779
|
+
#
|
780
|
+
# @param path [String]
|
781
|
+
# File path to which the file should be written.
|
782
|
+
# @param maintain_order [Boolean]
|
783
|
+
# Maintain the order in which data is processed.
|
784
|
+
# Setting this to `false` will be slightly faster.
|
785
|
+
# @param type_coercion [Boolean]
|
786
|
+
# Do type coercion optimization.
|
787
|
+
# @param predicate_pushdown [Boolean]
|
788
|
+
# Do predicate pushdown optimization.
|
789
|
+
# @param projection_pushdown [Boolean]
|
790
|
+
# Do projection pushdown optimization.
|
791
|
+
# @param simplify_expression [Boolean]
|
792
|
+
# Run simplify expressions optimization.
|
793
|
+
# @param slice_pushdown [Boolean]
|
794
|
+
# Slice pushdown optimization.
|
795
|
+
# @param no_optimization [Boolean]
|
796
|
+
# Turn off (certain) optimizations.
|
797
|
+
#
|
798
|
+
# @return [DataFrame]
|
799
|
+
#
|
800
|
+
# @example
|
801
|
+
# lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
|
802
|
+
# lf.sink_ndjson("out.ndjson")
|
803
|
+
def sink_ndjson(
|
804
|
+
path,
|
805
|
+
maintain_order: true,
|
806
|
+
type_coercion: true,
|
807
|
+
predicate_pushdown: true,
|
808
|
+
projection_pushdown: true,
|
809
|
+
simplify_expression: true,
|
810
|
+
slice_pushdown: true,
|
811
|
+
no_optimization: false
|
812
|
+
)
|
813
|
+
lf = _set_sink_optimizations(
|
814
|
+
type_coercion: type_coercion,
|
815
|
+
predicate_pushdown: predicate_pushdown,
|
816
|
+
projection_pushdown: projection_pushdown,
|
817
|
+
simplify_expression: simplify_expression,
|
818
|
+
slice_pushdown: slice_pushdown,
|
819
|
+
no_optimization: no_optimization
|
820
|
+
)
|
821
|
+
|
822
|
+
lf.sink_json(path, maintain_order)
|
823
|
+
end
|
824
|
+
|
825
|
+
# @private
|
826
|
+
def _set_sink_optimizations(
|
827
|
+
type_coercion: true,
|
828
|
+
predicate_pushdown: true,
|
829
|
+
projection_pushdown: true,
|
830
|
+
simplify_expression: true,
|
831
|
+
slice_pushdown: true,
|
832
|
+
no_optimization: false
|
562
833
|
)
|
563
834
|
if no_optimization
|
564
835
|
predicate_pushdown = false
|
@@ -566,25 +837,17 @@ module Polars
|
|
566
837
|
slice_pushdown = false
|
567
838
|
end
|
568
839
|
|
569
|
-
|
840
|
+
_ldf.optimization_toggle(
|
570
841
|
type_coercion,
|
571
842
|
predicate_pushdown,
|
572
843
|
projection_pushdown,
|
573
844
|
simplify_expression,
|
574
845
|
slice_pushdown,
|
575
846
|
false,
|
847
|
+
false,
|
576
848
|
true,
|
577
849
|
false
|
578
850
|
)
|
579
|
-
lf.sink_parquet(
|
580
|
-
path,
|
581
|
-
compression,
|
582
|
-
compression_level,
|
583
|
-
statistics,
|
584
|
-
row_group_size,
|
585
|
-
data_pagesize_limit,
|
586
|
-
maintain_order
|
587
|
-
)
|
588
851
|
end
|
589
852
|
|
590
853
|
# Collect a small number of rows for debugging purposes.
|
@@ -650,6 +913,7 @@ module Polars
|
|
650
913
|
no_optimization: false,
|
651
914
|
slice_pushdown: true,
|
652
915
|
common_subplan_elimination: true,
|
916
|
+
comm_subexpr_elim: true,
|
653
917
|
allow_streaming: false
|
654
918
|
)
|
655
919
|
if no_optimization
|
@@ -666,6 +930,7 @@ module Polars
|
|
666
930
|
simplify_expression,
|
667
931
|
slice_pushdown,
|
668
932
|
common_subplan_elimination,
|
933
|
+
comm_subexpr_elim,
|
669
934
|
allow_streaming,
|
670
935
|
false
|
671
936
|
)
|
@@ -699,6 +964,10 @@ module Polars
|
|
699
964
|
_from_rbldf(_ldf.cache)
|
700
965
|
end
|
701
966
|
|
967
|
+
# TODO
|
968
|
+
# def cast
|
969
|
+
# end
|
970
|
+
|
702
971
|
# Create an empty copy of the current LazyFrame.
|
703
972
|
#
|
704
973
|
# The copy has an identical schema but no data.
|
@@ -706,14 +975,14 @@ module Polars
|
|
706
975
|
# @return [LazyFrame]
|
707
976
|
#
|
708
977
|
# @example
|
709
|
-
#
|
978
|
+
# lf = Polars::LazyFrame.new(
|
710
979
|
# {
|
711
980
|
# "a" => [nil, 2, 3, 4],
|
712
981
|
# "b" => [0.5, nil, 2.5, 13],
|
713
982
|
# "c" => [true, true, false, nil],
|
714
983
|
# }
|
715
984
|
# ).lazy
|
716
|
-
#
|
985
|
+
# lf.clear.fetch
|
717
986
|
# # =>
|
718
987
|
# # shape: (0, 3)
|
719
988
|
# # ┌─────┬─────┬──────┐
|
@@ -722,9 +991,23 @@ module Polars
|
|
722
991
|
# # │ i64 ┆ f64 ┆ bool │
|
723
992
|
# # ╞═════╪═════╪══════╡
|
724
993
|
# # └─────┴─────┴──────┘
|
725
|
-
|
726
|
-
|
727
|
-
|
994
|
+
#
|
995
|
+
# @example
|
996
|
+
# lf.clear(2).fetch
|
997
|
+
# # =>
|
998
|
+
# # shape: (2, 3)
|
999
|
+
# # ┌──────┬──────┬──────┐
|
1000
|
+
# # │ a ┆ b ┆ c │
|
1001
|
+
# # │ --- ┆ --- ┆ --- │
|
1002
|
+
# # │ i64 ┆ f64 ┆ bool │
|
1003
|
+
# # ╞══════╪══════╪══════╡
|
1004
|
+
# # │ null ┆ null ┆ null │
|
1005
|
+
# # │ null ┆ null ┆ null │
|
1006
|
+
# # └──────┴──────┴──────┘
|
1007
|
+
def clear(n = 0)
|
1008
|
+
DataFrame.new(columns: schema).clear(n).lazy
|
1009
|
+
end
|
1010
|
+
alias_method :cleared, :clear
|
728
1011
|
|
729
1012
|
# Filter the rows in the DataFrame based on a predicate expression.
|
730
1013
|
#
|
@@ -774,8 +1057,13 @@ module Polars
|
|
774
1057
|
|
775
1058
|
# Select columns from this DataFrame.
|
776
1059
|
#
|
777
|
-
# @param exprs [
|
778
|
-
# Column
|
1060
|
+
# @param exprs [Array]
|
1061
|
+
# Column(s) to select, specified as positional arguments.
|
1062
|
+
# Accepts expression input. Strings are parsed as column names,
|
1063
|
+
# other non-expression inputs are parsed as literals.
|
1064
|
+
# @param named_exprs [Hash]
|
1065
|
+
# Additional columns to select, specified as keyword arguments.
|
1066
|
+
# The columns will be renamed to the keyword used.
|
779
1067
|
#
|
780
1068
|
# @return [LazyFrame]
|
781
1069
|
#
|
@@ -855,9 +1143,13 @@ module Polars
|
|
855
1143
|
# # │ 0 │
|
856
1144
|
# # │ 10 │
|
857
1145
|
# # └─────────┘
|
858
|
-
def select(exprs)
|
859
|
-
|
860
|
-
|
1146
|
+
def select(*exprs, **named_exprs)
|
1147
|
+
structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
|
1148
|
+
|
1149
|
+
rbexprs = Utils.parse_as_list_of_expressions(
|
1150
|
+
*exprs, **named_exprs, __structify: structify
|
1151
|
+
)
|
1152
|
+
_from_rbldf(_ldf.select(rbexprs))
|
861
1153
|
end
|
862
1154
|
|
863
1155
|
# Start a group by operation.
|
@@ -967,7 +1259,7 @@ module Polars
|
|
967
1259
|
# df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
|
968
1260
|
# Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
|
969
1261
|
# )
|
970
|
-
# df.
|
1262
|
+
# df.rolling(index_column: "dt", period: "2d").agg(
|
971
1263
|
# [
|
972
1264
|
# Polars.sum("a").alias("sum_a"),
|
973
1265
|
# Polars.min("a").alias("min_a"),
|
@@ -988,7 +1280,7 @@ module Polars
|
|
988
1280
|
# # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
|
989
1281
|
# # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
|
990
1282
|
# # └─────────────────────┴───────┴───────┴───────┘
|
991
|
-
def
|
1283
|
+
def rolling(
|
992
1284
|
index_column:,
|
993
1285
|
period:,
|
994
1286
|
offset: nil,
|
@@ -1005,12 +1297,13 @@ module Polars
|
|
1005
1297
|
period = Utils._timedelta_to_pl_duration(period)
|
1006
1298
|
offset = Utils._timedelta_to_pl_duration(offset)
|
1007
1299
|
|
1008
|
-
lgb = _ldf.
|
1300
|
+
lgb = _ldf.rolling(
|
1009
1301
|
index_column, period, offset, closed, rbexprs_by, check_sorted
|
1010
1302
|
)
|
1011
1303
|
LazyGroupBy.new(lgb)
|
1012
1304
|
end
|
1013
|
-
alias_method :
|
1305
|
+
alias_method :group_by_rolling, :rolling
|
1306
|
+
alias_method :groupby_rolling, :rolling
|
1014
1307
|
|
1015
1308
|
# Group based on a time value (or index value of type `:i32`, `:i64`).
|
1016
1309
|
#
|
@@ -1234,12 +1527,13 @@ module Polars
|
|
1234
1527
|
# closed: "right"
|
1235
1528
|
# ).agg(Polars.col("A").alias("A_agg_list"))
|
1236
1529
|
# # =>
|
1237
|
-
# # shape: (
|
1530
|
+
# # shape: (4, 4)
|
1238
1531
|
# # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
|
1239
1532
|
# # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
|
1240
1533
|
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1241
1534
|
# # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
|
1242
1535
|
# # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
|
1536
|
+
# # │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │
|
1243
1537
|
# # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
|
1244
1538
|
# # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
|
1245
1539
|
# # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
|
@@ -1440,6 +1734,8 @@ module Polars
|
|
1440
1734
|
# Join strategy.
|
1441
1735
|
# @param suffix [String]
|
1442
1736
|
# Suffix to append to columns with a duplicate name.
|
1737
|
+
# @param join_nulls [Boolean]
|
1738
|
+
# Join on null values. By default null values will never produce matches.
|
1443
1739
|
# @param allow_parallel [Boolean]
|
1444
1740
|
# Allow the physical plan to optionally evaluate the computation of both
|
1445
1741
|
# DataFrames up to the join in parallel.
|
@@ -1535,6 +1831,7 @@ module Polars
|
|
1535
1831
|
on: nil,
|
1536
1832
|
how: "inner",
|
1537
1833
|
suffix: "_right",
|
1834
|
+
join_nulls: false,
|
1538
1835
|
allow_parallel: true,
|
1539
1836
|
force_parallel: false
|
1540
1837
|
)
|
@@ -1545,7 +1842,7 @@ module Polars
|
|
1545
1842
|
if how == "cross"
|
1546
1843
|
return _from_rbldf(
|
1547
1844
|
_ldf.join(
|
1548
|
-
other._ldf, [], [], allow_parallel, force_parallel, how, suffix
|
1845
|
+
other._ldf, [], [], allow_parallel, join_nulls, force_parallel, how, suffix
|
1549
1846
|
)
|
1550
1847
|
)
|
1551
1848
|
end
|
@@ -1568,6 +1865,7 @@ module Polars
|
|
1568
1865
|
rbexprs_right,
|
1569
1866
|
allow_parallel,
|
1570
1867
|
force_parallel,
|
1868
|
+
join_nulls,
|
1571
1869
|
how,
|
1572
1870
|
suffix,
|
1573
1871
|
)
|
@@ -1598,37 +1896,19 @@ module Polars
|
|
1598
1896
|
# ).collect
|
1599
1897
|
# # =>
|
1600
1898
|
# # shape: (4, 6)
|
1601
|
-
# #
|
1602
|
-
# # │ a ┆ b ┆ c ┆ a^2
|
1603
|
-
# # │ --- ┆ --- ┆ --- ┆ ---
|
1604
|
-
# # │ i64 ┆ f64 ┆ bool ┆
|
1605
|
-
# #
|
1606
|
-
# # │ 1 ┆ 0.5 ┆ true ┆ 1
|
1607
|
-
# # │ 2 ┆ 4.0 ┆ true ┆ 4
|
1608
|
-
# # │ 3 ┆ 10.0 ┆ false ┆ 9
|
1609
|
-
# # │ 4 ┆ 13.0 ┆ true ┆ 16
|
1610
|
-
# #
|
1611
|
-
def with_columns(exprs)
|
1612
|
-
|
1613
|
-
|
1614
|
-
[]
|
1615
|
-
elsif exprs.is_a?(Expr)
|
1616
|
-
[exprs]
|
1617
|
-
else
|
1618
|
-
exprs.to_a
|
1619
|
-
end
|
1620
|
-
|
1621
|
-
rbexprs = []
|
1622
|
-
exprs.each do |e|
|
1623
|
-
case e
|
1624
|
-
when Expr
|
1625
|
-
rbexprs << e._rbexpr
|
1626
|
-
when Series
|
1627
|
-
rbexprs << Utils.lit(e)._rbexpr
|
1628
|
-
else
|
1629
|
-
raise ArgumentError, "Expected an expression, got #{e}"
|
1630
|
-
end
|
1631
|
-
end
|
1899
|
+
# # ┌─────┬──────┬───────┬─────┬──────┬───────┐
|
1900
|
+
# # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
|
1901
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
1902
|
+
# # │ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │
|
1903
|
+
# # ╞═════╪══════╪═══════╪═════╪══════╪═══════╡
|
1904
|
+
# # │ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │
|
1905
|
+
# # │ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │
|
1906
|
+
# # │ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │
|
1907
|
+
# # │ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │
|
1908
|
+
# # └─────┴──────┴───────┴─────┴──────┴───────┘
|
1909
|
+
def with_columns(*exprs, **named_exprs)
|
1910
|
+
structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
|
1911
|
+
rbexprs = Utils.parse_as_list_of_expressions(*exprs, **named_exprs, __structify: structify)
|
1632
1912
|
|
1633
1913
|
_from_rbldf(_ldf.with_columns(rbexprs))
|
1634
1914
|
end
|
@@ -1690,26 +1970,26 @@ module Polars
|
|
1690
1970
|
# # ┌─────┬─────┬───────────┐
|
1691
1971
|
# # │ a ┆ b ┆ b_squared │
|
1692
1972
|
# # │ --- ┆ --- ┆ --- │
|
1693
|
-
# # │ i64 ┆ i64 ┆
|
1973
|
+
# # │ i64 ┆ i64 ┆ i64 │
|
1694
1974
|
# # ╞═════╪═════╪═══════════╡
|
1695
|
-
# # │ 1 ┆ 2 ┆ 4
|
1696
|
-
# # │ 3 ┆ 4 ┆ 16
|
1697
|
-
# # │ 5 ┆ 6 ┆ 36
|
1975
|
+
# # │ 1 ┆ 2 ┆ 4 │
|
1976
|
+
# # │ 3 ┆ 4 ┆ 16 │
|
1977
|
+
# # │ 5 ┆ 6 ┆ 36 │
|
1698
1978
|
# # └─────┴─────┴───────────┘
|
1699
1979
|
#
|
1700
1980
|
# @example
|
1701
1981
|
# df.with_column(Polars.col("a") ** 2).collect
|
1702
1982
|
# # =>
|
1703
1983
|
# # shape: (3, 2)
|
1704
|
-
# #
|
1705
|
-
# # │ a
|
1706
|
-
# # │ ---
|
1707
|
-
# # │
|
1708
|
-
# #
|
1709
|
-
# # │ 1
|
1710
|
-
# # │ 9
|
1711
|
-
# # │ 25
|
1712
|
-
# #
|
1984
|
+
# # ┌─────┬─────┐
|
1985
|
+
# # │ a ┆ b │
|
1986
|
+
# # │ --- ┆ --- │
|
1987
|
+
# # │ i64 ┆ i64 │
|
1988
|
+
# # ╞═════╪═════╡
|
1989
|
+
# # │ 1 ┆ 2 │
|
1990
|
+
# # │ 9 ┆ 4 │
|
1991
|
+
# # │ 25 ┆ 6 │
|
1992
|
+
# # └─────┴─────┘
|
1713
1993
|
def with_column(column)
|
1714
1994
|
with_columns([column])
|
1715
1995
|
end
|
@@ -1721,11 +2001,9 @@ module Polars
|
|
1721
2001
|
# - List of column names.
|
1722
2002
|
#
|
1723
2003
|
# @return [LazyFrame]
|
1724
|
-
def drop(columns)
|
1725
|
-
|
1726
|
-
|
1727
|
-
end
|
1728
|
-
_from_rbldf(_ldf.drop_columns(columns))
|
2004
|
+
def drop(*columns)
|
2005
|
+
drop_cols = Utils._expand_selectors(self, *columns)
|
2006
|
+
_from_rbldf(_ldf.drop(drop_cols))
|
1729
2007
|
end
|
1730
2008
|
|
1731
2009
|
# Rename column names.
|
@@ -1955,7 +2233,7 @@ module Polars
|
|
1955
2233
|
# "b" => [2, 4, 6]
|
1956
2234
|
# }
|
1957
2235
|
# ).lazy
|
1958
|
-
# df.
|
2236
|
+
# df.with_row_index.collect
|
1959
2237
|
# # =>
|
1960
2238
|
# # shape: (3, 3)
|
1961
2239
|
# # ┌────────┬─────┬─────┐
|
@@ -1967,9 +2245,10 @@ module Polars
|
|
1967
2245
|
# # │ 1 ┆ 3 ┆ 4 │
|
1968
2246
|
# # │ 2 ┆ 5 ┆ 6 │
|
1969
2247
|
# # └────────┴─────┴─────┘
|
1970
|
-
def
|
1971
|
-
_from_rbldf(_ldf.
|
2248
|
+
def with_row_index(name: "row_nr", offset: 0)
|
2249
|
+
_from_rbldf(_ldf.with_row_index(name, offset))
|
1972
2250
|
end
|
2251
|
+
alias_method :with_row_count, :with_row_index
|
1973
2252
|
|
1974
2253
|
# Take every nth row in the LazyFrame and return as a new LazyFrame.
|
1975
2254
|
#
|
@@ -2470,9 +2749,47 @@ module Polars
|
|
2470
2749
|
_from_rbldf(_ldf.unnest(names))
|
2471
2750
|
end
|
2472
2751
|
|
2473
|
-
#
|
2474
|
-
#
|
2475
|
-
#
|
2752
|
+
# Take two sorted DataFrames and merge them by the sorted key.
|
2753
|
+
#
|
2754
|
+
# The output of this operation will also be sorted.
|
2755
|
+
# It is the callers responsibility that the frames are sorted
|
2756
|
+
# by that key otherwise the output will not make sense.
|
2757
|
+
#
|
2758
|
+
# The schemas of both LazyFrames must be equal.
|
2759
|
+
#
|
2760
|
+
# @param other [DataFrame]
|
2761
|
+
# Other DataFrame that must be merged
|
2762
|
+
# @param key [String]
|
2763
|
+
# Key that is sorted.
|
2764
|
+
#
|
2765
|
+
# @return [LazyFrame]
|
2766
|
+
#
|
2767
|
+
# @example
|
2768
|
+
# df0 = Polars::LazyFrame.new(
|
2769
|
+
# {"name" => ["steve", "elise", "bob"], "age" => [42, 44, 18]}
|
2770
|
+
# ).sort("age")
|
2771
|
+
# df1 = Polars::LazyFrame.new(
|
2772
|
+
# {"name" => ["anna", "megan", "steve", "thomas"], "age" => [21, 33, 42, 20]}
|
2773
|
+
# ).sort("age")
|
2774
|
+
# df0.merge_sorted(df1, "age").collect
|
2775
|
+
# # =>
|
2776
|
+
# # shape: (7, 2)
|
2777
|
+
# # ┌────────┬─────┐
|
2778
|
+
# # │ name ┆ age │
|
2779
|
+
# # │ --- ┆ --- │
|
2780
|
+
# # │ str ┆ i64 │
|
2781
|
+
# # ╞════════╪═════╡
|
2782
|
+
# # │ bob ┆ 18 │
|
2783
|
+
# # │ thomas ┆ 20 │
|
2784
|
+
# # │ anna ┆ 21 │
|
2785
|
+
# # │ megan ┆ 33 │
|
2786
|
+
# # │ steve ┆ 42 │
|
2787
|
+
# # │ steve ┆ 42 │
|
2788
|
+
# # │ elise ┆ 44 │
|
2789
|
+
# # └────────┴─────┘
|
2790
|
+
def merge_sorted(other, key)
|
2791
|
+
_from_rbldf(_ldf.merge_sorted(other._ldf, key))
|
2792
|
+
end
|
2476
2793
|
|
2477
2794
|
# Indicate that one or multiple columns are sorted.
|
2478
2795
|
#
|