polars-df 0.8.0-arm64-darwin → 0.10.0-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +42 -1
- data/Cargo.lock +159 -66
- data/Cargo.toml +0 -3
- data/LICENSE-THIRD-PARTY.txt +3112 -1613
- data/LICENSE.txt +1 -1
- data/README.md +3 -2
- data/lib/polars/3.1/polars.bundle +0 -0
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/3.3/polars.bundle +0 -0
- data/lib/polars/array_expr.rb +453 -0
- data/lib/polars/array_name_space.rb +346 -0
- data/lib/polars/batched_csv_reader.rb +4 -2
- data/lib/polars/cat_expr.rb +24 -0
- data/lib/polars/cat_name_space.rb +75 -0
- data/lib/polars/config.rb +2 -2
- data/lib/polars/data_frame.rb +306 -96
- data/lib/polars/data_types.rb +191 -28
- data/lib/polars/date_time_expr.rb +41 -18
- data/lib/polars/date_time_name_space.rb +9 -3
- data/lib/polars/exceptions.rb +12 -1
- data/lib/polars/expr.rb +898 -215
- data/lib/polars/functions/aggregation/horizontal.rb +246 -0
- data/lib/polars/functions/aggregation/vertical.rb +282 -0
- data/lib/polars/functions/as_datatype.rb +248 -0
- data/lib/polars/functions/col.rb +47 -0
- data/lib/polars/functions/eager.rb +182 -0
- data/lib/polars/functions/lazy.rb +1280 -0
- data/lib/polars/functions/len.rb +49 -0
- data/lib/polars/functions/lit.rb +35 -0
- data/lib/polars/functions/random.rb +16 -0
- data/lib/polars/functions/range/date_range.rb +103 -0
- data/lib/polars/functions/range/int_range.rb +51 -0
- data/lib/polars/functions/repeat.rb +144 -0
- data/lib/polars/functions/whenthen.rb +96 -0
- data/lib/polars/functions.rb +29 -416
- data/lib/polars/group_by.rb +2 -2
- data/lib/polars/io.rb +36 -31
- data/lib/polars/lazy_frame.rb +405 -88
- data/lib/polars/list_expr.rb +158 -8
- data/lib/polars/list_name_space.rb +102 -0
- data/lib/polars/meta_expr.rb +175 -7
- data/lib/polars/series.rb +282 -41
- data/lib/polars/string_cache.rb +75 -0
- data/lib/polars/string_expr.rb +413 -96
- data/lib/polars/string_name_space.rb +4 -4
- data/lib/polars/testing.rb +507 -0
- data/lib/polars/utils.rb +106 -8
- data/lib/polars/version.rb +1 -1
- data/lib/polars/whenthen.rb +83 -0
- data/lib/polars.rb +16 -4
- metadata +34 -6
- data/lib/polars/lazy_functions.rb +0 -1181
- data/lib/polars/when.rb +0 -16
- data/lib/polars/when_then.rb +0 -19
data/lib/polars/lazy_frame.rb
CHANGED
@@ -49,7 +49,8 @@ module Polars
|
|
49
49
|
row_count_name: nil,
|
50
50
|
row_count_offset: 0,
|
51
51
|
parse_dates: false,
|
52
|
-
eol_char: "\n"
|
52
|
+
eol_char: "\n",
|
53
|
+
truncate_ragged_lines: true
|
53
54
|
)
|
54
55
|
dtype_list = nil
|
55
56
|
if !dtypes.nil?
|
@@ -81,7 +82,8 @@ module Polars
|
|
81
82
|
encoding,
|
82
83
|
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
83
84
|
parse_dates,
|
84
|
-
eol_char
|
85
|
+
eol_char,
|
86
|
+
truncate_ragged_lines
|
85
87
|
)
|
86
88
|
)
|
87
89
|
end
|
@@ -103,6 +105,7 @@ module Polars
|
|
103
105
|
_from_rbldf(
|
104
106
|
RbLazyFrame.new_from_parquet(
|
105
107
|
file,
|
108
|
+
[],
|
106
109
|
n_rows,
|
107
110
|
cache,
|
108
111
|
parallel,
|
@@ -110,7 +113,8 @@ module Polars
|
|
110
113
|
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
111
114
|
low_memory,
|
112
115
|
use_statistics,
|
113
|
-
hive_partitioning
|
116
|
+
hive_partitioning,
|
117
|
+
nil
|
114
118
|
)
|
115
119
|
)
|
116
120
|
end
|
@@ -308,7 +312,7 @@ module Polars
|
|
308
312
|
# end
|
309
313
|
#
|
310
314
|
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]}).lazy
|
311
|
-
# df.pipe(cast_str_to_int, col_name: "b").collect
|
315
|
+
# df.pipe(cast_str_to_int, col_name: "b").collect
|
312
316
|
# # =>
|
313
317
|
# # shape: (4, 2)
|
314
318
|
# # ┌─────┬─────┐
|
@@ -342,6 +346,7 @@ module Polars
|
|
342
346
|
simplify_expression: true,
|
343
347
|
slice_pushdown: true,
|
344
348
|
common_subplan_elimination: true,
|
349
|
+
comm_subexpr_elim: true,
|
345
350
|
allow_streaming: false
|
346
351
|
)
|
347
352
|
ldf = _ldf.optimization_toggle(
|
@@ -351,6 +356,7 @@ module Polars
|
|
351
356
|
simplify_expression,
|
352
357
|
slice_pushdown,
|
353
358
|
common_subplan_elimination,
|
359
|
+
comm_subexpr_elim,
|
354
360
|
allow_streaming,
|
355
361
|
false
|
356
362
|
)
|
@@ -398,16 +404,16 @@ module Polars
|
|
398
404
|
# # │ 2 ┆ 7.0 ┆ b │
|
399
405
|
# # │ 1 ┆ 6.0 ┆ a │
|
400
406
|
# # └─────┴─────┴─────┘
|
401
|
-
def sort(by, reverse: false, nulls_last: false, maintain_order: false)
|
407
|
+
def sort(by, reverse: false, nulls_last: false, maintain_order: false, multithreaded: true)
|
402
408
|
if by.is_a?(::String)
|
403
|
-
return _from_rbldf(_ldf.sort(by, reverse, nulls_last, maintain_order))
|
409
|
+
return _from_rbldf(_ldf.sort(by, reverse, nulls_last, maintain_order, multithreaded))
|
404
410
|
end
|
405
411
|
if Utils.bool?(reverse)
|
406
412
|
reverse = [reverse]
|
407
413
|
end
|
408
414
|
|
409
415
|
by = Utils.selection_to_rbexpr_list(by)
|
410
|
-
_from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last, maintain_order))
|
416
|
+
_from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last, maintain_order, multithreaded))
|
411
417
|
end
|
412
418
|
|
413
419
|
# def profile
|
@@ -469,6 +475,7 @@ module Polars
|
|
469
475
|
no_optimization: false,
|
470
476
|
slice_pushdown: true,
|
471
477
|
common_subplan_elimination: true,
|
478
|
+
comm_subexpr_elim: true,
|
472
479
|
allow_streaming: false,
|
473
480
|
_eager: false
|
474
481
|
)
|
@@ -477,6 +484,7 @@ module Polars
|
|
477
484
|
projection_pushdown = false
|
478
485
|
slice_pushdown = false
|
479
486
|
common_subplan_elimination = false
|
487
|
+
comm_subexpr_elim = false
|
480
488
|
end
|
481
489
|
|
482
490
|
if allow_streaming
|
@@ -490,6 +498,7 @@ module Polars
|
|
490
498
|
simplify_expression,
|
491
499
|
slice_pushdown,
|
492
500
|
common_subplan_elimination,
|
501
|
+
comm_subexpr_elim,
|
493
502
|
allow_streaming,
|
494
503
|
_eager
|
495
504
|
)
|
@@ -559,6 +568,268 @@ module Polars
|
|
559
568
|
simplify_expression: true,
|
560
569
|
no_optimization: false,
|
561
570
|
slice_pushdown: true
|
571
|
+
)
|
572
|
+
lf = _set_sink_optimizations(
|
573
|
+
type_coercion: type_coercion,
|
574
|
+
predicate_pushdown: predicate_pushdown,
|
575
|
+
projection_pushdown: projection_pushdown,
|
576
|
+
simplify_expression: simplify_expression,
|
577
|
+
slice_pushdown: slice_pushdown,
|
578
|
+
no_optimization: no_optimization
|
579
|
+
)
|
580
|
+
|
581
|
+
lf.sink_parquet(
|
582
|
+
path,
|
583
|
+
compression,
|
584
|
+
compression_level,
|
585
|
+
statistics,
|
586
|
+
row_group_size,
|
587
|
+
data_pagesize_limit,
|
588
|
+
maintain_order
|
589
|
+
)
|
590
|
+
end
|
591
|
+
|
592
|
+
# Evaluate the query in streaming mode and write to an IPC file.
|
593
|
+
#
|
594
|
+
# This allows streaming results that are larger than RAM to be written to disk.
|
595
|
+
#
|
596
|
+
# @param path [String]
|
597
|
+
# File path to which the file should be written.
|
598
|
+
# @param compression ["lz4", "zstd"]
|
599
|
+
# Choose "zstd" for good compression performance.
|
600
|
+
# Choose "lz4" for fast compression/decompression.
|
601
|
+
# @param maintain_order [Boolean]
|
602
|
+
# Maintain the order in which data is processed.
|
603
|
+
# Setting this to `false` will be slightly faster.
|
604
|
+
# @param type_coercion [Boolean]
|
605
|
+
# Do type coercion optimization.
|
606
|
+
# @param predicate_pushdown [Boolean]
|
607
|
+
# Do predicate pushdown optimization.
|
608
|
+
# @param projection_pushdown [Boolean]
|
609
|
+
# Do projection pushdown optimization.
|
610
|
+
# @param simplify_expression [Boolean]
|
611
|
+
# Run simplify expressions optimization.
|
612
|
+
# @param slice_pushdown [Boolean]
|
613
|
+
# Slice pushdown optimization.
|
614
|
+
# @param no_optimization [Boolean]
|
615
|
+
# Turn off (certain) optimizations.
|
616
|
+
#
|
617
|
+
# @return [DataFrame]
|
618
|
+
#
|
619
|
+
# @example
|
620
|
+
# lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
|
621
|
+
# lf.sink_ipc("out.arrow")
|
622
|
+
def sink_ipc(
|
623
|
+
path,
|
624
|
+
compression: "zstd",
|
625
|
+
maintain_order: true,
|
626
|
+
type_coercion: true,
|
627
|
+
predicate_pushdown: true,
|
628
|
+
projection_pushdown: true,
|
629
|
+
simplify_expression: true,
|
630
|
+
slice_pushdown: true,
|
631
|
+
no_optimization: false
|
632
|
+
)
|
633
|
+
lf = _set_sink_optimizations(
|
634
|
+
type_coercion: type_coercion,
|
635
|
+
predicate_pushdown: predicate_pushdown,
|
636
|
+
projection_pushdown: projection_pushdown,
|
637
|
+
simplify_expression: simplify_expression,
|
638
|
+
slice_pushdown: slice_pushdown,
|
639
|
+
no_optimization: no_optimization
|
640
|
+
)
|
641
|
+
|
642
|
+
lf.sink_ipc(
|
643
|
+
path,
|
644
|
+
compression,
|
645
|
+
maintain_order
|
646
|
+
)
|
647
|
+
end
|
648
|
+
|
649
|
+
# Evaluate the query in streaming mode and write to a CSV file.
|
650
|
+
#
|
651
|
+
# This allows streaming results that are larger than RAM to be written to disk.
|
652
|
+
#
|
653
|
+
# @param path [String]
|
654
|
+
# File path to which the file should be written.
|
655
|
+
# @param include_bom [Boolean]
|
656
|
+
# Whether to include UTF-8 BOM in the CSV output.
|
657
|
+
# @param include_header [Boolean]
|
658
|
+
# Whether to include header in the CSV output.
|
659
|
+
# @param separator [String]
|
660
|
+
# Separate CSV fields with this symbol.
|
661
|
+
# @param line_terminator [String]
|
662
|
+
# String used to end each row.
|
663
|
+
# @param quote_char [String]
|
664
|
+
# Byte to use as quoting character.
|
665
|
+
# @param batch_size [Integer]
|
666
|
+
# Number of rows that will be processed per thread.
|
667
|
+
# @param datetime_format [String]
|
668
|
+
# A format string, with the specifiers defined by the
|
669
|
+
# `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
|
670
|
+
# Rust crate. If no format specified, the default fractional-second
|
671
|
+
# precision is inferred from the maximum timeunit found in the frame's
|
672
|
+
# Datetime cols (if any).
|
673
|
+
# @param date_format [String]
|
674
|
+
# A format string, with the specifiers defined by the
|
675
|
+
# `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
|
676
|
+
# Rust crate.
|
677
|
+
# @param time_format [String]
|
678
|
+
# A format string, with the specifiers defined by the
|
679
|
+
# `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
|
680
|
+
# Rust crate.
|
681
|
+
# @param float_precision [Integer]
|
682
|
+
# Number of decimal places to write, applied to both `Float32` and
|
683
|
+
# `Float64` datatypes.
|
684
|
+
# @param null_value [String]
|
685
|
+
# A string representing null values (defaulting to the empty string).
|
686
|
+
# @param quote_style ["necessary", "always", "non_numeric", "never"]
|
687
|
+
# Determines the quoting strategy used.
|
688
|
+
#
|
689
|
+
# - necessary (default): This puts quotes around fields only when necessary.
|
690
|
+
# They are necessary when fields contain a quote,
|
691
|
+
# delimiter or record terminator.
|
692
|
+
# Quotes are also necessary when writing an empty record
|
693
|
+
# (which is indistinguishable from a record with one empty field).
|
694
|
+
# This is the default.
|
695
|
+
# - always: This puts quotes around every field. Always.
|
696
|
+
# - never: This never puts quotes around fields, even if that results in
|
697
|
+
# invalid CSV data (e.g.: by not quoting strings containing the
|
698
|
+
# separator).
|
699
|
+
# - non_numeric: This puts quotes around all fields that are non-numeric.
|
700
|
+
# Namely, when writing a field that does not parse as a valid float
|
701
|
+
# or integer, then quotes will be used even if they aren`t strictly
|
702
|
+
# necessary.
|
703
|
+
# @param maintain_order [Boolean]
|
704
|
+
# Maintain the order in which data is processed.
|
705
|
+
# Setting this to `false` will be slightly faster.
|
706
|
+
# @param type_coercion [Boolean]
|
707
|
+
# Do type coercion optimization.
|
708
|
+
# @param predicate_pushdown [Boolean]
|
709
|
+
# Do predicate pushdown optimization.
|
710
|
+
# @param projection_pushdown [Boolean]
|
711
|
+
# Do projection pushdown optimization.
|
712
|
+
# @param simplify_expression [Boolean]
|
713
|
+
# Run simplify expressions optimization.
|
714
|
+
# @param slice_pushdown [Boolean]
|
715
|
+
# Slice pushdown optimization.
|
716
|
+
# @param no_optimization [Boolean]
|
717
|
+
# Turn off (certain) optimizations.
|
718
|
+
#
|
719
|
+
# @return [DataFrame]
|
720
|
+
#
|
721
|
+
# @example
|
722
|
+
# lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
|
723
|
+
# lf.sink_csv("out.csv")
|
724
|
+
def sink_csv(
|
725
|
+
path,
|
726
|
+
include_bom: false,
|
727
|
+
include_header: true,
|
728
|
+
separator: ",",
|
729
|
+
line_terminator: "\n",
|
730
|
+
quote_char: '"',
|
731
|
+
batch_size: 1024,
|
732
|
+
datetime_format: nil,
|
733
|
+
date_format: nil,
|
734
|
+
time_format: nil,
|
735
|
+
float_precision: nil,
|
736
|
+
null_value: nil,
|
737
|
+
quote_style: nil,
|
738
|
+
maintain_order: true,
|
739
|
+
type_coercion: true,
|
740
|
+
predicate_pushdown: true,
|
741
|
+
projection_pushdown: true,
|
742
|
+
simplify_expression: true,
|
743
|
+
slice_pushdown: true,
|
744
|
+
no_optimization: false
|
745
|
+
)
|
746
|
+
Utils._check_arg_is_1byte("separator", separator, false)
|
747
|
+
Utils._check_arg_is_1byte("quote_char", quote_char, false)
|
748
|
+
|
749
|
+
lf = _set_sink_optimizations(
|
750
|
+
type_coercion: type_coercion,
|
751
|
+
predicate_pushdown: predicate_pushdown,
|
752
|
+
projection_pushdown: projection_pushdown,
|
753
|
+
simplify_expression: simplify_expression,
|
754
|
+
slice_pushdown: slice_pushdown,
|
755
|
+
no_optimization: no_optimization
|
756
|
+
)
|
757
|
+
|
758
|
+
lf.sink_csv(
|
759
|
+
path,
|
760
|
+
include_bom,
|
761
|
+
include_header,
|
762
|
+
separator.ord,
|
763
|
+
line_terminator,
|
764
|
+
quote_char.ord,
|
765
|
+
batch_size,
|
766
|
+
datetime_format,
|
767
|
+
date_format,
|
768
|
+
time_format,
|
769
|
+
float_precision,
|
770
|
+
null_value,
|
771
|
+
quote_style,
|
772
|
+
maintain_order
|
773
|
+
)
|
774
|
+
end
|
775
|
+
|
776
|
+
# Evaluate the query in streaming mode and write to an NDJSON file.
|
777
|
+
#
|
778
|
+
# This allows streaming results that are larger than RAM to be written to disk.
|
779
|
+
#
|
780
|
+
# @param path [String]
|
781
|
+
# File path to which the file should be written.
|
782
|
+
# @param maintain_order [Boolean]
|
783
|
+
# Maintain the order in which data is processed.
|
784
|
+
# Setting this to `false` will be slightly faster.
|
785
|
+
# @param type_coercion [Boolean]
|
786
|
+
# Do type coercion optimization.
|
787
|
+
# @param predicate_pushdown [Boolean]
|
788
|
+
# Do predicate pushdown optimization.
|
789
|
+
# @param projection_pushdown [Boolean]
|
790
|
+
# Do projection pushdown optimization.
|
791
|
+
# @param simplify_expression [Boolean]
|
792
|
+
# Run simplify expressions optimization.
|
793
|
+
# @param slice_pushdown [Boolean]
|
794
|
+
# Slice pushdown optimization.
|
795
|
+
# @param no_optimization [Boolean]
|
796
|
+
# Turn off (certain) optimizations.
|
797
|
+
#
|
798
|
+
# @return [DataFrame]
|
799
|
+
#
|
800
|
+
# @example
|
801
|
+
# lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
|
802
|
+
# lf.sink_ndjson("out.ndjson")
|
803
|
+
def sink_ndjson(
|
804
|
+
path,
|
805
|
+
maintain_order: true,
|
806
|
+
type_coercion: true,
|
807
|
+
predicate_pushdown: true,
|
808
|
+
projection_pushdown: true,
|
809
|
+
simplify_expression: true,
|
810
|
+
slice_pushdown: true,
|
811
|
+
no_optimization: false
|
812
|
+
)
|
813
|
+
lf = _set_sink_optimizations(
|
814
|
+
type_coercion: type_coercion,
|
815
|
+
predicate_pushdown: predicate_pushdown,
|
816
|
+
projection_pushdown: projection_pushdown,
|
817
|
+
simplify_expression: simplify_expression,
|
818
|
+
slice_pushdown: slice_pushdown,
|
819
|
+
no_optimization: no_optimization
|
820
|
+
)
|
821
|
+
|
822
|
+
lf.sink_json(path, maintain_order)
|
823
|
+
end
|
824
|
+
|
825
|
+
# @private
|
826
|
+
def _set_sink_optimizations(
|
827
|
+
type_coercion: true,
|
828
|
+
predicate_pushdown: true,
|
829
|
+
projection_pushdown: true,
|
830
|
+
simplify_expression: true,
|
831
|
+
slice_pushdown: true,
|
832
|
+
no_optimization: false
|
562
833
|
)
|
563
834
|
if no_optimization
|
564
835
|
predicate_pushdown = false
|
@@ -566,25 +837,17 @@ module Polars
|
|
566
837
|
slice_pushdown = false
|
567
838
|
end
|
568
839
|
|
569
|
-
|
840
|
+
_ldf.optimization_toggle(
|
570
841
|
type_coercion,
|
571
842
|
predicate_pushdown,
|
572
843
|
projection_pushdown,
|
573
844
|
simplify_expression,
|
574
845
|
slice_pushdown,
|
575
846
|
false,
|
847
|
+
false,
|
576
848
|
true,
|
577
849
|
false
|
578
850
|
)
|
579
|
-
lf.sink_parquet(
|
580
|
-
path,
|
581
|
-
compression,
|
582
|
-
compression_level,
|
583
|
-
statistics,
|
584
|
-
row_group_size,
|
585
|
-
data_pagesize_limit,
|
586
|
-
maintain_order
|
587
|
-
)
|
588
851
|
end
|
589
852
|
|
590
853
|
# Collect a small number of rows for debugging purposes.
|
@@ -650,6 +913,7 @@ module Polars
|
|
650
913
|
no_optimization: false,
|
651
914
|
slice_pushdown: true,
|
652
915
|
common_subplan_elimination: true,
|
916
|
+
comm_subexpr_elim: true,
|
653
917
|
allow_streaming: false
|
654
918
|
)
|
655
919
|
if no_optimization
|
@@ -666,6 +930,7 @@ module Polars
|
|
666
930
|
simplify_expression,
|
667
931
|
slice_pushdown,
|
668
932
|
common_subplan_elimination,
|
933
|
+
comm_subexpr_elim,
|
669
934
|
allow_streaming,
|
670
935
|
false
|
671
936
|
)
|
@@ -699,6 +964,10 @@ module Polars
|
|
699
964
|
_from_rbldf(_ldf.cache)
|
700
965
|
end
|
701
966
|
|
967
|
+
# TODO
|
968
|
+
# def cast
|
969
|
+
# end
|
970
|
+
|
702
971
|
# Create an empty copy of the current LazyFrame.
|
703
972
|
#
|
704
973
|
# The copy has an identical schema but no data.
|
@@ -706,14 +975,14 @@ module Polars
|
|
706
975
|
# @return [LazyFrame]
|
707
976
|
#
|
708
977
|
# @example
|
709
|
-
#
|
978
|
+
# lf = Polars::LazyFrame.new(
|
710
979
|
# {
|
711
980
|
# "a" => [nil, 2, 3, 4],
|
712
981
|
# "b" => [0.5, nil, 2.5, 13],
|
713
982
|
# "c" => [true, true, false, nil],
|
714
983
|
# }
|
715
984
|
# ).lazy
|
716
|
-
#
|
985
|
+
# lf.clear.fetch
|
717
986
|
# # =>
|
718
987
|
# # shape: (0, 3)
|
719
988
|
# # ┌─────┬─────┬──────┐
|
@@ -722,9 +991,23 @@ module Polars
|
|
722
991
|
# # │ i64 ┆ f64 ┆ bool │
|
723
992
|
# # ╞═════╪═════╪══════╡
|
724
993
|
# # └─────┴─────┴──────┘
|
725
|
-
|
726
|
-
|
727
|
-
|
994
|
+
#
|
995
|
+
# @example
|
996
|
+
# lf.clear(2).fetch
|
997
|
+
# # =>
|
998
|
+
# # shape: (2, 3)
|
999
|
+
# # ┌──────┬──────┬──────┐
|
1000
|
+
# # │ a ┆ b ┆ c │
|
1001
|
+
# # │ --- ┆ --- ┆ --- │
|
1002
|
+
# # │ i64 ┆ f64 ┆ bool │
|
1003
|
+
# # ╞══════╪══════╪══════╡
|
1004
|
+
# # │ null ┆ null ┆ null │
|
1005
|
+
# # │ null ┆ null ┆ null │
|
1006
|
+
# # └──────┴──────┴──────┘
|
1007
|
+
def clear(n = 0)
|
1008
|
+
DataFrame.new(columns: schema).clear(n).lazy
|
1009
|
+
end
|
1010
|
+
alias_method :cleared, :clear
|
728
1011
|
|
729
1012
|
# Filter the rows in the DataFrame based on a predicate expression.
|
730
1013
|
#
|
@@ -774,8 +1057,13 @@ module Polars
|
|
774
1057
|
|
775
1058
|
# Select columns from this DataFrame.
|
776
1059
|
#
|
777
|
-
# @param exprs [
|
778
|
-
# Column
|
1060
|
+
# @param exprs [Array]
|
1061
|
+
# Column(s) to select, specified as positional arguments.
|
1062
|
+
# Accepts expression input. Strings are parsed as column names,
|
1063
|
+
# other non-expression inputs are parsed as literals.
|
1064
|
+
# @param named_exprs [Hash]
|
1065
|
+
# Additional columns to select, specified as keyword arguments.
|
1066
|
+
# The columns will be renamed to the keyword used.
|
779
1067
|
#
|
780
1068
|
# @return [LazyFrame]
|
781
1069
|
#
|
@@ -855,9 +1143,13 @@ module Polars
|
|
855
1143
|
# # │ 0 │
|
856
1144
|
# # │ 10 │
|
857
1145
|
# # └─────────┘
|
858
|
-
def select(exprs)
|
859
|
-
|
860
|
-
|
1146
|
+
def select(*exprs, **named_exprs)
|
1147
|
+
structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
|
1148
|
+
|
1149
|
+
rbexprs = Utils.parse_as_list_of_expressions(
|
1150
|
+
*exprs, **named_exprs, __structify: structify
|
1151
|
+
)
|
1152
|
+
_from_rbldf(_ldf.select(rbexprs))
|
861
1153
|
end
|
862
1154
|
|
863
1155
|
# Start a group by operation.
|
@@ -967,7 +1259,7 @@ module Polars
|
|
967
1259
|
# df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
|
968
1260
|
# Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
|
969
1261
|
# )
|
970
|
-
# df.
|
1262
|
+
# df.rolling(index_column: "dt", period: "2d").agg(
|
971
1263
|
# [
|
972
1264
|
# Polars.sum("a").alias("sum_a"),
|
973
1265
|
# Polars.min("a").alias("min_a"),
|
@@ -988,7 +1280,7 @@ module Polars
|
|
988
1280
|
# # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
|
989
1281
|
# # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
|
990
1282
|
# # └─────────────────────┴───────┴───────┴───────┘
|
991
|
-
def
|
1283
|
+
def rolling(
|
992
1284
|
index_column:,
|
993
1285
|
period:,
|
994
1286
|
offset: nil,
|
@@ -1005,12 +1297,13 @@ module Polars
|
|
1005
1297
|
period = Utils._timedelta_to_pl_duration(period)
|
1006
1298
|
offset = Utils._timedelta_to_pl_duration(offset)
|
1007
1299
|
|
1008
|
-
lgb = _ldf.
|
1300
|
+
lgb = _ldf.rolling(
|
1009
1301
|
index_column, period, offset, closed, rbexprs_by, check_sorted
|
1010
1302
|
)
|
1011
1303
|
LazyGroupBy.new(lgb)
|
1012
1304
|
end
|
1013
|
-
alias_method :
|
1305
|
+
alias_method :group_by_rolling, :rolling
|
1306
|
+
alias_method :groupby_rolling, :rolling
|
1014
1307
|
|
1015
1308
|
# Group based on a time value (or index value of type `:i32`, `:i64`).
|
1016
1309
|
#
|
@@ -1234,12 +1527,13 @@ module Polars
|
|
1234
1527
|
# closed: "right"
|
1235
1528
|
# ).agg(Polars.col("A").alias("A_agg_list"))
|
1236
1529
|
# # =>
|
1237
|
-
# # shape: (
|
1530
|
+
# # shape: (4, 4)
|
1238
1531
|
# # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
|
1239
1532
|
# # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
|
1240
1533
|
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1241
1534
|
# # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
|
1242
1535
|
# # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
|
1536
|
+
# # │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │
|
1243
1537
|
# # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
|
1244
1538
|
# # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
|
1245
1539
|
# # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
|
@@ -1440,6 +1734,8 @@ module Polars
|
|
1440
1734
|
# Join strategy.
|
1441
1735
|
# @param suffix [String]
|
1442
1736
|
# Suffix to append to columns with a duplicate name.
|
1737
|
+
# @param join_nulls [Boolean]
|
1738
|
+
# Join on null values. By default null values will never produce matches.
|
1443
1739
|
# @param allow_parallel [Boolean]
|
1444
1740
|
# Allow the physical plan to optionally evaluate the computation of both
|
1445
1741
|
# DataFrames up to the join in parallel.
|
@@ -1535,6 +1831,7 @@ module Polars
|
|
1535
1831
|
on: nil,
|
1536
1832
|
how: "inner",
|
1537
1833
|
suffix: "_right",
|
1834
|
+
join_nulls: false,
|
1538
1835
|
allow_parallel: true,
|
1539
1836
|
force_parallel: false
|
1540
1837
|
)
|
@@ -1545,7 +1842,7 @@ module Polars
|
|
1545
1842
|
if how == "cross"
|
1546
1843
|
return _from_rbldf(
|
1547
1844
|
_ldf.join(
|
1548
|
-
other._ldf, [], [], allow_parallel, force_parallel, how, suffix
|
1845
|
+
other._ldf, [], [], allow_parallel, join_nulls, force_parallel, how, suffix
|
1549
1846
|
)
|
1550
1847
|
)
|
1551
1848
|
end
|
@@ -1568,6 +1865,7 @@ module Polars
|
|
1568
1865
|
rbexprs_right,
|
1569
1866
|
allow_parallel,
|
1570
1867
|
force_parallel,
|
1868
|
+
join_nulls,
|
1571
1869
|
how,
|
1572
1870
|
suffix,
|
1573
1871
|
)
|
@@ -1598,37 +1896,19 @@ module Polars
|
|
1598
1896
|
# ).collect
|
1599
1897
|
# # =>
|
1600
1898
|
# # shape: (4, 6)
|
1601
|
-
# #
|
1602
|
-
# # │ a ┆ b ┆ c ┆ a^2
|
1603
|
-
# # │ --- ┆ --- ┆ --- ┆ ---
|
1604
|
-
# # │ i64 ┆ f64 ┆ bool ┆
|
1605
|
-
# #
|
1606
|
-
# # │ 1 ┆ 0.5 ┆ true ┆ 1
|
1607
|
-
# # │ 2 ┆ 4.0 ┆ true ┆ 4
|
1608
|
-
# # │ 3 ┆ 10.0 ┆ false ┆ 9
|
1609
|
-
# # │ 4 ┆ 13.0 ┆ true ┆ 16
|
1610
|
-
# #
|
1611
|
-
def with_columns(exprs)
|
1612
|
-
|
1613
|
-
|
1614
|
-
[]
|
1615
|
-
elsif exprs.is_a?(Expr)
|
1616
|
-
[exprs]
|
1617
|
-
else
|
1618
|
-
exprs.to_a
|
1619
|
-
end
|
1620
|
-
|
1621
|
-
rbexprs = []
|
1622
|
-
exprs.each do |e|
|
1623
|
-
case e
|
1624
|
-
when Expr
|
1625
|
-
rbexprs << e._rbexpr
|
1626
|
-
when Series
|
1627
|
-
rbexprs << Utils.lit(e)._rbexpr
|
1628
|
-
else
|
1629
|
-
raise ArgumentError, "Expected an expression, got #{e}"
|
1630
|
-
end
|
1631
|
-
end
|
1899
|
+
# # ┌─────┬──────┬───────┬─────┬──────┬───────┐
|
1900
|
+
# # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
|
1901
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
1902
|
+
# # │ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │
|
1903
|
+
# # ╞═════╪══════╪═══════╪═════╪══════╪═══════╡
|
1904
|
+
# # │ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │
|
1905
|
+
# # │ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │
|
1906
|
+
# # │ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │
|
1907
|
+
# # │ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │
|
1908
|
+
# # └─────┴──────┴───────┴─────┴──────┴───────┘
|
1909
|
+
def with_columns(*exprs, **named_exprs)
|
1910
|
+
structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
|
1911
|
+
rbexprs = Utils.parse_as_list_of_expressions(*exprs, **named_exprs, __structify: structify)
|
1632
1912
|
|
1633
1913
|
_from_rbldf(_ldf.with_columns(rbexprs))
|
1634
1914
|
end
|
@@ -1690,26 +1970,26 @@ module Polars
|
|
1690
1970
|
# # ┌─────┬─────┬───────────┐
|
1691
1971
|
# # │ a ┆ b ┆ b_squared │
|
1692
1972
|
# # │ --- ┆ --- ┆ --- │
|
1693
|
-
# # │ i64 ┆ i64 ┆
|
1973
|
+
# # │ i64 ┆ i64 ┆ i64 │
|
1694
1974
|
# # ╞═════╪═════╪═══════════╡
|
1695
|
-
# # │ 1 ┆ 2 ┆ 4
|
1696
|
-
# # │ 3 ┆ 4 ┆ 16
|
1697
|
-
# # │ 5 ┆ 6 ┆ 36
|
1975
|
+
# # │ 1 ┆ 2 ┆ 4 │
|
1976
|
+
# # │ 3 ┆ 4 ┆ 16 │
|
1977
|
+
# # │ 5 ┆ 6 ┆ 36 │
|
1698
1978
|
# # └─────┴─────┴───────────┘
|
1699
1979
|
#
|
1700
1980
|
# @example
|
1701
1981
|
# df.with_column(Polars.col("a") ** 2).collect
|
1702
1982
|
# # =>
|
1703
1983
|
# # shape: (3, 2)
|
1704
|
-
# #
|
1705
|
-
# # │ a
|
1706
|
-
# # │ ---
|
1707
|
-
# # │
|
1708
|
-
# #
|
1709
|
-
# # │ 1
|
1710
|
-
# # │ 9
|
1711
|
-
# # │ 25
|
1712
|
-
# #
|
1984
|
+
# # ┌─────┬─────┐
|
1985
|
+
# # │ a ┆ b │
|
1986
|
+
# # │ --- ┆ --- │
|
1987
|
+
# # │ i64 ┆ i64 │
|
1988
|
+
# # ╞═════╪═════╡
|
1989
|
+
# # │ 1 ┆ 2 │
|
1990
|
+
# # │ 9 ┆ 4 │
|
1991
|
+
# # │ 25 ┆ 6 │
|
1992
|
+
# # └─────┴─────┘
|
1713
1993
|
def with_column(column)
|
1714
1994
|
with_columns([column])
|
1715
1995
|
end
|
@@ -1721,11 +2001,9 @@ module Polars
|
|
1721
2001
|
# - List of column names.
|
1722
2002
|
#
|
1723
2003
|
# @return [LazyFrame]
|
1724
|
-
def drop(columns)
|
1725
|
-
|
1726
|
-
|
1727
|
-
end
|
1728
|
-
_from_rbldf(_ldf.drop_columns(columns))
|
2004
|
+
def drop(*columns)
|
2005
|
+
drop_cols = Utils._expand_selectors(self, *columns)
|
2006
|
+
_from_rbldf(_ldf.drop(drop_cols))
|
1729
2007
|
end
|
1730
2008
|
|
1731
2009
|
# Rename column names.
|
@@ -1955,7 +2233,7 @@ module Polars
|
|
1955
2233
|
# "b" => [2, 4, 6]
|
1956
2234
|
# }
|
1957
2235
|
# ).lazy
|
1958
|
-
# df.
|
2236
|
+
# df.with_row_index.collect
|
1959
2237
|
# # =>
|
1960
2238
|
# # shape: (3, 3)
|
1961
2239
|
# # ┌────────┬─────┬─────┐
|
@@ -1967,9 +2245,10 @@ module Polars
|
|
1967
2245
|
# # │ 1 ┆ 3 ┆ 4 │
|
1968
2246
|
# # │ 2 ┆ 5 ┆ 6 │
|
1969
2247
|
# # └────────┴─────┴─────┘
|
1970
|
-
def
|
1971
|
-
_from_rbldf(_ldf.
|
2248
|
+
def with_row_index(name: "row_nr", offset: 0)
|
2249
|
+
_from_rbldf(_ldf.with_row_index(name, offset))
|
1972
2250
|
end
|
2251
|
+
alias_method :with_row_count, :with_row_index
|
1973
2252
|
|
1974
2253
|
# Take every nth row in the LazyFrame and return as a new LazyFrame.
|
1975
2254
|
#
|
@@ -2470,9 +2749,47 @@ module Polars
|
|
2470
2749
|
_from_rbldf(_ldf.unnest(names))
|
2471
2750
|
end
|
2472
2751
|
|
2473
|
-
#
|
2474
|
-
#
|
2475
|
-
#
|
2752
|
+
# Take two sorted DataFrames and merge them by the sorted key.
|
2753
|
+
#
|
2754
|
+
# The output of this operation will also be sorted.
|
2755
|
+
# It is the callers responsibility that the frames are sorted
|
2756
|
+
# by that key otherwise the output will not make sense.
|
2757
|
+
#
|
2758
|
+
# The schemas of both LazyFrames must be equal.
|
2759
|
+
#
|
2760
|
+
# @param other [DataFrame]
|
2761
|
+
# Other DataFrame that must be merged
|
2762
|
+
# @param key [String]
|
2763
|
+
# Key that is sorted.
|
2764
|
+
#
|
2765
|
+
# @return [LazyFrame]
|
2766
|
+
#
|
2767
|
+
# @example
|
2768
|
+
# df0 = Polars::LazyFrame.new(
|
2769
|
+
# {"name" => ["steve", "elise", "bob"], "age" => [42, 44, 18]}
|
2770
|
+
# ).sort("age")
|
2771
|
+
# df1 = Polars::LazyFrame.new(
|
2772
|
+
# {"name" => ["anna", "megan", "steve", "thomas"], "age" => [21, 33, 42, 20]}
|
2773
|
+
# ).sort("age")
|
2774
|
+
# df0.merge_sorted(df1, "age").collect
|
2775
|
+
# # =>
|
2776
|
+
# # shape: (7, 2)
|
2777
|
+
# # ┌────────┬─────┐
|
2778
|
+
# # │ name ┆ age │
|
2779
|
+
# # │ --- ┆ --- │
|
2780
|
+
# # │ str ┆ i64 │
|
2781
|
+
# # ╞════════╪═════╡
|
2782
|
+
# # │ bob ┆ 18 │
|
2783
|
+
# # │ thomas ┆ 20 │
|
2784
|
+
# # │ anna ┆ 21 │
|
2785
|
+
# # │ megan ┆ 33 │
|
2786
|
+
# # │ steve ┆ 42 │
|
2787
|
+
# # │ steve ┆ 42 │
|
2788
|
+
# # │ elise ┆ 44 │
|
2789
|
+
# # └────────┴─────┘
|
2790
|
+
def merge_sorted(other, key)
|
2791
|
+
_from_rbldf(_ldf.merge_sorted(other._ldf, key))
|
2792
|
+
end
|
2476
2793
|
|
2477
2794
|
# Indicate that one or multiple columns are sorted.
|
2478
2795
|
#
|