polars-df 0.8.0 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +42 -1
  3. data/Cargo.lock +159 -66
  4. data/Cargo.toml +0 -3
  5. data/LICENSE.txt +1 -1
  6. data/README.md +3 -2
  7. data/ext/polars/Cargo.toml +18 -8
  8. data/ext/polars/src/batched_csv.rs +7 -5
  9. data/ext/polars/src/conversion/anyvalue.rs +186 -0
  10. data/ext/polars/src/conversion/chunked_array.rs +140 -0
  11. data/ext/polars/src/{conversion.rs → conversion/mod.rs} +273 -342
  12. data/ext/polars/src/dataframe.rs +108 -66
  13. data/ext/polars/src/expr/array.rs +78 -0
  14. data/ext/polars/src/expr/datetime.rs +29 -58
  15. data/ext/polars/src/expr/general.rs +83 -36
  16. data/ext/polars/src/expr/list.rs +58 -6
  17. data/ext/polars/src/expr/meta.rs +48 -0
  18. data/ext/polars/src/expr/rolling.rs +1 -0
  19. data/ext/polars/src/expr/string.rs +62 -11
  20. data/ext/polars/src/expr/struct.rs +8 -4
  21. data/ext/polars/src/file.rs +158 -11
  22. data/ext/polars/src/functions/aggregation.rs +6 -0
  23. data/ext/polars/src/functions/lazy.rs +120 -50
  24. data/ext/polars/src/functions/meta.rs +45 -1
  25. data/ext/polars/src/functions/string_cache.rs +14 -0
  26. data/ext/polars/src/functions/whenthen.rs +47 -17
  27. data/ext/polars/src/{lazyframe.rs → lazyframe/mod.rs} +195 -40
  28. data/ext/polars/src/lib.rs +246 -179
  29. data/ext/polars/src/map/dataframe.rs +17 -9
  30. data/ext/polars/src/series/aggregation.rs +20 -0
  31. data/ext/polars/src/series/mod.rs +35 -4
  32. data/lib/polars/array_expr.rb +453 -0
  33. data/lib/polars/array_name_space.rb +346 -0
  34. data/lib/polars/batched_csv_reader.rb +4 -2
  35. data/lib/polars/cat_expr.rb +24 -0
  36. data/lib/polars/cat_name_space.rb +75 -0
  37. data/lib/polars/config.rb +2 -2
  38. data/lib/polars/data_frame.rb +306 -96
  39. data/lib/polars/data_types.rb +191 -28
  40. data/lib/polars/date_time_expr.rb +41 -18
  41. data/lib/polars/date_time_name_space.rb +9 -3
  42. data/lib/polars/exceptions.rb +12 -1
  43. data/lib/polars/expr.rb +898 -215
  44. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  45. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  46. data/lib/polars/functions/as_datatype.rb +248 -0
  47. data/lib/polars/functions/col.rb +47 -0
  48. data/lib/polars/functions/eager.rb +182 -0
  49. data/lib/polars/functions/lazy.rb +1280 -0
  50. data/lib/polars/functions/len.rb +49 -0
  51. data/lib/polars/functions/lit.rb +35 -0
  52. data/lib/polars/functions/random.rb +16 -0
  53. data/lib/polars/functions/range/date_range.rb +103 -0
  54. data/lib/polars/functions/range/int_range.rb +51 -0
  55. data/lib/polars/functions/repeat.rb +144 -0
  56. data/lib/polars/functions/whenthen.rb +96 -0
  57. data/lib/polars/functions.rb +29 -416
  58. data/lib/polars/group_by.rb +2 -2
  59. data/lib/polars/io.rb +36 -31
  60. data/lib/polars/lazy_frame.rb +405 -88
  61. data/lib/polars/list_expr.rb +158 -8
  62. data/lib/polars/list_name_space.rb +102 -0
  63. data/lib/polars/meta_expr.rb +175 -7
  64. data/lib/polars/series.rb +282 -41
  65. data/lib/polars/string_cache.rb +75 -0
  66. data/lib/polars/string_expr.rb +413 -96
  67. data/lib/polars/string_name_space.rb +4 -4
  68. data/lib/polars/testing.rb +507 -0
  69. data/lib/polars/utils.rb +106 -8
  70. data/lib/polars/version.rb +1 -1
  71. data/lib/polars/whenthen.rb +83 -0
  72. data/lib/polars.rb +16 -4
  73. metadata +37 -8
  74. data/lib/polars/lazy_functions.rb +0 -1181
  75. data/lib/polars/when.rb +0 -16
  76. data/lib/polars/when_then.rb +0 -19
@@ -49,7 +49,8 @@ module Polars
49
49
  row_count_name: nil,
50
50
  row_count_offset: 0,
51
51
  parse_dates: false,
52
- eol_char: "\n"
52
+ eol_char: "\n",
53
+ truncate_ragged_lines: true
53
54
  )
54
55
  dtype_list = nil
55
56
  if !dtypes.nil?
@@ -81,7 +82,8 @@ module Polars
81
82
  encoding,
82
83
  Utils._prepare_row_count_args(row_count_name, row_count_offset),
83
84
  parse_dates,
84
- eol_char
85
+ eol_char,
86
+ truncate_ragged_lines
85
87
  )
86
88
  )
87
89
  end
@@ -103,6 +105,7 @@ module Polars
103
105
  _from_rbldf(
104
106
  RbLazyFrame.new_from_parquet(
105
107
  file,
108
+ [],
106
109
  n_rows,
107
110
  cache,
108
111
  parallel,
@@ -110,7 +113,8 @@ module Polars
110
113
  Utils._prepare_row_count_args(row_count_name, row_count_offset),
111
114
  low_memory,
112
115
  use_statistics,
113
- hive_partitioning
116
+ hive_partitioning,
117
+ nil
114
118
  )
115
119
  )
116
120
  end
@@ -308,7 +312,7 @@ module Polars
308
312
  # end
309
313
  #
310
314
  # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]}).lazy
311
- # df.pipe(cast_str_to_int, col_name: "b").collect()
315
+ # df.pipe(cast_str_to_int, col_name: "b").collect
312
316
  # # =>
313
317
  # # shape: (4, 2)
314
318
  # # ┌─────┬─────┐
@@ -342,6 +346,7 @@ module Polars
342
346
  simplify_expression: true,
343
347
  slice_pushdown: true,
344
348
  common_subplan_elimination: true,
349
+ comm_subexpr_elim: true,
345
350
  allow_streaming: false
346
351
  )
347
352
  ldf = _ldf.optimization_toggle(
@@ -351,6 +356,7 @@ module Polars
351
356
  simplify_expression,
352
357
  slice_pushdown,
353
358
  common_subplan_elimination,
359
+ comm_subexpr_elim,
354
360
  allow_streaming,
355
361
  false
356
362
  )
@@ -398,16 +404,16 @@ module Polars
398
404
  # # │ 2 ┆ 7.0 ┆ b │
399
405
  # # │ 1 ┆ 6.0 ┆ a │
400
406
  # # └─────┴─────┴─────┘
401
- def sort(by, reverse: false, nulls_last: false, maintain_order: false)
407
+ def sort(by, reverse: false, nulls_last: false, maintain_order: false, multithreaded: true)
402
408
  if by.is_a?(::String)
403
- return _from_rbldf(_ldf.sort(by, reverse, nulls_last, maintain_order))
409
+ return _from_rbldf(_ldf.sort(by, reverse, nulls_last, maintain_order, multithreaded))
404
410
  end
405
411
  if Utils.bool?(reverse)
406
412
  reverse = [reverse]
407
413
  end
408
414
 
409
415
  by = Utils.selection_to_rbexpr_list(by)
410
- _from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last, maintain_order))
416
+ _from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last, maintain_order, multithreaded))
411
417
  end
412
418
 
413
419
  # def profile
@@ -469,6 +475,7 @@ module Polars
469
475
  no_optimization: false,
470
476
  slice_pushdown: true,
471
477
  common_subplan_elimination: true,
478
+ comm_subexpr_elim: true,
472
479
  allow_streaming: false,
473
480
  _eager: false
474
481
  )
@@ -477,6 +484,7 @@ module Polars
477
484
  projection_pushdown = false
478
485
  slice_pushdown = false
479
486
  common_subplan_elimination = false
487
+ comm_subexpr_elim = false
480
488
  end
481
489
 
482
490
  if allow_streaming
@@ -490,6 +498,7 @@ module Polars
490
498
  simplify_expression,
491
499
  slice_pushdown,
492
500
  common_subplan_elimination,
501
+ comm_subexpr_elim,
493
502
  allow_streaming,
494
503
  _eager
495
504
  )
@@ -559,6 +568,268 @@ module Polars
559
568
  simplify_expression: true,
560
569
  no_optimization: false,
561
570
  slice_pushdown: true
571
+ )
572
+ lf = _set_sink_optimizations(
573
+ type_coercion: type_coercion,
574
+ predicate_pushdown: predicate_pushdown,
575
+ projection_pushdown: projection_pushdown,
576
+ simplify_expression: simplify_expression,
577
+ slice_pushdown: slice_pushdown,
578
+ no_optimization: no_optimization
579
+ )
580
+
581
+ lf.sink_parquet(
582
+ path,
583
+ compression,
584
+ compression_level,
585
+ statistics,
586
+ row_group_size,
587
+ data_pagesize_limit,
588
+ maintain_order
589
+ )
590
+ end
591
+
592
+ # Evaluate the query in streaming mode and write to an IPC file.
593
+ #
594
+ # This allows streaming results that are larger than RAM to be written to disk.
595
+ #
596
+ # @param path [String]
597
+ # File path to which the file should be written.
598
+ # @param compression ["lz4", "zstd"]
599
+ # Choose "zstd" for good compression performance.
600
+ # Choose "lz4" for fast compression/decompression.
601
+ # @param maintain_order [Boolean]
602
+ # Maintain the order in which data is processed.
603
+ # Setting this to `false` will be slightly faster.
604
+ # @param type_coercion [Boolean]
605
+ # Do type coercion optimization.
606
+ # @param predicate_pushdown [Boolean]
607
+ # Do predicate pushdown optimization.
608
+ # @param projection_pushdown [Boolean]
609
+ # Do projection pushdown optimization.
610
+ # @param simplify_expression [Boolean]
611
+ # Run simplify expressions optimization.
612
+ # @param slice_pushdown [Boolean]
613
+ # Slice pushdown optimization.
614
+ # @param no_optimization [Boolean]
615
+ # Turn off (certain) optimizations.
616
+ #
617
+ # @return [DataFrame]
618
+ #
619
+ # @example
620
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
621
+ # lf.sink_ipc("out.arrow")
622
+ def sink_ipc(
623
+ path,
624
+ compression: "zstd",
625
+ maintain_order: true,
626
+ type_coercion: true,
627
+ predicate_pushdown: true,
628
+ projection_pushdown: true,
629
+ simplify_expression: true,
630
+ slice_pushdown: true,
631
+ no_optimization: false
632
+ )
633
+ lf = _set_sink_optimizations(
634
+ type_coercion: type_coercion,
635
+ predicate_pushdown: predicate_pushdown,
636
+ projection_pushdown: projection_pushdown,
637
+ simplify_expression: simplify_expression,
638
+ slice_pushdown: slice_pushdown,
639
+ no_optimization: no_optimization
640
+ )
641
+
642
+ lf.sink_ipc(
643
+ path,
644
+ compression,
645
+ maintain_order
646
+ )
647
+ end
648
+
649
+ # Evaluate the query in streaming mode and write to a CSV file.
650
+ #
651
+ # This allows streaming results that are larger than RAM to be written to disk.
652
+ #
653
+ # @param path [String]
654
+ # File path to which the file should be written.
655
+ # @param include_bom [Boolean]
656
+ # Whether to include UTF-8 BOM in the CSV output.
657
+ # @param include_header [Boolean]
658
+ # Whether to include header in the CSV output.
659
+ # @param separator [String]
660
+ # Separate CSV fields with this symbol.
661
+ # @param line_terminator [String]
662
+ # String used to end each row.
663
+ # @param quote_char [String]
664
+ # Byte to use as quoting character.
665
+ # @param batch_size [Integer]
666
+ # Number of rows that will be processed per thread.
667
+ # @param datetime_format [String]
668
+ # A format string, with the specifiers defined by the
669
+ # `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
670
+ # Rust crate. If no format specified, the default fractional-second
671
+ # precision is inferred from the maximum timeunit found in the frame's
672
+ # Datetime cols (if any).
673
+ # @param date_format [String]
674
+ # A format string, with the specifiers defined by the
675
+ # `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
676
+ # Rust crate.
677
+ # @param time_format [String]
678
+ # A format string, with the specifiers defined by the
679
+ # `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
680
+ # Rust crate.
681
+ # @param float_precision [Integer]
682
+ # Number of decimal places to write, applied to both `Float32` and
683
+ # `Float64` datatypes.
684
+ # @param null_value [String]
685
+ # A string representing null values (defaulting to the empty string).
686
+ # @param quote_style ["necessary", "always", "non_numeric", "never"]
687
+ # Determines the quoting strategy used.
688
+ #
689
+ # - necessary (default): This puts quotes around fields only when necessary.
690
+ # They are necessary when fields contain a quote,
691
+ # delimiter or record terminator.
692
+ # Quotes are also necessary when writing an empty record
693
+ # (which is indistinguishable from a record with one empty field).
694
+ # This is the default.
695
+ # - always: This puts quotes around every field. Always.
696
+ # - never: This never puts quotes around fields, even if that results in
697
+ # invalid CSV data (e.g.: by not quoting strings containing the
698
+ # separator).
699
+ # - non_numeric: This puts quotes around all fields that are non-numeric.
700
+ # Namely, when writing a field that does not parse as a valid float
701
+ # or integer, then quotes will be used even if they aren`t strictly
702
+ # necessary.
703
+ # @param maintain_order [Boolean]
704
+ # Maintain the order in which data is processed.
705
+ # Setting this to `false` will be slightly faster.
706
+ # @param type_coercion [Boolean]
707
+ # Do type coercion optimization.
708
+ # @param predicate_pushdown [Boolean]
709
+ # Do predicate pushdown optimization.
710
+ # @param projection_pushdown [Boolean]
711
+ # Do projection pushdown optimization.
712
+ # @param simplify_expression [Boolean]
713
+ # Run simplify expressions optimization.
714
+ # @param slice_pushdown [Boolean]
715
+ # Slice pushdown optimization.
716
+ # @param no_optimization [Boolean]
717
+ # Turn off (certain) optimizations.
718
+ #
719
+ # @return [DataFrame]
720
+ #
721
+ # @example
722
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
723
+ # lf.sink_csv("out.csv")
724
+ def sink_csv(
725
+ path,
726
+ include_bom: false,
727
+ include_header: true,
728
+ separator: ",",
729
+ line_terminator: "\n",
730
+ quote_char: '"',
731
+ batch_size: 1024,
732
+ datetime_format: nil,
733
+ date_format: nil,
734
+ time_format: nil,
735
+ float_precision: nil,
736
+ null_value: nil,
737
+ quote_style: nil,
738
+ maintain_order: true,
739
+ type_coercion: true,
740
+ predicate_pushdown: true,
741
+ projection_pushdown: true,
742
+ simplify_expression: true,
743
+ slice_pushdown: true,
744
+ no_optimization: false
745
+ )
746
+ Utils._check_arg_is_1byte("separator", separator, false)
747
+ Utils._check_arg_is_1byte("quote_char", quote_char, false)
748
+
749
+ lf = _set_sink_optimizations(
750
+ type_coercion: type_coercion,
751
+ predicate_pushdown: predicate_pushdown,
752
+ projection_pushdown: projection_pushdown,
753
+ simplify_expression: simplify_expression,
754
+ slice_pushdown: slice_pushdown,
755
+ no_optimization: no_optimization
756
+ )
757
+
758
+ lf.sink_csv(
759
+ path,
760
+ include_bom,
761
+ include_header,
762
+ separator.ord,
763
+ line_terminator,
764
+ quote_char.ord,
765
+ batch_size,
766
+ datetime_format,
767
+ date_format,
768
+ time_format,
769
+ float_precision,
770
+ null_value,
771
+ quote_style,
772
+ maintain_order
773
+ )
774
+ end
775
+
776
+ # Evaluate the query in streaming mode and write to an NDJSON file.
777
+ #
778
+ # This allows streaming results that are larger than RAM to be written to disk.
779
+ #
780
+ # @param path [String]
781
+ # File path to which the file should be written.
782
+ # @param maintain_order [Boolean]
783
+ # Maintain the order in which data is processed.
784
+ # Setting this to `false` will be slightly faster.
785
+ # @param type_coercion [Boolean]
786
+ # Do type coercion optimization.
787
+ # @param predicate_pushdown [Boolean]
788
+ # Do predicate pushdown optimization.
789
+ # @param projection_pushdown [Boolean]
790
+ # Do projection pushdown optimization.
791
+ # @param simplify_expression [Boolean]
792
+ # Run simplify expressions optimization.
793
+ # @param slice_pushdown [Boolean]
794
+ # Slice pushdown optimization.
795
+ # @param no_optimization [Boolean]
796
+ # Turn off (certain) optimizations.
797
+ #
798
+ # @return [DataFrame]
799
+ #
800
+ # @example
801
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
802
+ # lf.sink_ndjson("out.ndjson")
803
+ def sink_ndjson(
804
+ path,
805
+ maintain_order: true,
806
+ type_coercion: true,
807
+ predicate_pushdown: true,
808
+ projection_pushdown: true,
809
+ simplify_expression: true,
810
+ slice_pushdown: true,
811
+ no_optimization: false
812
+ )
813
+ lf = _set_sink_optimizations(
814
+ type_coercion: type_coercion,
815
+ predicate_pushdown: predicate_pushdown,
816
+ projection_pushdown: projection_pushdown,
817
+ simplify_expression: simplify_expression,
818
+ slice_pushdown: slice_pushdown,
819
+ no_optimization: no_optimization
820
+ )
821
+
822
+ lf.sink_json(path, maintain_order)
823
+ end
824
+
825
+ # @private
826
+ def _set_sink_optimizations(
827
+ type_coercion: true,
828
+ predicate_pushdown: true,
829
+ projection_pushdown: true,
830
+ simplify_expression: true,
831
+ slice_pushdown: true,
832
+ no_optimization: false
562
833
  )
563
834
  if no_optimization
564
835
  predicate_pushdown = false
@@ -566,25 +837,17 @@ module Polars
566
837
  slice_pushdown = false
567
838
  end
568
839
 
569
- lf = _ldf.optimization_toggle(
840
+ _ldf.optimization_toggle(
570
841
  type_coercion,
571
842
  predicate_pushdown,
572
843
  projection_pushdown,
573
844
  simplify_expression,
574
845
  slice_pushdown,
575
846
  false,
847
+ false,
576
848
  true,
577
849
  false
578
850
  )
579
- lf.sink_parquet(
580
- path,
581
- compression,
582
- compression_level,
583
- statistics,
584
- row_group_size,
585
- data_pagesize_limit,
586
- maintain_order
587
- )
588
851
  end
589
852
 
590
853
  # Collect a small number of rows for debugging purposes.
@@ -650,6 +913,7 @@ module Polars
650
913
  no_optimization: false,
651
914
  slice_pushdown: true,
652
915
  common_subplan_elimination: true,
916
+ comm_subexpr_elim: true,
653
917
  allow_streaming: false
654
918
  )
655
919
  if no_optimization
@@ -666,6 +930,7 @@ module Polars
666
930
  simplify_expression,
667
931
  slice_pushdown,
668
932
  common_subplan_elimination,
933
+ comm_subexpr_elim,
669
934
  allow_streaming,
670
935
  false
671
936
  )
@@ -699,6 +964,10 @@ module Polars
699
964
  _from_rbldf(_ldf.cache)
700
965
  end
701
966
 
967
+ # TODO
968
+ # def cast
969
+ # end
970
+
702
971
  # Create an empty copy of the current LazyFrame.
703
972
  #
704
973
  # The copy has an identical schema but no data.
@@ -706,14 +975,14 @@ module Polars
706
975
  # @return [LazyFrame]
707
976
  #
708
977
  # @example
709
- # df = Polars::DataFrame.new(
978
+ # lf = Polars::LazyFrame.new(
710
979
  # {
711
980
  # "a" => [nil, 2, 3, 4],
712
981
  # "b" => [0.5, nil, 2.5, 13],
713
982
  # "c" => [true, true, false, nil],
714
983
  # }
715
984
  # ).lazy
716
- # df.cleared.fetch
985
+ # lf.clear.fetch
717
986
  # # =>
718
987
  # # shape: (0, 3)
719
988
  # # ┌─────┬─────┬──────┐
@@ -722,9 +991,23 @@ module Polars
722
991
  # # │ i64 ┆ f64 ┆ bool │
723
992
  # # ╞═════╪═════╪══════╡
724
993
  # # └─────┴─────┴──────┘
725
- def cleared
726
- DataFrame.new(columns: schema).lazy
727
- end
994
+ #
995
+ # @example
996
+ # lf.clear(2).fetch
997
+ # # =>
998
+ # # shape: (2, 3)
999
+ # # ┌──────┬──────┬──────┐
1000
+ # # │ a ┆ b ┆ c │
1001
+ # # │ --- ┆ --- ┆ --- │
1002
+ # # │ i64 ┆ f64 ┆ bool │
1003
+ # # ╞══════╪══════╪══════╡
1004
+ # # │ null ┆ null ┆ null │
1005
+ # # │ null ┆ null ┆ null │
1006
+ # # └──────┴──────┴──────┘
1007
+ def clear(n = 0)
1008
+ DataFrame.new(columns: schema).clear(n).lazy
1009
+ end
1010
+ alias_method :cleared, :clear
728
1011
 
729
1012
  # Filter the rows in the DataFrame based on a predicate expression.
730
1013
  #
@@ -774,8 +1057,13 @@ module Polars
774
1057
 
775
1058
  # Select columns from this DataFrame.
776
1059
  #
777
- # @param exprs [Object]
778
- # Column or columns to select.
1060
+ # @param exprs [Array]
1061
+ # Column(s) to select, specified as positional arguments.
1062
+ # Accepts expression input. Strings are parsed as column names,
1063
+ # other non-expression inputs are parsed as literals.
1064
+ # @param named_exprs [Hash]
1065
+ # Additional columns to select, specified as keyword arguments.
1066
+ # The columns will be renamed to the keyword used.
779
1067
  #
780
1068
  # @return [LazyFrame]
781
1069
  #
@@ -855,9 +1143,13 @@ module Polars
855
1143
  # # │ 0 │
856
1144
  # # │ 10 │
857
1145
  # # └─────────┘
858
- def select(exprs)
859
- exprs = Utils.selection_to_rbexpr_list(exprs)
860
- _from_rbldf(_ldf.select(exprs))
1146
+ def select(*exprs, **named_exprs)
1147
+ structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
1148
+
1149
+ rbexprs = Utils.parse_as_list_of_expressions(
1150
+ *exprs, **named_exprs, __structify: structify
1151
+ )
1152
+ _from_rbldf(_ldf.select(rbexprs))
861
1153
  end
862
1154
 
863
1155
  # Start a group by operation.
@@ -967,7 +1259,7 @@ module Polars
967
1259
  # df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
968
1260
  # Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
969
1261
  # )
970
- # df.group_by_rolling(index_column: "dt", period: "2d").agg(
1262
+ # df.rolling(index_column: "dt", period: "2d").agg(
971
1263
  # [
972
1264
  # Polars.sum("a").alias("sum_a"),
973
1265
  # Polars.min("a").alias("min_a"),
@@ -988,7 +1280,7 @@ module Polars
988
1280
  # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
989
1281
  # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
990
1282
  # # └─────────────────────┴───────┴───────┴───────┘
991
- def group_by_rolling(
1283
+ def rolling(
992
1284
  index_column:,
993
1285
  period:,
994
1286
  offset: nil,
@@ -1005,12 +1297,13 @@ module Polars
1005
1297
  period = Utils._timedelta_to_pl_duration(period)
1006
1298
  offset = Utils._timedelta_to_pl_duration(offset)
1007
1299
 
1008
- lgb = _ldf.group_by_rolling(
1300
+ lgb = _ldf.rolling(
1009
1301
  index_column, period, offset, closed, rbexprs_by, check_sorted
1010
1302
  )
1011
1303
  LazyGroupBy.new(lgb)
1012
1304
  end
1013
- alias_method :groupby_rolling, :group_by_rolling
1305
+ alias_method :group_by_rolling, :rolling
1306
+ alias_method :groupby_rolling, :rolling
1014
1307
 
1015
1308
  # Group based on a time value (or index value of type `:i32`, `:i64`).
1016
1309
  #
@@ -1234,12 +1527,13 @@ module Polars
1234
1527
  # closed: "right"
1235
1528
  # ).agg(Polars.col("A").alias("A_agg_list"))
1236
1529
  # # =>
1237
- # # shape: (3, 4)
1530
+ # # shape: (4, 4)
1238
1531
  # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
1239
1532
  # # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
1240
1533
  # # │ --- ┆ --- ┆ --- ┆ --- │
1241
1534
  # # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
1242
1535
  # # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
1536
+ # # │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │
1243
1537
  # # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
1244
1538
  # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
1245
1539
  # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
@@ -1440,6 +1734,8 @@ module Polars
1440
1734
  # Join strategy.
1441
1735
  # @param suffix [String]
1442
1736
  # Suffix to append to columns with a duplicate name.
1737
+ # @param join_nulls [Boolean]
1738
+ # Join on null values. By default null values will never produce matches.
1443
1739
  # @param allow_parallel [Boolean]
1444
1740
  # Allow the physical plan to optionally evaluate the computation of both
1445
1741
  # DataFrames up to the join in parallel.
@@ -1535,6 +1831,7 @@ module Polars
1535
1831
  on: nil,
1536
1832
  how: "inner",
1537
1833
  suffix: "_right",
1834
+ join_nulls: false,
1538
1835
  allow_parallel: true,
1539
1836
  force_parallel: false
1540
1837
  )
@@ -1545,7 +1842,7 @@ module Polars
1545
1842
  if how == "cross"
1546
1843
  return _from_rbldf(
1547
1844
  _ldf.join(
1548
- other._ldf, [], [], allow_parallel, force_parallel, how, suffix
1845
+ other._ldf, [], [], allow_parallel, join_nulls, force_parallel, how, suffix
1549
1846
  )
1550
1847
  )
1551
1848
  end
@@ -1568,6 +1865,7 @@ module Polars
1568
1865
  rbexprs_right,
1569
1866
  allow_parallel,
1570
1867
  force_parallel,
1868
+ join_nulls,
1571
1869
  how,
1572
1870
  suffix,
1573
1871
  )
@@ -1598,37 +1896,19 @@ module Polars
1598
1896
  # ).collect
1599
1897
  # # =>
1600
1898
  # # shape: (4, 6)
1601
- # # ┌─────┬──────┬───────┬──────┬──────┬───────┐
1602
- # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
1603
- # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1604
- # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
1605
- # # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
1606
- # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
1607
- # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
1608
- # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
1609
- # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
1610
- # # └─────┴──────┴───────┴──────┴──────┴───────┘
1611
- def with_columns(exprs)
1612
- exprs =
1613
- if exprs.nil?
1614
- []
1615
- elsif exprs.is_a?(Expr)
1616
- [exprs]
1617
- else
1618
- exprs.to_a
1619
- end
1620
-
1621
- rbexprs = []
1622
- exprs.each do |e|
1623
- case e
1624
- when Expr
1625
- rbexprs << e._rbexpr
1626
- when Series
1627
- rbexprs << Utils.lit(e)._rbexpr
1628
- else
1629
- raise ArgumentError, "Expected an expression, got #{e}"
1630
- end
1631
- end
1899
+ # # ┌─────┬──────┬───────┬─────┬──────┬───────┐
1900
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
1901
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1902
+ # # │ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │
1903
+ # # ╞═════╪══════╪═══════╪═════╪══════╪═══════╡
1904
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │
1905
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │
1906
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │
1907
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │
1908
+ # # └─────┴──────┴───────┴─────┴──────┴───────┘
1909
+ def with_columns(*exprs, **named_exprs)
1910
+ structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
1911
+ rbexprs = Utils.parse_as_list_of_expressions(*exprs, **named_exprs, __structify: structify)
1632
1912
 
1633
1913
  _from_rbldf(_ldf.with_columns(rbexprs))
1634
1914
  end
@@ -1690,26 +1970,26 @@ module Polars
1690
1970
  # # ┌─────┬─────┬───────────┐
1691
1971
  # # │ a ┆ b ┆ b_squared │
1692
1972
  # # │ --- ┆ --- ┆ --- │
1693
- # # │ i64 ┆ i64 ┆ f64
1973
+ # # │ i64 ┆ i64 ┆ i64
1694
1974
  # # ╞═════╪═════╪═══════════╡
1695
- # # │ 1 ┆ 2 ┆ 4.0
1696
- # # │ 3 ┆ 4 ┆ 16.0
1697
- # # │ 5 ┆ 6 ┆ 36.0
1975
+ # # │ 1 ┆ 2 ┆ 4
1976
+ # # │ 3 ┆ 4 ┆ 16
1977
+ # # │ 5 ┆ 6 ┆ 36
1698
1978
  # # └─────┴─────┴───────────┘
1699
1979
  #
1700
1980
  # @example
1701
1981
  # df.with_column(Polars.col("a") ** 2).collect
1702
1982
  # # =>
1703
1983
  # # shape: (3, 2)
1704
- # # ┌──────┬─────┐
1705
- # # │ a ┆ b │
1706
- # # │ --- ┆ --- │
1707
- # # │ f64 ┆ i64 │
1708
- # # ╞══════╪═════╡
1709
- # # │ 1.0 ┆ 2 │
1710
- # # │ 9.0 ┆ 4 │
1711
- # # │ 25.0 ┆ 6 │
1712
- # # └──────┴─────┘
1984
+ # # ┌─────┬─────┐
1985
+ # # │ a ┆ b │
1986
+ # # │ --- ┆ --- │
1987
+ # # │ i64 ┆ i64 │
1988
+ # # ╞═════╪═════╡
1989
+ # # │ 1 ┆ 2 │
1990
+ # # │ 9 ┆ 4 │
1991
+ # # │ 25 ┆ 6 │
1992
+ # # └─────┴─────┘
1713
1993
  def with_column(column)
1714
1994
  with_columns([column])
1715
1995
  end
@@ -1721,11 +2001,9 @@ module Polars
1721
2001
  # - List of column names.
1722
2002
  #
1723
2003
  # @return [LazyFrame]
1724
- def drop(columns)
1725
- if columns.is_a?(::String)
1726
- columns = [columns]
1727
- end
1728
- _from_rbldf(_ldf.drop_columns(columns))
2004
+ def drop(*columns)
2005
+ drop_cols = Utils._expand_selectors(self, *columns)
2006
+ _from_rbldf(_ldf.drop(drop_cols))
1729
2007
  end
1730
2008
 
1731
2009
  # Rename column names.
@@ -1955,7 +2233,7 @@ module Polars
1955
2233
  # "b" => [2, 4, 6]
1956
2234
  # }
1957
2235
  # ).lazy
1958
- # df.with_row_count.collect
2236
+ # df.with_row_index.collect
1959
2237
  # # =>
1960
2238
  # # shape: (3, 3)
1961
2239
  # # ┌────────┬─────┬─────┐
@@ -1967,9 +2245,10 @@ module Polars
1967
2245
  # # │ 1 ┆ 3 ┆ 4 │
1968
2246
  # # │ 2 ┆ 5 ┆ 6 │
1969
2247
  # # └────────┴─────┴─────┘
1970
- def with_row_count(name: "row_nr", offset: 0)
1971
- _from_rbldf(_ldf.with_row_count(name, offset))
2248
+ def with_row_index(name: "row_nr", offset: 0)
2249
+ _from_rbldf(_ldf.with_row_index(name, offset))
1972
2250
  end
2251
+ alias_method :with_row_count, :with_row_index
1973
2252
 
1974
2253
  # Take every nth row in the LazyFrame and return as a new LazyFrame.
1975
2254
  #
@@ -2470,9 +2749,47 @@ module Polars
2470
2749
  _from_rbldf(_ldf.unnest(names))
2471
2750
  end
2472
2751
 
2473
- # TODO
2474
- # def merge_sorted
2475
- # end
2752
+ # Take two sorted DataFrames and merge them by the sorted key.
2753
+ #
2754
+ # The output of this operation will also be sorted.
2755
+ # It is the callers responsibility that the frames are sorted
2756
+ # by that key otherwise the output will not make sense.
2757
+ #
2758
+ # The schemas of both LazyFrames must be equal.
2759
+ #
2760
+ # @param other [DataFrame]
2761
+ # Other DataFrame that must be merged
2762
+ # @param key [String]
2763
+ # Key that is sorted.
2764
+ #
2765
+ # @return [LazyFrame]
2766
+ #
2767
+ # @example
2768
+ # df0 = Polars::LazyFrame.new(
2769
+ # {"name" => ["steve", "elise", "bob"], "age" => [42, 44, 18]}
2770
+ # ).sort("age")
2771
+ # df1 = Polars::LazyFrame.new(
2772
+ # {"name" => ["anna", "megan", "steve", "thomas"], "age" => [21, 33, 42, 20]}
2773
+ # ).sort("age")
2774
+ # df0.merge_sorted(df1, "age").collect
2775
+ # # =>
2776
+ # # shape: (7, 2)
2777
+ # # ┌────────┬─────┐
2778
+ # # │ name ┆ age │
2779
+ # # │ --- ┆ --- │
2780
+ # # │ str ┆ i64 │
2781
+ # # ╞════════╪═════╡
2782
+ # # │ bob ┆ 18 │
2783
+ # # │ thomas ┆ 20 │
2784
+ # # │ anna ┆ 21 │
2785
+ # # │ megan ┆ 33 │
2786
+ # # │ steve ┆ 42 │
2787
+ # # │ steve ┆ 42 │
2788
+ # # │ elise ┆ 44 │
2789
+ # # └────────┴─────┘
2790
+ def merge_sorted(other, key)
2791
+ _from_rbldf(_ldf.merge_sorted(other._ldf, key))
2792
+ end
2476
2793
 
2477
2794
  # Indicate that one or multiple columns are sorted.
2478
2795
  #