polars-df 0.8.0-aarch64-linux → 0.10.0-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +42 -1
  3. data/Cargo.lock +159 -66
  4. data/Cargo.toml +0 -3
  5. data/LICENSE-THIRD-PARTY.txt +3112 -1613
  6. data/LICENSE.txt +1 -1
  7. data/README.md +3 -2
  8. data/lib/polars/3.1/polars.so +0 -0
  9. data/lib/polars/3.2/polars.so +0 -0
  10. data/lib/polars/3.3/polars.so +0 -0
  11. data/lib/polars/array_expr.rb +453 -0
  12. data/lib/polars/array_name_space.rb +346 -0
  13. data/lib/polars/batched_csv_reader.rb +4 -2
  14. data/lib/polars/cat_expr.rb +24 -0
  15. data/lib/polars/cat_name_space.rb +75 -0
  16. data/lib/polars/config.rb +2 -2
  17. data/lib/polars/data_frame.rb +306 -96
  18. data/lib/polars/data_types.rb +191 -28
  19. data/lib/polars/date_time_expr.rb +41 -18
  20. data/lib/polars/date_time_name_space.rb +9 -3
  21. data/lib/polars/exceptions.rb +12 -1
  22. data/lib/polars/expr.rb +898 -215
  23. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  24. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  25. data/lib/polars/functions/as_datatype.rb +248 -0
  26. data/lib/polars/functions/col.rb +47 -0
  27. data/lib/polars/functions/eager.rb +182 -0
  28. data/lib/polars/functions/lazy.rb +1280 -0
  29. data/lib/polars/functions/len.rb +49 -0
  30. data/lib/polars/functions/lit.rb +35 -0
  31. data/lib/polars/functions/random.rb +16 -0
  32. data/lib/polars/functions/range/date_range.rb +103 -0
  33. data/lib/polars/functions/range/int_range.rb +51 -0
  34. data/lib/polars/functions/repeat.rb +144 -0
  35. data/lib/polars/functions/whenthen.rb +96 -0
  36. data/lib/polars/functions.rb +29 -416
  37. data/lib/polars/group_by.rb +2 -2
  38. data/lib/polars/io.rb +36 -31
  39. data/lib/polars/lazy_frame.rb +405 -88
  40. data/lib/polars/list_expr.rb +158 -8
  41. data/lib/polars/list_name_space.rb +102 -0
  42. data/lib/polars/meta_expr.rb +175 -7
  43. data/lib/polars/series.rb +282 -41
  44. data/lib/polars/string_cache.rb +75 -0
  45. data/lib/polars/string_expr.rb +413 -96
  46. data/lib/polars/string_name_space.rb +4 -4
  47. data/lib/polars/testing.rb +507 -0
  48. data/lib/polars/utils.rb +106 -8
  49. data/lib/polars/version.rb +1 -1
  50. data/lib/polars/whenthen.rb +83 -0
  51. data/lib/polars.rb +16 -4
  52. metadata +34 -6
  53. data/lib/polars/lazy_functions.rb +0 -1181
  54. data/lib/polars/when.rb +0 -16
  55. data/lib/polars/when_then.rb +0 -19
@@ -49,7 +49,8 @@ module Polars
49
49
  row_count_name: nil,
50
50
  row_count_offset: 0,
51
51
  parse_dates: false,
52
- eol_char: "\n"
52
+ eol_char: "\n",
53
+ truncate_ragged_lines: true
53
54
  )
54
55
  dtype_list = nil
55
56
  if !dtypes.nil?
@@ -81,7 +82,8 @@ module Polars
81
82
  encoding,
82
83
  Utils._prepare_row_count_args(row_count_name, row_count_offset),
83
84
  parse_dates,
84
- eol_char
85
+ eol_char,
86
+ truncate_ragged_lines
85
87
  )
86
88
  )
87
89
  end
@@ -103,6 +105,7 @@ module Polars
103
105
  _from_rbldf(
104
106
  RbLazyFrame.new_from_parquet(
105
107
  file,
108
+ [],
106
109
  n_rows,
107
110
  cache,
108
111
  parallel,
@@ -110,7 +113,8 @@ module Polars
110
113
  Utils._prepare_row_count_args(row_count_name, row_count_offset),
111
114
  low_memory,
112
115
  use_statistics,
113
- hive_partitioning
116
+ hive_partitioning,
117
+ nil
114
118
  )
115
119
  )
116
120
  end
@@ -308,7 +312,7 @@ module Polars
308
312
  # end
309
313
  #
310
314
  # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]}).lazy
311
- # df.pipe(cast_str_to_int, col_name: "b").collect()
315
+ # df.pipe(cast_str_to_int, col_name: "b").collect
312
316
  # # =>
313
317
  # # shape: (4, 2)
314
318
  # # ┌─────┬─────┐
@@ -342,6 +346,7 @@ module Polars
342
346
  simplify_expression: true,
343
347
  slice_pushdown: true,
344
348
  common_subplan_elimination: true,
349
+ comm_subexpr_elim: true,
345
350
  allow_streaming: false
346
351
  )
347
352
  ldf = _ldf.optimization_toggle(
@@ -351,6 +356,7 @@ module Polars
351
356
  simplify_expression,
352
357
  slice_pushdown,
353
358
  common_subplan_elimination,
359
+ comm_subexpr_elim,
354
360
  allow_streaming,
355
361
  false
356
362
  )
@@ -398,16 +404,16 @@ module Polars
398
404
  # # │ 2 ┆ 7.0 ┆ b │
399
405
  # # │ 1 ┆ 6.0 ┆ a │
400
406
  # # └─────┴─────┴─────┘
401
- def sort(by, reverse: false, nulls_last: false, maintain_order: false)
407
+ def sort(by, reverse: false, nulls_last: false, maintain_order: false, multithreaded: true)
402
408
  if by.is_a?(::String)
403
- return _from_rbldf(_ldf.sort(by, reverse, nulls_last, maintain_order))
409
+ return _from_rbldf(_ldf.sort(by, reverse, nulls_last, maintain_order, multithreaded))
404
410
  end
405
411
  if Utils.bool?(reverse)
406
412
  reverse = [reverse]
407
413
  end
408
414
 
409
415
  by = Utils.selection_to_rbexpr_list(by)
410
- _from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last, maintain_order))
416
+ _from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last, maintain_order, multithreaded))
411
417
  end
412
418
 
413
419
  # def profile
@@ -469,6 +475,7 @@ module Polars
469
475
  no_optimization: false,
470
476
  slice_pushdown: true,
471
477
  common_subplan_elimination: true,
478
+ comm_subexpr_elim: true,
472
479
  allow_streaming: false,
473
480
  _eager: false
474
481
  )
@@ -477,6 +484,7 @@ module Polars
477
484
  projection_pushdown = false
478
485
  slice_pushdown = false
479
486
  common_subplan_elimination = false
487
+ comm_subexpr_elim = false
480
488
  end
481
489
 
482
490
  if allow_streaming
@@ -490,6 +498,7 @@ module Polars
490
498
  simplify_expression,
491
499
  slice_pushdown,
492
500
  common_subplan_elimination,
501
+ comm_subexpr_elim,
493
502
  allow_streaming,
494
503
  _eager
495
504
  )
@@ -559,6 +568,268 @@ module Polars
559
568
  simplify_expression: true,
560
569
  no_optimization: false,
561
570
  slice_pushdown: true
571
+ )
572
+ lf = _set_sink_optimizations(
573
+ type_coercion: type_coercion,
574
+ predicate_pushdown: predicate_pushdown,
575
+ projection_pushdown: projection_pushdown,
576
+ simplify_expression: simplify_expression,
577
+ slice_pushdown: slice_pushdown,
578
+ no_optimization: no_optimization
579
+ )
580
+
581
+ lf.sink_parquet(
582
+ path,
583
+ compression,
584
+ compression_level,
585
+ statistics,
586
+ row_group_size,
587
+ data_pagesize_limit,
588
+ maintain_order
589
+ )
590
+ end
591
+
592
+ # Evaluate the query in streaming mode and write to an IPC file.
593
+ #
594
+ # This allows streaming results that are larger than RAM to be written to disk.
595
+ #
596
+ # @param path [String]
597
+ # File path to which the file should be written.
598
+ # @param compression ["lz4", "zstd"]
599
+ # Choose "zstd" for good compression performance.
600
+ # Choose "lz4" for fast compression/decompression.
601
+ # @param maintain_order [Boolean]
602
+ # Maintain the order in which data is processed.
603
+ # Setting this to `false` will be slightly faster.
604
+ # @param type_coercion [Boolean]
605
+ # Do type coercion optimization.
606
+ # @param predicate_pushdown [Boolean]
607
+ # Do predicate pushdown optimization.
608
+ # @param projection_pushdown [Boolean]
609
+ # Do projection pushdown optimization.
610
+ # @param simplify_expression [Boolean]
611
+ # Run simplify expressions optimization.
612
+ # @param slice_pushdown [Boolean]
613
+ # Slice pushdown optimization.
614
+ # @param no_optimization [Boolean]
615
+ # Turn off (certain) optimizations.
616
+ #
617
+ # @return [DataFrame]
618
+ #
619
+ # @example
620
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
621
+ # lf.sink_ipc("out.arrow")
622
+ def sink_ipc(
623
+ path,
624
+ compression: "zstd",
625
+ maintain_order: true,
626
+ type_coercion: true,
627
+ predicate_pushdown: true,
628
+ projection_pushdown: true,
629
+ simplify_expression: true,
630
+ slice_pushdown: true,
631
+ no_optimization: false
632
+ )
633
+ lf = _set_sink_optimizations(
634
+ type_coercion: type_coercion,
635
+ predicate_pushdown: predicate_pushdown,
636
+ projection_pushdown: projection_pushdown,
637
+ simplify_expression: simplify_expression,
638
+ slice_pushdown: slice_pushdown,
639
+ no_optimization: no_optimization
640
+ )
641
+
642
+ lf.sink_ipc(
643
+ path,
644
+ compression,
645
+ maintain_order
646
+ )
647
+ end
648
+
649
+ # Evaluate the query in streaming mode and write to a CSV file.
650
+ #
651
+ # This allows streaming results that are larger than RAM to be written to disk.
652
+ #
653
+ # @param path [String]
654
+ # File path to which the file should be written.
655
+ # @param include_bom [Boolean]
656
+ # Whether to include UTF-8 BOM in the CSV output.
657
+ # @param include_header [Boolean]
658
+ # Whether to include header in the CSV output.
659
+ # @param separator [String]
660
+ # Separate CSV fields with this symbol.
661
+ # @param line_terminator [String]
662
+ # String used to end each row.
663
+ # @param quote_char [String]
664
+ # Byte to use as quoting character.
665
+ # @param batch_size [Integer]
666
+ # Number of rows that will be processed per thread.
667
+ # @param datetime_format [String]
668
+ # A format string, with the specifiers defined by the
669
+ # `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
670
+ # Rust crate. If no format specified, the default fractional-second
671
+ # precision is inferred from the maximum timeunit found in the frame's
672
+ # Datetime cols (if any).
673
+ # @param date_format [String]
674
+ # A format string, with the specifiers defined by the
675
+ # `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
676
+ # Rust crate.
677
+ # @param time_format [String]
678
+ # A format string, with the specifiers defined by the
679
+ # `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
680
+ # Rust crate.
681
+ # @param float_precision [Integer]
682
+ # Number of decimal places to write, applied to both `Float32` and
683
+ # `Float64` datatypes.
684
+ # @param null_value [String]
685
+ # A string representing null values (defaulting to the empty string).
686
+ # @param quote_style ["necessary", "always", "non_numeric", "never"]
687
+ # Determines the quoting strategy used.
688
+ #
689
+ # - necessary (default): This puts quotes around fields only when necessary.
690
+ # They are necessary when fields contain a quote,
691
+ # delimiter or record terminator.
692
+ # Quotes are also necessary when writing an empty record
693
+ # (which is indistinguishable from a record with one empty field).
694
+ # This is the default.
695
+ # - always: This puts quotes around every field. Always.
696
+ # - never: This never puts quotes around fields, even if that results in
697
+ # invalid CSV data (e.g.: by not quoting strings containing the
698
+ # separator).
699
+ # - non_numeric: This puts quotes around all fields that are non-numeric.
700
+ # Namely, when writing a field that does not parse as a valid float
701
+ # or integer, then quotes will be used even if they aren`t strictly
702
+ # necessary.
703
+ # @param maintain_order [Boolean]
704
+ # Maintain the order in which data is processed.
705
+ # Setting this to `false` will be slightly faster.
706
+ # @param type_coercion [Boolean]
707
+ # Do type coercion optimization.
708
+ # @param predicate_pushdown [Boolean]
709
+ # Do predicate pushdown optimization.
710
+ # @param projection_pushdown [Boolean]
711
+ # Do projection pushdown optimization.
712
+ # @param simplify_expression [Boolean]
713
+ # Run simplify expressions optimization.
714
+ # @param slice_pushdown [Boolean]
715
+ # Slice pushdown optimization.
716
+ # @param no_optimization [Boolean]
717
+ # Turn off (certain) optimizations.
718
+ #
719
+ # @return [DataFrame]
720
+ #
721
+ # @example
722
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
723
+ # lf.sink_csv("out.csv")
724
+ def sink_csv(
725
+ path,
726
+ include_bom: false,
727
+ include_header: true,
728
+ separator: ",",
729
+ line_terminator: "\n",
730
+ quote_char: '"',
731
+ batch_size: 1024,
732
+ datetime_format: nil,
733
+ date_format: nil,
734
+ time_format: nil,
735
+ float_precision: nil,
736
+ null_value: nil,
737
+ quote_style: nil,
738
+ maintain_order: true,
739
+ type_coercion: true,
740
+ predicate_pushdown: true,
741
+ projection_pushdown: true,
742
+ simplify_expression: true,
743
+ slice_pushdown: true,
744
+ no_optimization: false
745
+ )
746
+ Utils._check_arg_is_1byte("separator", separator, false)
747
+ Utils._check_arg_is_1byte("quote_char", quote_char, false)
748
+
749
+ lf = _set_sink_optimizations(
750
+ type_coercion: type_coercion,
751
+ predicate_pushdown: predicate_pushdown,
752
+ projection_pushdown: projection_pushdown,
753
+ simplify_expression: simplify_expression,
754
+ slice_pushdown: slice_pushdown,
755
+ no_optimization: no_optimization
756
+ )
757
+
758
+ lf.sink_csv(
759
+ path,
760
+ include_bom,
761
+ include_header,
762
+ separator.ord,
763
+ line_terminator,
764
+ quote_char.ord,
765
+ batch_size,
766
+ datetime_format,
767
+ date_format,
768
+ time_format,
769
+ float_precision,
770
+ null_value,
771
+ quote_style,
772
+ maintain_order
773
+ )
774
+ end
775
+
776
+ # Evaluate the query in streaming mode and write to an NDJSON file.
777
+ #
778
+ # This allows streaming results that are larger than RAM to be written to disk.
779
+ #
780
+ # @param path [String]
781
+ # File path to which the file should be written.
782
+ # @param maintain_order [Boolean]
783
+ # Maintain the order in which data is processed.
784
+ # Setting this to `false` will be slightly faster.
785
+ # @param type_coercion [Boolean]
786
+ # Do type coercion optimization.
787
+ # @param predicate_pushdown [Boolean]
788
+ # Do predicate pushdown optimization.
789
+ # @param projection_pushdown [Boolean]
790
+ # Do projection pushdown optimization.
791
+ # @param simplify_expression [Boolean]
792
+ # Run simplify expressions optimization.
793
+ # @param slice_pushdown [Boolean]
794
+ # Slice pushdown optimization.
795
+ # @param no_optimization [Boolean]
796
+ # Turn off (certain) optimizations.
797
+ #
798
+ # @return [DataFrame]
799
+ #
800
+ # @example
801
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
802
+ # lf.sink_ndjson("out.ndjson")
803
+ def sink_ndjson(
804
+ path,
805
+ maintain_order: true,
806
+ type_coercion: true,
807
+ predicate_pushdown: true,
808
+ projection_pushdown: true,
809
+ simplify_expression: true,
810
+ slice_pushdown: true,
811
+ no_optimization: false
812
+ )
813
+ lf = _set_sink_optimizations(
814
+ type_coercion: type_coercion,
815
+ predicate_pushdown: predicate_pushdown,
816
+ projection_pushdown: projection_pushdown,
817
+ simplify_expression: simplify_expression,
818
+ slice_pushdown: slice_pushdown,
819
+ no_optimization: no_optimization
820
+ )
821
+
822
+ lf.sink_json(path, maintain_order)
823
+ end
824
+
825
+ # @private
826
+ def _set_sink_optimizations(
827
+ type_coercion: true,
828
+ predicate_pushdown: true,
829
+ projection_pushdown: true,
830
+ simplify_expression: true,
831
+ slice_pushdown: true,
832
+ no_optimization: false
562
833
  )
563
834
  if no_optimization
564
835
  predicate_pushdown = false
@@ -566,25 +837,17 @@ module Polars
566
837
  slice_pushdown = false
567
838
  end
568
839
 
569
- lf = _ldf.optimization_toggle(
840
+ _ldf.optimization_toggle(
570
841
  type_coercion,
571
842
  predicate_pushdown,
572
843
  projection_pushdown,
573
844
  simplify_expression,
574
845
  slice_pushdown,
575
846
  false,
847
+ false,
576
848
  true,
577
849
  false
578
850
  )
579
- lf.sink_parquet(
580
- path,
581
- compression,
582
- compression_level,
583
- statistics,
584
- row_group_size,
585
- data_pagesize_limit,
586
- maintain_order
587
- )
588
851
  end
589
852
 
590
853
  # Collect a small number of rows for debugging purposes.
@@ -650,6 +913,7 @@ module Polars
650
913
  no_optimization: false,
651
914
  slice_pushdown: true,
652
915
  common_subplan_elimination: true,
916
+ comm_subexpr_elim: true,
653
917
  allow_streaming: false
654
918
  )
655
919
  if no_optimization
@@ -666,6 +930,7 @@ module Polars
666
930
  simplify_expression,
667
931
  slice_pushdown,
668
932
  common_subplan_elimination,
933
+ comm_subexpr_elim,
669
934
  allow_streaming,
670
935
  false
671
936
  )
@@ -699,6 +964,10 @@ module Polars
699
964
  _from_rbldf(_ldf.cache)
700
965
  end
701
966
 
967
+ # TODO
968
+ # def cast
969
+ # end
970
+
702
971
  # Create an empty copy of the current LazyFrame.
703
972
  #
704
973
  # The copy has an identical schema but no data.
@@ -706,14 +975,14 @@ module Polars
706
975
  # @return [LazyFrame]
707
976
  #
708
977
  # @example
709
- # df = Polars::DataFrame.new(
978
+ # lf = Polars::LazyFrame.new(
710
979
  # {
711
980
  # "a" => [nil, 2, 3, 4],
712
981
  # "b" => [0.5, nil, 2.5, 13],
713
982
  # "c" => [true, true, false, nil],
714
983
  # }
715
984
  # ).lazy
716
- # df.cleared.fetch
985
+ # lf.clear.fetch
717
986
  # # =>
718
987
  # # shape: (0, 3)
719
988
  # # ┌─────┬─────┬──────┐
@@ -722,9 +991,23 @@ module Polars
722
991
  # # │ i64 ┆ f64 ┆ bool │
723
992
  # # ╞═════╪═════╪══════╡
724
993
  # # └─────┴─────┴──────┘
725
- def cleared
726
- DataFrame.new(columns: schema).lazy
727
- end
994
+ #
995
+ # @example
996
+ # lf.clear(2).fetch
997
+ # # =>
998
+ # # shape: (2, 3)
999
+ # # ┌──────┬──────┬──────┐
1000
+ # # │ a ┆ b ┆ c │
1001
+ # # │ --- ┆ --- ┆ --- │
1002
+ # # │ i64 ┆ f64 ┆ bool │
1003
+ # # ╞══════╪══════╪══════╡
1004
+ # # │ null ┆ null ┆ null │
1005
+ # # │ null ┆ null ┆ null │
1006
+ # # └──────┴──────┴──────┘
1007
+ def clear(n = 0)
1008
+ DataFrame.new(columns: schema).clear(n).lazy
1009
+ end
1010
+ alias_method :cleared, :clear
728
1011
 
729
1012
  # Filter the rows in the DataFrame based on a predicate expression.
730
1013
  #
@@ -774,8 +1057,13 @@ module Polars
774
1057
 
775
1058
  # Select columns from this DataFrame.
776
1059
  #
777
- # @param exprs [Object]
778
- # Column or columns to select.
1060
+ # @param exprs [Array]
1061
+ # Column(s) to select, specified as positional arguments.
1062
+ # Accepts expression input. Strings are parsed as column names,
1063
+ # other non-expression inputs are parsed as literals.
1064
+ # @param named_exprs [Hash]
1065
+ # Additional columns to select, specified as keyword arguments.
1066
+ # The columns will be renamed to the keyword used.
779
1067
  #
780
1068
  # @return [LazyFrame]
781
1069
  #
@@ -855,9 +1143,13 @@ module Polars
855
1143
  # # │ 0 │
856
1144
  # # │ 10 │
857
1145
  # # └─────────┘
858
- def select(exprs)
859
- exprs = Utils.selection_to_rbexpr_list(exprs)
860
- _from_rbldf(_ldf.select(exprs))
1146
+ def select(*exprs, **named_exprs)
1147
+ structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
1148
+
1149
+ rbexprs = Utils.parse_as_list_of_expressions(
1150
+ *exprs, **named_exprs, __structify: structify
1151
+ )
1152
+ _from_rbldf(_ldf.select(rbexprs))
861
1153
  end
862
1154
 
863
1155
  # Start a group by operation.
@@ -967,7 +1259,7 @@ module Polars
967
1259
  # df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
968
1260
  # Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
969
1261
  # )
970
- # df.group_by_rolling(index_column: "dt", period: "2d").agg(
1262
+ # df.rolling(index_column: "dt", period: "2d").agg(
971
1263
  # [
972
1264
  # Polars.sum("a").alias("sum_a"),
973
1265
  # Polars.min("a").alias("min_a"),
@@ -988,7 +1280,7 @@ module Polars
988
1280
  # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
989
1281
  # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
990
1282
  # # └─────────────────────┴───────┴───────┴───────┘
991
- def group_by_rolling(
1283
+ def rolling(
992
1284
  index_column:,
993
1285
  period:,
994
1286
  offset: nil,
@@ -1005,12 +1297,13 @@ module Polars
1005
1297
  period = Utils._timedelta_to_pl_duration(period)
1006
1298
  offset = Utils._timedelta_to_pl_duration(offset)
1007
1299
 
1008
- lgb = _ldf.group_by_rolling(
1300
+ lgb = _ldf.rolling(
1009
1301
  index_column, period, offset, closed, rbexprs_by, check_sorted
1010
1302
  )
1011
1303
  LazyGroupBy.new(lgb)
1012
1304
  end
1013
- alias_method :groupby_rolling, :group_by_rolling
1305
+ alias_method :group_by_rolling, :rolling
1306
+ alias_method :groupby_rolling, :rolling
1014
1307
 
1015
1308
  # Group based on a time value (or index value of type `:i32`, `:i64`).
1016
1309
  #
@@ -1234,12 +1527,13 @@ module Polars
1234
1527
  # closed: "right"
1235
1528
  # ).agg(Polars.col("A").alias("A_agg_list"))
1236
1529
  # # =>
1237
- # # shape: (3, 4)
1530
+ # # shape: (4, 4)
1238
1531
  # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
1239
1532
  # # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
1240
1533
  # # │ --- ┆ --- ┆ --- ┆ --- │
1241
1534
  # # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
1242
1535
  # # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
1536
+ # # │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │
1243
1537
  # # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
1244
1538
  # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
1245
1539
  # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
@@ -1440,6 +1734,8 @@ module Polars
1440
1734
  # Join strategy.
1441
1735
  # @param suffix [String]
1442
1736
  # Suffix to append to columns with a duplicate name.
1737
+ # @param join_nulls [Boolean]
1738
+ # Join on null values. By default null values will never produce matches.
1443
1739
  # @param allow_parallel [Boolean]
1444
1740
  # Allow the physical plan to optionally evaluate the computation of both
1445
1741
  # DataFrames up to the join in parallel.
@@ -1535,6 +1831,7 @@ module Polars
1535
1831
  on: nil,
1536
1832
  how: "inner",
1537
1833
  suffix: "_right",
1834
+ join_nulls: false,
1538
1835
  allow_parallel: true,
1539
1836
  force_parallel: false
1540
1837
  )
@@ -1545,7 +1842,7 @@ module Polars
1545
1842
  if how == "cross"
1546
1843
  return _from_rbldf(
1547
1844
  _ldf.join(
1548
- other._ldf, [], [], allow_parallel, force_parallel, how, suffix
1845
+ other._ldf, [], [], allow_parallel, join_nulls, force_parallel, how, suffix
1549
1846
  )
1550
1847
  )
1551
1848
  end
@@ -1568,6 +1865,7 @@ module Polars
1568
1865
  rbexprs_right,
1569
1866
  allow_parallel,
1570
1867
  force_parallel,
1868
+ join_nulls,
1571
1869
  how,
1572
1870
  suffix,
1573
1871
  )
@@ -1598,37 +1896,19 @@ module Polars
1598
1896
  # ).collect
1599
1897
  # # =>
1600
1898
  # # shape: (4, 6)
1601
- # # ┌─────┬──────┬───────┬──────┬──────┬───────┐
1602
- # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
1603
- # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1604
- # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
1605
- # # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
1606
- # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
1607
- # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
1608
- # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
1609
- # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
1610
- # # └─────┴──────┴───────┴──────┴──────┴───────┘
1611
- def with_columns(exprs)
1612
- exprs =
1613
- if exprs.nil?
1614
- []
1615
- elsif exprs.is_a?(Expr)
1616
- [exprs]
1617
- else
1618
- exprs.to_a
1619
- end
1620
-
1621
- rbexprs = []
1622
- exprs.each do |e|
1623
- case e
1624
- when Expr
1625
- rbexprs << e._rbexpr
1626
- when Series
1627
- rbexprs << Utils.lit(e)._rbexpr
1628
- else
1629
- raise ArgumentError, "Expected an expression, got #{e}"
1630
- end
1631
- end
1899
+ # # ┌─────┬──────┬───────┬─────┬──────┬───────┐
1900
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
1901
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1902
+ # # │ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │
1903
+ # # ╞═════╪══════╪═══════╪═════╪══════╪═══════╡
1904
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │
1905
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │
1906
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │
1907
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │
1908
+ # # └─────┴──────┴───────┴─────┴──────┴───────┘
1909
+ def with_columns(*exprs, **named_exprs)
1910
+ structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
1911
+ rbexprs = Utils.parse_as_list_of_expressions(*exprs, **named_exprs, __structify: structify)
1632
1912
 
1633
1913
  _from_rbldf(_ldf.with_columns(rbexprs))
1634
1914
  end
@@ -1690,26 +1970,26 @@ module Polars
1690
1970
  # # ┌─────┬─────┬───────────┐
1691
1971
  # # │ a ┆ b ┆ b_squared │
1692
1972
  # # │ --- ┆ --- ┆ --- │
1693
- # # │ i64 ┆ i64 ┆ f64
1973
+ # # │ i64 ┆ i64 ┆ i64
1694
1974
  # # ╞═════╪═════╪═══════════╡
1695
- # # │ 1 ┆ 2 ┆ 4.0
1696
- # # │ 3 ┆ 4 ┆ 16.0
1697
- # # │ 5 ┆ 6 ┆ 36.0
1975
+ # # │ 1 ┆ 2 ┆ 4
1976
+ # # │ 3 ┆ 4 ┆ 16
1977
+ # # │ 5 ┆ 6 ┆ 36
1698
1978
  # # └─────┴─────┴───────────┘
1699
1979
  #
1700
1980
  # @example
1701
1981
  # df.with_column(Polars.col("a") ** 2).collect
1702
1982
  # # =>
1703
1983
  # # shape: (3, 2)
1704
- # # ┌──────┬─────┐
1705
- # # │ a ┆ b │
1706
- # # │ --- ┆ --- │
1707
- # # │ f64 ┆ i64 │
1708
- # # ╞══════╪═════╡
1709
- # # │ 1.0 ┆ 2 │
1710
- # # │ 9.0 ┆ 4 │
1711
- # # │ 25.0 ┆ 6 │
1712
- # # └──────┴─────┘
1984
+ # # ┌─────┬─────┐
1985
+ # # │ a ┆ b │
1986
+ # # │ --- ┆ --- │
1987
+ # # │ i64 ┆ i64 │
1988
+ # # ╞═════╪═════╡
1989
+ # # │ 1 ┆ 2 │
1990
+ # # │ 9 ┆ 4 │
1991
+ # # │ 25 ┆ 6 │
1992
+ # # └─────┴─────┘
1713
1993
  def with_column(column)
1714
1994
  with_columns([column])
1715
1995
  end
@@ -1721,11 +2001,9 @@ module Polars
1721
2001
  # - List of column names.
1722
2002
  #
1723
2003
  # @return [LazyFrame]
1724
- def drop(columns)
1725
- if columns.is_a?(::String)
1726
- columns = [columns]
1727
- end
1728
- _from_rbldf(_ldf.drop_columns(columns))
2004
+ def drop(*columns)
2005
+ drop_cols = Utils._expand_selectors(self, *columns)
2006
+ _from_rbldf(_ldf.drop(drop_cols))
1729
2007
  end
1730
2008
 
1731
2009
  # Rename column names.
@@ -1955,7 +2233,7 @@ module Polars
1955
2233
  # "b" => [2, 4, 6]
1956
2234
  # }
1957
2235
  # ).lazy
1958
- # df.with_row_count.collect
2236
+ # df.with_row_index.collect
1959
2237
  # # =>
1960
2238
  # # shape: (3, 3)
1961
2239
  # # ┌────────┬─────┬─────┐
@@ -1967,9 +2245,10 @@ module Polars
1967
2245
  # # │ 1 ┆ 3 ┆ 4 │
1968
2246
  # # │ 2 ┆ 5 ┆ 6 │
1969
2247
  # # └────────┴─────┴─────┘
1970
- def with_row_count(name: "row_nr", offset: 0)
1971
- _from_rbldf(_ldf.with_row_count(name, offset))
2248
+ def with_row_index(name: "row_nr", offset: 0)
2249
+ _from_rbldf(_ldf.with_row_index(name, offset))
1972
2250
  end
2251
+ alias_method :with_row_count, :with_row_index
1973
2252
 
1974
2253
  # Take every nth row in the LazyFrame and return as a new LazyFrame.
1975
2254
  #
@@ -2470,9 +2749,47 @@ module Polars
2470
2749
  _from_rbldf(_ldf.unnest(names))
2471
2750
  end
2472
2751
 
2473
- # TODO
2474
- # def merge_sorted
2475
- # end
2752
+ # Take two sorted DataFrames and merge them by the sorted key.
2753
+ #
2754
+ # The output of this operation will also be sorted.
2755
+ # It is the callers responsibility that the frames are sorted
2756
+ # by that key otherwise the output will not make sense.
2757
+ #
2758
+ # The schemas of both LazyFrames must be equal.
2759
+ #
2760
+ # @param other [DataFrame]
2761
+ # Other DataFrame that must be merged
2762
+ # @param key [String]
2763
+ # Key that is sorted.
2764
+ #
2765
+ # @return [LazyFrame]
2766
+ #
2767
+ # @example
2768
+ # df0 = Polars::LazyFrame.new(
2769
+ # {"name" => ["steve", "elise", "bob"], "age" => [42, 44, 18]}
2770
+ # ).sort("age")
2771
+ # df1 = Polars::LazyFrame.new(
2772
+ # {"name" => ["anna", "megan", "steve", "thomas"], "age" => [21, 33, 42, 20]}
2773
+ # ).sort("age")
2774
+ # df0.merge_sorted(df1, "age").collect
2775
+ # # =>
2776
+ # # shape: (7, 2)
2777
+ # # ┌────────┬─────┐
2778
+ # # │ name ┆ age │
2779
+ # # │ --- ┆ --- │
2780
+ # # │ str ┆ i64 │
2781
+ # # ╞════════╪═════╡
2782
+ # # │ bob ┆ 18 │
2783
+ # # │ thomas ┆ 20 │
2784
+ # # │ anna ┆ 21 │
2785
+ # # │ megan ┆ 33 │
2786
+ # # │ steve ┆ 42 │
2787
+ # # │ steve ┆ 42 │
2788
+ # # │ elise ┆ 44 │
2789
+ # # └────────┴─────┘
2790
+ def merge_sorted(other, key)
2791
+ _from_rbldf(_ldf.merge_sorted(other._ldf, key))
2792
+ end
2476
2793
 
2477
2794
  # Indicate that one or multiple columns are sorted.
2478
2795
  #