polars-df 0.8.0 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +30 -1
  3. data/Cargo.lock +107 -59
  4. data/Cargo.toml +0 -3
  5. data/LICENSE.txt +1 -1
  6. data/README.md +2 -2
  7. data/ext/polars/Cargo.toml +15 -7
  8. data/ext/polars/src/batched_csv.rs +4 -4
  9. data/ext/polars/src/conversion/anyvalue.rs +185 -0
  10. data/ext/polars/src/conversion/chunked_array.rs +140 -0
  11. data/ext/polars/src/{conversion.rs → conversion/mod.rs} +260 -340
  12. data/ext/polars/src/dataframe.rs +69 -53
  13. data/ext/polars/src/expr/array.rs +74 -0
  14. data/ext/polars/src/expr/datetime.rs +22 -56
  15. data/ext/polars/src/expr/general.rs +61 -33
  16. data/ext/polars/src/expr/list.rs +52 -4
  17. data/ext/polars/src/expr/meta.rs +48 -0
  18. data/ext/polars/src/expr/rolling.rs +1 -0
  19. data/ext/polars/src/expr/string.rs +59 -8
  20. data/ext/polars/src/expr/struct.rs +8 -4
  21. data/ext/polars/src/functions/aggregation.rs +6 -0
  22. data/ext/polars/src/functions/lazy.rs +103 -48
  23. data/ext/polars/src/functions/meta.rs +45 -1
  24. data/ext/polars/src/functions/string_cache.rs +14 -0
  25. data/ext/polars/src/{lazyframe.rs → lazyframe/mod.rs} +138 -22
  26. data/ext/polars/src/lib.rs +226 -168
  27. data/ext/polars/src/series/aggregation.rs +20 -0
  28. data/ext/polars/src/series/mod.rs +25 -4
  29. data/lib/polars/array_expr.rb +449 -0
  30. data/lib/polars/array_name_space.rb +346 -0
  31. data/lib/polars/cat_expr.rb +24 -0
  32. data/lib/polars/cat_name_space.rb +75 -0
  33. data/lib/polars/config.rb +2 -2
  34. data/lib/polars/data_frame.rb +179 -43
  35. data/lib/polars/data_types.rb +191 -28
  36. data/lib/polars/date_time_expr.rb +31 -14
  37. data/lib/polars/exceptions.rb +12 -1
  38. data/lib/polars/expr.rb +866 -186
  39. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  40. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  41. data/lib/polars/functions/as_datatype.rb +248 -0
  42. data/lib/polars/functions/col.rb +47 -0
  43. data/lib/polars/functions/eager.rb +182 -0
  44. data/lib/polars/functions/lazy.rb +1280 -0
  45. data/lib/polars/functions/len.rb +49 -0
  46. data/lib/polars/functions/lit.rb +35 -0
  47. data/lib/polars/functions/random.rb +16 -0
  48. data/lib/polars/functions/range/date_range.rb +103 -0
  49. data/lib/polars/functions/range/int_range.rb +51 -0
  50. data/lib/polars/functions/repeat.rb +144 -0
  51. data/lib/polars/functions/whenthen.rb +27 -0
  52. data/lib/polars/functions.rb +29 -416
  53. data/lib/polars/group_by.rb +2 -2
  54. data/lib/polars/io.rb +18 -25
  55. data/lib/polars/lazy_frame.rb +367 -53
  56. data/lib/polars/list_expr.rb +152 -6
  57. data/lib/polars/list_name_space.rb +102 -0
  58. data/lib/polars/meta_expr.rb +175 -7
  59. data/lib/polars/series.rb +273 -34
  60. data/lib/polars/string_cache.rb +75 -0
  61. data/lib/polars/string_expr.rb +412 -96
  62. data/lib/polars/string_name_space.rb +4 -4
  63. data/lib/polars/testing.rb +507 -0
  64. data/lib/polars/utils.rb +52 -8
  65. data/lib/polars/version.rb +1 -1
  66. data/lib/polars.rb +15 -2
  67. metadata +35 -5
  68. data/lib/polars/lazy_functions.rb +0 -1181
@@ -308,7 +308,7 @@ module Polars
308
308
  # end
309
309
  #
310
310
  # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]}).lazy
311
- # df.pipe(cast_str_to_int, col_name: "b").collect()
311
+ # df.pipe(cast_str_to_int, col_name: "b").collect
312
312
  # # =>
313
313
  # # shape: (4, 2)
314
314
  # # ┌─────┬─────┐
@@ -342,6 +342,7 @@ module Polars
342
342
  simplify_expression: true,
343
343
  slice_pushdown: true,
344
344
  common_subplan_elimination: true,
345
+ comm_subexpr_elim: true,
345
346
  allow_streaming: false
346
347
  )
347
348
  ldf = _ldf.optimization_toggle(
@@ -351,6 +352,7 @@ module Polars
351
352
  simplify_expression,
352
353
  slice_pushdown,
353
354
  common_subplan_elimination,
355
+ comm_subexpr_elim,
354
356
  allow_streaming,
355
357
  false
356
358
  )
@@ -469,6 +471,7 @@ module Polars
469
471
  no_optimization: false,
470
472
  slice_pushdown: true,
471
473
  common_subplan_elimination: true,
474
+ comm_subexpr_elim: true,
472
475
  allow_streaming: false,
473
476
  _eager: false
474
477
  )
@@ -477,6 +480,7 @@ module Polars
477
480
  projection_pushdown = false
478
481
  slice_pushdown = false
479
482
  common_subplan_elimination = false
483
+ comm_subexpr_elim = false
480
484
  end
481
485
 
482
486
  if allow_streaming
@@ -490,6 +494,7 @@ module Polars
490
494
  simplify_expression,
491
495
  slice_pushdown,
492
496
  common_subplan_elimination,
497
+ comm_subexpr_elim,
493
498
  allow_streaming,
494
499
  _eager
495
500
  )
@@ -559,6 +564,268 @@ module Polars
559
564
  simplify_expression: true,
560
565
  no_optimization: false,
561
566
  slice_pushdown: true
567
+ )
568
+ lf = _set_sink_optimizations(
569
+ type_coercion: type_coercion,
570
+ predicate_pushdown: predicate_pushdown,
571
+ projection_pushdown: projection_pushdown,
572
+ simplify_expression: simplify_expression,
573
+ slice_pushdown: slice_pushdown,
574
+ no_optimization: no_optimization
575
+ )
576
+
577
+ lf.sink_parquet(
578
+ path,
579
+ compression,
580
+ compression_level,
581
+ statistics,
582
+ row_group_size,
583
+ data_pagesize_limit,
584
+ maintain_order
585
+ )
586
+ end
587
+
588
+ # Evaluate the query in streaming mode and write to an IPC file.
589
+ #
590
+ # This allows streaming results that are larger than RAM to be written to disk.
591
+ #
592
+ # @param path [String]
593
+ # File path to which the file should be written.
594
+ # @param compression ["lz4", "zstd"]
595
+ # Choose "zstd" for good compression performance.
596
+ # Choose "lz4" for fast compression/decompression.
597
+ # @param maintain_order [Boolean]
598
+ # Maintain the order in which data is processed.
599
+ # Setting this to `false` will be slightly faster.
600
+ # @param type_coercion [Boolean]
601
+ # Do type coercion optimization.
602
+ # @param predicate_pushdown [Boolean]
603
+ # Do predicate pushdown optimization.
604
+ # @param projection_pushdown [Boolean]
605
+ # Do projection pushdown optimization.
606
+ # @param simplify_expression [Boolean]
607
+ # Run simplify expressions optimization.
608
+ # @param slice_pushdown [Boolean]
609
+ # Slice pushdown optimization.
610
+ # @param no_optimization [Boolean]
611
+ # Turn off (certain) optimizations.
612
+ #
613
+ # @return [DataFrame]
614
+ #
615
+ # @example
616
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
617
+ # lf.sink_ipc("out.arrow")
618
+ def sink_ipc(
619
+ path,
620
+ compression: "zstd",
621
+ maintain_order: true,
622
+ type_coercion: true,
623
+ predicate_pushdown: true,
624
+ projection_pushdown: true,
625
+ simplify_expression: true,
626
+ slice_pushdown: true,
627
+ no_optimization: false
628
+ )
629
+ lf = _set_sink_optimizations(
630
+ type_coercion: type_coercion,
631
+ predicate_pushdown: predicate_pushdown,
632
+ projection_pushdown: projection_pushdown,
633
+ simplify_expression: simplify_expression,
634
+ slice_pushdown: slice_pushdown,
635
+ no_optimization: no_optimization
636
+ )
637
+
638
+ lf.sink_ipc(
639
+ path,
640
+ compression,
641
+ maintain_order
642
+ )
643
+ end
644
+
645
+ # Evaluate the query in streaming mode and write to a CSV file.
646
+ #
647
+ # This allows streaming results that are larger than RAM to be written to disk.
648
+ #
649
+ # @param path [String]
650
+ # File path to which the file should be written.
651
+ # @param include_bom [Boolean]
652
+ # Whether to include UTF-8 BOM in the CSV output.
653
+ # @param include_header [Boolean]
654
+ # Whether to include header in the CSV output.
655
+ # @param separator [String]
656
+ # Separate CSV fields with this symbol.
657
+ # @param line_terminator [String]
658
+ # String used to end each row.
659
+ # @param quote_char [String]
660
+ # Byte to use as quoting character.
661
+ # @param batch_size [Integer]
662
+ # Number of rows that will be processed per thread.
663
+ # @param datetime_format [String]
664
+ # A format string, with the specifiers defined by the
665
+ # `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
666
+ # Rust crate. If no format specified, the default fractional-second
667
+ # precision is inferred from the maximum timeunit found in the frame's
668
+ # Datetime cols (if any).
669
+ # @param date_format [String]
670
+ # A format string, with the specifiers defined by the
671
+ # `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
672
+ # Rust crate.
673
+ # @param time_format [String]
674
+ # A format string, with the specifiers defined by the
675
+ # `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
676
+ # Rust crate.
677
+ # @param float_precision [Integer]
678
+ # Number of decimal places to write, applied to both `Float32` and
679
+ # `Float64` datatypes.
680
+ # @param null_value [String]
681
+ # A string representing null values (defaulting to the empty string).
682
+ # @param quote_style ["necessary", "always", "non_numeric", "never"]
683
+ # Determines the quoting strategy used.
684
+ #
685
+ # - necessary (default): This puts quotes around fields only when necessary.
686
+ # They are necessary when fields contain a quote,
687
+ # delimiter or record terminator.
688
+ # Quotes are also necessary when writing an empty record
689
+ # (which is indistinguishable from a record with one empty field).
690
+ # This is the default.
691
+ # - always: This puts quotes around every field. Always.
692
+ # - never: This never puts quotes around fields, even if that results in
693
+ # invalid CSV data (e.g.: by not quoting strings containing the
694
+ # separator).
695
+ # - non_numeric: This puts quotes around all fields that are non-numeric.
696
+ # Namely, when writing a field that does not parse as a valid float
697
+ # or integer, then quotes will be used even if they aren`t strictly
698
+ # necessary.
699
+ # @param maintain_order [Boolean]
700
+ # Maintain the order in which data is processed.
701
+ # Setting this to `false` will be slightly faster.
702
+ # @param type_coercion [Boolean]
703
+ # Do type coercion optimization.
704
+ # @param predicate_pushdown [Boolean]
705
+ # Do predicate pushdown optimization.
706
+ # @param projection_pushdown [Boolean]
707
+ # Do projection pushdown optimization.
708
+ # @param simplify_expression [Boolean]
709
+ # Run simplify expressions optimization.
710
+ # @param slice_pushdown [Boolean]
711
+ # Slice pushdown optimization.
712
+ # @param no_optimization [Boolean]
713
+ # Turn off (certain) optimizations.
714
+ #
715
+ # @return [DataFrame]
716
+ #
717
+ # @example
718
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
719
+ # lf.sink_csv("out.csv")
720
+ def sink_csv(
721
+ path,
722
+ include_bom: false,
723
+ include_header: true,
724
+ separator: ",",
725
+ line_terminator: "\n",
726
+ quote_char: '"',
727
+ batch_size: 1024,
728
+ datetime_format: nil,
729
+ date_format: nil,
730
+ time_format: nil,
731
+ float_precision: nil,
732
+ null_value: nil,
733
+ quote_style: nil,
734
+ maintain_order: true,
735
+ type_coercion: true,
736
+ predicate_pushdown: true,
737
+ projection_pushdown: true,
738
+ simplify_expression: true,
739
+ slice_pushdown: true,
740
+ no_optimization: false
741
+ )
742
+ Utils._check_arg_is_1byte("separator", separator, false)
743
+ Utils._check_arg_is_1byte("quote_char", quote_char, false)
744
+
745
+ lf = _set_sink_optimizations(
746
+ type_coercion: type_coercion,
747
+ predicate_pushdown: predicate_pushdown,
748
+ projection_pushdown: projection_pushdown,
749
+ simplify_expression: simplify_expression,
750
+ slice_pushdown: slice_pushdown,
751
+ no_optimization: no_optimization
752
+ )
753
+
754
+ lf.sink_csv(
755
+ path,
756
+ include_bom,
757
+ include_header,
758
+ separator.ord,
759
+ line_terminator,
760
+ quote_char.ord,
761
+ batch_size,
762
+ datetime_format,
763
+ date_format,
764
+ time_format,
765
+ float_precision,
766
+ null_value,
767
+ quote_style,
768
+ maintain_order
769
+ )
770
+ end
771
+
772
+ # Evaluate the query in streaming mode and write to an NDJSON file.
773
+ #
774
+ # This allows streaming results that are larger than RAM to be written to disk.
775
+ #
776
+ # @param path [String]
777
+ # File path to which the file should be written.
778
+ # @param maintain_order [Boolean]
779
+ # Maintain the order in which data is processed.
780
+ # Setting this to `false` will be slightly faster.
781
+ # @param type_coercion [Boolean]
782
+ # Do type coercion optimization.
783
+ # @param predicate_pushdown [Boolean]
784
+ # Do predicate pushdown optimization.
785
+ # @param projection_pushdown [Boolean]
786
+ # Do projection pushdown optimization.
787
+ # @param simplify_expression [Boolean]
788
+ # Run simplify expressions optimization.
789
+ # @param slice_pushdown [Boolean]
790
+ # Slice pushdown optimization.
791
+ # @param no_optimization [Boolean]
792
+ # Turn off (certain) optimizations.
793
+ #
794
+ # @return [DataFrame]
795
+ #
796
+ # @example
797
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
798
+ # lf.sink_ndjson("out.ndjson")
799
+ def sink_ndjson(
800
+ path,
801
+ maintain_order: true,
802
+ type_coercion: true,
803
+ predicate_pushdown: true,
804
+ projection_pushdown: true,
805
+ simplify_expression: true,
806
+ slice_pushdown: true,
807
+ no_optimization: false
808
+ )
809
+ lf = _set_sink_optimizations(
810
+ type_coercion: type_coercion,
811
+ predicate_pushdown: predicate_pushdown,
812
+ projection_pushdown: projection_pushdown,
813
+ simplify_expression: simplify_expression,
814
+ slice_pushdown: slice_pushdown,
815
+ no_optimization: no_optimization
816
+ )
817
+
818
+ lf.sink_json(path, maintain_order)
819
+ end
820
+
821
+ # @private
822
+ def _set_sink_optimizations(
823
+ type_coercion: true,
824
+ predicate_pushdown: true,
825
+ projection_pushdown: true,
826
+ simplify_expression: true,
827
+ slice_pushdown: true,
828
+ no_optimization: false
562
829
  )
563
830
  if no_optimization
564
831
  predicate_pushdown = false
@@ -566,25 +833,17 @@ module Polars
566
833
  slice_pushdown = false
567
834
  end
568
835
 
569
- lf = _ldf.optimization_toggle(
836
+ _ldf.optimization_toggle(
570
837
  type_coercion,
571
838
  predicate_pushdown,
572
839
  projection_pushdown,
573
840
  simplify_expression,
574
841
  slice_pushdown,
575
842
  false,
843
+ false,
576
844
  true,
577
845
  false
578
846
  )
579
- lf.sink_parquet(
580
- path,
581
- compression,
582
- compression_level,
583
- statistics,
584
- row_group_size,
585
- data_pagesize_limit,
586
- maintain_order
587
- )
588
847
  end
589
848
 
590
849
  # Collect a small number of rows for debugging purposes.
@@ -650,6 +909,7 @@ module Polars
650
909
  no_optimization: false,
651
910
  slice_pushdown: true,
652
911
  common_subplan_elimination: true,
912
+ comm_subexpr_elim: true,
653
913
  allow_streaming: false
654
914
  )
655
915
  if no_optimization
@@ -666,6 +926,7 @@ module Polars
666
926
  simplify_expression,
667
927
  slice_pushdown,
668
928
  common_subplan_elimination,
929
+ comm_subexpr_elim,
669
930
  allow_streaming,
670
931
  false
671
932
  )
@@ -699,6 +960,10 @@ module Polars
699
960
  _from_rbldf(_ldf.cache)
700
961
  end
701
962
 
963
+ # TODO
964
+ # def cast
965
+ # end
966
+
702
967
  # Create an empty copy of the current LazyFrame.
703
968
  #
704
969
  # The copy has an identical schema but no data.
@@ -706,14 +971,14 @@ module Polars
706
971
  # @return [LazyFrame]
707
972
  #
708
973
  # @example
709
- # df = Polars::DataFrame.new(
974
+ # lf = Polars::LazyFrame.new(
710
975
  # {
711
976
  # "a" => [nil, 2, 3, 4],
712
977
  # "b" => [0.5, nil, 2.5, 13],
713
978
  # "c" => [true, true, false, nil],
714
979
  # }
715
980
  # ).lazy
716
- # df.cleared.fetch
981
+ # lf.clear.fetch
717
982
  # # =>
718
983
  # # shape: (0, 3)
719
984
  # # ┌─────┬─────┬──────┐
@@ -722,9 +987,23 @@ module Polars
722
987
  # # │ i64 ┆ f64 ┆ bool │
723
988
  # # ╞═════╪═════╪══════╡
724
989
  # # └─────┴─────┴──────┘
725
- def cleared
726
- DataFrame.new(columns: schema).lazy
727
- end
990
+ #
991
+ # @example
992
+ # lf.clear(2).fetch
993
+ # # =>
994
+ # # shape: (2, 3)
995
+ # # ┌──────┬──────┬──────┐
996
+ # # │ a ┆ b ┆ c │
997
+ # # │ --- ┆ --- ┆ --- │
998
+ # # │ i64 ┆ f64 ┆ bool │
999
+ # # ╞══════╪══════╪══════╡
1000
+ # # │ null ┆ null ┆ null │
1001
+ # # │ null ┆ null ┆ null │
1002
+ # # └──────┴──────┴──────┘
1003
+ def clear(n = 0)
1004
+ DataFrame.new(columns: schema).clear(n).lazy
1005
+ end
1006
+ alias_method :cleared, :clear
728
1007
 
729
1008
  # Filter the rows in the DataFrame based on a predicate expression.
730
1009
  #
@@ -774,8 +1053,13 @@ module Polars
774
1053
 
775
1054
  # Select columns from this DataFrame.
776
1055
  #
777
- # @param exprs [Object]
778
- # Column or columns to select.
1056
+ # @param exprs [Array]
1057
+ # Column(s) to select, specified as positional arguments.
1058
+ # Accepts expression input. Strings are parsed as column names,
1059
+ # other non-expression inputs are parsed as literals.
1060
+ # @param named_exprs [Hash]
1061
+ # Additional columns to select, specified as keyword arguments.
1062
+ # The columns will be renamed to the keyword used.
779
1063
  #
780
1064
  # @return [LazyFrame]
781
1065
  #
@@ -855,9 +1139,13 @@ module Polars
855
1139
  # # │ 0 │
856
1140
  # # │ 10 │
857
1141
  # # └─────────┘
858
- def select(exprs)
859
- exprs = Utils.selection_to_rbexpr_list(exprs)
860
- _from_rbldf(_ldf.select(exprs))
1142
+ def select(*exprs, **named_exprs)
1143
+ structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
1144
+
1145
+ rbexprs = Utils.parse_as_list_of_expressions(
1146
+ *exprs, **named_exprs, __structify: structify
1147
+ )
1148
+ _from_rbldf(_ldf.select(rbexprs))
861
1149
  end
862
1150
 
863
1151
  # Start a group by operation.
@@ -967,7 +1255,7 @@ module Polars
967
1255
  # df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
968
1256
  # Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
969
1257
  # )
970
- # df.group_by_rolling(index_column: "dt", period: "2d").agg(
1258
+ # df.rolling(index_column: "dt", period: "2d").agg(
971
1259
  # [
972
1260
  # Polars.sum("a").alias("sum_a"),
973
1261
  # Polars.min("a").alias("min_a"),
@@ -988,7 +1276,7 @@ module Polars
988
1276
  # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
989
1277
  # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
990
1278
  # # └─────────────────────┴───────┴───────┴───────┘
991
- def group_by_rolling(
1279
+ def rolling(
992
1280
  index_column:,
993
1281
  period:,
994
1282
  offset: nil,
@@ -1005,12 +1293,13 @@ module Polars
1005
1293
  period = Utils._timedelta_to_pl_duration(period)
1006
1294
  offset = Utils._timedelta_to_pl_duration(offset)
1007
1295
 
1008
- lgb = _ldf.group_by_rolling(
1296
+ lgb = _ldf.rolling(
1009
1297
  index_column, period, offset, closed, rbexprs_by, check_sorted
1010
1298
  )
1011
1299
  LazyGroupBy.new(lgb)
1012
1300
  end
1013
- alias_method :groupby_rolling, :group_by_rolling
1301
+ alias_method :group_by_rolling, :rolling
1302
+ alias_method :groupby_rolling, :rolling
1014
1303
 
1015
1304
  # Group based on a time value (or index value of type `:i32`, `:i64`).
1016
1305
  #
@@ -1440,6 +1729,8 @@ module Polars
1440
1729
  # Join strategy.
1441
1730
  # @param suffix [String]
1442
1731
  # Suffix to append to columns with a duplicate name.
1732
+ # @param join_nulls [Boolean]
1733
+ # Join on null values. By default null values will never produce matches.
1443
1734
  # @param allow_parallel [Boolean]
1444
1735
  # Allow the physical plan to optionally evaluate the computation of both
1445
1736
  # DataFrames up to the join in parallel.
@@ -1535,6 +1826,7 @@ module Polars
1535
1826
  on: nil,
1536
1827
  how: "inner",
1537
1828
  suffix: "_right",
1829
+ join_nulls: false,
1538
1830
  allow_parallel: true,
1539
1831
  force_parallel: false
1540
1832
  )
@@ -1568,6 +1860,7 @@ module Polars
1568
1860
  rbexprs_right,
1569
1861
  allow_parallel,
1570
1862
  force_parallel,
1863
+ join_nulls,
1571
1864
  how,
1572
1865
  suffix,
1573
1866
  )
@@ -1608,27 +1901,9 @@ module Polars
1608
1901
  # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
1609
1902
  # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
1610
1903
  # # └─────┴──────┴───────┴──────┴──────┴───────┘
1611
- def with_columns(exprs)
1612
- exprs =
1613
- if exprs.nil?
1614
- []
1615
- elsif exprs.is_a?(Expr)
1616
- [exprs]
1617
- else
1618
- exprs.to_a
1619
- end
1620
-
1621
- rbexprs = []
1622
- exprs.each do |e|
1623
- case e
1624
- when Expr
1625
- rbexprs << e._rbexpr
1626
- when Series
1627
- rbexprs << Utils.lit(e)._rbexpr
1628
- else
1629
- raise ArgumentError, "Expected an expression, got #{e}"
1630
- end
1631
- end
1904
+ def with_columns(*exprs, **named_exprs)
1905
+ structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
1906
+ rbexprs = Utils.parse_as_list_of_expressions(*exprs, **named_exprs, __structify: structify)
1632
1907
 
1633
1908
  _from_rbldf(_ldf.with_columns(rbexprs))
1634
1909
  end
@@ -1725,7 +2000,7 @@ module Polars
1725
2000
  if columns.is_a?(::String)
1726
2001
  columns = [columns]
1727
2002
  end
1728
- _from_rbldf(_ldf.drop_columns(columns))
2003
+ _from_rbldf(_ldf.drop(columns))
1729
2004
  end
1730
2005
 
1731
2006
  # Rename column names.
@@ -1955,7 +2230,7 @@ module Polars
1955
2230
  # "b" => [2, 4, 6]
1956
2231
  # }
1957
2232
  # ).lazy
1958
- # df.with_row_count.collect
2233
+ # df.with_row_index.collect
1959
2234
  # # =>
1960
2235
  # # shape: (3, 3)
1961
2236
  # # ┌────────┬─────┬─────┐
@@ -1967,9 +2242,10 @@ module Polars
1967
2242
  # # │ 1 ┆ 3 ┆ 4 │
1968
2243
  # # │ 2 ┆ 5 ┆ 6 │
1969
2244
  # # └────────┴─────┴─────┘
1970
- def with_row_count(name: "row_nr", offset: 0)
1971
- _from_rbldf(_ldf.with_row_count(name, offset))
2245
+ def with_row_index(name: "row_nr", offset: 0)
2246
+ _from_rbldf(_ldf.with_row_index(name, offset))
1972
2247
  end
2248
+ alias_method :with_row_count, :with_row_index
1973
2249
 
1974
2250
  # Take every nth row in the LazyFrame and return as a new LazyFrame.
1975
2251
  #
@@ -2470,9 +2746,47 @@ module Polars
2470
2746
  _from_rbldf(_ldf.unnest(names))
2471
2747
  end
2472
2748
 
2473
- # TODO
2474
- # def merge_sorted
2475
- # end
2749
+ # Take two sorted DataFrames and merge them by the sorted key.
2750
+ #
2751
+ # The output of this operation will also be sorted.
2752
+ # It is the callers responsibility that the frames are sorted
2753
+ # by that key otherwise the output will not make sense.
2754
+ #
2755
+ # The schemas of both LazyFrames must be equal.
2756
+ #
2757
+ # @param other [DataFrame]
2758
+ # Other DataFrame that must be merged
2759
+ # @param key [String]
2760
+ # Key that is sorted.
2761
+ #
2762
+ # @return [LazyFrame]
2763
+ #
2764
+ # @example
2765
+ # df0 = Polars::LazyFrame.new(
2766
+ # {"name" => ["steve", "elise", "bob"], "age" => [42, 44, 18]}
2767
+ # ).sort("age")
2768
+ # df1 = Polars::LazyFrame.new(
2769
+ # {"name" => ["anna", "megan", "steve", "thomas"], "age" => [21, 33, 42, 20]}
2770
+ # ).sort("age")
2771
+ # df0.merge_sorted(df1, "age").collect
2772
+ # # =>
2773
+ # # shape: (7, 2)
2774
+ # # ┌────────┬─────┐
2775
+ # # │ name ┆ age │
2776
+ # # │ --- ┆ --- │
2777
+ # # │ str ┆ i64 │
2778
+ # # ╞════════╪═════╡
2779
+ # # │ bob ┆ 18 │
2780
+ # # │ thomas ┆ 20 │
2781
+ # # │ anna ┆ 21 │
2782
+ # # │ megan ┆ 33 │
2783
+ # # │ steve ┆ 42 │
2784
+ # # │ steve ┆ 42 │
2785
+ # # │ elise ┆ 44 │
2786
+ # # └────────┴─────┘
2787
+ def merge_sorted(other, key)
2788
+ _from_rbldf(_ldf.merge_sorted(other._ldf, key))
2789
+ end
2476
2790
 
2477
2791
  # Indicate that one or multiple columns are sorted.
2478
2792
  #