polars-df 0.5.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +26 -0
  3. data/Cargo.lock +595 -709
  4. data/Cargo.toml +1 -0
  5. data/README.md +11 -9
  6. data/ext/polars/Cargo.toml +18 -10
  7. data/ext/polars/src/batched_csv.rs +26 -26
  8. data/ext/polars/src/conversion.rs +272 -136
  9. data/ext/polars/src/dataframe.rs +135 -94
  10. data/ext/polars/src/error.rs +8 -5
  11. data/ext/polars/src/expr/array.rs +15 -0
  12. data/ext/polars/src/expr/binary.rs +18 -6
  13. data/ext/polars/src/expr/datetime.rs +10 -12
  14. data/ext/polars/src/expr/general.rs +78 -264
  15. data/ext/polars/src/expr/list.rs +41 -28
  16. data/ext/polars/src/{expr.rs → expr/mod.rs} +5 -2
  17. data/ext/polars/src/expr/name.rs +44 -0
  18. data/ext/polars/src/expr/rolling.rs +196 -0
  19. data/ext/polars/src/expr/string.rs +94 -66
  20. data/ext/polars/src/file.rs +3 -3
  21. data/ext/polars/src/functions/aggregation.rs +35 -0
  22. data/ext/polars/src/functions/eager.rs +7 -31
  23. data/ext/polars/src/functions/io.rs +10 -10
  24. data/ext/polars/src/functions/lazy.rs +119 -54
  25. data/ext/polars/src/functions/meta.rs +30 -0
  26. data/ext/polars/src/functions/misc.rs +8 -0
  27. data/ext/polars/src/functions/mod.rs +5 -0
  28. data/ext/polars/src/functions/random.rs +6 -0
  29. data/ext/polars/src/functions/range.rs +46 -0
  30. data/ext/polars/src/functions/string_cache.rs +11 -0
  31. data/ext/polars/src/functions/whenthen.rs +7 -7
  32. data/ext/polars/src/lazyframe.rs +61 -44
  33. data/ext/polars/src/lib.rs +173 -84
  34. data/ext/polars/src/{apply → map}/dataframe.rs +28 -33
  35. data/ext/polars/src/{apply → map}/mod.rs +10 -6
  36. data/ext/polars/src/{apply → map}/series.rs +12 -16
  37. data/ext/polars/src/object.rs +2 -2
  38. data/ext/polars/src/rb_modules.rs +25 -6
  39. data/ext/polars/src/series/construction.rs +32 -6
  40. data/ext/polars/src/series/export.rs +2 -2
  41. data/ext/polars/src/series/set_at_idx.rs +33 -17
  42. data/ext/polars/src/series.rs +62 -42
  43. data/ext/polars/src/sql.rs +46 -0
  44. data/lib/polars/array_expr.rb +84 -0
  45. data/lib/polars/array_name_space.rb +77 -0
  46. data/lib/polars/batched_csv_reader.rb +1 -1
  47. data/lib/polars/config.rb +530 -0
  48. data/lib/polars/data_frame.rb +206 -131
  49. data/lib/polars/data_types.rb +163 -29
  50. data/lib/polars/date_time_expr.rb +13 -18
  51. data/lib/polars/date_time_name_space.rb +22 -28
  52. data/lib/polars/dynamic_group_by.rb +2 -2
  53. data/lib/polars/expr.rb +241 -151
  54. data/lib/polars/functions.rb +29 -38
  55. data/lib/polars/group_by.rb +38 -76
  56. data/lib/polars/io.rb +37 -2
  57. data/lib/polars/lazy_frame.rb +174 -95
  58. data/lib/polars/lazy_functions.rb +87 -63
  59. data/lib/polars/lazy_group_by.rb +7 -8
  60. data/lib/polars/list_expr.rb +40 -36
  61. data/lib/polars/list_name_space.rb +15 -15
  62. data/lib/polars/name_expr.rb +198 -0
  63. data/lib/polars/rolling_group_by.rb +6 -4
  64. data/lib/polars/series.rb +95 -28
  65. data/lib/polars/sql_context.rb +194 -0
  66. data/lib/polars/string_expr.rb +249 -69
  67. data/lib/polars/string_name_space.rb +155 -25
  68. data/lib/polars/utils.rb +119 -57
  69. data/lib/polars/version.rb +1 -1
  70. data/lib/polars.rb +6 -0
  71. metadata +21 -7
  72. /data/ext/polars/src/{apply → map}/lazy.rs +0 -0
@@ -4,6 +4,22 @@ module Polars
4
4
  # @private
5
5
  attr_accessor :_ldf
6
6
 
7
+ # Create a new LazyFrame.
8
+ def initialize(data = nil, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
9
+ self._ldf = (
10
+ DataFrame.new(
11
+ data,
12
+ schema: schema,
13
+ schema_overrides: schema_overrides,
14
+ orient: orient,
15
+ infer_schema_length: infer_schema_length,
16
+ nan_to_null: nan_to_null
17
+ )
18
+ .lazy
19
+ ._ldf
20
+ )
21
+ end
22
+
7
23
  # @private
8
24
  def self._from_rbldf(rb_ldf)
9
25
  ldf = LazyFrame.allocate
@@ -81,7 +97,8 @@ module Polars
81
97
  row_count_offset: 0,
82
98
  storage_options: nil,
83
99
  low_memory: false,
84
- use_statistics: true
100
+ use_statistics: true,
101
+ hive_partitioning: true
85
102
  )
86
103
  _from_rbldf(
87
104
  RbLazyFrame.new_from_parquet(
@@ -92,7 +109,8 @@ module Polars
92
109
  rechunk,
93
110
  Utils._prepare_row_count_args(row_count_name, row_count_offset),
94
111
  low_memory,
95
- use_statistics
112
+ use_statistics,
113
+ hive_partitioning
96
114
  )
97
115
  )
98
116
  end
@@ -334,6 +352,7 @@ module Polars
334
352
  slice_pushdown,
335
353
  common_subplan_elimination,
336
354
  allow_streaming,
355
+ false
337
356
  )
338
357
 
339
358
  ldf.describe_optimized_plan
@@ -379,16 +398,16 @@ module Polars
379
398
  # # │ 2 ┆ 7.0 ┆ b │
380
399
  # # │ 1 ┆ 6.0 ┆ a │
381
400
  # # └─────┴─────┴─────┘
382
- def sort(by, reverse: false, nulls_last: false)
401
+ def sort(by, reverse: false, nulls_last: false, maintain_order: false)
383
402
  if by.is_a?(String)
384
- _from_rbldf(_ldf.sort(by, reverse, nulls_last))
403
+ return _from_rbldf(_ldf.sort(by, reverse, nulls_last, maintain_order))
385
404
  end
386
405
  if Utils.bool?(reverse)
387
406
  reverse = [reverse]
388
407
  end
389
408
 
390
409
  by = Utils.selection_to_rbexpr_list(by)
391
- _from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last))
410
+ _from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last, maintain_order))
392
411
  end
393
412
 
394
413
  # def profile
@@ -429,7 +448,7 @@ module Polars
429
448
  # "c" => [6, 5, 4, 3, 2, 1]
430
449
  # }
431
450
  # ).lazy
432
- # df.groupby("a", maintain_order: true).agg(Polars.all.sum).collect
451
+ # df.group_by("a", maintain_order: true).agg(Polars.all.sum).collect
433
452
  # # =>
434
453
  # # shape: (3, 3)
435
454
  # # ┌─────┬─────┬─────┐
@@ -450,7 +469,8 @@ module Polars
450
469
  no_optimization: false,
451
470
  slice_pushdown: true,
452
471
  common_subplan_elimination: true,
453
- allow_streaming: false
472
+ allow_streaming: false,
473
+ _eager: false
454
474
  )
455
475
  if no_optimization
456
476
  predicate_pushdown = false
@@ -470,7 +490,8 @@ module Polars
470
490
  simplify_expression,
471
491
  slice_pushdown,
472
492
  common_subplan_elimination,
473
- allow_streaming
493
+ allow_streaming,
494
+ _eager
474
495
  )
475
496
  Utils.wrap_df(ldf.collect)
476
497
  end
@@ -552,7 +573,8 @@ module Polars
552
573
  simplify_expression,
553
574
  slice_pushdown,
554
575
  false,
555
- true
576
+ true,
577
+ false
556
578
  )
557
579
  lf.sink_parquet(
558
580
  path,
@@ -607,7 +629,7 @@ module Polars
607
629
  # "c" => [6, 5, 4, 3, 2, 1]
608
630
  # }
609
631
  # ).lazy
610
- # df.groupby("a", maintain_order: true).agg(Polars.all.sum).fetch(2)
632
+ # df.group_by("a", maintain_order: true).agg(Polars.all.sum).fetch(2)
611
633
  # # =>
612
634
  # # shape: (2, 3)
613
635
  # # ┌─────┬─────┬─────┐
@@ -644,7 +666,8 @@ module Polars
644
666
  simplify_expression,
645
667
  slice_pushdown,
646
668
  common_subplan_elimination,
647
- allow_streaming
669
+ allow_streaming,
670
+ false
648
671
  )
649
672
  Utils.wrap_df(ldf.fetch(n_rows))
650
673
  end
@@ -837,13 +860,13 @@ module Polars
837
860
  _from_rbldf(_ldf.select(exprs))
838
861
  end
839
862
 
840
- # Start a groupby operation.
863
+ # Start a group by operation.
841
864
  #
842
865
  # @param by [Object]
843
866
  # Column(s) to group by.
844
867
  # @param maintain_order [Boolean]
845
868
  # Make sure that the order of the groups remain consistent. This is more
846
- # expensive than a default groupby.
869
+ # expensive than a default group by.
847
870
  #
848
871
  # @return [LazyGroupBy]
849
872
  #
@@ -855,7 +878,7 @@ module Polars
855
878
  # "c" => [6, 5, 4, 3, 2, 1]
856
879
  # }
857
880
  # ).lazy
858
- # df.groupby("a", maintain_order: true).agg(Polars.col("b").sum).collect
881
+ # df.group_by("a", maintain_order: true).agg(Polars.col("b").sum).collect
859
882
  # # =>
860
883
  # # shape: (3, 2)
861
884
  # # ┌─────┬─────┐
@@ -867,19 +890,21 @@ module Polars
867
890
  # # │ b ┆ 11 │
868
891
  # # │ c ┆ 6 │
869
892
  # # └─────┴─────┘
870
- def groupby(by, maintain_order: false)
893
+ def group_by(by, maintain_order: false)
871
894
  rbexprs_by = Utils.selection_to_rbexpr_list(by)
872
- lgb = _ldf.groupby(rbexprs_by, maintain_order)
873
- LazyGroupBy.new(lgb, self.class)
895
+ lgb = _ldf.group_by(rbexprs_by, maintain_order)
896
+ LazyGroupBy.new(lgb)
874
897
  end
898
+ alias_method :groupby, :group_by
899
+ alias_method :group, :group_by
875
900
 
876
901
  # Create rolling groups based on a time column.
877
902
  #
878
903
  # Also works for index values of type `:i32` or `:i64`.
879
904
  #
880
- # Different from a `dynamic_groupby` the windows are now determined by the
905
+ # Different from a `dynamic_group_by` the windows are now determined by the
881
906
  # individual values and are not of constant intervals. For constant intervals
882
- # use *groupby_dynamic*.
907
+ # use *group_by_dynamic*.
883
908
  #
884
909
  # The `period` and `offset` arguments are created either from a timedelta, or
885
910
  # by using the following string language:
@@ -899,7 +924,7 @@ module Polars
899
924
  # Or combine them:
900
925
  # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
901
926
  #
902
- # In case of a groupby_rolling on an integer column, the windows are defined by:
927
+ # In case of a group_by_rolling on an integer column, the windows are defined by:
903
928
  #
904
929
  # - "1i" # length 1
905
930
  # - "10i" # length 10
@@ -910,7 +935,7 @@ module Polars
910
935
  # This column must be sorted in ascending order. If not the output will not
911
936
  # make sense.
912
937
  #
913
- # In case of a rolling groupby on indices, dtype needs to be one of
938
+ # In case of a rolling group by on indices, dtype needs to be one of
914
939
  # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
915
940
  # performance matters use an `:i64` column.
916
941
  # @param period [Object]
@@ -921,6 +946,12 @@ module Polars
921
946
  # Define whether the temporal window interval is closed or not.
922
947
  # @param by [Object]
923
948
  # Also group by this column/these columns.
949
+ # @param check_sorted [Boolean]
950
+ # When the `by` argument is given, polars can not check sortedness
951
+ # by the metadata and has to do a full scan on the index column to
952
+ # verify data is sorted. This is expensive. If you are sure the
953
+ # data within the by groups is sorted, you can set this to `false`.
954
+ # Doing so incorrectly will lead to incorrect output
924
955
  #
925
956
  # @return [LazyFrame]
926
957
  #
@@ -933,16 +964,16 @@ module Polars
933
964
  # "2020-01-03 19:45:32",
934
965
  # "2020-01-08 23:16:43"
935
966
  # ]
936
- # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
937
- # Polars.col("dt").str.strptime(Polars::Datetime)
967
+ # df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
968
+ # Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
938
969
  # )
939
- # df.groupby_rolling(index_column: "dt", period: "2d").agg(
970
+ # df.group_by_rolling(index_column: "dt", period: "2d").agg(
940
971
  # [
941
972
  # Polars.sum("a").alias("sum_a"),
942
973
  # Polars.min("a").alias("min_a"),
943
974
  # Polars.max("a").alias("max_a")
944
975
  # ]
945
- # )
976
+ # ).collect
946
977
  # # =>
947
978
  # # shape: (6, 4)
948
979
  # # ┌─────────────────────┬───────┬───────┬───────┐
@@ -957,14 +988,15 @@ module Polars
957
988
  # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
958
989
  # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
959
990
  # # └─────────────────────┴───────┴───────┴───────┘
960
- def groupby_rolling(
991
+ def group_by_rolling(
961
992
  index_column:,
962
993
  period:,
963
994
  offset: nil,
964
995
  closed: "right",
965
- by: nil
996
+ by: nil,
997
+ check_sorted: true
966
998
  )
967
- index_column = Utils.expr_to_lit_or_expr(index_column, str_to_lit: false)
999
+ index_column = Utils.parse_as_expression(index_column)
968
1000
  if offset.nil?
969
1001
  offset = "-#{period}"
970
1002
  end
@@ -973,16 +1005,17 @@ module Polars
973
1005
  period = Utils._timedelta_to_pl_duration(period)
974
1006
  offset = Utils._timedelta_to_pl_duration(offset)
975
1007
 
976
- lgb = _ldf.groupby_rolling(
977
- index_column._rbexpr, period, offset, closed, rbexprs_by
1008
+ lgb = _ldf.group_by_rolling(
1009
+ index_column, period, offset, closed, rbexprs_by, check_sorted
978
1010
  )
979
- LazyGroupBy.new(lgb, self.class)
1011
+ LazyGroupBy.new(lgb)
980
1012
  end
1013
+ alias_method :groupby_rolling, :group_by_rolling
981
1014
 
982
1015
  # Group based on a time value (or index value of type `:i32`, `:i64`).
983
1016
  #
984
1017
  # Time windows are calculated and rows are assigned to windows. Different from a
985
- # normal groupby is that a row can be member of multiple groups. The time/index
1018
+ # normal group by is that a row can be member of multiple groups. The time/index
986
1019
  # window could be seen as a rolling window, with a window size determined by
987
1020
  # dates/times/values instead of slots in the DataFrame.
988
1021
  #
@@ -1010,37 +1043,43 @@ module Polars
1010
1043
  # Or combine them:
1011
1044
  # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1012
1045
  #
1013
- # In case of a groupby_dynamic on an integer column, the windows are defined by:
1046
+ # In case of a group_by_dynamic on an integer column, the windows are defined by:
1014
1047
  #
1015
1048
  # - "1i" # length 1
1016
1049
  # - "10i" # length 10
1017
1050
  #
1018
- # @param index_column
1051
+ # @param index_column [Object]
1019
1052
  # Column used to group based on the time window.
1020
1053
  # Often to type Date/Datetime
1021
1054
  # This column must be sorted in ascending order. If not the output will not
1022
1055
  # make sense.
1023
1056
  #
1024
- # In case of a dynamic groupby on indices, dtype needs to be one of
1057
+ # In case of a dynamic group by on indices, dtype needs to be one of
1025
1058
  # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1026
1059
  # performance matters use an `:i64` column.
1027
- # @param every
1060
+ # @param every [Object]
1028
1061
  # Interval of the window.
1029
- # @param period
1062
+ # @param period [Object]
1030
1063
  # Length of the window, if None it is equal to 'every'.
1031
- # @param offset
1064
+ # @param offset [Object]
1032
1065
  # Offset of the window if None and period is None it will be equal to negative
1033
1066
  # `every`.
1034
- # @param truncate
1067
+ # @param truncate [Boolean]
1035
1068
  # Truncate the time value to the window lower bound.
1036
- # @param include_boundaries
1069
+ # @param include_boundaries [Boolean]
1037
1070
  # Add the lower and upper bound of the window to the "_lower_bound" and
1038
1071
  # "_upper_bound" columns. This will impact performance because it's harder to
1039
1072
  # parallelize
1040
1073
  # @param closed ["right", "left", "both", "none"]
1041
1074
  # Define whether the temporal window interval is closed or not.
1042
- # @param by
1075
+ # @param by [Object]
1043
1076
  # Also group by this column/these columns
1077
+ # @param check_sorted [Boolean]
1078
+ # When the `by` argument is given, polars can not check sortedness
1079
+ # by the metadata and has to do a full scan on the index column to
1080
+ # verify data is sorted. This is expensive. If you are sure the
1081
+ # data within the by groups is sorted, you can set this to `false`.
1082
+ # Doing so incorrectly will lead to incorrect output.
1044
1083
  #
1045
1084
  # @return [DataFrame]
1046
1085
  #
@@ -1072,7 +1111,7 @@ module Polars
1072
1111
  # # └─────────────────────┴─────┘
1073
1112
  #
1074
1113
  # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
1075
- # df.groupby_dynamic("time", every: "1h", closed: "right").agg(
1114
+ # df.group_by_dynamic("time", every: "1h", closed: "right").agg(
1076
1115
  # [
1077
1116
  # Polars.col("time").min.alias("time_min"),
1078
1117
  # Polars.col("time").max.alias("time_max")
@@ -1092,7 +1131,7 @@ module Polars
1092
1131
  # # └─────────────────────┴─────────────────────┴─────────────────────┘
1093
1132
  #
1094
1133
  # @example The window boundaries can also be added to the aggregation result.
1095
- # df.groupby_dynamic(
1134
+ # df.group_by_dynamic(
1096
1135
  # "time", every: "1h", include_boundaries: true, closed: "right"
1097
1136
  # ).agg([Polars.col("time").count.alias("time_count")])
1098
1137
  # # =>
@@ -1109,27 +1148,27 @@ module Polars
1109
1148
  # # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
1110
1149
  #
1111
1150
  # @example When closed="left", should not include right end of interval.
1112
- # df.groupby_dynamic("time", every: "1h", closed: "left").agg(
1151
+ # df.group_by_dynamic("time", every: "1h", closed: "left").agg(
1113
1152
  # [
1114
1153
  # Polars.col("time").count.alias("time_count"),
1115
- # Polars.col("time").list.alias("time_agg_list")
1154
+ # Polars.col("time").alias("time_agg_list")
1116
1155
  # ]
1117
1156
  # )
1118
1157
  # # =>
1119
1158
  # # shape: (4, 3)
1120
- # # ┌─────────────────────┬────────────┬─────────────────────────────────────┐
1121
- # # │ time ┆ time_count ┆ time_agg_list
1122
- # # │ --- ┆ --- ┆ ---
1123
- # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
1124
- # # ╞═════════════════════╪════════════╪═════════════════════════════════════╡
1125
- # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16...
1126
- # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16...
1127
- # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16...
1128
- # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
1129
- # # └─────────────────────┴────────────┴─────────────────────────────────────┘
1159
+ # # ┌─────────────────────┬────────────┬───────────────────────────────────┐
1160
+ # # │ time ┆ time_count ┆ time_agg_list
1161
+ # # │ --- ┆ --- ┆ ---
1162
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
1163
+ # # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
1164
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16
1165
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16
1166
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16
1167
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
1168
+ # # └─────────────────────┴────────────┴───────────────────────────────────┘
1130
1169
  #
1131
1170
  # @example When closed="both" the time values at the window boundaries belong to 2 groups.
1132
- # df.groupby_dynamic("time", every: "1h", closed: "both").agg(
1171
+ # df.group_by_dynamic("time", every: "1h", closed: "both").agg(
1133
1172
  # [Polars.col("time").count.alias("time_count")]
1134
1173
  # )
1135
1174
  # # =>
@@ -1146,7 +1185,7 @@ module Polars
1146
1185
  # # │ 2021-12-16 03:00:00 ┆ 1 │
1147
1186
  # # └─────────────────────┴────────────┘
1148
1187
  #
1149
- # @example Dynamic groupbys can also be combined with grouping on normal keys.
1188
+ # @example Dynamic group bys can also be combined with grouping on normal keys.
1150
1189
  # df = Polars::DataFrame.new(
1151
1190
  # {
1152
1191
  # "time" => Polars.date_range(
@@ -1157,7 +1196,7 @@ module Polars
1157
1196
  # "groups" => ["a", "a", "a", "b", "b", "a", "a"]
1158
1197
  # }
1159
1198
  # )
1160
- # df.groupby_dynamic(
1199
+ # df.group_by_dynamic(
1161
1200
  # "time",
1162
1201
  # every: "1h",
1163
1202
  # closed: "both",
@@ -1180,20 +1219,20 @@ module Polars
1180
1219
  # # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
1181
1220
  # # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
1182
1221
  #
1183
- # @example Dynamic groupby on an index column.
1222
+ # @example Dynamic group by on an index column.
1184
1223
  # df = Polars::DataFrame.new(
1185
1224
  # {
1186
1225
  # "idx" => Polars.arange(0, 6, eager: true),
1187
1226
  # "A" => ["A", "A", "B", "B", "B", "C"]
1188
1227
  # }
1189
1228
  # )
1190
- # df.groupby_dynamic(
1229
+ # df.group_by_dynamic(
1191
1230
  # "idx",
1192
1231
  # every: "2i",
1193
1232
  # period: "3i",
1194
1233
  # include_boundaries: true,
1195
1234
  # closed: "right"
1196
- # ).agg(Polars.col("A").list.alias("A_agg_list"))
1235
+ # ).agg(Polars.col("A").alias("A_agg_list"))
1197
1236
  # # =>
1198
1237
  # # shape: (3, 4)
1199
1238
  # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
@@ -1205,23 +1244,26 @@ module Polars
1205
1244
  # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
1206
1245
  # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
1207
1246
  # # └─────────────────┴─────────────────┴─────┴─────────────────┘
1208
- def groupby_dynamic(
1247
+ def group_by_dynamic(
1209
1248
  index_column,
1210
1249
  every:,
1211
1250
  period: nil,
1212
1251
  offset: nil,
1213
- truncate: true,
1252
+ truncate: nil,
1214
1253
  include_boundaries: false,
1215
1254
  closed: "left",
1255
+ label: "left",
1216
1256
  by: nil,
1217
- start_by: "window"
1257
+ start_by: "window",
1258
+ check_sorted: true
1218
1259
  )
1260
+ if !truncate.nil?
1261
+ label = truncate ? "left" : "datapoint"
1262
+ end
1263
+
1264
+ index_column = Utils.expr_to_lit_or_expr(index_column, str_to_lit: false)
1219
1265
  if offset.nil?
1220
- if period.nil?
1221
- offset = "-#{every}"
1222
- else
1223
- offset = "0ns"
1224
- end
1266
+ offset = period.nil? ? "-#{every}" : "0ns"
1225
1267
  end
1226
1268
 
1227
1269
  if period.nil?
@@ -1233,19 +1275,21 @@ module Polars
1233
1275
  every = Utils._timedelta_to_pl_duration(every)
1234
1276
 
1235
1277
  rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
1236
- lgb = _ldf.groupby_dynamic(
1237
- index_column,
1278
+ lgb = _ldf.group_by_dynamic(
1279
+ index_column._rbexpr,
1238
1280
  every,
1239
1281
  period,
1240
1282
  offset,
1241
- truncate,
1283
+ label,
1242
1284
  include_boundaries,
1243
1285
  closed,
1244
1286
  rbexprs_by,
1245
- start_by
1287
+ start_by,
1288
+ check_sorted
1246
1289
  )
1247
- LazyGroupBy.new(lgb, self.class)
1290
+ LazyGroupBy.new(lgb)
1248
1291
  end
1292
+ alias_method :groupby_dynamic, :group_by_dynamic
1249
1293
 
1250
1294
  # Perform an asof join.
1251
1295
  #
@@ -1351,7 +1395,7 @@ module Polars
1351
1395
  if by.is_a?(String)
1352
1396
  by_left_ = [by]
1353
1397
  by_right_ = [by]
1354
- elsif by.is_a?(Array)
1398
+ elsif by.is_a?(::Array)
1355
1399
  by_left_ = by
1356
1400
  by_right_ = by
1357
1401
  end
@@ -1619,7 +1663,7 @@ module Polars
1619
1663
  # # │ null │
1620
1664
  # # └──────┘
1621
1665
  def with_context(other)
1622
- if !other.is_a?(Array)
1666
+ if !other.is_a?(::Array)
1623
1667
  other = [other]
1624
1668
  end
1625
1669
 
@@ -1705,8 +1749,10 @@ module Polars
1705
1749
 
1706
1750
  # Shift the values by a given period.
1707
1751
  #
1708
- # @param periods [Integer]
1752
+ # @param n [Integer]
1709
1753
  # Number of places to shift (may be negative).
1754
+ # @param fill_value [Object]
1755
+ # Fill the resulting null values with this value.
1710
1756
  #
1711
1757
  # @return [LazyFrame]
1712
1758
  #
@@ -1743,8 +1789,12 @@ module Polars
1743
1789
  # # │ 5 ┆ 6 │
1744
1790
  # # │ null ┆ null │
1745
1791
  # # └──────┴──────┘
1746
- def shift(periods)
1747
- _from_rbldf(_ldf.shift(periods))
1792
+ def shift(n, fill_value: nil)
1793
+ if !fill_value.nil?
1794
+ fill_value = Utils.parse_as_expression(fill_value, str_as_lit: true)
1795
+ end
1796
+ n = Utils.parse_as_expression(n)
1797
+ _from_rbldf(_ldf.shift(n, fill_value))
1748
1798
  end
1749
1799
 
1750
1800
  # Shift the values by a given period and fill the resulting null values.
@@ -1790,10 +1840,7 @@ module Polars
1790
1840
  # # │ 0 ┆ 0 │
1791
1841
  # # └─────┴─────┘
1792
1842
  def shift_and_fill(periods, fill_value)
1793
- if !fill_value.is_a?(Expr)
1794
- fill_value = Polars.lit(fill_value)
1795
- end
1796
- _from_rbldf(_ldf.shift_and_fill(periods, fill_value._rbexpr))
1843
+ shift(periods, fill_value: fill_value)
1797
1844
  end
1798
1845
 
1799
1846
  # Get a slice of this DataFrame.
@@ -2228,7 +2275,7 @@ module Polars
2228
2275
  #
2229
2276
  # @return [LazyFrame]
2230
2277
  def unique(maintain_order: true, subset: nil, keep: "first")
2231
- if !subset.nil? && !subset.is_a?(Array)
2278
+ if !subset.nil? && !subset.is_a?(::Array)
2232
2279
  subset = [subset]
2233
2280
  end
2234
2281
  _from_rbldf(_ldf.unique(maintain_order, subset, keep))
@@ -2261,7 +2308,7 @@ module Polars
2261
2308
  # # │ 3 ┆ 8 ┆ c │
2262
2309
  # # └─────┴─────┴─────┘
2263
2310
  def drop_nulls(subset: nil)
2264
- if !subset.nil? && !subset.is_a?(Array)
2311
+ if !subset.nil? && !subset.is_a?(::Array)
2265
2312
  subset = [subset]
2266
2313
  end
2267
2314
  _from_rbldf(_ldf.drop_nulls(subset))
@@ -2351,16 +2398,16 @@ module Polars
2351
2398
  # df.interpolate.collect
2352
2399
  # # =>
2353
2400
  # # shape: (4, 3)
2354
- # # ┌─────┬──────┬─────┐
2355
- # # │ foo ┆ bar ┆ baz
2356
- # # │ --- ┆ --- ┆ ---
2357
- # # │ i64 i64i64
2358
- # # ╞═════╪══════╪═════╡
2359
- # # │ 1 ┆ 6 ┆ 1
2360
- # # │ 5 ┆ 7 ┆ 3
2361
- # # │ 9 ┆ 9 ┆ 6
2362
- # # │ 10 ┆ null ┆ 9
2363
- # # └─────┴──────┴─────┘
2401
+ # # ┌──────┬──────┬──────────┐
2402
+ # # │ foo ┆ bar ┆ baz
2403
+ # # │ --- ┆ --- ┆ ---
2404
+ # # │ f64 f64f64
2405
+ # # ╞══════╪══════╪══════════╡
2406
+ # # │ 1.0 ┆ 6.0 ┆ 1.0
2407
+ # # │ 5.0 ┆ 7.0 ┆ 3.666667
2408
+ # # │ 9.0 ┆ 9.0 ┆ 6.333333
2409
+ # # │ 10.0 ┆ null ┆ 9.0
2410
+ # # └──────┴──────┴──────────┘
2364
2411
  def interpolate
2365
2412
  select(Utils.col("*").interpolate)
2366
2413
  end
@@ -2423,6 +2470,38 @@ module Polars
2423
2470
  _from_rbldf(_ldf.unnest(names))
2424
2471
  end
2425
2472
 
2473
+ # TODO
2474
+ # def merge_sorted
2475
+ # end
2476
+
2477
+ # Indicate that one or multiple columns are sorted.
2478
+ #
2479
+ # @param column [Object]
2480
+ # Columns that are sorted
2481
+ # @param more_columns [Object]
2482
+ # Additional columns that are sorted, specified as positional arguments.
2483
+ # @param descending [Boolean]
2484
+ # Whether the columns are sorted in descending order.
2485
+ #
2486
+ # @return [LazyFrame]
2487
+ def set_sorted(
2488
+ column,
2489
+ *more_columns,
2490
+ descending: false
2491
+ )
2492
+ columns = Utils.selection_to_rbexpr_list(column)
2493
+ if more_columns.any?
2494
+ columns.concat(Utils.selection_to_rbexpr_list(more_columns))
2495
+ end
2496
+ with_columns(
2497
+ columns.map { |e| Utils.wrap_expr(e).set_sorted(descending: descending) }
2498
+ )
2499
+ end
2500
+
2501
+ # TODO
2502
+ # def update
2503
+ # end
2504
+
2426
2505
  private
2427
2506
 
2428
2507
  def initialize_copy(other)