polars-df 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -149,8 +149,19 @@ module Polars
149
149
  # def self.from_json
150
150
  # end
151
151
 
152
- # def self.read_json
153
- # end
152
+ # Read a logical plan from a JSON file to construct a LazyFrame.
153
+ #
154
+ # @param file [String]
155
+ # Path to a file or a file-like object.
156
+ #
157
+ # @return [LazyFrame]
158
+ def self.read_json(file)
159
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
160
+ file = Utils.format_path(file)
161
+ end
162
+
163
+ Utils.wrap_ldf(RbLazyFrame.read_json(file))
164
+ end
154
165
 
155
166
  # Get or set column names.
156
167
  #
@@ -187,7 +198,7 @@ module Polars
187
198
  # }
188
199
  # ).lazy
189
200
  # lf.dtypes
190
- # # => [:i64, :f64, :str]
201
+ # # => [Polars::Int64, Polars::Float64, Polars::Utf8]
191
202
  def dtypes
192
203
  _ldf.dtypes
193
204
  end
@@ -205,7 +216,7 @@ module Polars
205
216
  # }
206
217
  # ).lazy
207
218
  # lf.schema
208
- # # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str}
219
+ # # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::Utf8}
209
220
  def schema
210
221
  _ldf.schema
211
222
  end
@@ -245,11 +256,57 @@ module Polars
245
256
  EOS
246
257
  end
247
258
 
248
- # def write_json
249
- # end
259
+ # Write the logical plan of this LazyFrame to a file or string in JSON format.
260
+ #
261
+ # @param file [String]
262
+ # File path to which the result should be written.
263
+ #
264
+ # @return [nil]
265
+ def write_json(file)
266
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
267
+ file = Utils.format_path(file)
268
+ end
269
+ _ldf.write_json(file)
270
+ nil
271
+ end
250
272
 
251
- # def pipe
252
- # end
273
+ # Offers a structured way to apply a sequence of user-defined functions (UDFs).
274
+ #
275
+ # @param func [Object]
276
+ # Callable; will receive the frame as the first parameter,
277
+ # followed by any given args/kwargs.
278
+ # @param args [Object]
279
+ # Arguments to pass to the UDF.
280
+ # @param kwargs [Object]
281
+ # Keyword arguments to pass to the UDF.
282
+ #
283
+ # @return [LazyFrame]
284
+ #
285
+ # @example
286
+ # cast_str_to_int = lambda do |data, col_name:|
287
+ # data.with_column(Polars.col(col_name).cast(:i64))
288
+ # end
289
+ #
290
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]}).lazy
291
+ # df.pipe(cast_str_to_int, col_name: "b").collect()
292
+ # # =>
293
+ # # shape: (4, 2)
294
+ # # ┌─────┬─────┐
295
+ # # │ a ┆ b │
296
+ # # │ --- ┆ --- │
297
+ # # │ i64 ┆ i64 │
298
+ # # ╞═════╪═════╡
299
+ # # │ 1 ┆ 10 │
300
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
301
+ # # │ 2 ┆ 20 │
302
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
303
+ # # │ 3 ┆ 30 │
304
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
305
+ # # │ 4 ┆ 40 │
306
+ # # └─────┴─────┘
307
+ def pipe(func, *args, **kwargs, &block)
308
+ func.call(self, *args, **kwargs, &block)
309
+ end
253
310
 
254
311
  # Create a string representation of the unoptimized query plan.
255
312
  #
@@ -261,8 +318,27 @@ module Polars
261
318
  # Create a string representation of the optimized query plan.
262
319
  #
263
320
  # @return [String]
264
- # def describe_optimized_plan
265
- # end
321
+ def describe_optimized_plan(
322
+ type_coercion: true,
323
+ predicate_pushdown: true,
324
+ projection_pushdown: true,
325
+ simplify_expression: true,
326
+ slice_pushdown: true,
327
+ common_subplan_elimination: true,
328
+ allow_streaming: false
329
+ )
330
+ ldf = _ldf.optimization_toggle(
331
+ type_coercion,
332
+ predicate_pushdown,
333
+ projection_pushdown,
334
+ simplify_expression,
335
+ slice_pushdown,
336
+ common_subplan_elimination,
337
+ allow_streaming,
338
+ )
339
+
340
+ ldf.describe_optimized_plan
341
+ end
266
342
 
267
343
  # def show_graph
268
344
  # end
@@ -726,14 +802,544 @@ module Polars
726
802
  LazyGroupBy.new(lgb, self.class)
727
803
  end
728
804
 
729
- # def groupby_rolling
730
- # end
805
+ # Create rolling groups based on a time column.
806
+ #
807
+ # Also works for index values of type `:i32` or `:i64`.
808
+ #
809
+ # Different from a `dynamic_groupby` the windows are now determined by the
810
+ # individual values and are not of constant intervals. For constant intervals
811
+ # use *groupby_dynamic*.
812
+ #
813
+ # The `period` and `offset` arguments are created either from a timedelta, or
814
+ # by using the following string language:
815
+ #
816
+ # - 1ns (1 nanosecond)
817
+ # - 1us (1 microsecond)
818
+ # - 1ms (1 millisecond)
819
+ # - 1s (1 second)
820
+ # - 1m (1 minute)
821
+ # - 1h (1 hour)
822
+ # - 1d (1 day)
823
+ # - 1w (1 week)
824
+ # - 1mo (1 calendar month)
825
+ # - 1y (1 calendar year)
826
+ # - 1i (1 index count)
827
+ #
828
+ # Or combine them:
829
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
830
+ #
831
+ # In case of a groupby_rolling on an integer column, the windows are defined by:
832
+ #
833
+ # - "1i" # length 1
834
+ # - "10i" # length 10
835
+ #
836
+ # @param index_column [Object]
837
+ # Column used to group based on the time window.
838
+ # Often to type Date/Datetime
839
+ # This column must be sorted in ascending order. If not the output will not
840
+ # make sense.
841
+ #
842
+ # In case of a rolling groupby on indices, dtype needs to be one of
843
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
844
+ # performance matters use an `:i64` column.
845
+ # @param period [Object]
846
+ # Length of the window.
847
+ # @param offset [Object]
848
+ # Offset of the window. Default is -period.
849
+ # @param closed ["right", "left", "both", "none"]
850
+ # Define whether the temporal window interval is closed or not.
851
+ # @param by [Object]
852
+ # Also group by this column/these columns.
853
+ #
854
+ # @return [LazyFrame]
855
+ #
856
+ # @example
857
+ # dates = [
858
+ # "2020-01-01 13:45:48",
859
+ # "2020-01-01 16:42:13",
860
+ # "2020-01-01 16:45:09",
861
+ # "2020-01-02 18:12:48",
862
+ # "2020-01-03 19:45:32",
863
+ # "2020-01-08 23:16:43"
864
+ # ]
865
+ # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
866
+ # Polars.col("dt").str.strptime(:datetime)
867
+ # )
868
+ # df.groupby_rolling(index_column: "dt", period: "2d").agg(
869
+ # [
870
+ # Polars.sum("a").alias("sum_a"),
871
+ # Polars.min("a").alias("min_a"),
872
+ # Polars.max("a").alias("max_a")
873
+ # ]
874
+ # )
875
+ # # =>
876
+ # # shape: (6, 4)
877
+ # # ┌─────────────────────┬───────┬───────┬───────┐
878
+ # # │ dt ┆ sum_a ┆ min_a ┆ max_a │
879
+ # # │ --- ┆ --- ┆ --- ┆ --- │
880
+ # # │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │
881
+ # # ╞═════════════════════╪═══════╪═══════╪═══════╡
882
+ # # │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │
883
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
884
+ # # │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │
885
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
886
+ # # │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │
887
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
888
+ # # │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │
889
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
890
+ # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
891
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
892
+ # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
893
+ # # └─────────────────────┴───────┴───────┴───────┘
894
+ def groupby_rolling(
895
+ index_column:,
896
+ period:,
897
+ offset: nil,
898
+ closed: "right",
899
+ by: nil
900
+ )
901
+ if offset.nil?
902
+ offset = "-#{period}"
903
+ end
731
904
 
732
- # def groupby_dynamic
733
- # end
905
+ rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
906
+ period = Utils._timedelta_to_pl_duration(period)
907
+ offset = Utils._timedelta_to_pl_duration(offset)
734
908
 
735
- # def join_asof
736
- # end
909
+ lgb = _ldf.groupby_rolling(
910
+ index_column, period, offset, closed, rbexprs_by
911
+ )
912
+ LazyGroupBy.new(lgb, self.class)
913
+ end
914
+
915
+ # Group based on a time value (or index value of type `:i32`, `:i64`).
916
+ #
917
+ # Time windows are calculated and rows are assigned to windows. Different from a
918
+ # normal groupby is that a row can be member of multiple groups. The time/index
919
+ # window could be seen as a rolling window, with a window size determined by
920
+ # dates/times/values instead of slots in the DataFrame.
921
+ #
922
+ # A window is defined by:
923
+ #
924
+ # - every: interval of the window
925
+ # - period: length of the window
926
+ # - offset: offset of the window
927
+ #
928
+ # The `every`, `period` and `offset` arguments are created with
929
+ # the following string language:
930
+ #
931
+ # - 1ns (1 nanosecond)
932
+ # - 1us (1 microsecond)
933
+ # - 1ms (1 millisecond)
934
+ # - 1s (1 second)
935
+ # - 1m (1 minute)
936
+ # - 1h (1 hour)
937
+ # - 1d (1 day)
938
+ # - 1w (1 week)
939
+ # - 1mo (1 calendar month)
940
+ # - 1y (1 calendar year)
941
+ # - 1i (1 index count)
942
+ #
943
+ # Or combine them:
944
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
945
+ #
946
+ # In case of a groupby_dynamic on an integer column, the windows are defined by:
947
+ #
948
+ # - "1i" # length 1
949
+ # - "10i" # length 10
950
+ #
951
+ # @param index_column
952
+ # Column used to group based on the time window.
953
+ # Often to type Date/Datetime
954
+ # This column must be sorted in ascending order. If not the output will not
955
+ # make sense.
956
+ #
957
+ # In case of a dynamic groupby on indices, dtype needs to be one of
958
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
959
+ # performance matters use an `:i64` column.
960
+ # @param every
961
+ # Interval of the window.
962
+ # @param period
963
+ # Length of the window, if None it is equal to 'every'.
964
+ # @param offset
965
+ # Offset of the window if None and period is None it will be equal to negative
966
+ # `every`.
967
+ # @param truncate
968
+ # Truncate the time value to the window lower bound.
969
+ # @param include_boundaries
970
+ # Add the lower and upper bound of the window to the "_lower_bound" and
971
+ # "_upper_bound" columns. This will impact performance because it's harder to
972
+ # parallelize
973
+ # @param closed ["right", "left", "both", "none"]
974
+ # Define whether the temporal window interval is closed or not.
975
+ # @param by
976
+ # Also group by this column/these columns
977
+ #
978
+ # @return [DataFrame]
979
+ #
980
+ # @example
981
+ # df = Polars::DataFrame.new(
982
+ # {
983
+ # "time" => Polars.date_range(
984
+ # DateTime.new(2021, 12, 16),
985
+ # DateTime.new(2021, 12, 16, 3),
986
+ # "30m"
987
+ # ),
988
+ # "n" => 0..6
989
+ # }
990
+ # )
991
+ # # =>
992
+ # # shape: (7, 2)
993
+ # # ┌─────────────────────┬─────┐
994
+ # # │ time ┆ n │
995
+ # # │ --- ┆ --- │
996
+ # # │ datetime[μs] ┆ i64 │
997
+ # # ╞═════════════════════╪═════╡
998
+ # # │ 2021-12-16 00:00:00 ┆ 0 │
999
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1000
+ # # │ 2021-12-16 00:30:00 ┆ 1 │
1001
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1002
+ # # │ 2021-12-16 01:00:00 ┆ 2 │
1003
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1004
+ # # │ 2021-12-16 01:30:00 ┆ 3 │
1005
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1006
+ # # │ 2021-12-16 02:00:00 ┆ 4 │
1007
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1008
+ # # │ 2021-12-16 02:30:00 ┆ 5 │
1009
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
1010
+ # # │ 2021-12-16 03:00:00 ┆ 6 │
1011
+ # # └─────────────────────┴─────┘
1012
+ #
1013
+ # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
1014
+ # df.groupby_dynamic("time", every: "1h", closed: "right").agg(
1015
+ # [
1016
+ # Polars.col("time").min.alias("time_min"),
1017
+ # Polars.col("time").max.alias("time_max")
1018
+ # ]
1019
+ # )
1020
+ # # =>
1021
+ # # shape: (4, 3)
1022
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┐
1023
+ # # │ time ┆ time_min ┆ time_max │
1024
+ # # │ --- ┆ --- ┆ --- │
1025
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │
1026
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╡
1027
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │
1028
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1029
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │
1030
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1031
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │
1032
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1033
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │
1034
+ # # └─────────────────────┴─────────────────────┴─────────────────────┘
1035
+ #
1036
+ # @example The window boundaries can also be added to the aggregation result.
1037
+ # df.groupby_dynamic(
1038
+ # "time", every: "1h", include_boundaries: true, closed: "right"
1039
+ # ).agg([Polars.col("time").count.alias("time_count")])
1040
+ # # =>
1041
+ # # shape: (4, 4)
1042
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
1043
+ # # │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
1044
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1045
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
1046
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
1047
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
1048
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1049
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │
1050
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1051
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
1052
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1053
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
1054
+ # # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
1055
+ #
1056
+ # @example When closed="left", should not include right end of interval.
1057
+ # df.groupby_dynamic("time", every: "1h", closed: "left").agg(
1058
+ # [
1059
+ # Polars.col("time").count.alias("time_count"),
1060
+ # Polars.col("time").list.alias("time_agg_list")
1061
+ # ]
1062
+ # )
1063
+ # # =>
1064
+ # # shape: (4, 3)
1065
+ # # ┌─────────────────────┬────────────┬─────────────────────────────────────┐
1066
+ # # │ time ┆ time_count ┆ time_agg_list │
1067
+ # # │ --- ┆ --- ┆ --- │
1068
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
1069
+ # # ╞═════════════════════╪════════════╪═════════════════════════════════════╡
1070
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16... │
1071
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1072
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16... │
1073
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1074
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16... │
1075
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1076
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
1077
+ # # └─────────────────────┴────────────┴─────────────────────────────────────┘
1078
+ #
1079
+ # @example When closed="both" the time values at the window boundaries belong to 2 groups.
1080
+ # df.groupby_dynamic("time", every: "1h", closed: "both").agg(
1081
+ # [Polars.col("time").count.alias("time_count")]
1082
+ # )
1083
+ # # =>
1084
+ # # shape: (5, 2)
1085
+ # # ┌─────────────────────┬────────────┐
1086
+ # # │ time ┆ time_count │
1087
+ # # │ --- ┆ --- │
1088
+ # # │ datetime[μs] ┆ u32 │
1089
+ # # ╞═════════════════════╪════════════╡
1090
+ # # │ 2021-12-15 23:00:00 ┆ 1 │
1091
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1092
+ # # │ 2021-12-16 00:00:00 ┆ 3 │
1093
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1094
+ # # │ 2021-12-16 01:00:00 ┆ 3 │
1095
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1096
+ # # │ 2021-12-16 02:00:00 ┆ 3 │
1097
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1098
+ # # │ 2021-12-16 03:00:00 ┆ 1 │
1099
+ # # └─────────────────────┴────────────┘
1100
+ #
1101
+ # @example Dynamic groupbys can also be combined with grouping on normal keys.
1102
+ # df = Polars::DataFrame.new(
1103
+ # {
1104
+ # "time" => Polars.date_range(
1105
+ # DateTime.new(2021, 12, 16),
1106
+ # DateTime.new(2021, 12, 16, 3),
1107
+ # "30m"
1108
+ # ),
1109
+ # "groups" => ["a", "a", "a", "b", "b", "a", "a"]
1110
+ # }
1111
+ # )
1112
+ # df.groupby_dynamic(
1113
+ # "time",
1114
+ # every: "1h",
1115
+ # closed: "both",
1116
+ # by: "groups",
1117
+ # include_boundaries: true
1118
+ # ).agg([Polars.col("time").count.alias("time_count")])
1119
+ # # =>
1120
+ # # shape: (7, 5)
1121
+ # # ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
1122
+ # # │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
1123
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1124
+ # # │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
1125
+ # # ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
1126
+ # # │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
1127
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1128
+ # # │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │
1129
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1130
+ # # │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │
1131
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1132
+ # # │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
1133
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1134
+ # # │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │
1135
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1136
+ # # │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
1137
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
1138
+ # # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
1139
+ # # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
1140
+ #
1141
+ # @example Dynamic groupby on an index column.
1142
+ # df = Polars::DataFrame.new(
1143
+ # {
1144
+ # "idx" => Polars.arange(0, 6, eager: true),
1145
+ # "A" => ["A", "A", "B", "B", "B", "C"]
1146
+ # }
1147
+ # )
1148
+ # df.groupby_dynamic(
1149
+ # "idx",
1150
+ # every: "2i",
1151
+ # period: "3i",
1152
+ # include_boundaries: true,
1153
+ # closed: "right"
1154
+ # ).agg(Polars.col("A").list.alias("A_agg_list"))
1155
+ # # =>
1156
+ # # shape: (3, 4)
1157
+ # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
1158
+ # # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
1159
+ # # │ --- ┆ --- ┆ --- ┆ --- │
1160
+ # # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
1161
+ # # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
1162
+ # # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
1163
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1164
+ # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
1165
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
1166
+ # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
1167
+ # # └─────────────────┴─────────────────┴─────┴─────────────────┘
1168
+ def groupby_dynamic(
1169
+ index_column,
1170
+ every:,
1171
+ period: nil,
1172
+ offset: nil,
1173
+ truncate: true,
1174
+ include_boundaries: false,
1175
+ closed: "left",
1176
+ by: nil,
1177
+ start_by: "window"
1178
+ )
1179
+ if offset.nil?
1180
+ if period.nil?
1181
+ offset = "-#{every}"
1182
+ else
1183
+ offset = "0ns"
1184
+ end
1185
+ end
1186
+
1187
+ if period.nil?
1188
+ period = every
1189
+ end
1190
+
1191
+ period = Utils._timedelta_to_pl_duration(period)
1192
+ offset = Utils._timedelta_to_pl_duration(offset)
1193
+ every = Utils._timedelta_to_pl_duration(every)
1194
+
1195
+ rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
1196
+ lgb = _ldf.groupby_dynamic(
1197
+ index_column,
1198
+ every,
1199
+ period,
1200
+ offset,
1201
+ truncate,
1202
+ include_boundaries,
1203
+ closed,
1204
+ rbexprs_by,
1205
+ start_by
1206
+ )
1207
+ LazyGroupBy.new(lgb, self.class)
1208
+ end
1209
+
1210
+ # Perform an asof join.
1211
+ #
1212
+ # This is similar to a left-join except that we match on nearest key rather than
1213
+ # equal keys.
1214
+ #
1215
+ # Both DataFrames must be sorted by the join_asof key.
1216
+ #
1217
+ # For each row in the left DataFrame:
1218
+ #
1219
+ # - A "backward" search selects the last row in the right DataFrame whose 'on' key is less than or equal to the left's key.
1220
+ # - A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key.
1221
+ #
1222
+ # The default is "backward".
1223
+ #
1224
+ # @param other [LazyFrame]
1225
+ # Lazy DataFrame to join with.
1226
+ # @param left_on [String]
1227
+ # Join column of the left DataFrame.
1228
+ # @param right_on [String]
1229
+ # Join column of the right DataFrame.
1230
+ # @param on [String]
1231
+ # Join column of both DataFrames. If set, `left_on` and `right_on` should be
1232
+ # None.
1233
+ # @param by [Object]
1234
+ # Join on these columns before doing asof join.
1235
+ # @param by_left [Object]
1236
+ # Join on these columns before doing asof join.
1237
+ # @param by_right [Object]
1238
+ # Join on these columns before doing asof join.
1239
+ # @param strategy ["backward", "forward"]
1240
+ # Join strategy.
1241
+ # @param suffix [String]
1242
+ # Suffix to append to columns with a duplicate name.
1243
+ # @param tolerance [Object]
1244
+ # Numeric tolerance. By setting this the join will only be done if the near
1245
+ # keys are within this distance. If an asof join is done on columns of dtype
1246
+ # "Date", "Datetime", "Duration" or "Time" you use the following string
1247
+ # language:
1248
+ #
1249
+ # - 1ns (1 nanosecond)
1250
+ # - 1us (1 microsecond)
1251
+ # - 1ms (1 millisecond)
1252
+ # - 1s (1 second)
1253
+ # - 1m (1 minute)
1254
+ # - 1h (1 hour)
1255
+ # - 1d (1 day)
1256
+ # - 1w (1 week)
1257
+ # - 1mo (1 calendar month)
1258
+ # - 1y (1 calendar year)
1259
+ # - 1i (1 index count)
1260
+ #
1261
+ # Or combine them:
1262
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1263
+ #
1264
+ # @param allow_parallel [Boolean]
1265
+ # Allow the physical plan to optionally evaluate the computation of both
1266
+ # DataFrames up to the join in parallel.
1267
+ # @param force_parallel [Boolean]
1268
+ # Force the physical plan to evaluate the computation of both DataFrames up to
1269
+ # the join in parallel.
1270
+ #
1271
+ # @return [LazyFrame]
1272
+ def join_asof(
1273
+ other,
1274
+ left_on: nil,
1275
+ right_on: nil,
1276
+ on: nil,
1277
+ by_left: nil,
1278
+ by_right: nil,
1279
+ by: nil,
1280
+ strategy: "backward",
1281
+ suffix: "_right",
1282
+ tolerance: nil,
1283
+ allow_parallel: true,
1284
+ force_parallel: false
1285
+ )
1286
+ if !other.is_a?(LazyFrame)
1287
+ raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
1288
+ end
1289
+
1290
+ if on.is_a?(String)
1291
+ left_on = on
1292
+ right_on = on
1293
+ end
1294
+
1295
+ if left_on.nil? || right_on.nil?
1296
+ raise ArgumentError, "You should pass the column to join on as an argument."
1297
+ end
1298
+
1299
+ if by_left.is_a?(String) || by_left.is_a?(Expr)
1300
+ by_left_ = [by_left]
1301
+ else
1302
+ by_left_ = by_left
1303
+ end
1304
+
1305
+ if by_right.is_a?(String) || by_right.is_a?(Expr)
1306
+ by_right_ = [by_right]
1307
+ else
1308
+ by_right_ = by_right
1309
+ end
1310
+
1311
+ if by.is_a?(String)
1312
+ by_left_ = [by]
1313
+ by_right_ = [by]
1314
+ elsif by.is_a?(Array)
1315
+ by_left_ = by
1316
+ by_right_ = by
1317
+ end
1318
+
1319
+ tolerance_str = nil
1320
+ tolerance_num = nil
1321
+ if tolerance.is_a?(String)
1322
+ tolerance_str = tolerance
1323
+ else
1324
+ tolerance_num = tolerance
1325
+ end
1326
+
1327
+ _from_rbldf(
1328
+ _ldf.join_asof(
1329
+ other._ldf,
1330
+ Polars.col(left_on)._rbexpr,
1331
+ Polars.col(right_on)._rbexpr,
1332
+ by_left_,
1333
+ by_right_,
1334
+ allow_parallel,
1335
+ force_parallel,
1336
+ suffix,
1337
+ strategy,
1338
+ tolerance_num,
1339
+ tolerance_str
1340
+ )
1341
+ )
1342
+ end
737
1343
 
738
1344
  # Add a join operation to the Logical Plan.
739
1345
  #
@@ -953,8 +1559,44 @@ module Polars
953
1559
  _from_rbldf(_ldf.with_columns(rbexprs))
954
1560
  end
955
1561
 
956
- # def with_context
957
- # end
1562
+ # Add an external context to the computation graph.
1563
+ #
1564
+ # This allows expressions to also access columns from DataFrames
1565
+ # that are not part of this one.
1566
+ #
1567
+ # @param other [Object]
1568
+ # Lazy DataFrame to join with.
1569
+ #
1570
+ # @return [LazyFrame]
1571
+ #
1572
+ # @example
1573
+ # df_a = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => ["a", "c", nil]}).lazy
1574
+ # df_other = Polars::DataFrame.new({"c" => ["foo", "ham"]})
1575
+ # (
1576
+ # df_a.with_context(df_other.lazy).select(
1577
+ # [Polars.col("b") + Polars.col("c").first]
1578
+ # )
1579
+ # ).collect
1580
+ # # =>
1581
+ # # shape: (3, 1)
1582
+ # # ┌──────┐
1583
+ # # │ b │
1584
+ # # │ --- │
1585
+ # # │ str │
1586
+ # # ╞══════╡
1587
+ # # │ afoo │
1588
+ # # ├╌╌╌╌╌╌┤
1589
+ # # │ cfoo │
1590
+ # # ├╌╌╌╌╌╌┤
1591
+ # # │ null │
1592
+ # # └──────┘
1593
+ def with_context(other)
1594
+ if !other.is_a?(Array)
1595
+ other = [other]
1596
+ end
1597
+
1598
+ _from_rbldf(_ldf.with_context(other.map(&:_ldf)))
1599
+ end
958
1600
 
959
1601
  # Add or overwrite column in a DataFrame.
960
1602
  #
@@ -1231,8 +1873,43 @@ module Polars
1231
1873
  slice(0, 1)
1232
1874
  end
1233
1875
 
1234
- # def with_row_count
1235
- # end
1876
+ # Add a column at index 0 that counts the rows.
1877
+ #
1878
+ # @param name [String]
1879
+ # Name of the column to add.
1880
+ # @param offset [Integer]
1881
+ # Start the row count at this offset.
1882
+ #
1883
+ # @return [LazyFrame]
1884
+ #
1885
+ # @note
1886
+ # This can have a negative effect on query performance.
1887
+ # This may, for instance, block predicate pushdown optimization.
1888
+ #
1889
+ # @example
1890
+ # df = Polars::DataFrame.new(
1891
+ # {
1892
+ # "a" => [1, 3, 5],
1893
+ # "b" => [2, 4, 6]
1894
+ # }
1895
+ # ).lazy
1896
+ # df.with_row_count.collect
1897
+ # # =>
1898
+ # # shape: (3, 3)
1899
+ # # ┌────────┬─────┬─────┐
1900
+ # # │ row_nr ┆ a ┆ b │
1901
+ # # │ --- ┆ --- ┆ --- │
1902
+ # # │ u32 ┆ i64 ┆ i64 │
1903
+ # # ╞════════╪═════╪═════╡
1904
+ # # │ 0 ┆ 1 ┆ 2 │
1905
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1906
+ # # │ 1 ┆ 3 ┆ 4 │
1907
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
1908
+ # # │ 2 ┆ 5 ┆ 6 │
1909
+ # # └────────┴─────┴─────┘
1910
+ def with_row_count(name: "row_nr", offset: 0)
1911
+ _from_rbldf(_ldf.with_row_count(name, offset))
1912
+ end
1236
1913
 
1237
1914
  # Take every nth row in the LazyFrame and return as a new LazyFrame.
1238
1915
  #
@@ -1490,7 +2167,8 @@ module Polars
1490
2167
  # # │ 3.0 ┆ 1.0 │
1491
2168
  # # └─────┴─────┘
1492
2169
  def quantile(quantile, interpolation: "nearest")
1493
- _from_rbldf(_ldf.quantile(quantile, interpolation))
2170
+ quantile = Utils.expr_to_lit_or_expr(quantile, str_to_lit: false)
2171
+ _from_rbldf(_ldf.quantile(quantile._rbexpr, interpolation))
1494
2172
  end
1495
2173
 
1496
2174
  # Explode lists to long format.
@@ -1554,11 +2232,106 @@ module Polars
1554
2232
  _from_rbldf(_ldf.unique(maintain_order, subset, keep))
1555
2233
  end
1556
2234
 
1557
- # def drop_nulls
1558
- # end
2235
+ # Drop rows with null values from this LazyFrame.
2236
+ #
2237
+ # @param subset [Object]
2238
+ # Subset of column(s) on which `drop_nulls` will be applied.
2239
+ #
2240
+ # @return [LazyFrame]
2241
+ #
2242
+ # @example
2243
+ # df = Polars::DataFrame.new(
2244
+ # {
2245
+ # "foo" => [1, 2, 3],
2246
+ # "bar" => [6, nil, 8],
2247
+ # "ham" => ["a", "b", "c"]
2248
+ # }
2249
+ # )
2250
+ # df.lazy.drop_nulls.collect
2251
+ # # =>
2252
+ # # shape: (2, 3)
2253
+ # # ┌─────┬─────┬─────┐
2254
+ # # │ foo ┆ bar ┆ ham │
2255
+ # # │ --- ┆ --- ┆ --- │
2256
+ # # │ i64 ┆ i64 ┆ str │
2257
+ # # ╞═════╪═════╪═════╡
2258
+ # # │ 1 ┆ 6 ┆ a │
2259
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
2260
+ # # │ 3 ┆ 8 ┆ c │
2261
+ # # └─────┴─────┴─────┘
2262
+ def drop_nulls(subset: nil)
2263
+ if !subset.nil? && !subset.is_a?(Array)
2264
+ subset = [subset]
2265
+ end
2266
+ _from_rbldf(_ldf.drop_nulls(subset))
2267
+ end
1559
2268
 
1560
- # def melt
1561
- # end
2269
+ # Unpivot a DataFrame from wide to long format.
2270
+ #
2271
+ # Optionally leaves identifiers set.
2272
+ #
2273
+ # This function is useful to massage a DataFrame into a format where one or more
2274
+ # columns are identifier variables (id_vars), while all other columns, considered
2275
+ # measured variables (value_vars), are "unpivoted" to the row axis, leaving just
2276
+ # two non-identifier columns, 'variable' and 'value'.
2277
+ #
2278
+ # @param id_vars [Object]
2279
+ # Columns to use as identifier variables.
2280
+ # @param value_vars [Object]
2281
+ # Values to use as identifier variables.
2282
+ # If `value_vars` is empty all columns that are not in `id_vars` will be used.
2283
+ # @param variable_name [String]
2284
+ # Name to give to the `value` column. Defaults to "variable"
2285
+ # @param value_name [String]
2286
+ # Name to give to the `value` column. Defaults to "value"
2287
+ #
2288
+ # @return [LazyFrame]
2289
+ #
2290
+ # @example
2291
+ # df = Polars::DataFrame.new(
2292
+ # {
2293
+ # "a" => ["x", "y", "z"],
2294
+ # "b" => [1, 3, 5],
2295
+ # "c" => [2, 4, 6]
2296
+ # }
2297
+ # ).lazy
2298
+ # df.melt(id_vars: "a", value_vars: ["b", "c"]).collect
2299
+ # # =>
2300
+ # # shape: (6, 3)
2301
+ # # ┌─────┬──────────┬───────┐
2302
+ # # │ a ┆ variable ┆ value │
2303
+ # # │ --- ┆ --- ┆ --- │
2304
+ # # │ str ┆ str ┆ i64 │
2305
+ # # ╞═════╪══════════╪═══════╡
2306
+ # # │ x ┆ b ┆ 1 │
2307
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2308
+ # # │ y ┆ b ┆ 3 │
2309
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2310
+ # # │ z ┆ b ┆ 5 │
2311
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2312
+ # # │ x ┆ c ┆ 2 │
2313
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2314
+ # # │ y ┆ c ┆ 4 │
2315
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
2316
+ # # │ z ┆ c ┆ 6 │
2317
+ # # └─────┴──────────┴───────┘
2318
+ def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
2319
+ if value_vars.is_a?(String)
2320
+ value_vars = [value_vars]
2321
+ end
2322
+ if id_vars.is_a?(String)
2323
+ id_vars = [id_vars]
2324
+ end
2325
+ if value_vars.nil?
2326
+ value_vars = []
2327
+ end
2328
+ if id_vars.nil?
2329
+ id_vars = []
2330
+ end
2331
+ _from_rbldf(
2332
+ _ldf.melt(id_vars, value_vars, value_name, variable_name)
2333
+ )
2334
+ end
1562
2335
 
1563
2336
  # def map
1564
2337
  # end