polars-df 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/Cargo.lock +430 -217
- data/Cargo.toml +2 -0
- data/LICENSE.txt +1 -1
- data/README.md +0 -2
- data/ext/polars/Cargo.toml +9 -3
- data/ext/polars/src/apply/dataframe.rs +303 -0
- data/ext/polars/src/apply/mod.rs +253 -0
- data/ext/polars/src/apply/series.rs +1173 -0
- data/ext/polars/src/conversion.rs +254 -35
- data/ext/polars/src/dataframe.rs +151 -6
- data/ext/polars/src/error.rs +8 -0
- data/ext/polars/src/lazy/apply.rs +34 -2
- data/ext/polars/src/lazy/dataframe.rs +80 -3
- data/ext/polars/src/lazy/dsl.rs +84 -10
- data/ext/polars/src/lib.rs +180 -8
- data/ext/polars/src/series.rs +328 -10
- data/ext/polars/src/utils.rs +25 -0
- data/lib/polars/convert.rb +100 -0
- data/lib/polars/data_frame.rb +1480 -77
- data/lib/polars/data_types.rb +122 -0
- data/lib/polars/date_time_expr.rb +10 -10
- data/lib/polars/date_time_name_space.rb +8 -8
- data/lib/polars/dynamic_group_by.rb +52 -0
- data/lib/polars/expr.rb +262 -12
- data/lib/polars/functions.rb +194 -5
- data/lib/polars/group_by.rb +76 -36
- data/lib/polars/io.rb +19 -3
- data/lib/polars/lazy_frame.rb +798 -25
- data/lib/polars/lazy_functions.rb +569 -30
- data/lib/polars/list_expr.rb +1 -1
- data/lib/polars/rolling_group_by.rb +35 -0
- data/lib/polars/series.rb +192 -27
- data/lib/polars/string_expr.rb +6 -5
- data/lib/polars/string_name_space.rb +1 -1
- data/lib/polars/utils.rb +25 -8
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +38 -29
- metadata +11 -4
data/lib/polars/lazy_frame.rb
CHANGED
@@ -149,8 +149,19 @@ module Polars
|
|
149
149
|
# def self.from_json
|
150
150
|
# end
|
151
151
|
|
152
|
-
#
|
153
|
-
#
|
152
|
+
# Read a logical plan from a JSON file to construct a LazyFrame.
|
153
|
+
#
|
154
|
+
# @param file [String]
|
155
|
+
# Path to a file or a file-like object.
|
156
|
+
#
|
157
|
+
# @return [LazyFrame]
|
158
|
+
def self.read_json(file)
|
159
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
160
|
+
file = Utils.format_path(file)
|
161
|
+
end
|
162
|
+
|
163
|
+
Utils.wrap_ldf(RbLazyFrame.read_json(file))
|
164
|
+
end
|
154
165
|
|
155
166
|
# Get or set column names.
|
156
167
|
#
|
@@ -187,7 +198,7 @@ module Polars
|
|
187
198
|
# }
|
188
199
|
# ).lazy
|
189
200
|
# lf.dtypes
|
190
|
-
# # => [
|
201
|
+
# # => [Polars::Int64, Polars::Float64, Polars::Utf8]
|
191
202
|
def dtypes
|
192
203
|
_ldf.dtypes
|
193
204
|
end
|
@@ -205,7 +216,7 @@ module Polars
|
|
205
216
|
# }
|
206
217
|
# ).lazy
|
207
218
|
# lf.schema
|
208
|
-
# # => {"foo"
|
219
|
+
# # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::Utf8}
|
209
220
|
def schema
|
210
221
|
_ldf.schema
|
211
222
|
end
|
@@ -245,11 +256,57 @@ module Polars
|
|
245
256
|
EOS
|
246
257
|
end
|
247
258
|
|
248
|
-
#
|
249
|
-
#
|
259
|
+
# Write the logical plan of this LazyFrame to a file or string in JSON format.
|
260
|
+
#
|
261
|
+
# @param file [String]
|
262
|
+
# File path to which the result should be written.
|
263
|
+
#
|
264
|
+
# @return [nil]
|
265
|
+
def write_json(file)
|
266
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
267
|
+
file = Utils.format_path(file)
|
268
|
+
end
|
269
|
+
_ldf.write_json(file)
|
270
|
+
nil
|
271
|
+
end
|
250
272
|
|
251
|
-
#
|
252
|
-
#
|
273
|
+
# Offers a structured way to apply a sequence of user-defined functions (UDFs).
|
274
|
+
#
|
275
|
+
# @param func [Object]
|
276
|
+
# Callable; will receive the frame as the first parameter,
|
277
|
+
# followed by any given args/kwargs.
|
278
|
+
# @param args [Object]
|
279
|
+
# Arguments to pass to the UDF.
|
280
|
+
# @param kwargs [Object]
|
281
|
+
# Keyword arguments to pass to the UDF.
|
282
|
+
#
|
283
|
+
# @return [LazyFrame]
|
284
|
+
#
|
285
|
+
# @example
|
286
|
+
# cast_str_to_int = lambda do |data, col_name:|
|
287
|
+
# data.with_column(Polars.col(col_name).cast(:i64))
|
288
|
+
# end
|
289
|
+
#
|
290
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]}).lazy
|
291
|
+
# df.pipe(cast_str_to_int, col_name: "b").collect()
|
292
|
+
# # =>
|
293
|
+
# # shape: (4, 2)
|
294
|
+
# # ┌─────┬─────┐
|
295
|
+
# # │ a ┆ b │
|
296
|
+
# # │ --- ┆ --- │
|
297
|
+
# # │ i64 ┆ i64 │
|
298
|
+
# # ╞═════╪═════╡
|
299
|
+
# # │ 1 ┆ 10 │
|
300
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
301
|
+
# # │ 2 ┆ 20 │
|
302
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
303
|
+
# # │ 3 ┆ 30 │
|
304
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
305
|
+
# # │ 4 ┆ 40 │
|
306
|
+
# # └─────┴─────┘
|
307
|
+
def pipe(func, *args, **kwargs, &block)
|
308
|
+
func.call(self, *args, **kwargs, &block)
|
309
|
+
end
|
253
310
|
|
254
311
|
# Create a string representation of the unoptimized query plan.
|
255
312
|
#
|
@@ -261,8 +318,27 @@ module Polars
|
|
261
318
|
# Create a string representation of the optimized query plan.
|
262
319
|
#
|
263
320
|
# @return [String]
|
264
|
-
|
265
|
-
|
321
|
+
def describe_optimized_plan(
|
322
|
+
type_coercion: true,
|
323
|
+
predicate_pushdown: true,
|
324
|
+
projection_pushdown: true,
|
325
|
+
simplify_expression: true,
|
326
|
+
slice_pushdown: true,
|
327
|
+
common_subplan_elimination: true,
|
328
|
+
allow_streaming: false
|
329
|
+
)
|
330
|
+
ldf = _ldf.optimization_toggle(
|
331
|
+
type_coercion,
|
332
|
+
predicate_pushdown,
|
333
|
+
projection_pushdown,
|
334
|
+
simplify_expression,
|
335
|
+
slice_pushdown,
|
336
|
+
common_subplan_elimination,
|
337
|
+
allow_streaming,
|
338
|
+
)
|
339
|
+
|
340
|
+
ldf.describe_optimized_plan
|
341
|
+
end
|
266
342
|
|
267
343
|
# def show_graph
|
268
344
|
# end
|
@@ -726,14 +802,544 @@ module Polars
|
|
726
802
|
LazyGroupBy.new(lgb, self.class)
|
727
803
|
end
|
728
804
|
|
729
|
-
#
|
730
|
-
#
|
805
|
+
# Create rolling groups based on a time column.
|
806
|
+
#
|
807
|
+
# Also works for index values of type `:i32` or `:i64`.
|
808
|
+
#
|
809
|
+
# Different from a `dynamic_groupby` the windows are now determined by the
|
810
|
+
# individual values and are not of constant intervals. For constant intervals
|
811
|
+
# use *groupby_dynamic*.
|
812
|
+
#
|
813
|
+
# The `period` and `offset` arguments are created either from a timedelta, or
|
814
|
+
# by using the following string language:
|
815
|
+
#
|
816
|
+
# - 1ns (1 nanosecond)
|
817
|
+
# - 1us (1 microsecond)
|
818
|
+
# - 1ms (1 millisecond)
|
819
|
+
# - 1s (1 second)
|
820
|
+
# - 1m (1 minute)
|
821
|
+
# - 1h (1 hour)
|
822
|
+
# - 1d (1 day)
|
823
|
+
# - 1w (1 week)
|
824
|
+
# - 1mo (1 calendar month)
|
825
|
+
# - 1y (1 calendar year)
|
826
|
+
# - 1i (1 index count)
|
827
|
+
#
|
828
|
+
# Or combine them:
|
829
|
+
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
830
|
+
#
|
831
|
+
# In case of a groupby_rolling on an integer column, the windows are defined by:
|
832
|
+
#
|
833
|
+
# - "1i" # length 1
|
834
|
+
# - "10i" # length 10
|
835
|
+
#
|
836
|
+
# @param index_column [Object]
|
837
|
+
# Column used to group based on the time window.
|
838
|
+
# Often to type Date/Datetime
|
839
|
+
# This column must be sorted in ascending order. If not the output will not
|
840
|
+
# make sense.
|
841
|
+
#
|
842
|
+
# In case of a rolling groupby on indices, dtype needs to be one of
|
843
|
+
# `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
|
844
|
+
# performance matters use an `:i64` column.
|
845
|
+
# @param period [Object]
|
846
|
+
# Length of the window.
|
847
|
+
# @param offset [Object]
|
848
|
+
# Offset of the window. Default is -period.
|
849
|
+
# @param closed ["right", "left", "both", "none"]
|
850
|
+
# Define whether the temporal window interval is closed or not.
|
851
|
+
# @param by [Object]
|
852
|
+
# Also group by this column/these columns.
|
853
|
+
#
|
854
|
+
# @return [LazyFrame]
|
855
|
+
#
|
856
|
+
# @example
|
857
|
+
# dates = [
|
858
|
+
# "2020-01-01 13:45:48",
|
859
|
+
# "2020-01-01 16:42:13",
|
860
|
+
# "2020-01-01 16:45:09",
|
861
|
+
# "2020-01-02 18:12:48",
|
862
|
+
# "2020-01-03 19:45:32",
|
863
|
+
# "2020-01-08 23:16:43"
|
864
|
+
# ]
|
865
|
+
# df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
|
866
|
+
# Polars.col("dt").str.strptime(:datetime)
|
867
|
+
# )
|
868
|
+
# df.groupby_rolling(index_column: "dt", period: "2d").agg(
|
869
|
+
# [
|
870
|
+
# Polars.sum("a").alias("sum_a"),
|
871
|
+
# Polars.min("a").alias("min_a"),
|
872
|
+
# Polars.max("a").alias("max_a")
|
873
|
+
# ]
|
874
|
+
# )
|
875
|
+
# # =>
|
876
|
+
# # shape: (6, 4)
|
877
|
+
# # ┌─────────────────────┬───────┬───────┬───────┐
|
878
|
+
# # │ dt ┆ sum_a ┆ min_a ┆ max_a │
|
879
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
880
|
+
# # │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │
|
881
|
+
# # ╞═════════════════════╪═══════╪═══════╪═══════╡
|
882
|
+
# # │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │
|
883
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
884
|
+
# # │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │
|
885
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
886
|
+
# # │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │
|
887
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
888
|
+
# # │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │
|
889
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
890
|
+
# # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
|
891
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
892
|
+
# # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
|
893
|
+
# # └─────────────────────┴───────┴───────┴───────┘
|
894
|
+
def groupby_rolling(
|
895
|
+
index_column:,
|
896
|
+
period:,
|
897
|
+
offset: nil,
|
898
|
+
closed: "right",
|
899
|
+
by: nil
|
900
|
+
)
|
901
|
+
if offset.nil?
|
902
|
+
offset = "-#{period}"
|
903
|
+
end
|
731
904
|
|
732
|
-
|
733
|
-
|
905
|
+
rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
|
906
|
+
period = Utils._timedelta_to_pl_duration(period)
|
907
|
+
offset = Utils._timedelta_to_pl_duration(offset)
|
734
908
|
|
735
|
-
|
736
|
-
|
909
|
+
lgb = _ldf.groupby_rolling(
|
910
|
+
index_column, period, offset, closed, rbexprs_by
|
911
|
+
)
|
912
|
+
LazyGroupBy.new(lgb, self.class)
|
913
|
+
end
|
914
|
+
|
915
|
+
# Group based on a time value (or index value of type `:i32`, `:i64`).
|
916
|
+
#
|
917
|
+
# Time windows are calculated and rows are assigned to windows. Different from a
|
918
|
+
# normal groupby is that a row can be member of multiple groups. The time/index
|
919
|
+
# window could be seen as a rolling window, with a window size determined by
|
920
|
+
# dates/times/values instead of slots in the DataFrame.
|
921
|
+
#
|
922
|
+
# A window is defined by:
|
923
|
+
#
|
924
|
+
# - every: interval of the window
|
925
|
+
# - period: length of the window
|
926
|
+
# - offset: offset of the window
|
927
|
+
#
|
928
|
+
# The `every`, `period` and `offset` arguments are created with
|
929
|
+
# the following string language:
|
930
|
+
#
|
931
|
+
# - 1ns (1 nanosecond)
|
932
|
+
# - 1us (1 microsecond)
|
933
|
+
# - 1ms (1 millisecond)
|
934
|
+
# - 1s (1 second)
|
935
|
+
# - 1m (1 minute)
|
936
|
+
# - 1h (1 hour)
|
937
|
+
# - 1d (1 day)
|
938
|
+
# - 1w (1 week)
|
939
|
+
# - 1mo (1 calendar month)
|
940
|
+
# - 1y (1 calendar year)
|
941
|
+
# - 1i (1 index count)
|
942
|
+
#
|
943
|
+
# Or combine them:
|
944
|
+
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
945
|
+
#
|
946
|
+
# In case of a groupby_dynamic on an integer column, the windows are defined by:
|
947
|
+
#
|
948
|
+
# - "1i" # length 1
|
949
|
+
# - "10i" # length 10
|
950
|
+
#
|
951
|
+
# @param index_column
|
952
|
+
# Column used to group based on the time window.
|
953
|
+
# Often to type Date/Datetime
|
954
|
+
# This column must be sorted in ascending order. If not the output will not
|
955
|
+
# make sense.
|
956
|
+
#
|
957
|
+
# In case of a dynamic groupby on indices, dtype needs to be one of
|
958
|
+
# `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
|
959
|
+
# performance matters use an `:i64` column.
|
960
|
+
# @param every
|
961
|
+
# Interval of the window.
|
962
|
+
# @param period
|
963
|
+
# Length of the window, if None it is equal to 'every'.
|
964
|
+
# @param offset
|
965
|
+
# Offset of the window if None and period is None it will be equal to negative
|
966
|
+
# `every`.
|
967
|
+
# @param truncate
|
968
|
+
# Truncate the time value to the window lower bound.
|
969
|
+
# @param include_boundaries
|
970
|
+
# Add the lower and upper bound of the window to the "_lower_bound" and
|
971
|
+
# "_upper_bound" columns. This will impact performance because it's harder to
|
972
|
+
# parallelize
|
973
|
+
# @param closed ["right", "left", "both", "none"]
|
974
|
+
# Define whether the temporal window interval is closed or not.
|
975
|
+
# @param by
|
976
|
+
# Also group by this column/these columns
|
977
|
+
#
|
978
|
+
# @return [DataFrame]
|
979
|
+
#
|
980
|
+
# @example
|
981
|
+
# df = Polars::DataFrame.new(
|
982
|
+
# {
|
983
|
+
# "time" => Polars.date_range(
|
984
|
+
# DateTime.new(2021, 12, 16),
|
985
|
+
# DateTime.new(2021, 12, 16, 3),
|
986
|
+
# "30m"
|
987
|
+
# ),
|
988
|
+
# "n" => 0..6
|
989
|
+
# }
|
990
|
+
# )
|
991
|
+
# # =>
|
992
|
+
# # shape: (7, 2)
|
993
|
+
# # ┌─────────────────────┬─────┐
|
994
|
+
# # │ time ┆ n │
|
995
|
+
# # │ --- ┆ --- │
|
996
|
+
# # │ datetime[μs] ┆ i64 │
|
997
|
+
# # ╞═════════════════════╪═════╡
|
998
|
+
# # │ 2021-12-16 00:00:00 ┆ 0 │
|
999
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1000
|
+
# # │ 2021-12-16 00:30:00 ┆ 1 │
|
1001
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1002
|
+
# # │ 2021-12-16 01:00:00 ┆ 2 │
|
1003
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1004
|
+
# # │ 2021-12-16 01:30:00 ┆ 3 │
|
1005
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1006
|
+
# # │ 2021-12-16 02:00:00 ┆ 4 │
|
1007
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1008
|
+
# # │ 2021-12-16 02:30:00 ┆ 5 │
|
1009
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1010
|
+
# # │ 2021-12-16 03:00:00 ┆ 6 │
|
1011
|
+
# # └─────────────────────┴─────┘
|
1012
|
+
#
|
1013
|
+
# @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
|
1014
|
+
# df.groupby_dynamic("time", every: "1h", closed: "right").agg(
|
1015
|
+
# [
|
1016
|
+
# Polars.col("time").min.alias("time_min"),
|
1017
|
+
# Polars.col("time").max.alias("time_max")
|
1018
|
+
# ]
|
1019
|
+
# )
|
1020
|
+
# # =>
|
1021
|
+
# # shape: (4, 3)
|
1022
|
+
# # ┌─────────────────────┬─────────────────────┬─────────────────────┐
|
1023
|
+
# # │ time ┆ time_min ┆ time_max │
|
1024
|
+
# # │ --- ┆ --- ┆ --- │
|
1025
|
+
# # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │
|
1026
|
+
# # ╞═════════════════════╪═════════════════════╪═════════════════════╡
|
1027
|
+
# # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │
|
1028
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1029
|
+
# # │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │
|
1030
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1031
|
+
# # │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │
|
1032
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1033
|
+
# # │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │
|
1034
|
+
# # └─────────────────────┴─────────────────────┴─────────────────────┘
|
1035
|
+
#
|
1036
|
+
# @example The window boundaries can also be added to the aggregation result.
|
1037
|
+
# df.groupby_dynamic(
|
1038
|
+
# "time", every: "1h", include_boundaries: true, closed: "right"
|
1039
|
+
# ).agg([Polars.col("time").count.alias("time_count")])
|
1040
|
+
# # =>
|
1041
|
+
# # shape: (4, 4)
|
1042
|
+
# # ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
|
1043
|
+
# # │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
|
1044
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1045
|
+
# # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
|
1046
|
+
# # ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
|
1047
|
+
# # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
|
1048
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1049
|
+
# # │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │
|
1050
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1051
|
+
# # │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
|
1052
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1053
|
+
# # │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
|
1054
|
+
# # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
|
1055
|
+
#
|
1056
|
+
# @example When closed="left", should not include right end of interval.
|
1057
|
+
# df.groupby_dynamic("time", every: "1h", closed: "left").agg(
|
1058
|
+
# [
|
1059
|
+
# Polars.col("time").count.alias("time_count"),
|
1060
|
+
# Polars.col("time").list.alias("time_agg_list")
|
1061
|
+
# ]
|
1062
|
+
# )
|
1063
|
+
# # =>
|
1064
|
+
# # shape: (4, 3)
|
1065
|
+
# # ┌─────────────────────┬────────────┬─────────────────────────────────────┐
|
1066
|
+
# # │ time ┆ time_count ┆ time_agg_list │
|
1067
|
+
# # │ --- ┆ --- ┆ --- │
|
1068
|
+
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
|
1069
|
+
# # ╞═════════════════════╪════════════╪═════════════════════════════════════╡
|
1070
|
+
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16... │
|
1071
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1072
|
+
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16... │
|
1073
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1074
|
+
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16... │
|
1075
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1076
|
+
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
|
1077
|
+
# # └─────────────────────┴────────────┴─────────────────────────────────────┘
|
1078
|
+
#
|
1079
|
+
# @example When closed="both" the time values at the window boundaries belong to 2 groups.
|
1080
|
+
# df.groupby_dynamic("time", every: "1h", closed: "both").agg(
|
1081
|
+
# [Polars.col("time").count.alias("time_count")]
|
1082
|
+
# )
|
1083
|
+
# # =>
|
1084
|
+
# # shape: (5, 2)
|
1085
|
+
# # ┌─────────────────────┬────────────┐
|
1086
|
+
# # │ time ┆ time_count │
|
1087
|
+
# # │ --- ┆ --- │
|
1088
|
+
# # │ datetime[μs] ┆ u32 │
|
1089
|
+
# # ╞═════════════════════╪════════════╡
|
1090
|
+
# # │ 2021-12-15 23:00:00 ┆ 1 │
|
1091
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1092
|
+
# # │ 2021-12-16 00:00:00 ┆ 3 │
|
1093
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1094
|
+
# # │ 2021-12-16 01:00:00 ┆ 3 │
|
1095
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1096
|
+
# # │ 2021-12-16 02:00:00 ┆ 3 │
|
1097
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1098
|
+
# # │ 2021-12-16 03:00:00 ┆ 1 │
|
1099
|
+
# # └─────────────────────┴────────────┘
|
1100
|
+
#
|
1101
|
+
# @example Dynamic groupbys can also be combined with grouping on normal keys.
|
1102
|
+
# df = Polars::DataFrame.new(
|
1103
|
+
# {
|
1104
|
+
# "time" => Polars.date_range(
|
1105
|
+
# DateTime.new(2021, 12, 16),
|
1106
|
+
# DateTime.new(2021, 12, 16, 3),
|
1107
|
+
# "30m"
|
1108
|
+
# ),
|
1109
|
+
# "groups" => ["a", "a", "a", "b", "b", "a", "a"]
|
1110
|
+
# }
|
1111
|
+
# )
|
1112
|
+
# df.groupby_dynamic(
|
1113
|
+
# "time",
|
1114
|
+
# every: "1h",
|
1115
|
+
# closed: "both",
|
1116
|
+
# by: "groups",
|
1117
|
+
# include_boundaries: true
|
1118
|
+
# ).agg([Polars.col("time").count.alias("time_count")])
|
1119
|
+
# # =>
|
1120
|
+
# # shape: (7, 5)
|
1121
|
+
# # ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
|
1122
|
+
# # │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
|
1123
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
1124
|
+
# # │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
|
1125
|
+
# # ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
|
1126
|
+
# # │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
|
1127
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1128
|
+
# # │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │
|
1129
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1130
|
+
# # │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │
|
1131
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1132
|
+
# # │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
|
1133
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1134
|
+
# # │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │
|
1135
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1136
|
+
# # │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
|
1137
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1138
|
+
# # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
|
1139
|
+
# # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
|
1140
|
+
#
|
1141
|
+
# @example Dynamic groupby on an index column.
|
1142
|
+
# df = Polars::DataFrame.new(
|
1143
|
+
# {
|
1144
|
+
# "idx" => Polars.arange(0, 6, eager: true),
|
1145
|
+
# "A" => ["A", "A", "B", "B", "B", "C"]
|
1146
|
+
# }
|
1147
|
+
# )
|
1148
|
+
# df.groupby_dynamic(
|
1149
|
+
# "idx",
|
1150
|
+
# every: "2i",
|
1151
|
+
# period: "3i",
|
1152
|
+
# include_boundaries: true,
|
1153
|
+
# closed: "right"
|
1154
|
+
# ).agg(Polars.col("A").list.alias("A_agg_list"))
|
1155
|
+
# # =>
|
1156
|
+
# # shape: (3, 4)
|
1157
|
+
# # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
|
1158
|
+
# # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
|
1159
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1160
|
+
# # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
|
1161
|
+
# # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
|
1162
|
+
# # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
|
1163
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1164
|
+
# # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
|
1165
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1166
|
+
# # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
|
1167
|
+
# # └─────────────────┴─────────────────┴─────┴─────────────────┘
|
1168
|
+
def groupby_dynamic(
|
1169
|
+
index_column,
|
1170
|
+
every:,
|
1171
|
+
period: nil,
|
1172
|
+
offset: nil,
|
1173
|
+
truncate: true,
|
1174
|
+
include_boundaries: false,
|
1175
|
+
closed: "left",
|
1176
|
+
by: nil,
|
1177
|
+
start_by: "window"
|
1178
|
+
)
|
1179
|
+
if offset.nil?
|
1180
|
+
if period.nil?
|
1181
|
+
offset = "-#{every}"
|
1182
|
+
else
|
1183
|
+
offset = "0ns"
|
1184
|
+
end
|
1185
|
+
end
|
1186
|
+
|
1187
|
+
if period.nil?
|
1188
|
+
period = every
|
1189
|
+
end
|
1190
|
+
|
1191
|
+
period = Utils._timedelta_to_pl_duration(period)
|
1192
|
+
offset = Utils._timedelta_to_pl_duration(offset)
|
1193
|
+
every = Utils._timedelta_to_pl_duration(every)
|
1194
|
+
|
1195
|
+
rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
|
1196
|
+
lgb = _ldf.groupby_dynamic(
|
1197
|
+
index_column,
|
1198
|
+
every,
|
1199
|
+
period,
|
1200
|
+
offset,
|
1201
|
+
truncate,
|
1202
|
+
include_boundaries,
|
1203
|
+
closed,
|
1204
|
+
rbexprs_by,
|
1205
|
+
start_by
|
1206
|
+
)
|
1207
|
+
LazyGroupBy.new(lgb, self.class)
|
1208
|
+
end
|
1209
|
+
|
1210
|
+
# Perform an asof join.
|
1211
|
+
#
|
1212
|
+
# This is similar to a left-join except that we match on nearest key rather than
|
1213
|
+
# equal keys.
|
1214
|
+
#
|
1215
|
+
# Both DataFrames must be sorted by the join_asof key.
|
1216
|
+
#
|
1217
|
+
# For each row in the left DataFrame:
|
1218
|
+
#
|
1219
|
+
# - A "backward" search selects the last row in the right DataFrame whose 'on' key is less than or equal to the left's key.
|
1220
|
+
# - A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key.
|
1221
|
+
#
|
1222
|
+
# The default is "backward".
|
1223
|
+
#
|
1224
|
+
# @param other [LazyFrame]
|
1225
|
+
# Lazy DataFrame to join with.
|
1226
|
+
# @param left_on [String]
|
1227
|
+
# Join column of the left DataFrame.
|
1228
|
+
# @param right_on [String]
|
1229
|
+
# Join column of the right DataFrame.
|
1230
|
+
# @param on [String]
|
1231
|
+
# Join column of both DataFrames. If set, `left_on` and `right_on` should be
|
1232
|
+
# None.
|
1233
|
+
# @param by [Object]
|
1234
|
+
# Join on these columns before doing asof join.
|
1235
|
+
# @param by_left [Object]
|
1236
|
+
# Join on these columns before doing asof join.
|
1237
|
+
# @param by_right [Object]
|
1238
|
+
# Join on these columns before doing asof join.
|
1239
|
+
# @param strategy ["backward", "forward"]
|
1240
|
+
# Join strategy.
|
1241
|
+
# @param suffix [String]
|
1242
|
+
# Suffix to append to columns with a duplicate name.
|
1243
|
+
# @param tolerance [Object]
|
1244
|
+
# Numeric tolerance. By setting this the join will only be done if the near
|
1245
|
+
# keys are within this distance. If an asof join is done on columns of dtype
|
1246
|
+
# "Date", "Datetime", "Duration" or "Time" you use the following string
|
1247
|
+
# language:
|
1248
|
+
#
|
1249
|
+
# - 1ns (1 nanosecond)
|
1250
|
+
# - 1us (1 microsecond)
|
1251
|
+
# - 1ms (1 millisecond)
|
1252
|
+
# - 1s (1 second)
|
1253
|
+
# - 1m (1 minute)
|
1254
|
+
# - 1h (1 hour)
|
1255
|
+
# - 1d (1 day)
|
1256
|
+
# - 1w (1 week)
|
1257
|
+
# - 1mo (1 calendar month)
|
1258
|
+
# - 1y (1 calendar year)
|
1259
|
+
# - 1i (1 index count)
|
1260
|
+
#
|
1261
|
+
# Or combine them:
|
1262
|
+
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
1263
|
+
#
|
1264
|
+
# @param allow_parallel [Boolean]
|
1265
|
+
# Allow the physical plan to optionally evaluate the computation of both
|
1266
|
+
# DataFrames up to the join in parallel.
|
1267
|
+
# @param force_parallel [Boolean]
|
1268
|
+
# Force the physical plan to evaluate the computation of both DataFrames up to
|
1269
|
+
# the join in parallel.
|
1270
|
+
#
|
1271
|
+
# @return [LazyFrame]
|
1272
|
+
def join_asof(
|
1273
|
+
other,
|
1274
|
+
left_on: nil,
|
1275
|
+
right_on: nil,
|
1276
|
+
on: nil,
|
1277
|
+
by_left: nil,
|
1278
|
+
by_right: nil,
|
1279
|
+
by: nil,
|
1280
|
+
strategy: "backward",
|
1281
|
+
suffix: "_right",
|
1282
|
+
tolerance: nil,
|
1283
|
+
allow_parallel: true,
|
1284
|
+
force_parallel: false
|
1285
|
+
)
|
1286
|
+
if !other.is_a?(LazyFrame)
|
1287
|
+
raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
|
1288
|
+
end
|
1289
|
+
|
1290
|
+
if on.is_a?(String)
|
1291
|
+
left_on = on
|
1292
|
+
right_on = on
|
1293
|
+
end
|
1294
|
+
|
1295
|
+
if left_on.nil? || right_on.nil?
|
1296
|
+
raise ArgumentError, "You should pass the column to join on as an argument."
|
1297
|
+
end
|
1298
|
+
|
1299
|
+
if by_left.is_a?(String) || by_left.is_a?(Expr)
|
1300
|
+
by_left_ = [by_left]
|
1301
|
+
else
|
1302
|
+
by_left_ = by_left
|
1303
|
+
end
|
1304
|
+
|
1305
|
+
if by_right.is_a?(String) || by_right.is_a?(Expr)
|
1306
|
+
by_right_ = [by_right]
|
1307
|
+
else
|
1308
|
+
by_right_ = by_right
|
1309
|
+
end
|
1310
|
+
|
1311
|
+
if by.is_a?(String)
|
1312
|
+
by_left_ = [by]
|
1313
|
+
by_right_ = [by]
|
1314
|
+
elsif by.is_a?(Array)
|
1315
|
+
by_left_ = by
|
1316
|
+
by_right_ = by
|
1317
|
+
end
|
1318
|
+
|
1319
|
+
tolerance_str = nil
|
1320
|
+
tolerance_num = nil
|
1321
|
+
if tolerance.is_a?(String)
|
1322
|
+
tolerance_str = tolerance
|
1323
|
+
else
|
1324
|
+
tolerance_num = tolerance
|
1325
|
+
end
|
1326
|
+
|
1327
|
+
_from_rbldf(
|
1328
|
+
_ldf.join_asof(
|
1329
|
+
other._ldf,
|
1330
|
+
Polars.col(left_on)._rbexpr,
|
1331
|
+
Polars.col(right_on)._rbexpr,
|
1332
|
+
by_left_,
|
1333
|
+
by_right_,
|
1334
|
+
allow_parallel,
|
1335
|
+
force_parallel,
|
1336
|
+
suffix,
|
1337
|
+
strategy,
|
1338
|
+
tolerance_num,
|
1339
|
+
tolerance_str
|
1340
|
+
)
|
1341
|
+
)
|
1342
|
+
end
|
737
1343
|
|
738
1344
|
# Add a join operation to the Logical Plan.
|
739
1345
|
#
|
@@ -953,8 +1559,44 @@ module Polars
|
|
953
1559
|
_from_rbldf(_ldf.with_columns(rbexprs))
|
954
1560
|
end
|
955
1561
|
|
956
|
-
#
|
957
|
-
#
|
1562
|
+
# Add an external context to the computation graph.
|
1563
|
+
#
|
1564
|
+
# This allows expressions to also access columns from DataFrames
|
1565
|
+
# that are not part of this one.
|
1566
|
+
#
|
1567
|
+
# @param other [Object]
|
1568
|
+
# Lazy DataFrame to join with.
|
1569
|
+
#
|
1570
|
+
# @return [LazyFrame]
|
1571
|
+
#
|
1572
|
+
# @example
|
1573
|
+
# df_a = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => ["a", "c", nil]}).lazy
|
1574
|
+
# df_other = Polars::DataFrame.new({"c" => ["foo", "ham"]})
|
1575
|
+
# (
|
1576
|
+
# df_a.with_context(df_other.lazy).select(
|
1577
|
+
# [Polars.col("b") + Polars.col("c").first]
|
1578
|
+
# )
|
1579
|
+
# ).collect
|
1580
|
+
# # =>
|
1581
|
+
# # shape: (3, 1)
|
1582
|
+
# # ┌──────┐
|
1583
|
+
# # │ b │
|
1584
|
+
# # │ --- │
|
1585
|
+
# # │ str │
|
1586
|
+
# # ╞══════╡
|
1587
|
+
# # │ afoo │
|
1588
|
+
# # ├╌╌╌╌╌╌┤
|
1589
|
+
# # │ cfoo │
|
1590
|
+
# # ├╌╌╌╌╌╌┤
|
1591
|
+
# # │ null │
|
1592
|
+
# # └──────┘
|
1593
|
+
def with_context(other)
|
1594
|
+
if !other.is_a?(Array)
|
1595
|
+
other = [other]
|
1596
|
+
end
|
1597
|
+
|
1598
|
+
_from_rbldf(_ldf.with_context(other.map(&:_ldf)))
|
1599
|
+
end
|
958
1600
|
|
959
1601
|
# Add or overwrite column in a DataFrame.
|
960
1602
|
#
|
@@ -1231,8 +1873,43 @@ module Polars
|
|
1231
1873
|
slice(0, 1)
|
1232
1874
|
end
|
1233
1875
|
|
1234
|
-
#
|
1235
|
-
#
|
1876
|
+
# Add a column at index 0 that counts the rows.
|
1877
|
+
#
|
1878
|
+
# @param name [String]
|
1879
|
+
# Name of the column to add.
|
1880
|
+
# @param offset [Integer]
|
1881
|
+
# Start the row count at this offset.
|
1882
|
+
#
|
1883
|
+
# @return [LazyFrame]
|
1884
|
+
#
|
1885
|
+
# @note
|
1886
|
+
# This can have a negative effect on query performance.
|
1887
|
+
# This may, for instance, block predicate pushdown optimization.
|
1888
|
+
#
|
1889
|
+
# @example
|
1890
|
+
# df = Polars::DataFrame.new(
|
1891
|
+
# {
|
1892
|
+
# "a" => [1, 3, 5],
|
1893
|
+
# "b" => [2, 4, 6]
|
1894
|
+
# }
|
1895
|
+
# ).lazy
|
1896
|
+
# df.with_row_count.collect
|
1897
|
+
# # =>
|
1898
|
+
# # shape: (3, 3)
|
1899
|
+
# # ┌────────┬─────┬─────┐
|
1900
|
+
# # │ row_nr ┆ a ┆ b │
|
1901
|
+
# # │ --- ┆ --- ┆ --- │
|
1902
|
+
# # │ u32 ┆ i64 ┆ i64 │
|
1903
|
+
# # ╞════════╪═════╪═════╡
|
1904
|
+
# # │ 0 ┆ 1 ┆ 2 │
|
1905
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1906
|
+
# # │ 1 ┆ 3 ┆ 4 │
|
1907
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1908
|
+
# # │ 2 ┆ 5 ┆ 6 │
|
1909
|
+
# # └────────┴─────┴─────┘
|
1910
|
+
def with_row_count(name: "row_nr", offset: 0)
|
1911
|
+
_from_rbldf(_ldf.with_row_count(name, offset))
|
1912
|
+
end
|
1236
1913
|
|
1237
1914
|
# Take every nth row in the LazyFrame and return as a new LazyFrame.
|
1238
1915
|
#
|
@@ -1490,7 +2167,8 @@ module Polars
|
|
1490
2167
|
# # │ 3.0 ┆ 1.0 │
|
1491
2168
|
# # └─────┴─────┘
|
1492
2169
|
def quantile(quantile, interpolation: "nearest")
|
1493
|
-
|
2170
|
+
quantile = Utils.expr_to_lit_or_expr(quantile, str_to_lit: false)
|
2171
|
+
_from_rbldf(_ldf.quantile(quantile._rbexpr, interpolation))
|
1494
2172
|
end
|
1495
2173
|
|
1496
2174
|
# Explode lists to long format.
|
@@ -1554,11 +2232,106 @@ module Polars
|
|
1554
2232
|
_from_rbldf(_ldf.unique(maintain_order, subset, keep))
|
1555
2233
|
end
|
1556
2234
|
|
1557
|
-
#
|
1558
|
-
#
|
2235
|
+
# Drop rows with null values from this LazyFrame.
|
2236
|
+
#
|
2237
|
+
# @param subset [Object]
|
2238
|
+
# Subset of column(s) on which `drop_nulls` will be applied.
|
2239
|
+
#
|
2240
|
+
# @return [LazyFrame]
|
2241
|
+
#
|
2242
|
+
# @example
|
2243
|
+
# df = Polars::DataFrame.new(
|
2244
|
+
# {
|
2245
|
+
# "foo" => [1, 2, 3],
|
2246
|
+
# "bar" => [6, nil, 8],
|
2247
|
+
# "ham" => ["a", "b", "c"]
|
2248
|
+
# }
|
2249
|
+
# )
|
2250
|
+
# df.lazy.drop_nulls.collect
|
2251
|
+
# # =>
|
2252
|
+
# # shape: (2, 3)
|
2253
|
+
# # ┌─────┬─────┬─────┐
|
2254
|
+
# # │ foo ┆ bar ┆ ham │
|
2255
|
+
# # │ --- ┆ --- ┆ --- │
|
2256
|
+
# # │ i64 ┆ i64 ┆ str │
|
2257
|
+
# # ╞═════╪═════╪═════╡
|
2258
|
+
# # │ 1 ┆ 6 ┆ a │
|
2259
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
2260
|
+
# # │ 3 ┆ 8 ┆ c │
|
2261
|
+
# # └─────┴─────┴─────┘
|
2262
|
+
def drop_nulls(subset: nil)
|
2263
|
+
if !subset.nil? && !subset.is_a?(Array)
|
2264
|
+
subset = [subset]
|
2265
|
+
end
|
2266
|
+
_from_rbldf(_ldf.drop_nulls(subset))
|
2267
|
+
end
|
1559
2268
|
|
1560
|
-
#
|
1561
|
-
#
|
2269
|
+
# Unpivot a DataFrame from wide to long format.
|
2270
|
+
#
|
2271
|
+
# Optionally leaves identifiers set.
|
2272
|
+
#
|
2273
|
+
# This function is useful to massage a DataFrame into a format where one or more
|
2274
|
+
# columns are identifier variables (id_vars), while all other columns, considered
|
2275
|
+
# measured variables (value_vars), are "unpivoted" to the row axis, leaving just
|
2276
|
+
# two non-identifier columns, 'variable' and 'value'.
|
2277
|
+
#
|
2278
|
+
# @param id_vars [Object]
|
2279
|
+
# Columns to use as identifier variables.
|
2280
|
+
# @param value_vars [Object]
|
2281
|
+
# Values to use as identifier variables.
|
2282
|
+
# If `value_vars` is empty all columns that are not in `id_vars` will be used.
|
2283
|
+
# @param variable_name [String]
|
2284
|
+
# Name to give to the `value` column. Defaults to "variable"
|
2285
|
+
# @param value_name [String]
|
2286
|
+
# Name to give to the `value` column. Defaults to "value"
|
2287
|
+
#
|
2288
|
+
# @return [LazyFrame]
|
2289
|
+
#
|
2290
|
+
# @example
|
2291
|
+
# df = Polars::DataFrame.new(
|
2292
|
+
# {
|
2293
|
+
# "a" => ["x", "y", "z"],
|
2294
|
+
# "b" => [1, 3, 5],
|
2295
|
+
# "c" => [2, 4, 6]
|
2296
|
+
# }
|
2297
|
+
# ).lazy
|
2298
|
+
# df.melt(id_vars: "a", value_vars: ["b", "c"]).collect
|
2299
|
+
# # =>
|
2300
|
+
# # shape: (6, 3)
|
2301
|
+
# # ┌─────┬──────────┬───────┐
|
2302
|
+
# # │ a ┆ variable ┆ value │
|
2303
|
+
# # │ --- ┆ --- ┆ --- │
|
2304
|
+
# # │ str ┆ str ┆ i64 │
|
2305
|
+
# # ╞═════╪══════════╪═══════╡
|
2306
|
+
# # │ x ┆ b ┆ 1 │
|
2307
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2308
|
+
# # │ y ┆ b ┆ 3 │
|
2309
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2310
|
+
# # │ z ┆ b ┆ 5 │
|
2311
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2312
|
+
# # │ x ┆ c ┆ 2 │
|
2313
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2314
|
+
# # │ y ┆ c ┆ 4 │
|
2315
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2316
|
+
# # │ z ┆ c ┆ 6 │
|
2317
|
+
# # └─────┴──────────┴───────┘
|
2318
|
+
def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
|
2319
|
+
if value_vars.is_a?(String)
|
2320
|
+
value_vars = [value_vars]
|
2321
|
+
end
|
2322
|
+
if id_vars.is_a?(String)
|
2323
|
+
id_vars = [id_vars]
|
2324
|
+
end
|
2325
|
+
if value_vars.nil?
|
2326
|
+
value_vars = []
|
2327
|
+
end
|
2328
|
+
if id_vars.nil?
|
2329
|
+
id_vars = []
|
2330
|
+
end
|
2331
|
+
_from_rbldf(
|
2332
|
+
_ldf.melt(id_vars, value_vars, value_name, variable_name)
|
2333
|
+
)
|
2334
|
+
end
|
1562
2335
|
|
1563
2336
|
# def map
|
1564
2337
|
# end
|