polars-df 0.5.0-arm64-darwin → 0.7.0-arm64-darwin
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +26 -0
- data/Cargo.lock +595 -709
- data/Cargo.toml +1 -0
- data/LICENSE-THIRD-PARTY.txt +4572 -5214
- data/README.md +11 -9
- data/lib/polars/3.0/polars.bundle +0 -0
- data/lib/polars/3.1/polars.bundle +0 -0
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/array_expr.rb +84 -0
- data/lib/polars/array_name_space.rb +77 -0
- data/lib/polars/batched_csv_reader.rb +1 -1
- data/lib/polars/config.rb +530 -0
- data/lib/polars/data_frame.rb +206 -131
- data/lib/polars/data_types.rb +163 -29
- data/lib/polars/date_time_expr.rb +13 -18
- data/lib/polars/date_time_name_space.rb +22 -28
- data/lib/polars/dynamic_group_by.rb +2 -2
- data/lib/polars/expr.rb +241 -151
- data/lib/polars/functions.rb +29 -38
- data/lib/polars/group_by.rb +38 -76
- data/lib/polars/io.rb +37 -2
- data/lib/polars/lazy_frame.rb +174 -95
- data/lib/polars/lazy_functions.rb +87 -63
- data/lib/polars/lazy_group_by.rb +7 -8
- data/lib/polars/list_expr.rb +40 -36
- data/lib/polars/list_name_space.rb +15 -15
- data/lib/polars/name_expr.rb +198 -0
- data/lib/polars/rolling_group_by.rb +6 -4
- data/lib/polars/series.rb +95 -28
- data/lib/polars/sql_context.rb +194 -0
- data/lib/polars/string_expr.rb +249 -69
- data/lib/polars/string_name_space.rb +155 -25
- data/lib/polars/utils.rb +119 -57
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +6 -0
- metadata +7 -2
data/lib/polars/lazy_frame.rb
CHANGED
@@ -4,6 +4,22 @@ module Polars
|
|
4
4
|
# @private
|
5
5
|
attr_accessor :_ldf
|
6
6
|
|
7
|
+
# Create a new LazyFrame.
|
8
|
+
def initialize(data = nil, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
|
9
|
+
self._ldf = (
|
10
|
+
DataFrame.new(
|
11
|
+
data,
|
12
|
+
schema: schema,
|
13
|
+
schema_overrides: schema_overrides,
|
14
|
+
orient: orient,
|
15
|
+
infer_schema_length: infer_schema_length,
|
16
|
+
nan_to_null: nan_to_null
|
17
|
+
)
|
18
|
+
.lazy
|
19
|
+
._ldf
|
20
|
+
)
|
21
|
+
end
|
22
|
+
|
7
23
|
# @private
|
8
24
|
def self._from_rbldf(rb_ldf)
|
9
25
|
ldf = LazyFrame.allocate
|
@@ -81,7 +97,8 @@ module Polars
|
|
81
97
|
row_count_offset: 0,
|
82
98
|
storage_options: nil,
|
83
99
|
low_memory: false,
|
84
|
-
use_statistics: true
|
100
|
+
use_statistics: true,
|
101
|
+
hive_partitioning: true
|
85
102
|
)
|
86
103
|
_from_rbldf(
|
87
104
|
RbLazyFrame.new_from_parquet(
|
@@ -92,7 +109,8 @@ module Polars
|
|
92
109
|
rechunk,
|
93
110
|
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
94
111
|
low_memory,
|
95
|
-
use_statistics
|
112
|
+
use_statistics,
|
113
|
+
hive_partitioning
|
96
114
|
)
|
97
115
|
)
|
98
116
|
end
|
@@ -334,6 +352,7 @@ module Polars
|
|
334
352
|
slice_pushdown,
|
335
353
|
common_subplan_elimination,
|
336
354
|
allow_streaming,
|
355
|
+
false
|
337
356
|
)
|
338
357
|
|
339
358
|
ldf.describe_optimized_plan
|
@@ -379,16 +398,16 @@ module Polars
|
|
379
398
|
# # │ 2 ┆ 7.0 ┆ b │
|
380
399
|
# # │ 1 ┆ 6.0 ┆ a │
|
381
400
|
# # └─────┴─────┴─────┘
|
382
|
-
def sort(by, reverse: false, nulls_last: false)
|
401
|
+
def sort(by, reverse: false, nulls_last: false, maintain_order: false)
|
383
402
|
if by.is_a?(String)
|
384
|
-
_from_rbldf(_ldf.sort(by, reverse, nulls_last))
|
403
|
+
return _from_rbldf(_ldf.sort(by, reverse, nulls_last, maintain_order))
|
385
404
|
end
|
386
405
|
if Utils.bool?(reverse)
|
387
406
|
reverse = [reverse]
|
388
407
|
end
|
389
408
|
|
390
409
|
by = Utils.selection_to_rbexpr_list(by)
|
391
|
-
_from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last))
|
410
|
+
_from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last, maintain_order))
|
392
411
|
end
|
393
412
|
|
394
413
|
# def profile
|
@@ -429,7 +448,7 @@ module Polars
|
|
429
448
|
# "c" => [6, 5, 4, 3, 2, 1]
|
430
449
|
# }
|
431
450
|
# ).lazy
|
432
|
-
# df.
|
451
|
+
# df.group_by("a", maintain_order: true).agg(Polars.all.sum).collect
|
433
452
|
# # =>
|
434
453
|
# # shape: (3, 3)
|
435
454
|
# # ┌─────┬─────┬─────┐
|
@@ -450,7 +469,8 @@ module Polars
|
|
450
469
|
no_optimization: false,
|
451
470
|
slice_pushdown: true,
|
452
471
|
common_subplan_elimination: true,
|
453
|
-
allow_streaming: false
|
472
|
+
allow_streaming: false,
|
473
|
+
_eager: false
|
454
474
|
)
|
455
475
|
if no_optimization
|
456
476
|
predicate_pushdown = false
|
@@ -470,7 +490,8 @@ module Polars
|
|
470
490
|
simplify_expression,
|
471
491
|
slice_pushdown,
|
472
492
|
common_subplan_elimination,
|
473
|
-
allow_streaming
|
493
|
+
allow_streaming,
|
494
|
+
_eager
|
474
495
|
)
|
475
496
|
Utils.wrap_df(ldf.collect)
|
476
497
|
end
|
@@ -552,7 +573,8 @@ module Polars
|
|
552
573
|
simplify_expression,
|
553
574
|
slice_pushdown,
|
554
575
|
false,
|
555
|
-
true
|
576
|
+
true,
|
577
|
+
false
|
556
578
|
)
|
557
579
|
lf.sink_parquet(
|
558
580
|
path,
|
@@ -607,7 +629,7 @@ module Polars
|
|
607
629
|
# "c" => [6, 5, 4, 3, 2, 1]
|
608
630
|
# }
|
609
631
|
# ).lazy
|
610
|
-
# df.
|
632
|
+
# df.group_by("a", maintain_order: true).agg(Polars.all.sum).fetch(2)
|
611
633
|
# # =>
|
612
634
|
# # shape: (2, 3)
|
613
635
|
# # ┌─────┬─────┬─────┐
|
@@ -644,7 +666,8 @@ module Polars
|
|
644
666
|
simplify_expression,
|
645
667
|
slice_pushdown,
|
646
668
|
common_subplan_elimination,
|
647
|
-
allow_streaming
|
669
|
+
allow_streaming,
|
670
|
+
false
|
648
671
|
)
|
649
672
|
Utils.wrap_df(ldf.fetch(n_rows))
|
650
673
|
end
|
@@ -837,13 +860,13 @@ module Polars
|
|
837
860
|
_from_rbldf(_ldf.select(exprs))
|
838
861
|
end
|
839
862
|
|
840
|
-
# Start a
|
863
|
+
# Start a group by operation.
|
841
864
|
#
|
842
865
|
# @param by [Object]
|
843
866
|
# Column(s) to group by.
|
844
867
|
# @param maintain_order [Boolean]
|
845
868
|
# Make sure that the order of the groups remain consistent. This is more
|
846
|
-
# expensive than a default
|
869
|
+
# expensive than a default group by.
|
847
870
|
#
|
848
871
|
# @return [LazyGroupBy]
|
849
872
|
#
|
@@ -855,7 +878,7 @@ module Polars
|
|
855
878
|
# "c" => [6, 5, 4, 3, 2, 1]
|
856
879
|
# }
|
857
880
|
# ).lazy
|
858
|
-
# df.
|
881
|
+
# df.group_by("a", maintain_order: true).agg(Polars.col("b").sum).collect
|
859
882
|
# # =>
|
860
883
|
# # shape: (3, 2)
|
861
884
|
# # ┌─────┬─────┐
|
@@ -867,19 +890,21 @@ module Polars
|
|
867
890
|
# # │ b ┆ 11 │
|
868
891
|
# # │ c ┆ 6 │
|
869
892
|
# # └─────┴─────┘
|
870
|
-
def
|
893
|
+
def group_by(by, maintain_order: false)
|
871
894
|
rbexprs_by = Utils.selection_to_rbexpr_list(by)
|
872
|
-
lgb = _ldf.
|
873
|
-
LazyGroupBy.new(lgb
|
895
|
+
lgb = _ldf.group_by(rbexprs_by, maintain_order)
|
896
|
+
LazyGroupBy.new(lgb)
|
874
897
|
end
|
898
|
+
alias_method :groupby, :group_by
|
899
|
+
alias_method :group, :group_by
|
875
900
|
|
876
901
|
# Create rolling groups based on a time column.
|
877
902
|
#
|
878
903
|
# Also works for index values of type `:i32` or `:i64`.
|
879
904
|
#
|
880
|
-
# Different from a `
|
905
|
+
# Different from a `dynamic_group_by` the windows are now determined by the
|
881
906
|
# individual values and are not of constant intervals. For constant intervals
|
882
|
-
# use *
|
907
|
+
# use *group_by_dynamic*.
|
883
908
|
#
|
884
909
|
# The `period` and `offset` arguments are created either from a timedelta, or
|
885
910
|
# by using the following string language:
|
@@ -899,7 +924,7 @@ module Polars
|
|
899
924
|
# Or combine them:
|
900
925
|
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
901
926
|
#
|
902
|
-
# In case of a
|
927
|
+
# In case of a group_by_rolling on an integer column, the windows are defined by:
|
903
928
|
#
|
904
929
|
# - "1i" # length 1
|
905
930
|
# - "10i" # length 10
|
@@ -910,7 +935,7 @@ module Polars
|
|
910
935
|
# This column must be sorted in ascending order. If not the output will not
|
911
936
|
# make sense.
|
912
937
|
#
|
913
|
-
# In case of a rolling
|
938
|
+
# In case of a rolling group by on indices, dtype needs to be one of
|
914
939
|
# `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
|
915
940
|
# performance matters use an `:i64` column.
|
916
941
|
# @param period [Object]
|
@@ -921,6 +946,12 @@ module Polars
|
|
921
946
|
# Define whether the temporal window interval is closed or not.
|
922
947
|
# @param by [Object]
|
923
948
|
# Also group by this column/these columns.
|
949
|
+
# @param check_sorted [Boolean]
|
950
|
+
# When the `by` argument is given, polars can not check sortedness
|
951
|
+
# by the metadata and has to do a full scan on the index column to
|
952
|
+
# verify data is sorted. This is expensive. If you are sure the
|
953
|
+
# data within the by groups is sorted, you can set this to `false`.
|
954
|
+
# Doing so incorrectly will lead to incorrect output
|
924
955
|
#
|
925
956
|
# @return [LazyFrame]
|
926
957
|
#
|
@@ -933,16 +964,16 @@ module Polars
|
|
933
964
|
# "2020-01-03 19:45:32",
|
934
965
|
# "2020-01-08 23:16:43"
|
935
966
|
# ]
|
936
|
-
# df = Polars::
|
937
|
-
# Polars.col("dt").str.strptime(Polars::Datetime)
|
967
|
+
# df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
|
968
|
+
# Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
|
938
969
|
# )
|
939
|
-
# df.
|
970
|
+
# df.group_by_rolling(index_column: "dt", period: "2d").agg(
|
940
971
|
# [
|
941
972
|
# Polars.sum("a").alias("sum_a"),
|
942
973
|
# Polars.min("a").alias("min_a"),
|
943
974
|
# Polars.max("a").alias("max_a")
|
944
975
|
# ]
|
945
|
-
# )
|
976
|
+
# ).collect
|
946
977
|
# # =>
|
947
978
|
# # shape: (6, 4)
|
948
979
|
# # ┌─────────────────────┬───────┬───────┬───────┐
|
@@ -957,14 +988,15 @@ module Polars
|
|
957
988
|
# # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
|
958
989
|
# # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
|
959
990
|
# # └─────────────────────┴───────┴───────┴───────┘
|
960
|
-
def
|
991
|
+
def group_by_rolling(
|
961
992
|
index_column:,
|
962
993
|
period:,
|
963
994
|
offset: nil,
|
964
995
|
closed: "right",
|
965
|
-
by: nil
|
996
|
+
by: nil,
|
997
|
+
check_sorted: true
|
966
998
|
)
|
967
|
-
index_column = Utils.
|
999
|
+
index_column = Utils.parse_as_expression(index_column)
|
968
1000
|
if offset.nil?
|
969
1001
|
offset = "-#{period}"
|
970
1002
|
end
|
@@ -973,16 +1005,17 @@ module Polars
|
|
973
1005
|
period = Utils._timedelta_to_pl_duration(period)
|
974
1006
|
offset = Utils._timedelta_to_pl_duration(offset)
|
975
1007
|
|
976
|
-
lgb = _ldf.
|
977
|
-
index_column
|
1008
|
+
lgb = _ldf.group_by_rolling(
|
1009
|
+
index_column, period, offset, closed, rbexprs_by, check_sorted
|
978
1010
|
)
|
979
|
-
LazyGroupBy.new(lgb
|
1011
|
+
LazyGroupBy.new(lgb)
|
980
1012
|
end
|
1013
|
+
alias_method :groupby_rolling, :group_by_rolling
|
981
1014
|
|
982
1015
|
# Group based on a time value (or index value of type `:i32`, `:i64`).
|
983
1016
|
#
|
984
1017
|
# Time windows are calculated and rows are assigned to windows. Different from a
|
985
|
-
# normal
|
1018
|
+
# normal group by is that a row can be member of multiple groups. The time/index
|
986
1019
|
# window could be seen as a rolling window, with a window size determined by
|
987
1020
|
# dates/times/values instead of slots in the DataFrame.
|
988
1021
|
#
|
@@ -1010,37 +1043,43 @@ module Polars
|
|
1010
1043
|
# Or combine them:
|
1011
1044
|
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
1012
1045
|
#
|
1013
|
-
# In case of a
|
1046
|
+
# In case of a group_by_dynamic on an integer column, the windows are defined by:
|
1014
1047
|
#
|
1015
1048
|
# - "1i" # length 1
|
1016
1049
|
# - "10i" # length 10
|
1017
1050
|
#
|
1018
|
-
# @param index_column
|
1051
|
+
# @param index_column [Object]
|
1019
1052
|
# Column used to group based on the time window.
|
1020
1053
|
# Often to type Date/Datetime
|
1021
1054
|
# This column must be sorted in ascending order. If not the output will not
|
1022
1055
|
# make sense.
|
1023
1056
|
#
|
1024
|
-
# In case of a dynamic
|
1057
|
+
# In case of a dynamic group by on indices, dtype needs to be one of
|
1025
1058
|
# `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
|
1026
1059
|
# performance matters use an `:i64` column.
|
1027
|
-
# @param every
|
1060
|
+
# @param every [Object]
|
1028
1061
|
# Interval of the window.
|
1029
|
-
# @param period
|
1062
|
+
# @param period [Object]
|
1030
1063
|
# Length of the window, if None it is equal to 'every'.
|
1031
|
-
# @param offset
|
1064
|
+
# @param offset [Object]
|
1032
1065
|
# Offset of the window if None and period is None it will be equal to negative
|
1033
1066
|
# `every`.
|
1034
|
-
# @param truncate
|
1067
|
+
# @param truncate [Boolean]
|
1035
1068
|
# Truncate the time value to the window lower bound.
|
1036
|
-
# @param include_boundaries
|
1069
|
+
# @param include_boundaries [Boolean]
|
1037
1070
|
# Add the lower and upper bound of the window to the "_lower_bound" and
|
1038
1071
|
# "_upper_bound" columns. This will impact performance because it's harder to
|
1039
1072
|
# parallelize
|
1040
1073
|
# @param closed ["right", "left", "both", "none"]
|
1041
1074
|
# Define whether the temporal window interval is closed or not.
|
1042
|
-
# @param by
|
1075
|
+
# @param by [Object]
|
1043
1076
|
# Also group by this column/these columns
|
1077
|
+
# @param check_sorted [Boolean]
|
1078
|
+
# When the `by` argument is given, polars can not check sortedness
|
1079
|
+
# by the metadata and has to do a full scan on the index column to
|
1080
|
+
# verify data is sorted. This is expensive. If you are sure the
|
1081
|
+
# data within the by groups is sorted, you can set this to `false`.
|
1082
|
+
# Doing so incorrectly will lead to incorrect output.
|
1044
1083
|
#
|
1045
1084
|
# @return [DataFrame]
|
1046
1085
|
#
|
@@ -1072,7 +1111,7 @@ module Polars
|
|
1072
1111
|
# # └─────────────────────┴─────┘
|
1073
1112
|
#
|
1074
1113
|
# @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
|
1075
|
-
# df.
|
1114
|
+
# df.group_by_dynamic("time", every: "1h", closed: "right").agg(
|
1076
1115
|
# [
|
1077
1116
|
# Polars.col("time").min.alias("time_min"),
|
1078
1117
|
# Polars.col("time").max.alias("time_max")
|
@@ -1092,7 +1131,7 @@ module Polars
|
|
1092
1131
|
# # └─────────────────────┴─────────────────────┴─────────────────────┘
|
1093
1132
|
#
|
1094
1133
|
# @example The window boundaries can also be added to the aggregation result.
|
1095
|
-
# df.
|
1134
|
+
# df.group_by_dynamic(
|
1096
1135
|
# "time", every: "1h", include_boundaries: true, closed: "right"
|
1097
1136
|
# ).agg([Polars.col("time").count.alias("time_count")])
|
1098
1137
|
# # =>
|
@@ -1109,27 +1148,27 @@ module Polars
|
|
1109
1148
|
# # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
|
1110
1149
|
#
|
1111
1150
|
# @example When closed="left", should not include right end of interval.
|
1112
|
-
# df.
|
1151
|
+
# df.group_by_dynamic("time", every: "1h", closed: "left").agg(
|
1113
1152
|
# [
|
1114
1153
|
# Polars.col("time").count.alias("time_count"),
|
1115
|
-
# Polars.col("time").
|
1154
|
+
# Polars.col("time").alias("time_agg_list")
|
1116
1155
|
# ]
|
1117
1156
|
# )
|
1118
1157
|
# # =>
|
1119
1158
|
# # shape: (4, 3)
|
1120
|
-
# #
|
1121
|
-
# # │ time ┆ time_count ┆ time_agg_list
|
1122
|
-
# # │ --- ┆ --- ┆ ---
|
1123
|
-
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
|
1124
|
-
# #
|
1125
|
-
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16
|
1126
|
-
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16
|
1127
|
-
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16
|
1128
|
-
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
|
1129
|
-
# #
|
1159
|
+
# # ┌─────────────────────┬────────────┬───────────────────────────────────┐
|
1160
|
+
# # │ time ┆ time_count ┆ time_agg_list │
|
1161
|
+
# # │ --- ┆ --- ┆ --- │
|
1162
|
+
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
|
1163
|
+
# # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
|
1164
|
+
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │
|
1165
|
+
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │
|
1166
|
+
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │
|
1167
|
+
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
|
1168
|
+
# # └─────────────────────┴────────────┴───────────────────────────────────┘
|
1130
1169
|
#
|
1131
1170
|
# @example When closed="both" the time values at the window boundaries belong to 2 groups.
|
1132
|
-
# df.
|
1171
|
+
# df.group_by_dynamic("time", every: "1h", closed: "both").agg(
|
1133
1172
|
# [Polars.col("time").count.alias("time_count")]
|
1134
1173
|
# )
|
1135
1174
|
# # =>
|
@@ -1146,7 +1185,7 @@ module Polars
|
|
1146
1185
|
# # │ 2021-12-16 03:00:00 ┆ 1 │
|
1147
1186
|
# # └─────────────────────┴────────────┘
|
1148
1187
|
#
|
1149
|
-
# @example Dynamic
|
1188
|
+
# @example Dynamic group bys can also be combined with grouping on normal keys.
|
1150
1189
|
# df = Polars::DataFrame.new(
|
1151
1190
|
# {
|
1152
1191
|
# "time" => Polars.date_range(
|
@@ -1157,7 +1196,7 @@ module Polars
|
|
1157
1196
|
# "groups" => ["a", "a", "a", "b", "b", "a", "a"]
|
1158
1197
|
# }
|
1159
1198
|
# )
|
1160
|
-
# df.
|
1199
|
+
# df.group_by_dynamic(
|
1161
1200
|
# "time",
|
1162
1201
|
# every: "1h",
|
1163
1202
|
# closed: "both",
|
@@ -1180,20 +1219,20 @@ module Polars
|
|
1180
1219
|
# # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
|
1181
1220
|
# # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
|
1182
1221
|
#
|
1183
|
-
# @example Dynamic
|
1222
|
+
# @example Dynamic group by on an index column.
|
1184
1223
|
# df = Polars::DataFrame.new(
|
1185
1224
|
# {
|
1186
1225
|
# "idx" => Polars.arange(0, 6, eager: true),
|
1187
1226
|
# "A" => ["A", "A", "B", "B", "B", "C"]
|
1188
1227
|
# }
|
1189
1228
|
# )
|
1190
|
-
# df.
|
1229
|
+
# df.group_by_dynamic(
|
1191
1230
|
# "idx",
|
1192
1231
|
# every: "2i",
|
1193
1232
|
# period: "3i",
|
1194
1233
|
# include_boundaries: true,
|
1195
1234
|
# closed: "right"
|
1196
|
-
# ).agg(Polars.col("A").
|
1235
|
+
# ).agg(Polars.col("A").alias("A_agg_list"))
|
1197
1236
|
# # =>
|
1198
1237
|
# # shape: (3, 4)
|
1199
1238
|
# # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
|
@@ -1205,23 +1244,26 @@ module Polars
|
|
1205
1244
|
# # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
|
1206
1245
|
# # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
|
1207
1246
|
# # └─────────────────┴─────────────────┴─────┴─────────────────┘
|
1208
|
-
def
|
1247
|
+
def group_by_dynamic(
|
1209
1248
|
index_column,
|
1210
1249
|
every:,
|
1211
1250
|
period: nil,
|
1212
1251
|
offset: nil,
|
1213
|
-
truncate:
|
1252
|
+
truncate: nil,
|
1214
1253
|
include_boundaries: false,
|
1215
1254
|
closed: "left",
|
1255
|
+
label: "left",
|
1216
1256
|
by: nil,
|
1217
|
-
start_by: "window"
|
1257
|
+
start_by: "window",
|
1258
|
+
check_sorted: true
|
1218
1259
|
)
|
1260
|
+
if !truncate.nil?
|
1261
|
+
label = truncate ? "left" : "datapoint"
|
1262
|
+
end
|
1263
|
+
|
1264
|
+
index_column = Utils.expr_to_lit_or_expr(index_column, str_to_lit: false)
|
1219
1265
|
if offset.nil?
|
1220
|
-
|
1221
|
-
offset = "-#{every}"
|
1222
|
-
else
|
1223
|
-
offset = "0ns"
|
1224
|
-
end
|
1266
|
+
offset = period.nil? ? "-#{every}" : "0ns"
|
1225
1267
|
end
|
1226
1268
|
|
1227
1269
|
if period.nil?
|
@@ -1233,19 +1275,21 @@ module Polars
|
|
1233
1275
|
every = Utils._timedelta_to_pl_duration(every)
|
1234
1276
|
|
1235
1277
|
rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
|
1236
|
-
lgb = _ldf.
|
1237
|
-
index_column,
|
1278
|
+
lgb = _ldf.group_by_dynamic(
|
1279
|
+
index_column._rbexpr,
|
1238
1280
|
every,
|
1239
1281
|
period,
|
1240
1282
|
offset,
|
1241
|
-
|
1283
|
+
label,
|
1242
1284
|
include_boundaries,
|
1243
1285
|
closed,
|
1244
1286
|
rbexprs_by,
|
1245
|
-
start_by
|
1287
|
+
start_by,
|
1288
|
+
check_sorted
|
1246
1289
|
)
|
1247
|
-
LazyGroupBy.new(lgb
|
1290
|
+
LazyGroupBy.new(lgb)
|
1248
1291
|
end
|
1292
|
+
alias_method :groupby_dynamic, :group_by_dynamic
|
1249
1293
|
|
1250
1294
|
# Perform an asof join.
|
1251
1295
|
#
|
@@ -1351,7 +1395,7 @@ module Polars
|
|
1351
1395
|
if by.is_a?(String)
|
1352
1396
|
by_left_ = [by]
|
1353
1397
|
by_right_ = [by]
|
1354
|
-
elsif by.is_a?(Array)
|
1398
|
+
elsif by.is_a?(::Array)
|
1355
1399
|
by_left_ = by
|
1356
1400
|
by_right_ = by
|
1357
1401
|
end
|
@@ -1619,7 +1663,7 @@ module Polars
|
|
1619
1663
|
# # │ null │
|
1620
1664
|
# # └──────┘
|
1621
1665
|
def with_context(other)
|
1622
|
-
if !other.is_a?(Array)
|
1666
|
+
if !other.is_a?(::Array)
|
1623
1667
|
other = [other]
|
1624
1668
|
end
|
1625
1669
|
|
@@ -1705,8 +1749,10 @@ module Polars
|
|
1705
1749
|
|
1706
1750
|
# Shift the values by a given period.
|
1707
1751
|
#
|
1708
|
-
# @param
|
1752
|
+
# @param n [Integer]
|
1709
1753
|
# Number of places to shift (may be negative).
|
1754
|
+
# @param fill_value [Object]
|
1755
|
+
# Fill the resulting null values with this value.
|
1710
1756
|
#
|
1711
1757
|
# @return [LazyFrame]
|
1712
1758
|
#
|
@@ -1743,8 +1789,12 @@ module Polars
|
|
1743
1789
|
# # │ 5 ┆ 6 │
|
1744
1790
|
# # │ null ┆ null │
|
1745
1791
|
# # └──────┴──────┘
|
1746
|
-
def shift(
|
1747
|
-
|
1792
|
+
def shift(n, fill_value: nil)
|
1793
|
+
if !fill_value.nil?
|
1794
|
+
fill_value = Utils.parse_as_expression(fill_value, str_as_lit: true)
|
1795
|
+
end
|
1796
|
+
n = Utils.parse_as_expression(n)
|
1797
|
+
_from_rbldf(_ldf.shift(n, fill_value))
|
1748
1798
|
end
|
1749
1799
|
|
1750
1800
|
# Shift the values by a given period and fill the resulting null values.
|
@@ -1790,10 +1840,7 @@ module Polars
|
|
1790
1840
|
# # │ 0 ┆ 0 │
|
1791
1841
|
# # └─────┴─────┘
|
1792
1842
|
def shift_and_fill(periods, fill_value)
|
1793
|
-
|
1794
|
-
fill_value = Polars.lit(fill_value)
|
1795
|
-
end
|
1796
|
-
_from_rbldf(_ldf.shift_and_fill(periods, fill_value._rbexpr))
|
1843
|
+
shift(periods, fill_value: fill_value)
|
1797
1844
|
end
|
1798
1845
|
|
1799
1846
|
# Get a slice of this DataFrame.
|
@@ -2228,7 +2275,7 @@ module Polars
|
|
2228
2275
|
#
|
2229
2276
|
# @return [LazyFrame]
|
2230
2277
|
def unique(maintain_order: true, subset: nil, keep: "first")
|
2231
|
-
if !subset.nil? && !subset.is_a?(Array)
|
2278
|
+
if !subset.nil? && !subset.is_a?(::Array)
|
2232
2279
|
subset = [subset]
|
2233
2280
|
end
|
2234
2281
|
_from_rbldf(_ldf.unique(maintain_order, subset, keep))
|
@@ -2261,7 +2308,7 @@ module Polars
|
|
2261
2308
|
# # │ 3 ┆ 8 ┆ c │
|
2262
2309
|
# # └─────┴─────┴─────┘
|
2263
2310
|
def drop_nulls(subset: nil)
|
2264
|
-
if !subset.nil? && !subset.is_a?(Array)
|
2311
|
+
if !subset.nil? && !subset.is_a?(::Array)
|
2265
2312
|
subset = [subset]
|
2266
2313
|
end
|
2267
2314
|
_from_rbldf(_ldf.drop_nulls(subset))
|
@@ -2351,16 +2398,16 @@ module Polars
|
|
2351
2398
|
# df.interpolate.collect
|
2352
2399
|
# # =>
|
2353
2400
|
# # shape: (4, 3)
|
2354
|
-
# #
|
2355
|
-
# # │ foo
|
2356
|
-
# # │ ---
|
2357
|
-
# # │
|
2358
|
-
# #
|
2359
|
-
# # │ 1
|
2360
|
-
# # │ 5
|
2361
|
-
# # │ 9
|
2362
|
-
# # │ 10
|
2363
|
-
# #
|
2401
|
+
# # ┌──────┬──────┬──────────┐
|
2402
|
+
# # │ foo ┆ bar ┆ baz │
|
2403
|
+
# # │ --- ┆ --- ┆ --- │
|
2404
|
+
# # │ f64 ┆ f64 ┆ f64 │
|
2405
|
+
# # ╞══════╪══════╪══════════╡
|
2406
|
+
# # │ 1.0 ┆ 6.0 ┆ 1.0 │
|
2407
|
+
# # │ 5.0 ┆ 7.0 ┆ 3.666667 │
|
2408
|
+
# # │ 9.0 ┆ 9.0 ┆ 6.333333 │
|
2409
|
+
# # │ 10.0 ┆ null ┆ 9.0 │
|
2410
|
+
# # └──────┴──────┴──────────┘
|
2364
2411
|
def interpolate
|
2365
2412
|
select(Utils.col("*").interpolate)
|
2366
2413
|
end
|
@@ -2423,6 +2470,38 @@ module Polars
|
|
2423
2470
|
_from_rbldf(_ldf.unnest(names))
|
2424
2471
|
end
|
2425
2472
|
|
2473
|
+
# TODO
|
2474
|
+
# def merge_sorted
|
2475
|
+
# end
|
2476
|
+
|
2477
|
+
# Indicate that one or multiple columns are sorted.
|
2478
|
+
#
|
2479
|
+
# @param column [Object]
|
2480
|
+
# Columns that are sorted
|
2481
|
+
# @param more_columns [Object]
|
2482
|
+
# Additional columns that are sorted, specified as positional arguments.
|
2483
|
+
# @param descending [Boolean]
|
2484
|
+
# Whether the columns are sorted in descending order.
|
2485
|
+
#
|
2486
|
+
# @return [LazyFrame]
|
2487
|
+
def set_sorted(
|
2488
|
+
column,
|
2489
|
+
*more_columns,
|
2490
|
+
descending: false
|
2491
|
+
)
|
2492
|
+
columns = Utils.selection_to_rbexpr_list(column)
|
2493
|
+
if more_columns.any?
|
2494
|
+
columns.concat(Utils.selection_to_rbexpr_list(more_columns))
|
2495
|
+
end
|
2496
|
+
with_columns(
|
2497
|
+
columns.map { |e| Utils.wrap_expr(e).set_sorted(descending: descending) }
|
2498
|
+
)
|
2499
|
+
end
|
2500
|
+
|
2501
|
+
# TODO
|
2502
|
+
# def update
|
2503
|
+
# end
|
2504
|
+
|
2426
2505
|
private
|
2427
2506
|
|
2428
2507
|
def initialize_copy(other)
|