polars-df 0.6.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/Cargo.lock +468 -538
- data/Cargo.toml +1 -0
- data/README.md +8 -7
- data/ext/polars/Cargo.toml +17 -10
- data/ext/polars/src/batched_csv.rs +26 -26
- data/ext/polars/src/conversion.rs +121 -93
- data/ext/polars/src/dataframe.rs +116 -71
- data/ext/polars/src/error.rs +0 -5
- data/ext/polars/src/expr/binary.rs +18 -6
- data/ext/polars/src/expr/datetime.rs +10 -12
- data/ext/polars/src/expr/general.rs +68 -284
- data/ext/polars/src/expr/list.rs +17 -9
- data/ext/polars/src/{expr.rs → expr/mod.rs} +4 -2
- data/ext/polars/src/expr/name.rs +44 -0
- data/ext/polars/src/expr/rolling.rs +196 -0
- data/ext/polars/src/expr/string.rs +85 -58
- data/ext/polars/src/file.rs +3 -3
- data/ext/polars/src/functions/aggregation.rs +35 -0
- data/ext/polars/src/functions/eager.rs +7 -31
- data/ext/polars/src/functions/io.rs +10 -10
- data/ext/polars/src/functions/lazy.rs +66 -41
- data/ext/polars/src/functions/meta.rs +30 -0
- data/ext/polars/src/functions/misc.rs +8 -0
- data/ext/polars/src/functions/mod.rs +5 -0
- data/ext/polars/src/functions/random.rs +6 -0
- data/ext/polars/src/functions/range.rs +46 -0
- data/ext/polars/src/functions/string_cache.rs +11 -0
- data/ext/polars/src/functions/whenthen.rs +7 -7
- data/ext/polars/src/lazyframe.rs +47 -42
- data/ext/polars/src/lib.rs +156 -72
- data/ext/polars/src/{apply → map}/dataframe.rs +28 -33
- data/ext/polars/src/{apply → map}/mod.rs +3 -3
- data/ext/polars/src/{apply → map}/series.rs +12 -16
- data/ext/polars/src/object.rs +1 -1
- data/ext/polars/src/rb_modules.rs +22 -7
- data/ext/polars/src/series/construction.rs +4 -4
- data/ext/polars/src/series/export.rs +2 -2
- data/ext/polars/src/series/set_at_idx.rs +33 -17
- data/ext/polars/src/series.rs +7 -27
- data/ext/polars/src/sql.rs +46 -0
- data/lib/polars/config.rb +530 -0
- data/lib/polars/data_frame.rb +115 -82
- data/lib/polars/date_time_expr.rb +13 -18
- data/lib/polars/date_time_name_space.rb +5 -25
- data/lib/polars/dynamic_group_by.rb +2 -2
- data/lib/polars/expr.rb +177 -94
- data/lib/polars/functions.rb +29 -37
- data/lib/polars/group_by.rb +38 -55
- data/lib/polars/io.rb +37 -2
- data/lib/polars/lazy_frame.rb +93 -66
- data/lib/polars/lazy_functions.rb +36 -48
- data/lib/polars/lazy_group_by.rb +7 -8
- data/lib/polars/list_expr.rb +12 -8
- data/lib/polars/list_name_space.rb +2 -2
- data/lib/polars/name_expr.rb +198 -0
- data/lib/polars/rolling_group_by.rb +2 -2
- data/lib/polars/series.rb +26 -13
- data/lib/polars/sql_context.rb +194 -0
- data/lib/polars/string_expr.rb +114 -60
- data/lib/polars/string_name_space.rb +19 -4
- data/lib/polars/utils.rb +12 -0
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +3 -0
- metadata +18 -7
- /data/ext/polars/src/{apply → map}/lazy.rs +0 -0
data/lib/polars/lazy_frame.rb
CHANGED
@@ -97,7 +97,8 @@ module Polars
|
|
97
97
|
row_count_offset: 0,
|
98
98
|
storage_options: nil,
|
99
99
|
low_memory: false,
|
100
|
-
use_statistics: true
|
100
|
+
use_statistics: true,
|
101
|
+
hive_partitioning: true
|
101
102
|
)
|
102
103
|
_from_rbldf(
|
103
104
|
RbLazyFrame.new_from_parquet(
|
@@ -108,7 +109,8 @@ module Polars
|
|
108
109
|
rechunk,
|
109
110
|
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
110
111
|
low_memory,
|
111
|
-
use_statistics
|
112
|
+
use_statistics,
|
113
|
+
hive_partitioning
|
112
114
|
)
|
113
115
|
)
|
114
116
|
end
|
@@ -350,6 +352,7 @@ module Polars
|
|
350
352
|
slice_pushdown,
|
351
353
|
common_subplan_elimination,
|
352
354
|
allow_streaming,
|
355
|
+
false
|
353
356
|
)
|
354
357
|
|
355
358
|
ldf.describe_optimized_plan
|
@@ -445,7 +448,7 @@ module Polars
|
|
445
448
|
# "c" => [6, 5, 4, 3, 2, 1]
|
446
449
|
# }
|
447
450
|
# ).lazy
|
448
|
-
# df.
|
451
|
+
# df.group_by("a", maintain_order: true).agg(Polars.all.sum).collect
|
449
452
|
# # =>
|
450
453
|
# # shape: (3, 3)
|
451
454
|
# # ┌─────┬─────┬─────┐
|
@@ -466,7 +469,8 @@ module Polars
|
|
466
469
|
no_optimization: false,
|
467
470
|
slice_pushdown: true,
|
468
471
|
common_subplan_elimination: true,
|
469
|
-
allow_streaming: false
|
472
|
+
allow_streaming: false,
|
473
|
+
_eager: false
|
470
474
|
)
|
471
475
|
if no_optimization
|
472
476
|
predicate_pushdown = false
|
@@ -486,7 +490,8 @@ module Polars
|
|
486
490
|
simplify_expression,
|
487
491
|
slice_pushdown,
|
488
492
|
common_subplan_elimination,
|
489
|
-
allow_streaming
|
493
|
+
allow_streaming,
|
494
|
+
_eager
|
490
495
|
)
|
491
496
|
Utils.wrap_df(ldf.collect)
|
492
497
|
end
|
@@ -568,7 +573,8 @@ module Polars
|
|
568
573
|
simplify_expression,
|
569
574
|
slice_pushdown,
|
570
575
|
false,
|
571
|
-
true
|
576
|
+
true,
|
577
|
+
false
|
572
578
|
)
|
573
579
|
lf.sink_parquet(
|
574
580
|
path,
|
@@ -623,7 +629,7 @@ module Polars
|
|
623
629
|
# "c" => [6, 5, 4, 3, 2, 1]
|
624
630
|
# }
|
625
631
|
# ).lazy
|
626
|
-
# df.
|
632
|
+
# df.group_by("a", maintain_order: true).agg(Polars.all.sum).fetch(2)
|
627
633
|
# # =>
|
628
634
|
# # shape: (2, 3)
|
629
635
|
# # ┌─────┬─────┬─────┐
|
@@ -660,7 +666,8 @@ module Polars
|
|
660
666
|
simplify_expression,
|
661
667
|
slice_pushdown,
|
662
668
|
common_subplan_elimination,
|
663
|
-
allow_streaming
|
669
|
+
allow_streaming,
|
670
|
+
false
|
664
671
|
)
|
665
672
|
Utils.wrap_df(ldf.fetch(n_rows))
|
666
673
|
end
|
@@ -853,13 +860,13 @@ module Polars
|
|
853
860
|
_from_rbldf(_ldf.select(exprs))
|
854
861
|
end
|
855
862
|
|
856
|
-
# Start a
|
863
|
+
# Start a group by operation.
|
857
864
|
#
|
858
865
|
# @param by [Object]
|
859
866
|
# Column(s) to group by.
|
860
867
|
# @param maintain_order [Boolean]
|
861
868
|
# Make sure that the order of the groups remain consistent. This is more
|
862
|
-
# expensive than a default
|
869
|
+
# expensive than a default group by.
|
863
870
|
#
|
864
871
|
# @return [LazyGroupBy]
|
865
872
|
#
|
@@ -871,7 +878,7 @@ module Polars
|
|
871
878
|
# "c" => [6, 5, 4, 3, 2, 1]
|
872
879
|
# }
|
873
880
|
# ).lazy
|
874
|
-
# df.
|
881
|
+
# df.group_by("a", maintain_order: true).agg(Polars.col("b").sum).collect
|
875
882
|
# # =>
|
876
883
|
# # shape: (3, 2)
|
877
884
|
# # ┌─────┬─────┐
|
@@ -883,19 +890,21 @@ module Polars
|
|
883
890
|
# # │ b ┆ 11 │
|
884
891
|
# # │ c ┆ 6 │
|
885
892
|
# # └─────┴─────┘
|
886
|
-
def
|
893
|
+
def group_by(by, maintain_order: false)
|
887
894
|
rbexprs_by = Utils.selection_to_rbexpr_list(by)
|
888
|
-
lgb = _ldf.
|
889
|
-
LazyGroupBy.new(lgb
|
895
|
+
lgb = _ldf.group_by(rbexprs_by, maintain_order)
|
896
|
+
LazyGroupBy.new(lgb)
|
890
897
|
end
|
898
|
+
alias_method :groupby, :group_by
|
899
|
+
alias_method :group, :group_by
|
891
900
|
|
892
901
|
# Create rolling groups based on a time column.
|
893
902
|
#
|
894
903
|
# Also works for index values of type `:i32` or `:i64`.
|
895
904
|
#
|
896
|
-
# Different from a `
|
905
|
+
# Different from a `dynamic_group_by` the windows are now determined by the
|
897
906
|
# individual values and are not of constant intervals. For constant intervals
|
898
|
-
# use *
|
907
|
+
# use *group_by_dynamic*.
|
899
908
|
#
|
900
909
|
# The `period` and `offset` arguments are created either from a timedelta, or
|
901
910
|
# by using the following string language:
|
@@ -915,7 +924,7 @@ module Polars
|
|
915
924
|
# Or combine them:
|
916
925
|
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
917
926
|
#
|
918
|
-
# In case of a
|
927
|
+
# In case of a group_by_rolling on an integer column, the windows are defined by:
|
919
928
|
#
|
920
929
|
# - "1i" # length 1
|
921
930
|
# - "10i" # length 10
|
@@ -926,7 +935,7 @@ module Polars
|
|
926
935
|
# This column must be sorted in ascending order. If not the output will not
|
927
936
|
# make sense.
|
928
937
|
#
|
929
|
-
# In case of a rolling
|
938
|
+
# In case of a rolling group by on indices, dtype needs to be one of
|
930
939
|
# `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
|
931
940
|
# performance matters use an `:i64` column.
|
932
941
|
# @param period [Object]
|
@@ -958,7 +967,7 @@ module Polars
|
|
958
967
|
# df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
|
959
968
|
# Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
|
960
969
|
# )
|
961
|
-
# df.
|
970
|
+
# df.group_by_rolling(index_column: "dt", period: "2d").agg(
|
962
971
|
# [
|
963
972
|
# Polars.sum("a").alias("sum_a"),
|
964
973
|
# Polars.min("a").alias("min_a"),
|
@@ -979,7 +988,7 @@ module Polars
|
|
979
988
|
# # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
|
980
989
|
# # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
|
981
990
|
# # └─────────────────────┴───────┴───────┴───────┘
|
982
|
-
def
|
991
|
+
def group_by_rolling(
|
983
992
|
index_column:,
|
984
993
|
period:,
|
985
994
|
offset: nil,
|
@@ -987,7 +996,7 @@ module Polars
|
|
987
996
|
by: nil,
|
988
997
|
check_sorted: true
|
989
998
|
)
|
990
|
-
index_column = Utils.
|
999
|
+
index_column = Utils.parse_as_expression(index_column)
|
991
1000
|
if offset.nil?
|
992
1001
|
offset = "-#{period}"
|
993
1002
|
end
|
@@ -996,16 +1005,17 @@ module Polars
|
|
996
1005
|
period = Utils._timedelta_to_pl_duration(period)
|
997
1006
|
offset = Utils._timedelta_to_pl_duration(offset)
|
998
1007
|
|
999
|
-
lgb = _ldf.
|
1000
|
-
index_column
|
1008
|
+
lgb = _ldf.group_by_rolling(
|
1009
|
+
index_column, period, offset, closed, rbexprs_by, check_sorted
|
1001
1010
|
)
|
1002
|
-
LazyGroupBy.new(lgb
|
1011
|
+
LazyGroupBy.new(lgb)
|
1003
1012
|
end
|
1013
|
+
alias_method :groupby_rolling, :group_by_rolling
|
1004
1014
|
|
1005
1015
|
# Group based on a time value (or index value of type `:i32`, `:i64`).
|
1006
1016
|
#
|
1007
1017
|
# Time windows are calculated and rows are assigned to windows. Different from a
|
1008
|
-
# normal
|
1018
|
+
# normal group by is that a row can be member of multiple groups. The time/index
|
1009
1019
|
# window could be seen as a rolling window, with a window size determined by
|
1010
1020
|
# dates/times/values instead of slots in the DataFrame.
|
1011
1021
|
#
|
@@ -1033,37 +1043,43 @@ module Polars
|
|
1033
1043
|
# Or combine them:
|
1034
1044
|
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
1035
1045
|
#
|
1036
|
-
# In case of a
|
1046
|
+
# In case of a group_by_dynamic on an integer column, the windows are defined by:
|
1037
1047
|
#
|
1038
1048
|
# - "1i" # length 1
|
1039
1049
|
# - "10i" # length 10
|
1040
1050
|
#
|
1041
|
-
# @param index_column
|
1051
|
+
# @param index_column [Object]
|
1042
1052
|
# Column used to group based on the time window.
|
1043
1053
|
# Often to type Date/Datetime
|
1044
1054
|
# This column must be sorted in ascending order. If not the output will not
|
1045
1055
|
# make sense.
|
1046
1056
|
#
|
1047
|
-
# In case of a dynamic
|
1057
|
+
# In case of a dynamic group by on indices, dtype needs to be one of
|
1048
1058
|
# `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
|
1049
1059
|
# performance matters use an `:i64` column.
|
1050
|
-
# @param every
|
1060
|
+
# @param every [Object]
|
1051
1061
|
# Interval of the window.
|
1052
|
-
# @param period
|
1062
|
+
# @param period [Object]
|
1053
1063
|
# Length of the window, if None it is equal to 'every'.
|
1054
|
-
# @param offset
|
1064
|
+
# @param offset [Object]
|
1055
1065
|
# Offset of the window if None and period is None it will be equal to negative
|
1056
1066
|
# `every`.
|
1057
|
-
# @param truncate
|
1067
|
+
# @param truncate [Boolean]
|
1058
1068
|
# Truncate the time value to the window lower bound.
|
1059
|
-
# @param include_boundaries
|
1069
|
+
# @param include_boundaries [Boolean]
|
1060
1070
|
# Add the lower and upper bound of the window to the "_lower_bound" and
|
1061
1071
|
# "_upper_bound" columns. This will impact performance because it's harder to
|
1062
1072
|
# parallelize
|
1063
1073
|
# @param closed ["right", "left", "both", "none"]
|
1064
1074
|
# Define whether the temporal window interval is closed or not.
|
1065
|
-
# @param by
|
1075
|
+
# @param by [Object]
|
1066
1076
|
# Also group by this column/these columns
|
1077
|
+
# @param check_sorted [Boolean]
|
1078
|
+
# When the `by` argument is given, polars can not check sortedness
|
1079
|
+
# by the metadata and has to do a full scan on the index column to
|
1080
|
+
# verify data is sorted. This is expensive. If you are sure the
|
1081
|
+
# data within the by groups is sorted, you can set this to `false`.
|
1082
|
+
# Doing so incorrectly will lead to incorrect output.
|
1067
1083
|
#
|
1068
1084
|
# @return [DataFrame]
|
1069
1085
|
#
|
@@ -1095,7 +1111,7 @@ module Polars
|
|
1095
1111
|
# # └─────────────────────┴─────┘
|
1096
1112
|
#
|
1097
1113
|
# @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
|
1098
|
-
# df.
|
1114
|
+
# df.group_by_dynamic("time", every: "1h", closed: "right").agg(
|
1099
1115
|
# [
|
1100
1116
|
# Polars.col("time").min.alias("time_min"),
|
1101
1117
|
# Polars.col("time").max.alias("time_max")
|
@@ -1115,7 +1131,7 @@ module Polars
|
|
1115
1131
|
# # └─────────────────────┴─────────────────────┴─────────────────────┘
|
1116
1132
|
#
|
1117
1133
|
# @example The window boundaries can also be added to the aggregation result.
|
1118
|
-
# df.
|
1134
|
+
# df.group_by_dynamic(
|
1119
1135
|
# "time", every: "1h", include_boundaries: true, closed: "right"
|
1120
1136
|
# ).agg([Polars.col("time").count.alias("time_count")])
|
1121
1137
|
# # =>
|
@@ -1132,7 +1148,7 @@ module Polars
|
|
1132
1148
|
# # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
|
1133
1149
|
#
|
1134
1150
|
# @example When closed="left", should not include right end of interval.
|
1135
|
-
# df.
|
1151
|
+
# df.group_by_dynamic("time", every: "1h", closed: "left").agg(
|
1136
1152
|
# [
|
1137
1153
|
# Polars.col("time").count.alias("time_count"),
|
1138
1154
|
# Polars.col("time").alias("time_agg_list")
|
@@ -1152,7 +1168,7 @@ module Polars
|
|
1152
1168
|
# # └─────────────────────┴────────────┴───────────────────────────────────┘
|
1153
1169
|
#
|
1154
1170
|
# @example When closed="both" the time values at the window boundaries belong to 2 groups.
|
1155
|
-
# df.
|
1171
|
+
# df.group_by_dynamic("time", every: "1h", closed: "both").agg(
|
1156
1172
|
# [Polars.col("time").count.alias("time_count")]
|
1157
1173
|
# )
|
1158
1174
|
# # =>
|
@@ -1169,7 +1185,7 @@ module Polars
|
|
1169
1185
|
# # │ 2021-12-16 03:00:00 ┆ 1 │
|
1170
1186
|
# # └─────────────────────┴────────────┘
|
1171
1187
|
#
|
1172
|
-
# @example Dynamic
|
1188
|
+
# @example Dynamic group bys can also be combined with grouping on normal keys.
|
1173
1189
|
# df = Polars::DataFrame.new(
|
1174
1190
|
# {
|
1175
1191
|
# "time" => Polars.date_range(
|
@@ -1180,7 +1196,7 @@ module Polars
|
|
1180
1196
|
# "groups" => ["a", "a", "a", "b", "b", "a", "a"]
|
1181
1197
|
# }
|
1182
1198
|
# )
|
1183
|
-
# df.
|
1199
|
+
# df.group_by_dynamic(
|
1184
1200
|
# "time",
|
1185
1201
|
# every: "1h",
|
1186
1202
|
# closed: "both",
|
@@ -1203,14 +1219,14 @@ module Polars
|
|
1203
1219
|
# # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
|
1204
1220
|
# # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
|
1205
1221
|
#
|
1206
|
-
# @example Dynamic
|
1222
|
+
# @example Dynamic group by on an index column.
|
1207
1223
|
# df = Polars::DataFrame.new(
|
1208
1224
|
# {
|
1209
1225
|
# "idx" => Polars.arange(0, 6, eager: true),
|
1210
1226
|
# "A" => ["A", "A", "B", "B", "B", "C"]
|
1211
1227
|
# }
|
1212
1228
|
# )
|
1213
|
-
# df.
|
1229
|
+
# df.group_by_dynamic(
|
1214
1230
|
# "idx",
|
1215
1231
|
# every: "2i",
|
1216
1232
|
# period: "3i",
|
@@ -1228,17 +1244,23 @@ module Polars
|
|
1228
1244
|
# # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
|
1229
1245
|
# # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
|
1230
1246
|
# # └─────────────────┴─────────────────┴─────┴─────────────────┘
|
1231
|
-
def
|
1247
|
+
def group_by_dynamic(
|
1232
1248
|
index_column,
|
1233
1249
|
every:,
|
1234
1250
|
period: nil,
|
1235
1251
|
offset: nil,
|
1236
|
-
truncate:
|
1252
|
+
truncate: nil,
|
1237
1253
|
include_boundaries: false,
|
1238
1254
|
closed: "left",
|
1255
|
+
label: "left",
|
1239
1256
|
by: nil,
|
1240
|
-
start_by: "window"
|
1257
|
+
start_by: "window",
|
1258
|
+
check_sorted: true
|
1241
1259
|
)
|
1260
|
+
if !truncate.nil?
|
1261
|
+
label = truncate ? "left" : "datapoint"
|
1262
|
+
end
|
1263
|
+
|
1242
1264
|
index_column = Utils.expr_to_lit_or_expr(index_column, str_to_lit: false)
|
1243
1265
|
if offset.nil?
|
1244
1266
|
offset = period.nil? ? "-#{every}" : "0ns"
|
@@ -1253,19 +1275,21 @@ module Polars
|
|
1253
1275
|
every = Utils._timedelta_to_pl_duration(every)
|
1254
1276
|
|
1255
1277
|
rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
|
1256
|
-
lgb = _ldf.
|
1278
|
+
lgb = _ldf.group_by_dynamic(
|
1257
1279
|
index_column._rbexpr,
|
1258
1280
|
every,
|
1259
1281
|
period,
|
1260
1282
|
offset,
|
1261
|
-
|
1283
|
+
label,
|
1262
1284
|
include_boundaries,
|
1263
1285
|
closed,
|
1264
1286
|
rbexprs_by,
|
1265
|
-
start_by
|
1287
|
+
start_by,
|
1288
|
+
check_sorted
|
1266
1289
|
)
|
1267
|
-
LazyGroupBy.new(lgb
|
1290
|
+
LazyGroupBy.new(lgb)
|
1268
1291
|
end
|
1292
|
+
alias_method :groupby_dynamic, :group_by_dynamic
|
1269
1293
|
|
1270
1294
|
# Perform an asof join.
|
1271
1295
|
#
|
@@ -1725,8 +1749,10 @@ module Polars
|
|
1725
1749
|
|
1726
1750
|
# Shift the values by a given period.
|
1727
1751
|
#
|
1728
|
-
# @param
|
1752
|
+
# @param n [Integer]
|
1729
1753
|
# Number of places to shift (may be negative).
|
1754
|
+
# @param fill_value [Object]
|
1755
|
+
# Fill the resulting null values with this value.
|
1730
1756
|
#
|
1731
1757
|
# @return [LazyFrame]
|
1732
1758
|
#
|
@@ -1763,8 +1789,12 @@ module Polars
|
|
1763
1789
|
# # │ 5 ┆ 6 │
|
1764
1790
|
# # │ null ┆ null │
|
1765
1791
|
# # └──────┴──────┘
|
1766
|
-
def shift(
|
1767
|
-
|
1792
|
+
def shift(n, fill_value: nil)
|
1793
|
+
if !fill_value.nil?
|
1794
|
+
fill_value = Utils.parse_as_expression(fill_value, str_as_lit: true)
|
1795
|
+
end
|
1796
|
+
n = Utils.parse_as_expression(n)
|
1797
|
+
_from_rbldf(_ldf.shift(n, fill_value))
|
1768
1798
|
end
|
1769
1799
|
|
1770
1800
|
# Shift the values by a given period and fill the resulting null values.
|
@@ -1810,10 +1840,7 @@ module Polars
|
|
1810
1840
|
# # │ 0 ┆ 0 │
|
1811
1841
|
# # └─────┴─────┘
|
1812
1842
|
def shift_and_fill(periods, fill_value)
|
1813
|
-
|
1814
|
-
fill_value = Polars.lit(fill_value)
|
1815
|
-
end
|
1816
|
-
_from_rbldf(_ldf.shift_and_fill(periods, fill_value._rbexpr))
|
1843
|
+
shift(periods, fill_value: fill_value)
|
1817
1844
|
end
|
1818
1845
|
|
1819
1846
|
# Get a slice of this DataFrame.
|
@@ -2371,16 +2398,16 @@ module Polars
|
|
2371
2398
|
# df.interpolate.collect
|
2372
2399
|
# # =>
|
2373
2400
|
# # shape: (4, 3)
|
2374
|
-
# #
|
2375
|
-
# # │ foo
|
2376
|
-
# # │ ---
|
2377
|
-
# # │
|
2378
|
-
# #
|
2379
|
-
# # │ 1
|
2380
|
-
# # │ 5
|
2381
|
-
# # │ 9
|
2382
|
-
# # │ 10
|
2383
|
-
# #
|
2401
|
+
# # ┌──────┬──────┬──────────┐
|
2402
|
+
# # │ foo ┆ bar ┆ baz │
|
2403
|
+
# # │ --- ┆ --- ┆ --- │
|
2404
|
+
# # │ f64 ┆ f64 ┆ f64 │
|
2405
|
+
# # ╞══════╪══════╪══════════╡
|
2406
|
+
# # │ 1.0 ┆ 6.0 ┆ 1.0 │
|
2407
|
+
# # │ 5.0 ┆ 7.0 ┆ 3.666667 │
|
2408
|
+
# # │ 9.0 ┆ 9.0 ┆ 6.333333 │
|
2409
|
+
# # │ 10.0 ┆ null ┆ 9.0 │
|
2410
|
+
# # └──────┴──────┴──────────┘
|
2384
2411
|
def interpolate
|
2385
2412
|
select(Utils.col("*").interpolate)
|
2386
2413
|
end
|
@@ -43,7 +43,7 @@ module Polars
|
|
43
43
|
# # ┌─────┬─────┬────────────┐
|
44
44
|
# # │ a ┆ b ┆ rank │
|
45
45
|
# # │ --- ┆ --- ┆ --- │
|
46
|
-
# # │ i64 ┆ i64 ┆ list[
|
46
|
+
# # │ i64 ┆ i64 ┆ list[f64] │
|
47
47
|
# # ╞═════╪═════╪════════════╡
|
48
48
|
# # │ 1 ┆ 4 ┆ [1.0, 2.0] │
|
49
49
|
# # │ 8 ┆ 5 ┆ [2.0, 1.0] │
|
@@ -158,7 +158,7 @@ module Polars
|
|
158
158
|
col(column.to_s).sum
|
159
159
|
elsif column.is_a?(::Array)
|
160
160
|
exprs = Utils.selection_to_rbexpr_list(column)
|
161
|
-
Utils.wrap_expr(
|
161
|
+
Utils.wrap_expr(_sum_horizontal(exprs))
|
162
162
|
else
|
163
163
|
fold(lit(0).cast(:u32), ->(a, b) { a + b }, column).alias("sum")
|
164
164
|
end
|
@@ -625,16 +625,16 @@ module Polars
|
|
625
625
|
# This can be used in a `select`, `with_column`, etc. Be sure that the resulting
|
626
626
|
# range size is equal to the length of the DataFrame you are collecting.
|
627
627
|
#
|
628
|
-
# @param
|
628
|
+
# @param start [Integer, Expr, Series]
|
629
629
|
# Lower bound of range.
|
630
|
-
# @param
|
630
|
+
# @param stop [Integer, Expr, Series]
|
631
631
|
# Upper bound of range.
|
632
632
|
# @param step [Integer]
|
633
633
|
# Step size of the range.
|
634
634
|
# @param eager [Boolean]
|
635
635
|
# If eager evaluation is `True`, a Series is returned instead of an Expr.
|
636
636
|
# @param dtype [Symbol]
|
637
|
-
# Apply an explicit integer dtype to the resulting expression (default is
|
637
|
+
# Apply an explicit integer dtype to the resulting expression (default is `Int64`).
|
638
638
|
#
|
639
639
|
# @return [Expr, Series]
|
640
640
|
#
|
@@ -648,35 +648,20 @@ module Polars
|
|
648
648
|
# # 1
|
649
649
|
# # 2
|
650
650
|
# # ]
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
# # ┌───────────┐
|
658
|
-
# # │ arange │
|
659
|
-
# # │ --- │
|
660
|
-
# # │ list[i64] │
|
661
|
-
# # ╞═══════════╡
|
662
|
-
# # │ [1, 2] │
|
663
|
-
# # │ [2, 3] │
|
664
|
-
# # └───────────┘
|
665
|
-
def arange(low, high, step: 1, eager: false, dtype: nil)
|
666
|
-
low = Utils.expr_to_lit_or_expr(low, str_to_lit: false)
|
667
|
-
high = Utils.expr_to_lit_or_expr(high, str_to_lit: false)
|
668
|
-
range_expr = Utils.wrap_expr(RbExpr.arange(low._rbexpr, high._rbexpr, step))
|
669
|
-
|
670
|
-
if !dtype.nil? && !["i64", Int64].include?(dtype)
|
671
|
-
range_expr = range_expr.cast(dtype)
|
672
|
-
end
|
651
|
+
def int_range(start, stop, step: 1, eager: false, dtype: nil)
|
652
|
+
start = Utils.parse_as_expression(start)
|
653
|
+
stop = Utils.parse_as_expression(stop)
|
654
|
+
dtype ||= Int64
|
655
|
+
dtype = dtype.to_s if dtype.is_a?(Symbol)
|
656
|
+
result = Utils.wrap_expr(RbExpr.int_range(start, stop, step, dtype)).alias("arange")
|
673
657
|
|
674
|
-
if
|
675
|
-
|
676
|
-
else
|
677
|
-
DataFrame.new.select(range_expr.alias("arange")).to_series
|
658
|
+
if eager
|
659
|
+
return select(result).to_series
|
678
660
|
end
|
661
|
+
|
662
|
+
result
|
679
663
|
end
|
664
|
+
alias_method :arange, :int_range
|
680
665
|
|
681
666
|
# Find the indexes that would sort the columns.
|
682
667
|
#
|
@@ -735,15 +720,22 @@ module Polars
|
|
735
720
|
# # │ 2022-01-16 00:00:00 ┆ 2022-01-04 00:00:00 ┆ 2022-01-02 00:00:02 ┆ 2022-01-02 00:00:00.002 ┆ 2022-01-02 02:00:00 │
|
736
721
|
# # └─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────────┴─────────────────────┘
|
737
722
|
def duration(
|
723
|
+
weeks: nil,
|
738
724
|
days: nil,
|
725
|
+
hours: nil,
|
726
|
+
minutes: nil,
|
739
727
|
seconds: nil,
|
740
|
-
nanoseconds: nil,
|
741
|
-
microseconds: nil,
|
742
728
|
milliseconds: nil,
|
743
|
-
|
744
|
-
|
745
|
-
|
729
|
+
microseconds: nil,
|
730
|
+
nanoseconds: nil,
|
731
|
+
time_unit: "us"
|
746
732
|
)
|
733
|
+
if !weeks.nil?
|
734
|
+
weeks = Utils.expr_to_lit_or_expr(weeks, str_to_lit: false)._rbexpr
|
735
|
+
end
|
736
|
+
if !days.nil?
|
737
|
+
days = Utils.expr_to_lit_or_expr(days, str_to_lit: false)._rbexpr
|
738
|
+
end
|
747
739
|
if !hours.nil?
|
748
740
|
hours = Utils.expr_to_lit_or_expr(hours, str_to_lit: false)._rbexpr
|
749
741
|
end
|
@@ -762,23 +754,18 @@ module Polars
|
|
762
754
|
if !nanoseconds.nil?
|
763
755
|
nanoseconds = Utils.expr_to_lit_or_expr(nanoseconds, str_to_lit: false)._rbexpr
|
764
756
|
end
|
765
|
-
if !days.nil?
|
766
|
-
days = Utils.expr_to_lit_or_expr(days, str_to_lit: false)._rbexpr
|
767
|
-
end
|
768
|
-
if !weeks.nil?
|
769
|
-
weeks = Utils.expr_to_lit_or_expr(weeks, str_to_lit: false)._rbexpr
|
770
|
-
end
|
771
757
|
|
772
758
|
Utils.wrap_expr(
|
773
759
|
_rb_duration(
|
760
|
+
weeks,
|
774
761
|
days,
|
762
|
+
hours,
|
763
|
+
minutes,
|
775
764
|
seconds,
|
776
|
-
nanoseconds,
|
777
|
-
microseconds,
|
778
765
|
milliseconds,
|
779
|
-
|
780
|
-
|
781
|
-
|
766
|
+
microseconds,
|
767
|
+
nanoseconds,
|
768
|
+
time_unit
|
782
769
|
)
|
783
770
|
)
|
784
771
|
end
|
@@ -944,7 +931,8 @@ module Polars
|
|
944
931
|
simplify_expression,
|
945
932
|
slice_pushdown,
|
946
933
|
common_subplan_elimination,
|
947
|
-
allow_streaming
|
934
|
+
allow_streaming,
|
935
|
+
false
|
948
936
|
)
|
949
937
|
prepared << ldf
|
950
938
|
end
|
data/lib/polars/lazy_group_by.rb
CHANGED
@@ -1,10 +1,9 @@
|
|
1
1
|
module Polars
|
2
|
-
# Created by `df.lazy.
|
2
|
+
# Created by `df.lazy.group_by("foo")`.
|
3
3
|
class LazyGroupBy
|
4
4
|
# @private
|
5
|
-
def initialize(lgb
|
5
|
+
def initialize(lgb)
|
6
6
|
@lgb = lgb
|
7
|
-
@lazyframe_class = lazyframe_class
|
8
7
|
end
|
9
8
|
|
10
9
|
# Describe the aggregation that need to be done on a group.
|
@@ -12,7 +11,7 @@ module Polars
|
|
12
11
|
# @return [LazyFrame]
|
13
12
|
def agg(aggs)
|
14
13
|
rbexprs = Utils.selection_to_rbexpr_list(aggs)
|
15
|
-
|
14
|
+
Utils.wrap_ldf(@lgb.agg(rbexprs))
|
16
15
|
end
|
17
16
|
|
18
17
|
# Get the first `n` rows of each group.
|
@@ -29,7 +28,7 @@ module Polars
|
|
29
28
|
# "nrs" => [1, 2, 3, 4, 5, 6]
|
30
29
|
# }
|
31
30
|
# )
|
32
|
-
# df.
|
31
|
+
# df.group_by("letters").head(2).sort("letters")
|
33
32
|
# # =>
|
34
33
|
# # shape: (5, 2)
|
35
34
|
# # ┌─────────┬─────┐
|
@@ -44,7 +43,7 @@ module Polars
|
|
44
43
|
# # │ c ┆ 2 │
|
45
44
|
# # └─────────┴─────┘
|
46
45
|
def head(n = 5)
|
47
|
-
|
46
|
+
Utils.wrap_ldf(@lgb.head(n))
|
48
47
|
end
|
49
48
|
|
50
49
|
# Get the last `n` rows of each group.
|
@@ -61,7 +60,7 @@ module Polars
|
|
61
60
|
# "nrs" => [1, 2, 3, 4, 5, 6]
|
62
61
|
# }
|
63
62
|
# )
|
64
|
-
# df.
|
63
|
+
# df.group_by("letters").tail(2).sort("letters")
|
65
64
|
# # =>
|
66
65
|
# # shape: (5, 2)
|
67
66
|
# # ┌─────────┬─────┐
|
@@ -76,7 +75,7 @@ module Polars
|
|
76
75
|
# # │ c ┆ 4 │
|
77
76
|
# # └─────────┴─────┘
|
78
77
|
def tail(n = 5)
|
79
|
-
|
78
|
+
Utils.wrap_ldf(@lgb.tail(n))
|
80
79
|
end
|
81
80
|
|
82
81
|
# def apply
|