polars-df 0.6.0-x86_64-darwin → 0.7.0-x86_64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/Cargo.lock +468 -538
- data/Cargo.toml +1 -0
- data/LICENSE-THIRD-PARTY.txt +4896 -5867
- data/README.md +8 -7
- data/lib/polars/3.0/polars.bundle +0 -0
- data/lib/polars/3.1/polars.bundle +0 -0
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/config.rb +530 -0
- data/lib/polars/data_frame.rb +115 -82
- data/lib/polars/date_time_expr.rb +13 -18
- data/lib/polars/date_time_name_space.rb +5 -25
- data/lib/polars/dynamic_group_by.rb +2 -2
- data/lib/polars/expr.rb +177 -94
- data/lib/polars/functions.rb +29 -37
- data/lib/polars/group_by.rb +38 -55
- data/lib/polars/io.rb +37 -2
- data/lib/polars/lazy_frame.rb +93 -66
- data/lib/polars/lazy_functions.rb +36 -48
- data/lib/polars/lazy_group_by.rb +7 -8
- data/lib/polars/list_expr.rb +12 -8
- data/lib/polars/list_name_space.rb +2 -2
- data/lib/polars/name_expr.rb +198 -0
- data/lib/polars/rolling_group_by.rb +2 -2
- data/lib/polars/series.rb +26 -13
- data/lib/polars/sql_context.rb +194 -0
- data/lib/polars/string_expr.rb +114 -60
- data/lib/polars/string_name_space.rb +19 -4
- data/lib/polars/utils.rb +12 -0
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +3 -0
- metadata +5 -2
data/lib/polars/lazy_frame.rb
CHANGED
@@ -97,7 +97,8 @@ module Polars
|
|
97
97
|
row_count_offset: 0,
|
98
98
|
storage_options: nil,
|
99
99
|
low_memory: false,
|
100
|
-
use_statistics: true
|
100
|
+
use_statistics: true,
|
101
|
+
hive_partitioning: true
|
101
102
|
)
|
102
103
|
_from_rbldf(
|
103
104
|
RbLazyFrame.new_from_parquet(
|
@@ -108,7 +109,8 @@ module Polars
|
|
108
109
|
rechunk,
|
109
110
|
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
110
111
|
low_memory,
|
111
|
-
use_statistics
|
112
|
+
use_statistics,
|
113
|
+
hive_partitioning
|
112
114
|
)
|
113
115
|
)
|
114
116
|
end
|
@@ -350,6 +352,7 @@ module Polars
|
|
350
352
|
slice_pushdown,
|
351
353
|
common_subplan_elimination,
|
352
354
|
allow_streaming,
|
355
|
+
false
|
353
356
|
)
|
354
357
|
|
355
358
|
ldf.describe_optimized_plan
|
@@ -445,7 +448,7 @@ module Polars
|
|
445
448
|
# "c" => [6, 5, 4, 3, 2, 1]
|
446
449
|
# }
|
447
450
|
# ).lazy
|
448
|
-
# df.
|
451
|
+
# df.group_by("a", maintain_order: true).agg(Polars.all.sum).collect
|
449
452
|
# # =>
|
450
453
|
# # shape: (3, 3)
|
451
454
|
# # ┌─────┬─────┬─────┐
|
@@ -466,7 +469,8 @@ module Polars
|
|
466
469
|
no_optimization: false,
|
467
470
|
slice_pushdown: true,
|
468
471
|
common_subplan_elimination: true,
|
469
|
-
allow_streaming: false
|
472
|
+
allow_streaming: false,
|
473
|
+
_eager: false
|
470
474
|
)
|
471
475
|
if no_optimization
|
472
476
|
predicate_pushdown = false
|
@@ -486,7 +490,8 @@ module Polars
|
|
486
490
|
simplify_expression,
|
487
491
|
slice_pushdown,
|
488
492
|
common_subplan_elimination,
|
489
|
-
allow_streaming
|
493
|
+
allow_streaming,
|
494
|
+
_eager
|
490
495
|
)
|
491
496
|
Utils.wrap_df(ldf.collect)
|
492
497
|
end
|
@@ -568,7 +573,8 @@ module Polars
|
|
568
573
|
simplify_expression,
|
569
574
|
slice_pushdown,
|
570
575
|
false,
|
571
|
-
true
|
576
|
+
true,
|
577
|
+
false
|
572
578
|
)
|
573
579
|
lf.sink_parquet(
|
574
580
|
path,
|
@@ -623,7 +629,7 @@ module Polars
|
|
623
629
|
# "c" => [6, 5, 4, 3, 2, 1]
|
624
630
|
# }
|
625
631
|
# ).lazy
|
626
|
-
# df.
|
632
|
+
# df.group_by("a", maintain_order: true).agg(Polars.all.sum).fetch(2)
|
627
633
|
# # =>
|
628
634
|
# # shape: (2, 3)
|
629
635
|
# # ┌─────┬─────┬─────┐
|
@@ -660,7 +666,8 @@ module Polars
|
|
660
666
|
simplify_expression,
|
661
667
|
slice_pushdown,
|
662
668
|
common_subplan_elimination,
|
663
|
-
allow_streaming
|
669
|
+
allow_streaming,
|
670
|
+
false
|
664
671
|
)
|
665
672
|
Utils.wrap_df(ldf.fetch(n_rows))
|
666
673
|
end
|
@@ -853,13 +860,13 @@ module Polars
|
|
853
860
|
_from_rbldf(_ldf.select(exprs))
|
854
861
|
end
|
855
862
|
|
856
|
-
# Start a
|
863
|
+
# Start a group by operation.
|
857
864
|
#
|
858
865
|
# @param by [Object]
|
859
866
|
# Column(s) to group by.
|
860
867
|
# @param maintain_order [Boolean]
|
861
868
|
# Make sure that the order of the groups remain consistent. This is more
|
862
|
-
# expensive than a default
|
869
|
+
# expensive than a default group by.
|
863
870
|
#
|
864
871
|
# @return [LazyGroupBy]
|
865
872
|
#
|
@@ -871,7 +878,7 @@ module Polars
|
|
871
878
|
# "c" => [6, 5, 4, 3, 2, 1]
|
872
879
|
# }
|
873
880
|
# ).lazy
|
874
|
-
# df.
|
881
|
+
# df.group_by("a", maintain_order: true).agg(Polars.col("b").sum).collect
|
875
882
|
# # =>
|
876
883
|
# # shape: (3, 2)
|
877
884
|
# # ┌─────┬─────┐
|
@@ -883,19 +890,21 @@ module Polars
|
|
883
890
|
# # │ b ┆ 11 │
|
884
891
|
# # │ c ┆ 6 │
|
885
892
|
# # └─────┴─────┘
|
886
|
-
def
|
893
|
+
def group_by(by, maintain_order: false)
|
887
894
|
rbexprs_by = Utils.selection_to_rbexpr_list(by)
|
888
|
-
lgb = _ldf.
|
889
|
-
LazyGroupBy.new(lgb
|
895
|
+
lgb = _ldf.group_by(rbexprs_by, maintain_order)
|
896
|
+
LazyGroupBy.new(lgb)
|
890
897
|
end
|
898
|
+
alias_method :groupby, :group_by
|
899
|
+
alias_method :group, :group_by
|
891
900
|
|
892
901
|
# Create rolling groups based on a time column.
|
893
902
|
#
|
894
903
|
# Also works for index values of type `:i32` or `:i64`.
|
895
904
|
#
|
896
|
-
# Different from a `
|
905
|
+
# Different from a `dynamic_group_by` the windows are now determined by the
|
897
906
|
# individual values and are not of constant intervals. For constant intervals
|
898
|
-
# use *
|
907
|
+
# use *group_by_dynamic*.
|
899
908
|
#
|
900
909
|
# The `period` and `offset` arguments are created either from a timedelta, or
|
901
910
|
# by using the following string language:
|
@@ -915,7 +924,7 @@ module Polars
|
|
915
924
|
# Or combine them:
|
916
925
|
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
917
926
|
#
|
918
|
-
# In case of a
|
927
|
+
# In case of a group_by_rolling on an integer column, the windows are defined by:
|
919
928
|
#
|
920
929
|
# - "1i" # length 1
|
921
930
|
# - "10i" # length 10
|
@@ -926,7 +935,7 @@ module Polars
|
|
926
935
|
# This column must be sorted in ascending order. If not the output will not
|
927
936
|
# make sense.
|
928
937
|
#
|
929
|
-
# In case of a rolling
|
938
|
+
# In case of a rolling group by on indices, dtype needs to be one of
|
930
939
|
# `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
|
931
940
|
# performance matters use an `:i64` column.
|
932
941
|
# @param period [Object]
|
@@ -958,7 +967,7 @@ module Polars
|
|
958
967
|
# df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
|
959
968
|
# Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
|
960
969
|
# )
|
961
|
-
# df.
|
970
|
+
# df.group_by_rolling(index_column: "dt", period: "2d").agg(
|
962
971
|
# [
|
963
972
|
# Polars.sum("a").alias("sum_a"),
|
964
973
|
# Polars.min("a").alias("min_a"),
|
@@ -979,7 +988,7 @@ module Polars
|
|
979
988
|
# # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
|
980
989
|
# # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
|
981
990
|
# # └─────────────────────┴───────┴───────┴───────┘
|
982
|
-
def
|
991
|
+
def group_by_rolling(
|
983
992
|
index_column:,
|
984
993
|
period:,
|
985
994
|
offset: nil,
|
@@ -987,7 +996,7 @@ module Polars
|
|
987
996
|
by: nil,
|
988
997
|
check_sorted: true
|
989
998
|
)
|
990
|
-
index_column = Utils.
|
999
|
+
index_column = Utils.parse_as_expression(index_column)
|
991
1000
|
if offset.nil?
|
992
1001
|
offset = "-#{period}"
|
993
1002
|
end
|
@@ -996,16 +1005,17 @@ module Polars
|
|
996
1005
|
period = Utils._timedelta_to_pl_duration(period)
|
997
1006
|
offset = Utils._timedelta_to_pl_duration(offset)
|
998
1007
|
|
999
|
-
lgb = _ldf.
|
1000
|
-
index_column
|
1008
|
+
lgb = _ldf.group_by_rolling(
|
1009
|
+
index_column, period, offset, closed, rbexprs_by, check_sorted
|
1001
1010
|
)
|
1002
|
-
LazyGroupBy.new(lgb
|
1011
|
+
LazyGroupBy.new(lgb)
|
1003
1012
|
end
|
1013
|
+
alias_method :groupby_rolling, :group_by_rolling
|
1004
1014
|
|
1005
1015
|
# Group based on a time value (or index value of type `:i32`, `:i64`).
|
1006
1016
|
#
|
1007
1017
|
# Time windows are calculated and rows are assigned to windows. Different from a
|
1008
|
-
# normal
|
1018
|
+
# normal group by is that a row can be member of multiple groups. The time/index
|
1009
1019
|
# window could be seen as a rolling window, with a window size determined by
|
1010
1020
|
# dates/times/values instead of slots in the DataFrame.
|
1011
1021
|
#
|
@@ -1033,37 +1043,43 @@ module Polars
|
|
1033
1043
|
# Or combine them:
|
1034
1044
|
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
1035
1045
|
#
|
1036
|
-
# In case of a
|
1046
|
+
# In case of a group_by_dynamic on an integer column, the windows are defined by:
|
1037
1047
|
#
|
1038
1048
|
# - "1i" # length 1
|
1039
1049
|
# - "10i" # length 10
|
1040
1050
|
#
|
1041
|
-
# @param index_column
|
1051
|
+
# @param index_column [Object]
|
1042
1052
|
# Column used to group based on the time window.
|
1043
1053
|
# Often to type Date/Datetime
|
1044
1054
|
# This column must be sorted in ascending order. If not the output will not
|
1045
1055
|
# make sense.
|
1046
1056
|
#
|
1047
|
-
# In case of a dynamic
|
1057
|
+
# In case of a dynamic group by on indices, dtype needs to be one of
|
1048
1058
|
# `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
|
1049
1059
|
# performance matters use an `:i64` column.
|
1050
|
-
# @param every
|
1060
|
+
# @param every [Object]
|
1051
1061
|
# Interval of the window.
|
1052
|
-
# @param period
|
1062
|
+
# @param period [Object]
|
1053
1063
|
# Length of the window, if None it is equal to 'every'.
|
1054
|
-
# @param offset
|
1064
|
+
# @param offset [Object]
|
1055
1065
|
# Offset of the window if None and period is None it will be equal to negative
|
1056
1066
|
# `every`.
|
1057
|
-
# @param truncate
|
1067
|
+
# @param truncate [Boolean]
|
1058
1068
|
# Truncate the time value to the window lower bound.
|
1059
|
-
# @param include_boundaries
|
1069
|
+
# @param include_boundaries [Boolean]
|
1060
1070
|
# Add the lower and upper bound of the window to the "_lower_bound" and
|
1061
1071
|
# "_upper_bound" columns. This will impact performance because it's harder to
|
1062
1072
|
# parallelize
|
1063
1073
|
# @param closed ["right", "left", "both", "none"]
|
1064
1074
|
# Define whether the temporal window interval is closed or not.
|
1065
|
-
# @param by
|
1075
|
+
# @param by [Object]
|
1066
1076
|
# Also group by this column/these columns
|
1077
|
+
# @param check_sorted [Boolean]
|
1078
|
+
# When the `by` argument is given, polars can not check sortedness
|
1079
|
+
# by the metadata and has to do a full scan on the index column to
|
1080
|
+
# verify data is sorted. This is expensive. If you are sure the
|
1081
|
+
# data within the by groups is sorted, you can set this to `false`.
|
1082
|
+
# Doing so incorrectly will lead to incorrect output.
|
1067
1083
|
#
|
1068
1084
|
# @return [DataFrame]
|
1069
1085
|
#
|
@@ -1095,7 +1111,7 @@ module Polars
|
|
1095
1111
|
# # └─────────────────────┴─────┘
|
1096
1112
|
#
|
1097
1113
|
# @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
|
1098
|
-
# df.
|
1114
|
+
# df.group_by_dynamic("time", every: "1h", closed: "right").agg(
|
1099
1115
|
# [
|
1100
1116
|
# Polars.col("time").min.alias("time_min"),
|
1101
1117
|
# Polars.col("time").max.alias("time_max")
|
@@ -1115,7 +1131,7 @@ module Polars
|
|
1115
1131
|
# # └─────────────────────┴─────────────────────┴─────────────────────┘
|
1116
1132
|
#
|
1117
1133
|
# @example The window boundaries can also be added to the aggregation result.
|
1118
|
-
# df.
|
1134
|
+
# df.group_by_dynamic(
|
1119
1135
|
# "time", every: "1h", include_boundaries: true, closed: "right"
|
1120
1136
|
# ).agg([Polars.col("time").count.alias("time_count")])
|
1121
1137
|
# # =>
|
@@ -1132,7 +1148,7 @@ module Polars
|
|
1132
1148
|
# # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
|
1133
1149
|
#
|
1134
1150
|
# @example When closed="left", should not include right end of interval.
|
1135
|
-
# df.
|
1151
|
+
# df.group_by_dynamic("time", every: "1h", closed: "left").agg(
|
1136
1152
|
# [
|
1137
1153
|
# Polars.col("time").count.alias("time_count"),
|
1138
1154
|
# Polars.col("time").alias("time_agg_list")
|
@@ -1152,7 +1168,7 @@ module Polars
|
|
1152
1168
|
# # └─────────────────────┴────────────┴───────────────────────────────────┘
|
1153
1169
|
#
|
1154
1170
|
# @example When closed="both" the time values at the window boundaries belong to 2 groups.
|
1155
|
-
# df.
|
1171
|
+
# df.group_by_dynamic("time", every: "1h", closed: "both").agg(
|
1156
1172
|
# [Polars.col("time").count.alias("time_count")]
|
1157
1173
|
# )
|
1158
1174
|
# # =>
|
@@ -1169,7 +1185,7 @@ module Polars
|
|
1169
1185
|
# # │ 2021-12-16 03:00:00 ┆ 1 │
|
1170
1186
|
# # └─────────────────────┴────────────┘
|
1171
1187
|
#
|
1172
|
-
# @example Dynamic
|
1188
|
+
# @example Dynamic group bys can also be combined with grouping on normal keys.
|
1173
1189
|
# df = Polars::DataFrame.new(
|
1174
1190
|
# {
|
1175
1191
|
# "time" => Polars.date_range(
|
@@ -1180,7 +1196,7 @@ module Polars
|
|
1180
1196
|
# "groups" => ["a", "a", "a", "b", "b", "a", "a"]
|
1181
1197
|
# }
|
1182
1198
|
# )
|
1183
|
-
# df.
|
1199
|
+
# df.group_by_dynamic(
|
1184
1200
|
# "time",
|
1185
1201
|
# every: "1h",
|
1186
1202
|
# closed: "both",
|
@@ -1203,14 +1219,14 @@ module Polars
|
|
1203
1219
|
# # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
|
1204
1220
|
# # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
|
1205
1221
|
#
|
1206
|
-
# @example Dynamic
|
1222
|
+
# @example Dynamic group by on an index column.
|
1207
1223
|
# df = Polars::DataFrame.new(
|
1208
1224
|
# {
|
1209
1225
|
# "idx" => Polars.arange(0, 6, eager: true),
|
1210
1226
|
# "A" => ["A", "A", "B", "B", "B", "C"]
|
1211
1227
|
# }
|
1212
1228
|
# )
|
1213
|
-
# df.
|
1229
|
+
# df.group_by_dynamic(
|
1214
1230
|
# "idx",
|
1215
1231
|
# every: "2i",
|
1216
1232
|
# period: "3i",
|
@@ -1228,17 +1244,23 @@ module Polars
|
|
1228
1244
|
# # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
|
1229
1245
|
# # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
|
1230
1246
|
# # └─────────────────┴─────────────────┴─────┴─────────────────┘
|
1231
|
-
def
|
1247
|
+
def group_by_dynamic(
|
1232
1248
|
index_column,
|
1233
1249
|
every:,
|
1234
1250
|
period: nil,
|
1235
1251
|
offset: nil,
|
1236
|
-
truncate:
|
1252
|
+
truncate: nil,
|
1237
1253
|
include_boundaries: false,
|
1238
1254
|
closed: "left",
|
1255
|
+
label: "left",
|
1239
1256
|
by: nil,
|
1240
|
-
start_by: "window"
|
1257
|
+
start_by: "window",
|
1258
|
+
check_sorted: true
|
1241
1259
|
)
|
1260
|
+
if !truncate.nil?
|
1261
|
+
label = truncate ? "left" : "datapoint"
|
1262
|
+
end
|
1263
|
+
|
1242
1264
|
index_column = Utils.expr_to_lit_or_expr(index_column, str_to_lit: false)
|
1243
1265
|
if offset.nil?
|
1244
1266
|
offset = period.nil? ? "-#{every}" : "0ns"
|
@@ -1253,19 +1275,21 @@ module Polars
|
|
1253
1275
|
every = Utils._timedelta_to_pl_duration(every)
|
1254
1276
|
|
1255
1277
|
rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
|
1256
|
-
lgb = _ldf.
|
1278
|
+
lgb = _ldf.group_by_dynamic(
|
1257
1279
|
index_column._rbexpr,
|
1258
1280
|
every,
|
1259
1281
|
period,
|
1260
1282
|
offset,
|
1261
|
-
|
1283
|
+
label,
|
1262
1284
|
include_boundaries,
|
1263
1285
|
closed,
|
1264
1286
|
rbexprs_by,
|
1265
|
-
start_by
|
1287
|
+
start_by,
|
1288
|
+
check_sorted
|
1266
1289
|
)
|
1267
|
-
LazyGroupBy.new(lgb
|
1290
|
+
LazyGroupBy.new(lgb)
|
1268
1291
|
end
|
1292
|
+
alias_method :groupby_dynamic, :group_by_dynamic
|
1269
1293
|
|
1270
1294
|
# Perform an asof join.
|
1271
1295
|
#
|
@@ -1725,8 +1749,10 @@ module Polars
|
|
1725
1749
|
|
1726
1750
|
# Shift the values by a given period.
|
1727
1751
|
#
|
1728
|
-
# @param
|
1752
|
+
# @param n [Integer]
|
1729
1753
|
# Number of places to shift (may be negative).
|
1754
|
+
# @param fill_value [Object]
|
1755
|
+
# Fill the resulting null values with this value.
|
1730
1756
|
#
|
1731
1757
|
# @return [LazyFrame]
|
1732
1758
|
#
|
@@ -1763,8 +1789,12 @@ module Polars
|
|
1763
1789
|
# # │ 5 ┆ 6 │
|
1764
1790
|
# # │ null ┆ null │
|
1765
1791
|
# # └──────┴──────┘
|
1766
|
-
def shift(
|
1767
|
-
|
1792
|
+
def shift(n, fill_value: nil)
|
1793
|
+
if !fill_value.nil?
|
1794
|
+
fill_value = Utils.parse_as_expression(fill_value, str_as_lit: true)
|
1795
|
+
end
|
1796
|
+
n = Utils.parse_as_expression(n)
|
1797
|
+
_from_rbldf(_ldf.shift(n, fill_value))
|
1768
1798
|
end
|
1769
1799
|
|
1770
1800
|
# Shift the values by a given period and fill the resulting null values.
|
@@ -1810,10 +1840,7 @@ module Polars
|
|
1810
1840
|
# # │ 0 ┆ 0 │
|
1811
1841
|
# # └─────┴─────┘
|
1812
1842
|
def shift_and_fill(periods, fill_value)
|
1813
|
-
|
1814
|
-
fill_value = Polars.lit(fill_value)
|
1815
|
-
end
|
1816
|
-
_from_rbldf(_ldf.shift_and_fill(periods, fill_value._rbexpr))
|
1843
|
+
shift(periods, fill_value: fill_value)
|
1817
1844
|
end
|
1818
1845
|
|
1819
1846
|
# Get a slice of this DataFrame.
|
@@ -2371,16 +2398,16 @@ module Polars
|
|
2371
2398
|
# df.interpolate.collect
|
2372
2399
|
# # =>
|
2373
2400
|
# # shape: (4, 3)
|
2374
|
-
# #
|
2375
|
-
# # │ foo
|
2376
|
-
# # │ ---
|
2377
|
-
# # │
|
2378
|
-
# #
|
2379
|
-
# # │ 1
|
2380
|
-
# # │ 5
|
2381
|
-
# # │ 9
|
2382
|
-
# # │ 10
|
2383
|
-
# #
|
2401
|
+
# # ┌──────┬──────┬──────────┐
|
2402
|
+
# # │ foo ┆ bar ┆ baz │
|
2403
|
+
# # │ --- ┆ --- ┆ --- │
|
2404
|
+
# # │ f64 ┆ f64 ┆ f64 │
|
2405
|
+
# # ╞══════╪══════╪══════════╡
|
2406
|
+
# # │ 1.0 ┆ 6.0 ┆ 1.0 │
|
2407
|
+
# # │ 5.0 ┆ 7.0 ┆ 3.666667 │
|
2408
|
+
# # │ 9.0 ┆ 9.0 ┆ 6.333333 │
|
2409
|
+
# # │ 10.0 ┆ null ┆ 9.0 │
|
2410
|
+
# # └──────┴──────┴──────────┘
|
2384
2411
|
def interpolate
|
2385
2412
|
select(Utils.col("*").interpolate)
|
2386
2413
|
end
|
@@ -43,7 +43,7 @@ module Polars
|
|
43
43
|
# # ┌─────┬─────┬────────────┐
|
44
44
|
# # │ a ┆ b ┆ rank │
|
45
45
|
# # │ --- ┆ --- ┆ --- │
|
46
|
-
# # │ i64 ┆ i64 ┆ list[
|
46
|
+
# # │ i64 ┆ i64 ┆ list[f64] │
|
47
47
|
# # ╞═════╪═════╪════════════╡
|
48
48
|
# # │ 1 ┆ 4 ┆ [1.0, 2.0] │
|
49
49
|
# # │ 8 ┆ 5 ┆ [2.0, 1.0] │
|
@@ -158,7 +158,7 @@ module Polars
|
|
158
158
|
col(column.to_s).sum
|
159
159
|
elsif column.is_a?(::Array)
|
160
160
|
exprs = Utils.selection_to_rbexpr_list(column)
|
161
|
-
Utils.wrap_expr(
|
161
|
+
Utils.wrap_expr(_sum_horizontal(exprs))
|
162
162
|
else
|
163
163
|
fold(lit(0).cast(:u32), ->(a, b) { a + b }, column).alias("sum")
|
164
164
|
end
|
@@ -625,16 +625,16 @@ module Polars
|
|
625
625
|
# This can be used in a `select`, `with_column`, etc. Be sure that the resulting
|
626
626
|
# range size is equal to the length of the DataFrame you are collecting.
|
627
627
|
#
|
628
|
-
# @param
|
628
|
+
# @param start [Integer, Expr, Series]
|
629
629
|
# Lower bound of range.
|
630
|
-
# @param
|
630
|
+
# @param stop [Integer, Expr, Series]
|
631
631
|
# Upper bound of range.
|
632
632
|
# @param step [Integer]
|
633
633
|
# Step size of the range.
|
634
634
|
# @param eager [Boolean]
|
635
635
|
# If eager evaluation is `True`, a Series is returned instead of an Expr.
|
636
636
|
# @param dtype [Symbol]
|
637
|
-
# Apply an explicit integer dtype to the resulting expression (default is
|
637
|
+
# Apply an explicit integer dtype to the resulting expression (default is `Int64`).
|
638
638
|
#
|
639
639
|
# @return [Expr, Series]
|
640
640
|
#
|
@@ -648,35 +648,20 @@ module Polars
|
|
648
648
|
# # 1
|
649
649
|
# # 2
|
650
650
|
# # ]
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
# # ┌───────────┐
|
658
|
-
# # │ arange │
|
659
|
-
# # │ --- │
|
660
|
-
# # │ list[i64] │
|
661
|
-
# # ╞═══════════╡
|
662
|
-
# # │ [1, 2] │
|
663
|
-
# # │ [2, 3] │
|
664
|
-
# # └───────────┘
|
665
|
-
def arange(low, high, step: 1, eager: false, dtype: nil)
|
666
|
-
low = Utils.expr_to_lit_or_expr(low, str_to_lit: false)
|
667
|
-
high = Utils.expr_to_lit_or_expr(high, str_to_lit: false)
|
668
|
-
range_expr = Utils.wrap_expr(RbExpr.arange(low._rbexpr, high._rbexpr, step))
|
669
|
-
|
670
|
-
if !dtype.nil? && !["i64", Int64].include?(dtype)
|
671
|
-
range_expr = range_expr.cast(dtype)
|
672
|
-
end
|
651
|
+
def int_range(start, stop, step: 1, eager: false, dtype: nil)
|
652
|
+
start = Utils.parse_as_expression(start)
|
653
|
+
stop = Utils.parse_as_expression(stop)
|
654
|
+
dtype ||= Int64
|
655
|
+
dtype = dtype.to_s if dtype.is_a?(Symbol)
|
656
|
+
result = Utils.wrap_expr(RbExpr.int_range(start, stop, step, dtype)).alias("arange")
|
673
657
|
|
674
|
-
if
|
675
|
-
|
676
|
-
else
|
677
|
-
DataFrame.new.select(range_expr.alias("arange")).to_series
|
658
|
+
if eager
|
659
|
+
return select(result).to_series
|
678
660
|
end
|
661
|
+
|
662
|
+
result
|
679
663
|
end
|
664
|
+
alias_method :arange, :int_range
|
680
665
|
|
681
666
|
# Find the indexes that would sort the columns.
|
682
667
|
#
|
@@ -735,15 +720,22 @@ module Polars
|
|
735
720
|
# # │ 2022-01-16 00:00:00 ┆ 2022-01-04 00:00:00 ┆ 2022-01-02 00:00:02 ┆ 2022-01-02 00:00:00.002 ┆ 2022-01-02 02:00:00 │
|
736
721
|
# # └─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────────┴─────────────────────┘
|
737
722
|
def duration(
|
723
|
+
weeks: nil,
|
738
724
|
days: nil,
|
725
|
+
hours: nil,
|
726
|
+
minutes: nil,
|
739
727
|
seconds: nil,
|
740
|
-
nanoseconds: nil,
|
741
|
-
microseconds: nil,
|
742
728
|
milliseconds: nil,
|
743
|
-
|
744
|
-
|
745
|
-
|
729
|
+
microseconds: nil,
|
730
|
+
nanoseconds: nil,
|
731
|
+
time_unit: "us"
|
746
732
|
)
|
733
|
+
if !weeks.nil?
|
734
|
+
weeks = Utils.expr_to_lit_or_expr(weeks, str_to_lit: false)._rbexpr
|
735
|
+
end
|
736
|
+
if !days.nil?
|
737
|
+
days = Utils.expr_to_lit_or_expr(days, str_to_lit: false)._rbexpr
|
738
|
+
end
|
747
739
|
if !hours.nil?
|
748
740
|
hours = Utils.expr_to_lit_or_expr(hours, str_to_lit: false)._rbexpr
|
749
741
|
end
|
@@ -762,23 +754,18 @@ module Polars
|
|
762
754
|
if !nanoseconds.nil?
|
763
755
|
nanoseconds = Utils.expr_to_lit_or_expr(nanoseconds, str_to_lit: false)._rbexpr
|
764
756
|
end
|
765
|
-
if !days.nil?
|
766
|
-
days = Utils.expr_to_lit_or_expr(days, str_to_lit: false)._rbexpr
|
767
|
-
end
|
768
|
-
if !weeks.nil?
|
769
|
-
weeks = Utils.expr_to_lit_or_expr(weeks, str_to_lit: false)._rbexpr
|
770
|
-
end
|
771
757
|
|
772
758
|
Utils.wrap_expr(
|
773
759
|
_rb_duration(
|
760
|
+
weeks,
|
774
761
|
days,
|
762
|
+
hours,
|
763
|
+
minutes,
|
775
764
|
seconds,
|
776
|
-
nanoseconds,
|
777
|
-
microseconds,
|
778
765
|
milliseconds,
|
779
|
-
|
780
|
-
|
781
|
-
|
766
|
+
microseconds,
|
767
|
+
nanoseconds,
|
768
|
+
time_unit
|
782
769
|
)
|
783
770
|
)
|
784
771
|
end
|
@@ -944,7 +931,8 @@ module Polars
|
|
944
931
|
simplify_expression,
|
945
932
|
slice_pushdown,
|
946
933
|
common_subplan_elimination,
|
947
|
-
allow_streaming
|
934
|
+
allow_streaming,
|
935
|
+
false
|
948
936
|
)
|
949
937
|
prepared << ldf
|
950
938
|
end
|
data/lib/polars/lazy_group_by.rb
CHANGED
@@ -1,10 +1,9 @@
|
|
1
1
|
module Polars
|
2
|
-
# Created by `df.lazy.
|
2
|
+
# Created by `df.lazy.group_by("foo")`.
|
3
3
|
class LazyGroupBy
|
4
4
|
# @private
|
5
|
-
def initialize(lgb
|
5
|
+
def initialize(lgb)
|
6
6
|
@lgb = lgb
|
7
|
-
@lazyframe_class = lazyframe_class
|
8
7
|
end
|
9
8
|
|
10
9
|
# Describe the aggregation that need to be done on a group.
|
@@ -12,7 +11,7 @@ module Polars
|
|
12
11
|
# @return [LazyFrame]
|
13
12
|
def agg(aggs)
|
14
13
|
rbexprs = Utils.selection_to_rbexpr_list(aggs)
|
15
|
-
|
14
|
+
Utils.wrap_ldf(@lgb.agg(rbexprs))
|
16
15
|
end
|
17
16
|
|
18
17
|
# Get the first `n` rows of each group.
|
@@ -29,7 +28,7 @@ module Polars
|
|
29
28
|
# "nrs" => [1, 2, 3, 4, 5, 6]
|
30
29
|
# }
|
31
30
|
# )
|
32
|
-
# df.
|
31
|
+
# df.group_by("letters").head(2).sort("letters")
|
33
32
|
# # =>
|
34
33
|
# # shape: (5, 2)
|
35
34
|
# # ┌─────────┬─────┐
|
@@ -44,7 +43,7 @@ module Polars
|
|
44
43
|
# # │ c ┆ 2 │
|
45
44
|
# # └─────────┴─────┘
|
46
45
|
def head(n = 5)
|
47
|
-
|
46
|
+
Utils.wrap_ldf(@lgb.head(n))
|
48
47
|
end
|
49
48
|
|
50
49
|
# Get the last `n` rows of each group.
|
@@ -61,7 +60,7 @@ module Polars
|
|
61
60
|
# "nrs" => [1, 2, 3, 4, 5, 6]
|
62
61
|
# }
|
63
62
|
# )
|
64
|
-
# df.
|
63
|
+
# df.group_by("letters").tail(2).sort("letters")
|
65
64
|
# # =>
|
66
65
|
# # shape: (5, 2)
|
67
66
|
# # ┌─────────┬─────┐
|
@@ -76,7 +75,7 @@ module Polars
|
|
76
75
|
# # │ c ┆ 4 │
|
77
76
|
# # └─────────┴─────┘
|
78
77
|
def tail(n = 5)
|
79
|
-
|
78
|
+
Utils.wrap_ldf(@lgb.tail(n))
|
80
79
|
end
|
81
80
|
|
82
81
|
# def apply
|