polars-df 0.5.0-x86_64-darwin → 0.7.0-x86_64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +26 -0
- data/Cargo.lock +595 -709
- data/Cargo.toml +1 -0
- data/LICENSE-THIRD-PARTY.txt +4572 -5214
- data/README.md +11 -9
- data/lib/polars/3.0/polars.bundle +0 -0
- data/lib/polars/3.1/polars.bundle +0 -0
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/array_expr.rb +84 -0
- data/lib/polars/array_name_space.rb +77 -0
- data/lib/polars/batched_csv_reader.rb +1 -1
- data/lib/polars/config.rb +530 -0
- data/lib/polars/data_frame.rb +206 -131
- data/lib/polars/data_types.rb +163 -29
- data/lib/polars/date_time_expr.rb +13 -18
- data/lib/polars/date_time_name_space.rb +22 -28
- data/lib/polars/dynamic_group_by.rb +2 -2
- data/lib/polars/expr.rb +241 -151
- data/lib/polars/functions.rb +29 -38
- data/lib/polars/group_by.rb +38 -76
- data/lib/polars/io.rb +37 -2
- data/lib/polars/lazy_frame.rb +174 -95
- data/lib/polars/lazy_functions.rb +87 -63
- data/lib/polars/lazy_group_by.rb +7 -8
- data/lib/polars/list_expr.rb +40 -36
- data/lib/polars/list_name_space.rb +15 -15
- data/lib/polars/name_expr.rb +198 -0
- data/lib/polars/rolling_group_by.rb +6 -4
- data/lib/polars/series.rb +95 -28
- data/lib/polars/sql_context.rb +194 -0
- data/lib/polars/string_expr.rb +249 -69
- data/lib/polars/string_name_space.rb +155 -25
- data/lib/polars/utils.rb +119 -57
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +6 -0
- metadata +7 -2
data/lib/polars/lazy_frame.rb
CHANGED
@@ -4,6 +4,22 @@ module Polars
|
|
4
4
|
# @private
|
5
5
|
attr_accessor :_ldf
|
6
6
|
|
7
|
+
# Create a new LazyFrame.
|
8
|
+
def initialize(data = nil, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
|
9
|
+
self._ldf = (
|
10
|
+
DataFrame.new(
|
11
|
+
data,
|
12
|
+
schema: schema,
|
13
|
+
schema_overrides: schema_overrides,
|
14
|
+
orient: orient,
|
15
|
+
infer_schema_length: infer_schema_length,
|
16
|
+
nan_to_null: nan_to_null
|
17
|
+
)
|
18
|
+
.lazy
|
19
|
+
._ldf
|
20
|
+
)
|
21
|
+
end
|
22
|
+
|
7
23
|
# @private
|
8
24
|
def self._from_rbldf(rb_ldf)
|
9
25
|
ldf = LazyFrame.allocate
|
@@ -81,7 +97,8 @@ module Polars
|
|
81
97
|
row_count_offset: 0,
|
82
98
|
storage_options: nil,
|
83
99
|
low_memory: false,
|
84
|
-
use_statistics: true
|
100
|
+
use_statistics: true,
|
101
|
+
hive_partitioning: true
|
85
102
|
)
|
86
103
|
_from_rbldf(
|
87
104
|
RbLazyFrame.new_from_parquet(
|
@@ -92,7 +109,8 @@ module Polars
|
|
92
109
|
rechunk,
|
93
110
|
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
94
111
|
low_memory,
|
95
|
-
use_statistics
|
112
|
+
use_statistics,
|
113
|
+
hive_partitioning
|
96
114
|
)
|
97
115
|
)
|
98
116
|
end
|
@@ -334,6 +352,7 @@ module Polars
|
|
334
352
|
slice_pushdown,
|
335
353
|
common_subplan_elimination,
|
336
354
|
allow_streaming,
|
355
|
+
false
|
337
356
|
)
|
338
357
|
|
339
358
|
ldf.describe_optimized_plan
|
@@ -379,16 +398,16 @@ module Polars
|
|
379
398
|
# # │ 2 ┆ 7.0 ┆ b │
|
380
399
|
# # │ 1 ┆ 6.0 ┆ a │
|
381
400
|
# # └─────┴─────┴─────┘
|
382
|
-
def sort(by, reverse: false, nulls_last: false)
|
401
|
+
def sort(by, reverse: false, nulls_last: false, maintain_order: false)
|
383
402
|
if by.is_a?(String)
|
384
|
-
_from_rbldf(_ldf.sort(by, reverse, nulls_last))
|
403
|
+
return _from_rbldf(_ldf.sort(by, reverse, nulls_last, maintain_order))
|
385
404
|
end
|
386
405
|
if Utils.bool?(reverse)
|
387
406
|
reverse = [reverse]
|
388
407
|
end
|
389
408
|
|
390
409
|
by = Utils.selection_to_rbexpr_list(by)
|
391
|
-
_from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last))
|
410
|
+
_from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last, maintain_order))
|
392
411
|
end
|
393
412
|
|
394
413
|
# def profile
|
@@ -429,7 +448,7 @@ module Polars
|
|
429
448
|
# "c" => [6, 5, 4, 3, 2, 1]
|
430
449
|
# }
|
431
450
|
# ).lazy
|
432
|
-
# df.
|
451
|
+
# df.group_by("a", maintain_order: true).agg(Polars.all.sum).collect
|
433
452
|
# # =>
|
434
453
|
# # shape: (3, 3)
|
435
454
|
# # ┌─────┬─────┬─────┐
|
@@ -450,7 +469,8 @@ module Polars
|
|
450
469
|
no_optimization: false,
|
451
470
|
slice_pushdown: true,
|
452
471
|
common_subplan_elimination: true,
|
453
|
-
allow_streaming: false
|
472
|
+
allow_streaming: false,
|
473
|
+
_eager: false
|
454
474
|
)
|
455
475
|
if no_optimization
|
456
476
|
predicate_pushdown = false
|
@@ -470,7 +490,8 @@ module Polars
|
|
470
490
|
simplify_expression,
|
471
491
|
slice_pushdown,
|
472
492
|
common_subplan_elimination,
|
473
|
-
allow_streaming
|
493
|
+
allow_streaming,
|
494
|
+
_eager
|
474
495
|
)
|
475
496
|
Utils.wrap_df(ldf.collect)
|
476
497
|
end
|
@@ -552,7 +573,8 @@ module Polars
|
|
552
573
|
simplify_expression,
|
553
574
|
slice_pushdown,
|
554
575
|
false,
|
555
|
-
true
|
576
|
+
true,
|
577
|
+
false
|
556
578
|
)
|
557
579
|
lf.sink_parquet(
|
558
580
|
path,
|
@@ -607,7 +629,7 @@ module Polars
|
|
607
629
|
# "c" => [6, 5, 4, 3, 2, 1]
|
608
630
|
# }
|
609
631
|
# ).lazy
|
610
|
-
# df.
|
632
|
+
# df.group_by("a", maintain_order: true).agg(Polars.all.sum).fetch(2)
|
611
633
|
# # =>
|
612
634
|
# # shape: (2, 3)
|
613
635
|
# # ┌─────┬─────┬─────┐
|
@@ -644,7 +666,8 @@ module Polars
|
|
644
666
|
simplify_expression,
|
645
667
|
slice_pushdown,
|
646
668
|
common_subplan_elimination,
|
647
|
-
allow_streaming
|
669
|
+
allow_streaming,
|
670
|
+
false
|
648
671
|
)
|
649
672
|
Utils.wrap_df(ldf.fetch(n_rows))
|
650
673
|
end
|
@@ -837,13 +860,13 @@ module Polars
|
|
837
860
|
_from_rbldf(_ldf.select(exprs))
|
838
861
|
end
|
839
862
|
|
840
|
-
# Start a
|
863
|
+
# Start a group by operation.
|
841
864
|
#
|
842
865
|
# @param by [Object]
|
843
866
|
# Column(s) to group by.
|
844
867
|
# @param maintain_order [Boolean]
|
845
868
|
# Make sure that the order of the groups remain consistent. This is more
|
846
|
-
# expensive than a default
|
869
|
+
# expensive than a default group by.
|
847
870
|
#
|
848
871
|
# @return [LazyGroupBy]
|
849
872
|
#
|
@@ -855,7 +878,7 @@ module Polars
|
|
855
878
|
# "c" => [6, 5, 4, 3, 2, 1]
|
856
879
|
# }
|
857
880
|
# ).lazy
|
858
|
-
# df.
|
881
|
+
# df.group_by("a", maintain_order: true).agg(Polars.col("b").sum).collect
|
859
882
|
# # =>
|
860
883
|
# # shape: (3, 2)
|
861
884
|
# # ┌─────┬─────┐
|
@@ -867,19 +890,21 @@ module Polars
|
|
867
890
|
# # │ b ┆ 11 │
|
868
891
|
# # │ c ┆ 6 │
|
869
892
|
# # └─────┴─────┘
|
870
|
-
def
|
893
|
+
def group_by(by, maintain_order: false)
|
871
894
|
rbexprs_by = Utils.selection_to_rbexpr_list(by)
|
872
|
-
lgb = _ldf.
|
873
|
-
LazyGroupBy.new(lgb
|
895
|
+
lgb = _ldf.group_by(rbexprs_by, maintain_order)
|
896
|
+
LazyGroupBy.new(lgb)
|
874
897
|
end
|
898
|
+
alias_method :groupby, :group_by
|
899
|
+
alias_method :group, :group_by
|
875
900
|
|
876
901
|
# Create rolling groups based on a time column.
|
877
902
|
#
|
878
903
|
# Also works for index values of type `:i32` or `:i64`.
|
879
904
|
#
|
880
|
-
# Different from a `
|
905
|
+
# Different from a `dynamic_group_by` the windows are now determined by the
|
881
906
|
# individual values and are not of constant intervals. For constant intervals
|
882
|
-
# use *
|
907
|
+
# use *group_by_dynamic*.
|
883
908
|
#
|
884
909
|
# The `period` and `offset` arguments are created either from a timedelta, or
|
885
910
|
# by using the following string language:
|
@@ -899,7 +924,7 @@ module Polars
|
|
899
924
|
# Or combine them:
|
900
925
|
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
901
926
|
#
|
902
|
-
# In case of a
|
927
|
+
# In case of a group_by_rolling on an integer column, the windows are defined by:
|
903
928
|
#
|
904
929
|
# - "1i" # length 1
|
905
930
|
# - "10i" # length 10
|
@@ -910,7 +935,7 @@ module Polars
|
|
910
935
|
# This column must be sorted in ascending order. If not the output will not
|
911
936
|
# make sense.
|
912
937
|
#
|
913
|
-
# In case of a rolling
|
938
|
+
# In case of a rolling group by on indices, dtype needs to be one of
|
914
939
|
# `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
|
915
940
|
# performance matters use an `:i64` column.
|
916
941
|
# @param period [Object]
|
@@ -921,6 +946,12 @@ module Polars
|
|
921
946
|
# Define whether the temporal window interval is closed or not.
|
922
947
|
# @param by [Object]
|
923
948
|
# Also group by this column/these columns.
|
949
|
+
# @param check_sorted [Boolean]
|
950
|
+
# When the `by` argument is given, polars can not check sortedness
|
951
|
+
# by the metadata and has to do a full scan on the index column to
|
952
|
+
# verify data is sorted. This is expensive. If you are sure the
|
953
|
+
# data within the by groups is sorted, you can set this to `false`.
|
954
|
+
# Doing so incorrectly will lead to incorrect output
|
924
955
|
#
|
925
956
|
# @return [LazyFrame]
|
926
957
|
#
|
@@ -933,16 +964,16 @@ module Polars
|
|
933
964
|
# "2020-01-03 19:45:32",
|
934
965
|
# "2020-01-08 23:16:43"
|
935
966
|
# ]
|
936
|
-
# df = Polars::
|
937
|
-
# Polars.col("dt").str.strptime(Polars::Datetime)
|
967
|
+
# df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
|
968
|
+
# Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
|
938
969
|
# )
|
939
|
-
# df.
|
970
|
+
# df.group_by_rolling(index_column: "dt", period: "2d").agg(
|
940
971
|
# [
|
941
972
|
# Polars.sum("a").alias("sum_a"),
|
942
973
|
# Polars.min("a").alias("min_a"),
|
943
974
|
# Polars.max("a").alias("max_a")
|
944
975
|
# ]
|
945
|
-
# )
|
976
|
+
# ).collect
|
946
977
|
# # =>
|
947
978
|
# # shape: (6, 4)
|
948
979
|
# # ┌─────────────────────┬───────┬───────┬───────┐
|
@@ -957,14 +988,15 @@ module Polars
|
|
957
988
|
# # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
|
958
989
|
# # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
|
959
990
|
# # └─────────────────────┴───────┴───────┴───────┘
|
960
|
-
def
|
991
|
+
def group_by_rolling(
|
961
992
|
index_column:,
|
962
993
|
period:,
|
963
994
|
offset: nil,
|
964
995
|
closed: "right",
|
965
|
-
by: nil
|
996
|
+
by: nil,
|
997
|
+
check_sorted: true
|
966
998
|
)
|
967
|
-
index_column = Utils.
|
999
|
+
index_column = Utils.parse_as_expression(index_column)
|
968
1000
|
if offset.nil?
|
969
1001
|
offset = "-#{period}"
|
970
1002
|
end
|
@@ -973,16 +1005,17 @@ module Polars
|
|
973
1005
|
period = Utils._timedelta_to_pl_duration(period)
|
974
1006
|
offset = Utils._timedelta_to_pl_duration(offset)
|
975
1007
|
|
976
|
-
lgb = _ldf.
|
977
|
-
index_column
|
1008
|
+
lgb = _ldf.group_by_rolling(
|
1009
|
+
index_column, period, offset, closed, rbexprs_by, check_sorted
|
978
1010
|
)
|
979
|
-
LazyGroupBy.new(lgb
|
1011
|
+
LazyGroupBy.new(lgb)
|
980
1012
|
end
|
1013
|
+
alias_method :groupby_rolling, :group_by_rolling
|
981
1014
|
|
982
1015
|
# Group based on a time value (or index value of type `:i32`, `:i64`).
|
983
1016
|
#
|
984
1017
|
# Time windows are calculated and rows are assigned to windows. Different from a
|
985
|
-
# normal
|
1018
|
+
# normal group by is that a row can be member of multiple groups. The time/index
|
986
1019
|
# window could be seen as a rolling window, with a window size determined by
|
987
1020
|
# dates/times/values instead of slots in the DataFrame.
|
988
1021
|
#
|
@@ -1010,37 +1043,43 @@ module Polars
|
|
1010
1043
|
# Or combine them:
|
1011
1044
|
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
1012
1045
|
#
|
1013
|
-
# In case of a
|
1046
|
+
# In case of a group_by_dynamic on an integer column, the windows are defined by:
|
1014
1047
|
#
|
1015
1048
|
# - "1i" # length 1
|
1016
1049
|
# - "10i" # length 10
|
1017
1050
|
#
|
1018
|
-
# @param index_column
|
1051
|
+
# @param index_column [Object]
|
1019
1052
|
# Column used to group based on the time window.
|
1020
1053
|
# Often to type Date/Datetime
|
1021
1054
|
# This column must be sorted in ascending order. If not the output will not
|
1022
1055
|
# make sense.
|
1023
1056
|
#
|
1024
|
-
# In case of a dynamic
|
1057
|
+
# In case of a dynamic group by on indices, dtype needs to be one of
|
1025
1058
|
# `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
|
1026
1059
|
# performance matters use an `:i64` column.
|
1027
|
-
# @param every
|
1060
|
+
# @param every [Object]
|
1028
1061
|
# Interval of the window.
|
1029
|
-
# @param period
|
1062
|
+
# @param period [Object]
|
1030
1063
|
# Length of the window, if None it is equal to 'every'.
|
1031
|
-
# @param offset
|
1064
|
+
# @param offset [Object]
|
1032
1065
|
# Offset of the window if None and period is None it will be equal to negative
|
1033
1066
|
# `every`.
|
1034
|
-
# @param truncate
|
1067
|
+
# @param truncate [Boolean]
|
1035
1068
|
# Truncate the time value to the window lower bound.
|
1036
|
-
# @param include_boundaries
|
1069
|
+
# @param include_boundaries [Boolean]
|
1037
1070
|
# Add the lower and upper bound of the window to the "_lower_bound" and
|
1038
1071
|
# "_upper_bound" columns. This will impact performance because it's harder to
|
1039
1072
|
# parallelize
|
1040
1073
|
# @param closed ["right", "left", "both", "none"]
|
1041
1074
|
# Define whether the temporal window interval is closed or not.
|
1042
|
-
# @param by
|
1075
|
+
# @param by [Object]
|
1043
1076
|
# Also group by this column/these columns
|
1077
|
+
# @param check_sorted [Boolean]
|
1078
|
+
# When the `by` argument is given, polars can not check sortedness
|
1079
|
+
# by the metadata and has to do a full scan on the index column to
|
1080
|
+
# verify data is sorted. This is expensive. If you are sure the
|
1081
|
+
# data within the by groups is sorted, you can set this to `false`.
|
1082
|
+
# Doing so incorrectly will lead to incorrect output.
|
1044
1083
|
#
|
1045
1084
|
# @return [DataFrame]
|
1046
1085
|
#
|
@@ -1072,7 +1111,7 @@ module Polars
|
|
1072
1111
|
# # └─────────────────────┴─────┘
|
1073
1112
|
#
|
1074
1113
|
# @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
|
1075
|
-
# df.
|
1114
|
+
# df.group_by_dynamic("time", every: "1h", closed: "right").agg(
|
1076
1115
|
# [
|
1077
1116
|
# Polars.col("time").min.alias("time_min"),
|
1078
1117
|
# Polars.col("time").max.alias("time_max")
|
@@ -1092,7 +1131,7 @@ module Polars
|
|
1092
1131
|
# # └─────────────────────┴─────────────────────┴─────────────────────┘
|
1093
1132
|
#
|
1094
1133
|
# @example The window boundaries can also be added to the aggregation result.
|
1095
|
-
# df.
|
1134
|
+
# df.group_by_dynamic(
|
1096
1135
|
# "time", every: "1h", include_boundaries: true, closed: "right"
|
1097
1136
|
# ).agg([Polars.col("time").count.alias("time_count")])
|
1098
1137
|
# # =>
|
@@ -1109,27 +1148,27 @@ module Polars
|
|
1109
1148
|
# # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
|
1110
1149
|
#
|
1111
1150
|
# @example When closed="left", should not include right end of interval.
|
1112
|
-
# df.
|
1151
|
+
# df.group_by_dynamic("time", every: "1h", closed: "left").agg(
|
1113
1152
|
# [
|
1114
1153
|
# Polars.col("time").count.alias("time_count"),
|
1115
|
-
# Polars.col("time").
|
1154
|
+
# Polars.col("time").alias("time_agg_list")
|
1116
1155
|
# ]
|
1117
1156
|
# )
|
1118
1157
|
# # =>
|
1119
1158
|
# # shape: (4, 3)
|
1120
|
-
# #
|
1121
|
-
# # │ time ┆ time_count ┆ time_agg_list
|
1122
|
-
# # │ --- ┆ --- ┆ ---
|
1123
|
-
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
|
1124
|
-
# #
|
1125
|
-
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16
|
1126
|
-
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16
|
1127
|
-
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16
|
1128
|
-
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
|
1129
|
-
# #
|
1159
|
+
# # ┌─────────────────────┬────────────┬───────────────────────────────────┐
|
1160
|
+
# # │ time ┆ time_count ┆ time_agg_list │
|
1161
|
+
# # │ --- ┆ --- ┆ --- │
|
1162
|
+
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
|
1163
|
+
# # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
|
1164
|
+
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │
|
1165
|
+
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │
|
1166
|
+
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │
|
1167
|
+
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
|
1168
|
+
# # └─────────────────────┴────────────┴───────────────────────────────────┘
|
1130
1169
|
#
|
1131
1170
|
# @example When closed="both" the time values at the window boundaries belong to 2 groups.
|
1132
|
-
# df.
|
1171
|
+
# df.group_by_dynamic("time", every: "1h", closed: "both").agg(
|
1133
1172
|
# [Polars.col("time").count.alias("time_count")]
|
1134
1173
|
# )
|
1135
1174
|
# # =>
|
@@ -1146,7 +1185,7 @@ module Polars
|
|
1146
1185
|
# # │ 2021-12-16 03:00:00 ┆ 1 │
|
1147
1186
|
# # └─────────────────────┴────────────┘
|
1148
1187
|
#
|
1149
|
-
# @example Dynamic
|
1188
|
+
# @example Dynamic group bys can also be combined with grouping on normal keys.
|
1150
1189
|
# df = Polars::DataFrame.new(
|
1151
1190
|
# {
|
1152
1191
|
# "time" => Polars.date_range(
|
@@ -1157,7 +1196,7 @@ module Polars
|
|
1157
1196
|
# "groups" => ["a", "a", "a", "b", "b", "a", "a"]
|
1158
1197
|
# }
|
1159
1198
|
# )
|
1160
|
-
# df.
|
1199
|
+
# df.group_by_dynamic(
|
1161
1200
|
# "time",
|
1162
1201
|
# every: "1h",
|
1163
1202
|
# closed: "both",
|
@@ -1180,20 +1219,20 @@ module Polars
|
|
1180
1219
|
# # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
|
1181
1220
|
# # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
|
1182
1221
|
#
|
1183
|
-
# @example Dynamic
|
1222
|
+
# @example Dynamic group by on an index column.
|
1184
1223
|
# df = Polars::DataFrame.new(
|
1185
1224
|
# {
|
1186
1225
|
# "idx" => Polars.arange(0, 6, eager: true),
|
1187
1226
|
# "A" => ["A", "A", "B", "B", "B", "C"]
|
1188
1227
|
# }
|
1189
1228
|
# )
|
1190
|
-
# df.
|
1229
|
+
# df.group_by_dynamic(
|
1191
1230
|
# "idx",
|
1192
1231
|
# every: "2i",
|
1193
1232
|
# period: "3i",
|
1194
1233
|
# include_boundaries: true,
|
1195
1234
|
# closed: "right"
|
1196
|
-
# ).agg(Polars.col("A").
|
1235
|
+
# ).agg(Polars.col("A").alias("A_agg_list"))
|
1197
1236
|
# # =>
|
1198
1237
|
# # shape: (3, 4)
|
1199
1238
|
# # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
|
@@ -1205,23 +1244,26 @@ module Polars
|
|
1205
1244
|
# # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
|
1206
1245
|
# # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
|
1207
1246
|
# # └─────────────────┴─────────────────┴─────┴─────────────────┘
|
1208
|
-
def
|
1247
|
+
def group_by_dynamic(
|
1209
1248
|
index_column,
|
1210
1249
|
every:,
|
1211
1250
|
period: nil,
|
1212
1251
|
offset: nil,
|
1213
|
-
truncate:
|
1252
|
+
truncate: nil,
|
1214
1253
|
include_boundaries: false,
|
1215
1254
|
closed: "left",
|
1255
|
+
label: "left",
|
1216
1256
|
by: nil,
|
1217
|
-
start_by: "window"
|
1257
|
+
start_by: "window",
|
1258
|
+
check_sorted: true
|
1218
1259
|
)
|
1260
|
+
if !truncate.nil?
|
1261
|
+
label = truncate ? "left" : "datapoint"
|
1262
|
+
end
|
1263
|
+
|
1264
|
+
index_column = Utils.expr_to_lit_or_expr(index_column, str_to_lit: false)
|
1219
1265
|
if offset.nil?
|
1220
|
-
|
1221
|
-
offset = "-#{every}"
|
1222
|
-
else
|
1223
|
-
offset = "0ns"
|
1224
|
-
end
|
1266
|
+
offset = period.nil? ? "-#{every}" : "0ns"
|
1225
1267
|
end
|
1226
1268
|
|
1227
1269
|
if period.nil?
|
@@ -1233,19 +1275,21 @@ module Polars
|
|
1233
1275
|
every = Utils._timedelta_to_pl_duration(every)
|
1234
1276
|
|
1235
1277
|
rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
|
1236
|
-
lgb = _ldf.
|
1237
|
-
index_column,
|
1278
|
+
lgb = _ldf.group_by_dynamic(
|
1279
|
+
index_column._rbexpr,
|
1238
1280
|
every,
|
1239
1281
|
period,
|
1240
1282
|
offset,
|
1241
|
-
|
1283
|
+
label,
|
1242
1284
|
include_boundaries,
|
1243
1285
|
closed,
|
1244
1286
|
rbexprs_by,
|
1245
|
-
start_by
|
1287
|
+
start_by,
|
1288
|
+
check_sorted
|
1246
1289
|
)
|
1247
|
-
LazyGroupBy.new(lgb
|
1290
|
+
LazyGroupBy.new(lgb)
|
1248
1291
|
end
|
1292
|
+
alias_method :groupby_dynamic, :group_by_dynamic
|
1249
1293
|
|
1250
1294
|
# Perform an asof join.
|
1251
1295
|
#
|
@@ -1351,7 +1395,7 @@ module Polars
|
|
1351
1395
|
if by.is_a?(String)
|
1352
1396
|
by_left_ = [by]
|
1353
1397
|
by_right_ = [by]
|
1354
|
-
elsif by.is_a?(Array)
|
1398
|
+
elsif by.is_a?(::Array)
|
1355
1399
|
by_left_ = by
|
1356
1400
|
by_right_ = by
|
1357
1401
|
end
|
@@ -1619,7 +1663,7 @@ module Polars
|
|
1619
1663
|
# # │ null │
|
1620
1664
|
# # └──────┘
|
1621
1665
|
def with_context(other)
|
1622
|
-
if !other.is_a?(Array)
|
1666
|
+
if !other.is_a?(::Array)
|
1623
1667
|
other = [other]
|
1624
1668
|
end
|
1625
1669
|
|
@@ -1705,8 +1749,10 @@ module Polars
|
|
1705
1749
|
|
1706
1750
|
# Shift the values by a given period.
|
1707
1751
|
#
|
1708
|
-
# @param
|
1752
|
+
# @param n [Integer]
|
1709
1753
|
# Number of places to shift (may be negative).
|
1754
|
+
# @param fill_value [Object]
|
1755
|
+
# Fill the resulting null values with this value.
|
1710
1756
|
#
|
1711
1757
|
# @return [LazyFrame]
|
1712
1758
|
#
|
@@ -1743,8 +1789,12 @@ module Polars
|
|
1743
1789
|
# # │ 5 ┆ 6 │
|
1744
1790
|
# # │ null ┆ null │
|
1745
1791
|
# # └──────┴──────┘
|
1746
|
-
def shift(
|
1747
|
-
|
1792
|
+
def shift(n, fill_value: nil)
|
1793
|
+
if !fill_value.nil?
|
1794
|
+
fill_value = Utils.parse_as_expression(fill_value, str_as_lit: true)
|
1795
|
+
end
|
1796
|
+
n = Utils.parse_as_expression(n)
|
1797
|
+
_from_rbldf(_ldf.shift(n, fill_value))
|
1748
1798
|
end
|
1749
1799
|
|
1750
1800
|
# Shift the values by a given period and fill the resulting null values.
|
@@ -1790,10 +1840,7 @@ module Polars
|
|
1790
1840
|
# # │ 0 ┆ 0 │
|
1791
1841
|
# # └─────┴─────┘
|
1792
1842
|
def shift_and_fill(periods, fill_value)
|
1793
|
-
|
1794
|
-
fill_value = Polars.lit(fill_value)
|
1795
|
-
end
|
1796
|
-
_from_rbldf(_ldf.shift_and_fill(periods, fill_value._rbexpr))
|
1843
|
+
shift(periods, fill_value: fill_value)
|
1797
1844
|
end
|
1798
1845
|
|
1799
1846
|
# Get a slice of this DataFrame.
|
@@ -2228,7 +2275,7 @@ module Polars
|
|
2228
2275
|
#
|
2229
2276
|
# @return [LazyFrame]
|
2230
2277
|
def unique(maintain_order: true, subset: nil, keep: "first")
|
2231
|
-
if !subset.nil? && !subset.is_a?(Array)
|
2278
|
+
if !subset.nil? && !subset.is_a?(::Array)
|
2232
2279
|
subset = [subset]
|
2233
2280
|
end
|
2234
2281
|
_from_rbldf(_ldf.unique(maintain_order, subset, keep))
|
@@ -2261,7 +2308,7 @@ module Polars
|
|
2261
2308
|
# # │ 3 ┆ 8 ┆ c │
|
2262
2309
|
# # └─────┴─────┴─────┘
|
2263
2310
|
def drop_nulls(subset: nil)
|
2264
|
-
if !subset.nil? && !subset.is_a?(Array)
|
2311
|
+
if !subset.nil? && !subset.is_a?(::Array)
|
2265
2312
|
subset = [subset]
|
2266
2313
|
end
|
2267
2314
|
_from_rbldf(_ldf.drop_nulls(subset))
|
@@ -2351,16 +2398,16 @@ module Polars
|
|
2351
2398
|
# df.interpolate.collect
|
2352
2399
|
# # =>
|
2353
2400
|
# # shape: (4, 3)
|
2354
|
-
# #
|
2355
|
-
# # │ foo
|
2356
|
-
# # │ ---
|
2357
|
-
# # │
|
2358
|
-
# #
|
2359
|
-
# # │ 1
|
2360
|
-
# # │ 5
|
2361
|
-
# # │ 9
|
2362
|
-
# # │ 10
|
2363
|
-
# #
|
2401
|
+
# # ┌──────┬──────┬──────────┐
|
2402
|
+
# # │ foo ┆ bar ┆ baz │
|
2403
|
+
# # │ --- ┆ --- ┆ --- │
|
2404
|
+
# # │ f64 ┆ f64 ┆ f64 │
|
2405
|
+
# # ╞══════╪══════╪══════════╡
|
2406
|
+
# # │ 1.0 ┆ 6.0 ┆ 1.0 │
|
2407
|
+
# # │ 5.0 ┆ 7.0 ┆ 3.666667 │
|
2408
|
+
# # │ 9.0 ┆ 9.0 ┆ 6.333333 │
|
2409
|
+
# # │ 10.0 ┆ null ┆ 9.0 │
|
2410
|
+
# # └──────┴──────┴──────────┘
|
2364
2411
|
def interpolate
|
2365
2412
|
select(Utils.col("*").interpolate)
|
2366
2413
|
end
|
@@ -2423,6 +2470,38 @@ module Polars
|
|
2423
2470
|
_from_rbldf(_ldf.unnest(names))
|
2424
2471
|
end
|
2425
2472
|
|
2473
|
+
# TODO
|
2474
|
+
# def merge_sorted
|
2475
|
+
# end
|
2476
|
+
|
2477
|
+
# Indicate that one or multiple columns are sorted.
|
2478
|
+
#
|
2479
|
+
# @param column [Object]
|
2480
|
+
# Columns that are sorted
|
2481
|
+
# @param more_columns [Object]
|
2482
|
+
# Additional columns that are sorted, specified as positional arguments.
|
2483
|
+
# @param descending [Boolean]
|
2484
|
+
# Whether the columns are sorted in descending order.
|
2485
|
+
#
|
2486
|
+
# @return [LazyFrame]
|
2487
|
+
def set_sorted(
|
2488
|
+
column,
|
2489
|
+
*more_columns,
|
2490
|
+
descending: false
|
2491
|
+
)
|
2492
|
+
columns = Utils.selection_to_rbexpr_list(column)
|
2493
|
+
if more_columns.any?
|
2494
|
+
columns.concat(Utils.selection_to_rbexpr_list(more_columns))
|
2495
|
+
end
|
2496
|
+
with_columns(
|
2497
|
+
columns.map { |e| Utils.wrap_expr(e).set_sorted(descending: descending) }
|
2498
|
+
)
|
2499
|
+
end
|
2500
|
+
|
2501
|
+
# TODO
|
2502
|
+
# def update
|
2503
|
+
# end
|
2504
|
+
|
2426
2505
|
private
|
2427
2506
|
|
2428
2507
|
def initialize_copy(other)
|