polars-df 0.6.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +12 -0
  3. data/Cargo.lock +468 -538
  4. data/Cargo.toml +1 -0
  5. data/README.md +8 -7
  6. data/ext/polars/Cargo.toml +17 -10
  7. data/ext/polars/src/batched_csv.rs +26 -26
  8. data/ext/polars/src/conversion.rs +121 -93
  9. data/ext/polars/src/dataframe.rs +116 -71
  10. data/ext/polars/src/error.rs +0 -5
  11. data/ext/polars/src/expr/binary.rs +18 -6
  12. data/ext/polars/src/expr/datetime.rs +10 -12
  13. data/ext/polars/src/expr/general.rs +68 -284
  14. data/ext/polars/src/expr/list.rs +17 -9
  15. data/ext/polars/src/{expr.rs → expr/mod.rs} +4 -2
  16. data/ext/polars/src/expr/name.rs +44 -0
  17. data/ext/polars/src/expr/rolling.rs +196 -0
  18. data/ext/polars/src/expr/string.rs +85 -58
  19. data/ext/polars/src/file.rs +3 -3
  20. data/ext/polars/src/functions/aggregation.rs +35 -0
  21. data/ext/polars/src/functions/eager.rs +7 -31
  22. data/ext/polars/src/functions/io.rs +10 -10
  23. data/ext/polars/src/functions/lazy.rs +66 -41
  24. data/ext/polars/src/functions/meta.rs +30 -0
  25. data/ext/polars/src/functions/misc.rs +8 -0
  26. data/ext/polars/src/functions/mod.rs +5 -0
  27. data/ext/polars/src/functions/random.rs +6 -0
  28. data/ext/polars/src/functions/range.rs +46 -0
  29. data/ext/polars/src/functions/string_cache.rs +11 -0
  30. data/ext/polars/src/functions/whenthen.rs +7 -7
  31. data/ext/polars/src/lazyframe.rs +47 -42
  32. data/ext/polars/src/lib.rs +156 -72
  33. data/ext/polars/src/{apply → map}/dataframe.rs +28 -33
  34. data/ext/polars/src/{apply → map}/mod.rs +3 -3
  35. data/ext/polars/src/{apply → map}/series.rs +12 -16
  36. data/ext/polars/src/object.rs +1 -1
  37. data/ext/polars/src/rb_modules.rs +22 -7
  38. data/ext/polars/src/series/construction.rs +4 -4
  39. data/ext/polars/src/series/export.rs +2 -2
  40. data/ext/polars/src/series/set_at_idx.rs +33 -17
  41. data/ext/polars/src/series.rs +7 -27
  42. data/ext/polars/src/sql.rs +46 -0
  43. data/lib/polars/config.rb +530 -0
  44. data/lib/polars/data_frame.rb +115 -82
  45. data/lib/polars/date_time_expr.rb +13 -18
  46. data/lib/polars/date_time_name_space.rb +5 -25
  47. data/lib/polars/dynamic_group_by.rb +2 -2
  48. data/lib/polars/expr.rb +177 -94
  49. data/lib/polars/functions.rb +29 -37
  50. data/lib/polars/group_by.rb +38 -55
  51. data/lib/polars/io.rb +37 -2
  52. data/lib/polars/lazy_frame.rb +93 -66
  53. data/lib/polars/lazy_functions.rb +36 -48
  54. data/lib/polars/lazy_group_by.rb +7 -8
  55. data/lib/polars/list_expr.rb +12 -8
  56. data/lib/polars/list_name_space.rb +2 -2
  57. data/lib/polars/name_expr.rb +198 -0
  58. data/lib/polars/rolling_group_by.rb +2 -2
  59. data/lib/polars/series.rb +26 -13
  60. data/lib/polars/sql_context.rb +194 -0
  61. data/lib/polars/string_expr.rb +114 -60
  62. data/lib/polars/string_name_space.rb +19 -4
  63. data/lib/polars/utils.rb +12 -0
  64. data/lib/polars/version.rb +1 -1
  65. data/lib/polars.rb +3 -0
  66. metadata +18 -7
  67. /data/ext/polars/src/{apply → map}/lazy.rs +0 -0
@@ -97,7 +97,8 @@ module Polars
97
97
  row_count_offset: 0,
98
98
  storage_options: nil,
99
99
  low_memory: false,
100
- use_statistics: true
100
+ use_statistics: true,
101
+ hive_partitioning: true
101
102
  )
102
103
  _from_rbldf(
103
104
  RbLazyFrame.new_from_parquet(
@@ -108,7 +109,8 @@ module Polars
108
109
  rechunk,
109
110
  Utils._prepare_row_count_args(row_count_name, row_count_offset),
110
111
  low_memory,
111
- use_statistics
112
+ use_statistics,
113
+ hive_partitioning
112
114
  )
113
115
  )
114
116
  end
@@ -350,6 +352,7 @@ module Polars
350
352
  slice_pushdown,
351
353
  common_subplan_elimination,
352
354
  allow_streaming,
355
+ false
353
356
  )
354
357
 
355
358
  ldf.describe_optimized_plan
@@ -445,7 +448,7 @@ module Polars
445
448
  # "c" => [6, 5, 4, 3, 2, 1]
446
449
  # }
447
450
  # ).lazy
448
- # df.groupby("a", maintain_order: true).agg(Polars.all.sum).collect
451
+ # df.group_by("a", maintain_order: true).agg(Polars.all.sum).collect
449
452
  # # =>
450
453
  # # shape: (3, 3)
451
454
  # # ┌─────┬─────┬─────┐
@@ -466,7 +469,8 @@ module Polars
466
469
  no_optimization: false,
467
470
  slice_pushdown: true,
468
471
  common_subplan_elimination: true,
469
- allow_streaming: false
472
+ allow_streaming: false,
473
+ _eager: false
470
474
  )
471
475
  if no_optimization
472
476
  predicate_pushdown = false
@@ -486,7 +490,8 @@ module Polars
486
490
  simplify_expression,
487
491
  slice_pushdown,
488
492
  common_subplan_elimination,
489
- allow_streaming
493
+ allow_streaming,
494
+ _eager
490
495
  )
491
496
  Utils.wrap_df(ldf.collect)
492
497
  end
@@ -568,7 +573,8 @@ module Polars
568
573
  simplify_expression,
569
574
  slice_pushdown,
570
575
  false,
571
- true
576
+ true,
577
+ false
572
578
  )
573
579
  lf.sink_parquet(
574
580
  path,
@@ -623,7 +629,7 @@ module Polars
623
629
  # "c" => [6, 5, 4, 3, 2, 1]
624
630
  # }
625
631
  # ).lazy
626
- # df.groupby("a", maintain_order: true).agg(Polars.all.sum).fetch(2)
632
+ # df.group_by("a", maintain_order: true).agg(Polars.all.sum).fetch(2)
627
633
  # # =>
628
634
  # # shape: (2, 3)
629
635
  # # ┌─────┬─────┬─────┐
@@ -660,7 +666,8 @@ module Polars
660
666
  simplify_expression,
661
667
  slice_pushdown,
662
668
  common_subplan_elimination,
663
- allow_streaming
669
+ allow_streaming,
670
+ false
664
671
  )
665
672
  Utils.wrap_df(ldf.fetch(n_rows))
666
673
  end
@@ -853,13 +860,13 @@ module Polars
853
860
  _from_rbldf(_ldf.select(exprs))
854
861
  end
855
862
 
856
- # Start a groupby operation.
863
+ # Start a group by operation.
857
864
  #
858
865
  # @param by [Object]
859
866
  # Column(s) to group by.
860
867
  # @param maintain_order [Boolean]
861
868
  # Make sure that the order of the groups remain consistent. This is more
862
- # expensive than a default groupby.
869
+ # expensive than a default group by.
863
870
  #
864
871
  # @return [LazyGroupBy]
865
872
  #
@@ -871,7 +878,7 @@ module Polars
871
878
  # "c" => [6, 5, 4, 3, 2, 1]
872
879
  # }
873
880
  # ).lazy
874
- # df.groupby("a", maintain_order: true).agg(Polars.col("b").sum).collect
881
+ # df.group_by("a", maintain_order: true).agg(Polars.col("b").sum).collect
875
882
  # # =>
876
883
  # # shape: (3, 2)
877
884
  # # ┌─────┬─────┐
@@ -883,19 +890,21 @@ module Polars
883
890
  # # │ b ┆ 11 │
884
891
  # # │ c ┆ 6 │
885
892
  # # └─────┴─────┘
886
- def groupby(by, maintain_order: false)
893
+ def group_by(by, maintain_order: false)
887
894
  rbexprs_by = Utils.selection_to_rbexpr_list(by)
888
- lgb = _ldf.groupby(rbexprs_by, maintain_order)
889
- LazyGroupBy.new(lgb, self.class)
895
+ lgb = _ldf.group_by(rbexprs_by, maintain_order)
896
+ LazyGroupBy.new(lgb)
890
897
  end
898
+ alias_method :groupby, :group_by
899
+ alias_method :group, :group_by
891
900
 
892
901
  # Create rolling groups based on a time column.
893
902
  #
894
903
  # Also works for index values of type `:i32` or `:i64`.
895
904
  #
896
- # Different from a `dynamic_groupby` the windows are now determined by the
905
+ # Different from a `dynamic_group_by` the windows are now determined by the
897
906
  # individual values and are not of constant intervals. For constant intervals
898
- # use *groupby_dynamic*.
907
+ # use *group_by_dynamic*.
899
908
  #
900
909
  # The `period` and `offset` arguments are created either from a timedelta, or
901
910
  # by using the following string language:
@@ -915,7 +924,7 @@ module Polars
915
924
  # Or combine them:
916
925
  # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
917
926
  #
918
- # In case of a groupby_rolling on an integer column, the windows are defined by:
927
+ # In case of a group_by_rolling on an integer column, the windows are defined by:
919
928
  #
920
929
  # - "1i" # length 1
921
930
  # - "10i" # length 10
@@ -926,7 +935,7 @@ module Polars
926
935
  # This column must be sorted in ascending order. If not the output will not
927
936
  # make sense.
928
937
  #
929
- # In case of a rolling groupby on indices, dtype needs to be one of
938
+ # In case of a rolling group by on indices, dtype needs to be one of
930
939
  # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
931
940
  # performance matters use an `:i64` column.
932
941
  # @param period [Object]
@@ -958,7 +967,7 @@ module Polars
958
967
  # df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
959
968
  # Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
960
969
  # )
961
- # df.groupby_rolling(index_column: "dt", period: "2d").agg(
970
+ # df.group_by_rolling(index_column: "dt", period: "2d").agg(
962
971
  # [
963
972
  # Polars.sum("a").alias("sum_a"),
964
973
  # Polars.min("a").alias("min_a"),
@@ -979,7 +988,7 @@ module Polars
979
988
  # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
980
989
  # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
981
990
  # # └─────────────────────┴───────┴───────┴───────┘
982
- def groupby_rolling(
991
+ def group_by_rolling(
983
992
  index_column:,
984
993
  period:,
985
994
  offset: nil,
@@ -987,7 +996,7 @@ module Polars
987
996
  by: nil,
988
997
  check_sorted: true
989
998
  )
990
- index_column = Utils.expr_to_lit_or_expr(index_column, str_to_lit: false)
999
+ index_column = Utils.parse_as_expression(index_column)
991
1000
  if offset.nil?
992
1001
  offset = "-#{period}"
993
1002
  end
@@ -996,16 +1005,17 @@ module Polars
996
1005
  period = Utils._timedelta_to_pl_duration(period)
997
1006
  offset = Utils._timedelta_to_pl_duration(offset)
998
1007
 
999
- lgb = _ldf.groupby_rolling(
1000
- index_column._rbexpr, period, offset, closed, rbexprs_by, check_sorted
1008
+ lgb = _ldf.group_by_rolling(
1009
+ index_column, period, offset, closed, rbexprs_by, check_sorted
1001
1010
  )
1002
- LazyGroupBy.new(lgb, self.class)
1011
+ LazyGroupBy.new(lgb)
1003
1012
  end
1013
+ alias_method :groupby_rolling, :group_by_rolling
1004
1014
 
1005
1015
  # Group based on a time value (or index value of type `:i32`, `:i64`).
1006
1016
  #
1007
1017
  # Time windows are calculated and rows are assigned to windows. Different from a
1008
- # normal groupby is that a row can be member of multiple groups. The time/index
1018
+ # normal group by is that a row can be member of multiple groups. The time/index
1009
1019
  # window could be seen as a rolling window, with a window size determined by
1010
1020
  # dates/times/values instead of slots in the DataFrame.
1011
1021
  #
@@ -1033,37 +1043,43 @@ module Polars
1033
1043
  # Or combine them:
1034
1044
  # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1035
1045
  #
1036
- # In case of a groupby_dynamic on an integer column, the windows are defined by:
1046
+ # In case of a group_by_dynamic on an integer column, the windows are defined by:
1037
1047
  #
1038
1048
  # - "1i" # length 1
1039
1049
  # - "10i" # length 10
1040
1050
  #
1041
- # @param index_column
1051
+ # @param index_column [Object]
1042
1052
  # Column used to group based on the time window.
1043
1053
  # Often to type Date/Datetime
1044
1054
  # This column must be sorted in ascending order. If not the output will not
1045
1055
  # make sense.
1046
1056
  #
1047
- # In case of a dynamic groupby on indices, dtype needs to be one of
1057
+ # In case of a dynamic group by on indices, dtype needs to be one of
1048
1058
  # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1049
1059
  # performance matters use an `:i64` column.
1050
- # @param every
1060
+ # @param every [Object]
1051
1061
  # Interval of the window.
1052
- # @param period
1062
+ # @param period [Object]
1053
1063
  # Length of the window, if None it is equal to 'every'.
1054
- # @param offset
1064
+ # @param offset [Object]
1055
1065
  # Offset of the window if None and period is None it will be equal to negative
1056
1066
  # `every`.
1057
- # @param truncate
1067
+ # @param truncate [Boolean]
1058
1068
  # Truncate the time value to the window lower bound.
1059
- # @param include_boundaries
1069
+ # @param include_boundaries [Boolean]
1060
1070
  # Add the lower and upper bound of the window to the "_lower_bound" and
1061
1071
  # "_upper_bound" columns. This will impact performance because it's harder to
1062
1072
  # parallelize
1063
1073
  # @param closed ["right", "left", "both", "none"]
1064
1074
  # Define whether the temporal window interval is closed or not.
1065
- # @param by
1075
+ # @param by [Object]
1066
1076
  # Also group by this column/these columns
1077
+ # @param check_sorted [Boolean]
1078
+ # When the `by` argument is given, polars can not check sortedness
1079
+ # by the metadata and has to do a full scan on the index column to
1080
+ # verify data is sorted. This is expensive. If you are sure the
1081
+ # data within the by groups is sorted, you can set this to `false`.
1082
+ # Doing so incorrectly will lead to incorrect output.
1067
1083
  #
1068
1084
  # @return [DataFrame]
1069
1085
  #
@@ -1095,7 +1111,7 @@ module Polars
1095
1111
  # # └─────────────────────┴─────┘
1096
1112
  #
1097
1113
  # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
1098
- # df.groupby_dynamic("time", every: "1h", closed: "right").agg(
1114
+ # df.group_by_dynamic("time", every: "1h", closed: "right").agg(
1099
1115
  # [
1100
1116
  # Polars.col("time").min.alias("time_min"),
1101
1117
  # Polars.col("time").max.alias("time_max")
@@ -1115,7 +1131,7 @@ module Polars
1115
1131
  # # └─────────────────────┴─────────────────────┴─────────────────────┘
1116
1132
  #
1117
1133
  # @example The window boundaries can also be added to the aggregation result.
1118
- # df.groupby_dynamic(
1134
+ # df.group_by_dynamic(
1119
1135
  # "time", every: "1h", include_boundaries: true, closed: "right"
1120
1136
  # ).agg([Polars.col("time").count.alias("time_count")])
1121
1137
  # # =>
@@ -1132,7 +1148,7 @@ module Polars
1132
1148
  # # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
1133
1149
  #
1134
1150
  # @example When closed="left", should not include right end of interval.
1135
- # df.groupby_dynamic("time", every: "1h", closed: "left").agg(
1151
+ # df.group_by_dynamic("time", every: "1h", closed: "left").agg(
1136
1152
  # [
1137
1153
  # Polars.col("time").count.alias("time_count"),
1138
1154
  # Polars.col("time").alias("time_agg_list")
@@ -1152,7 +1168,7 @@ module Polars
1152
1168
  # # └─────────────────────┴────────────┴───────────────────────────────────┘
1153
1169
  #
1154
1170
  # @example When closed="both" the time values at the window boundaries belong to 2 groups.
1155
- # df.groupby_dynamic("time", every: "1h", closed: "both").agg(
1171
+ # df.group_by_dynamic("time", every: "1h", closed: "both").agg(
1156
1172
  # [Polars.col("time").count.alias("time_count")]
1157
1173
  # )
1158
1174
  # # =>
@@ -1169,7 +1185,7 @@ module Polars
1169
1185
  # # │ 2021-12-16 03:00:00 ┆ 1 │
1170
1186
  # # └─────────────────────┴────────────┘
1171
1187
  #
1172
- # @example Dynamic groupbys can also be combined with grouping on normal keys.
1188
+ # @example Dynamic group bys can also be combined with grouping on normal keys.
1173
1189
  # df = Polars::DataFrame.new(
1174
1190
  # {
1175
1191
  # "time" => Polars.date_range(
@@ -1180,7 +1196,7 @@ module Polars
1180
1196
  # "groups" => ["a", "a", "a", "b", "b", "a", "a"]
1181
1197
  # }
1182
1198
  # )
1183
- # df.groupby_dynamic(
1199
+ # df.group_by_dynamic(
1184
1200
  # "time",
1185
1201
  # every: "1h",
1186
1202
  # closed: "both",
@@ -1203,14 +1219,14 @@ module Polars
1203
1219
  # # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
1204
1220
  # # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
1205
1221
  #
1206
- # @example Dynamic groupby on an index column.
1222
+ # @example Dynamic group by on an index column.
1207
1223
  # df = Polars::DataFrame.new(
1208
1224
  # {
1209
1225
  # "idx" => Polars.arange(0, 6, eager: true),
1210
1226
  # "A" => ["A", "A", "B", "B", "B", "C"]
1211
1227
  # }
1212
1228
  # )
1213
- # df.groupby_dynamic(
1229
+ # df.group_by_dynamic(
1214
1230
  # "idx",
1215
1231
  # every: "2i",
1216
1232
  # period: "3i",
@@ -1228,17 +1244,23 @@ module Polars
1228
1244
  # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
1229
1245
  # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
1230
1246
  # # └─────────────────┴─────────────────┴─────┴─────────────────┘
1231
- def groupby_dynamic(
1247
+ def group_by_dynamic(
1232
1248
  index_column,
1233
1249
  every:,
1234
1250
  period: nil,
1235
1251
  offset: nil,
1236
- truncate: true,
1252
+ truncate: nil,
1237
1253
  include_boundaries: false,
1238
1254
  closed: "left",
1255
+ label: "left",
1239
1256
  by: nil,
1240
- start_by: "window"
1257
+ start_by: "window",
1258
+ check_sorted: true
1241
1259
  )
1260
+ if !truncate.nil?
1261
+ label = truncate ? "left" : "datapoint"
1262
+ end
1263
+
1242
1264
  index_column = Utils.expr_to_lit_or_expr(index_column, str_to_lit: false)
1243
1265
  if offset.nil?
1244
1266
  offset = period.nil? ? "-#{every}" : "0ns"
@@ -1253,19 +1275,21 @@ module Polars
1253
1275
  every = Utils._timedelta_to_pl_duration(every)
1254
1276
 
1255
1277
  rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
1256
- lgb = _ldf.groupby_dynamic(
1278
+ lgb = _ldf.group_by_dynamic(
1257
1279
  index_column._rbexpr,
1258
1280
  every,
1259
1281
  period,
1260
1282
  offset,
1261
- truncate,
1283
+ label,
1262
1284
  include_boundaries,
1263
1285
  closed,
1264
1286
  rbexprs_by,
1265
- start_by
1287
+ start_by,
1288
+ check_sorted
1266
1289
  )
1267
- LazyGroupBy.new(lgb, self.class)
1290
+ LazyGroupBy.new(lgb)
1268
1291
  end
1292
+ alias_method :groupby_dynamic, :group_by_dynamic
1269
1293
 
1270
1294
  # Perform an asof join.
1271
1295
  #
@@ -1725,8 +1749,10 @@ module Polars
1725
1749
 
1726
1750
  # Shift the values by a given period.
1727
1751
  #
1728
- # @param periods [Integer]
1752
+ # @param n [Integer]
1729
1753
  # Number of places to shift (may be negative).
1754
+ # @param fill_value [Object]
1755
+ # Fill the resulting null values with this value.
1730
1756
  #
1731
1757
  # @return [LazyFrame]
1732
1758
  #
@@ -1763,8 +1789,12 @@ module Polars
1763
1789
  # # │ 5 ┆ 6 │
1764
1790
  # # │ null ┆ null │
1765
1791
  # # └──────┴──────┘
1766
- def shift(periods)
1767
- _from_rbldf(_ldf.shift(periods))
1792
+ def shift(n, fill_value: nil)
1793
+ if !fill_value.nil?
1794
+ fill_value = Utils.parse_as_expression(fill_value, str_as_lit: true)
1795
+ end
1796
+ n = Utils.parse_as_expression(n)
1797
+ _from_rbldf(_ldf.shift(n, fill_value))
1768
1798
  end
1769
1799
 
1770
1800
  # Shift the values by a given period and fill the resulting null values.
@@ -1810,10 +1840,7 @@ module Polars
1810
1840
  # # │ 0 ┆ 0 │
1811
1841
  # # └─────┴─────┘
1812
1842
  def shift_and_fill(periods, fill_value)
1813
- if !fill_value.is_a?(Expr)
1814
- fill_value = Polars.lit(fill_value)
1815
- end
1816
- _from_rbldf(_ldf.shift_and_fill(periods, fill_value._rbexpr))
1843
+ shift(periods, fill_value: fill_value)
1817
1844
  end
1818
1845
 
1819
1846
  # Get a slice of this DataFrame.
@@ -2371,16 +2398,16 @@ module Polars
2371
2398
  # df.interpolate.collect
2372
2399
  # # =>
2373
2400
  # # shape: (4, 3)
2374
- # # ┌─────┬──────┬─────┐
2375
- # # │ foo ┆ bar ┆ baz
2376
- # # │ --- ┆ --- ┆ ---
2377
- # # │ i64 i64i64
2378
- # # ╞═════╪══════╪═════╡
2379
- # # │ 1 ┆ 6 ┆ 1
2380
- # # │ 5 ┆ 7 ┆ 3
2381
- # # │ 9 ┆ 9 ┆ 6
2382
- # # │ 10 ┆ null ┆ 9
2383
- # # └─────┴──────┴─────┘
2401
+ # # ┌──────┬──────┬──────────┐
2402
+ # # │ foo ┆ bar ┆ baz
2403
+ # # │ --- ┆ --- ┆ ---
2404
+ # # │ f64 f64f64
2405
+ # # ╞══════╪══════╪══════════╡
2406
+ # # │ 1.0 ┆ 6.0 ┆ 1.0
2407
+ # # │ 5.0 ┆ 7.0 ┆ 3.666667
2408
+ # # │ 9.0 ┆ 9.0 ┆ 6.333333
2409
+ # # │ 10.0 ┆ null ┆ 9.0
2410
+ # # └──────┴──────┴──────────┘
2384
2411
  def interpolate
2385
2412
  select(Utils.col("*").interpolate)
2386
2413
  end
@@ -43,7 +43,7 @@ module Polars
43
43
  # # ┌─────┬─────┬────────────┐
44
44
  # # │ a ┆ b ┆ rank │
45
45
  # # │ --- ┆ --- ┆ --- │
46
- # # │ i64 ┆ i64 ┆ list[f32] │
46
+ # # │ i64 ┆ i64 ┆ list[f64] │
47
47
  # # ╞═════╪═════╪════════════╡
48
48
  # # │ 1 ┆ 4 ┆ [1.0, 2.0] │
49
49
  # # │ 8 ┆ 5 ┆ [2.0, 1.0] │
@@ -158,7 +158,7 @@ module Polars
158
158
  col(column.to_s).sum
159
159
  elsif column.is_a?(::Array)
160
160
  exprs = Utils.selection_to_rbexpr_list(column)
161
- Utils.wrap_expr(_sum_exprs(exprs))
161
+ Utils.wrap_expr(_sum_horizontal(exprs))
162
162
  else
163
163
  fold(lit(0).cast(:u32), ->(a, b) { a + b }, column).alias("sum")
164
164
  end
@@ -625,16 +625,16 @@ module Polars
625
625
  # This can be used in a `select`, `with_column`, etc. Be sure that the resulting
626
626
  # range size is equal to the length of the DataFrame you are collecting.
627
627
  #
628
- # @param low [Integer, Expr, Series]
628
+ # @param start [Integer, Expr, Series]
629
629
  # Lower bound of range.
630
- # @param high [Integer, Expr, Series]
630
+ # @param stop [Integer, Expr, Series]
631
631
  # Upper bound of range.
632
632
  # @param step [Integer]
633
633
  # Step size of the range.
634
634
  # @param eager [Boolean]
635
635
  # If eager evaluation is `True`, a Series is returned instead of an Expr.
636
636
  # @param dtype [Symbol]
637
- # Apply an explicit integer dtype to the resulting expression (default is `:i64`).
637
+ # Apply an explicit integer dtype to the resulting expression (default is `Int64`).
638
638
  #
639
639
  # @return [Expr, Series]
640
640
  #
@@ -648,35 +648,20 @@ module Polars
648
648
  # # 1
649
649
  # # 2
650
650
  # # ]
651
- #
652
- # @example
653
- # df = Polars::DataFrame.new({"a" => [1, 2], "b" => [3, 4]})
654
- # df.select(Polars.arange(Polars.col("a"), Polars.col("b")))
655
- # # =>
656
- # # shape: (2, 1)
657
- # # ┌───────────┐
658
- # # │ arange │
659
- # # │ --- │
660
- # # │ list[i64] │
661
- # # ╞═══════════╡
662
- # # │ [1, 2] │
663
- # # │ [2, 3] │
664
- # # └───────────┘
665
- def arange(low, high, step: 1, eager: false, dtype: nil)
666
- low = Utils.expr_to_lit_or_expr(low, str_to_lit: false)
667
- high = Utils.expr_to_lit_or_expr(high, str_to_lit: false)
668
- range_expr = Utils.wrap_expr(RbExpr.arange(low._rbexpr, high._rbexpr, step))
669
-
670
- if !dtype.nil? && !["i64", Int64].include?(dtype)
671
- range_expr = range_expr.cast(dtype)
672
- end
651
+ def int_range(start, stop, step: 1, eager: false, dtype: nil)
652
+ start = Utils.parse_as_expression(start)
653
+ stop = Utils.parse_as_expression(stop)
654
+ dtype ||= Int64
655
+ dtype = dtype.to_s if dtype.is_a?(Symbol)
656
+ result = Utils.wrap_expr(RbExpr.int_range(start, stop, step, dtype)).alias("arange")
673
657
 
674
- if !eager
675
- range_expr
676
- else
677
- DataFrame.new.select(range_expr.alias("arange")).to_series
658
+ if eager
659
+ return select(result).to_series
678
660
  end
661
+
662
+ result
679
663
  end
664
+ alias_method :arange, :int_range
680
665
 
681
666
  # Find the indexes that would sort the columns.
682
667
  #
@@ -735,15 +720,22 @@ module Polars
735
720
  # # │ 2022-01-16 00:00:00 ┆ 2022-01-04 00:00:00 ┆ 2022-01-02 00:00:02 ┆ 2022-01-02 00:00:00.002 ┆ 2022-01-02 02:00:00 │
736
721
  # # └─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────────┴─────────────────────┘
737
722
  def duration(
723
+ weeks: nil,
738
724
  days: nil,
725
+ hours: nil,
726
+ minutes: nil,
739
727
  seconds: nil,
740
- nanoseconds: nil,
741
- microseconds: nil,
742
728
  milliseconds: nil,
743
- minutes: nil,
744
- hours: nil,
745
- weeks: nil
729
+ microseconds: nil,
730
+ nanoseconds: nil,
731
+ time_unit: "us"
746
732
  )
733
+ if !weeks.nil?
734
+ weeks = Utils.expr_to_lit_or_expr(weeks, str_to_lit: false)._rbexpr
735
+ end
736
+ if !days.nil?
737
+ days = Utils.expr_to_lit_or_expr(days, str_to_lit: false)._rbexpr
738
+ end
747
739
  if !hours.nil?
748
740
  hours = Utils.expr_to_lit_or_expr(hours, str_to_lit: false)._rbexpr
749
741
  end
@@ -762,23 +754,18 @@ module Polars
762
754
  if !nanoseconds.nil?
763
755
  nanoseconds = Utils.expr_to_lit_or_expr(nanoseconds, str_to_lit: false)._rbexpr
764
756
  end
765
- if !days.nil?
766
- days = Utils.expr_to_lit_or_expr(days, str_to_lit: false)._rbexpr
767
- end
768
- if !weeks.nil?
769
- weeks = Utils.expr_to_lit_or_expr(weeks, str_to_lit: false)._rbexpr
770
- end
771
757
 
772
758
  Utils.wrap_expr(
773
759
  _rb_duration(
760
+ weeks,
774
761
  days,
762
+ hours,
763
+ minutes,
775
764
  seconds,
776
- nanoseconds,
777
- microseconds,
778
765
  milliseconds,
779
- minutes,
780
- hours,
781
- weeks
766
+ microseconds,
767
+ nanoseconds,
768
+ time_unit
782
769
  )
783
770
  )
784
771
  end
@@ -944,7 +931,8 @@ module Polars
944
931
  simplify_expression,
945
932
  slice_pushdown,
946
933
  common_subplan_elimination,
947
- allow_streaming
934
+ allow_streaming,
935
+ false
948
936
  )
949
937
  prepared << ldf
950
938
  end
@@ -1,10 +1,9 @@
1
1
  module Polars
2
- # Created by `df.lazy.groupby("foo")`.
2
+ # Created by `df.lazy.group_by("foo")`.
3
3
  class LazyGroupBy
4
4
  # @private
5
- def initialize(lgb, lazyframe_class)
5
+ def initialize(lgb)
6
6
  @lgb = lgb
7
- @lazyframe_class = lazyframe_class
8
7
  end
9
8
 
10
9
  # Describe the aggregation that need to be done on a group.
@@ -12,7 +11,7 @@ module Polars
12
11
  # @return [LazyFrame]
13
12
  def agg(aggs)
14
13
  rbexprs = Utils.selection_to_rbexpr_list(aggs)
15
- @lazyframe_class._from_rbldf(@lgb.agg(rbexprs))
14
+ Utils.wrap_ldf(@lgb.agg(rbexprs))
16
15
  end
17
16
 
18
17
  # Get the first `n` rows of each group.
@@ -29,7 +28,7 @@ module Polars
29
28
  # "nrs" => [1, 2, 3, 4, 5, 6]
30
29
  # }
31
30
  # )
32
- # df.groupby("letters").head(2).sort("letters")
31
+ # df.group_by("letters").head(2).sort("letters")
33
32
  # # =>
34
33
  # # shape: (5, 2)
35
34
  # # ┌─────────┬─────┐
@@ -44,7 +43,7 @@ module Polars
44
43
  # # │ c ┆ 2 │
45
44
  # # └─────────┴─────┘
46
45
  def head(n = 5)
47
- @lazyframe_class._from_rbldf(@lgb.head(n))
46
+ Utils.wrap_ldf(@lgb.head(n))
48
47
  end
49
48
 
50
49
  # Get the last `n` rows of each group.
@@ -61,7 +60,7 @@ module Polars
61
60
  # "nrs" => [1, 2, 3, 4, 5, 6]
62
61
  # }
63
62
  # )
64
- # df.groupby("letters").tail(2).sort("letters")
63
+ # df.group_by("letters").tail(2).sort("letters")
65
64
  # # =>
66
65
  # # shape: (5, 2)
67
66
  # # ┌─────────┬─────┐
@@ -76,7 +75,7 @@ module Polars
76
75
  # # │ c ┆ 4 │
77
76
  # # └─────────┴─────┘
78
77
  def tail(n = 5)
79
- @lazyframe_class._from_rbldf(@lgb.tail(n))
78
+ Utils.wrap_ldf(@lgb.tail(n))
80
79
  end
81
80
 
82
81
  # def apply