polars-df 0.4.0-arm64-darwin → 0.6.0-arm64-darwin

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,9 +1,25 @@
1
1
  module Polars
2
- # Representation of a Lazy computation graph/query againat a DataFrame.
2
+ # Representation of a Lazy computation graph/query against a DataFrame.
3
3
  class LazyFrame
4
4
  # @private
5
5
  attr_accessor :_ldf
6
6
 
7
+ # Create a new LazyFrame.
8
+ def initialize(data = nil, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
9
+ self._ldf = (
10
+ DataFrame.new(
11
+ data,
12
+ schema: schema,
13
+ schema_overrides: schema_overrides,
14
+ orient: orient,
15
+ infer_schema_length: infer_schema_length,
16
+ nan_to_null: nan_to_null
17
+ )
18
+ .lazy
19
+ ._ldf
20
+ )
21
+ end
22
+
7
23
  # @private
8
24
  def self._from_rbldf(rb_ldf)
9
25
  ldf = LazyFrame.allocate
@@ -379,16 +395,16 @@ module Polars
379
395
  # # │ 2 ┆ 7.0 ┆ b │
380
396
  # # │ 1 ┆ 6.0 ┆ a │
381
397
  # # └─────┴─────┴─────┘
382
- def sort(by, reverse: false, nulls_last: false)
398
+ def sort(by, reverse: false, nulls_last: false, maintain_order: false)
383
399
  if by.is_a?(String)
384
- _from_rbldf(_ldf.sort(by, reverse, nulls_last))
400
+ return _from_rbldf(_ldf.sort(by, reverse, nulls_last, maintain_order))
385
401
  end
386
402
  if Utils.bool?(reverse)
387
403
  reverse = [reverse]
388
404
  end
389
405
 
390
406
  by = Utils.selection_to_rbexpr_list(by)
391
- _from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last))
407
+ _from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last, maintain_order))
392
408
  end
393
409
 
394
410
  # def profile
@@ -921,6 +937,12 @@ module Polars
921
937
  # Define whether the temporal window interval is closed or not.
922
938
  # @param by [Object]
923
939
  # Also group by this column/these columns.
940
+ # @param check_sorted [Boolean]
941
+ # When the `by` argument is given, polars can not check sortedness
942
+ # by the metadata and has to do a full scan on the index column to
943
+ # verify data is sorted. This is expensive. If you are sure the
944
+ # data within the by groups is sorted, you can set this to `false`.
945
+ # Doing so incorrectly will lead to incorrect output
924
946
  #
925
947
  # @return [LazyFrame]
926
948
  #
@@ -933,8 +955,8 @@ module Polars
933
955
  # "2020-01-03 19:45:32",
934
956
  # "2020-01-08 23:16:43"
935
957
  # ]
936
- # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
937
- # Polars.col("dt").str.strptime(:datetime)
958
+ # df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
959
+ # Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
938
960
  # )
939
961
  # df.groupby_rolling(index_column: "dt", period: "2d").agg(
940
962
  # [
@@ -942,7 +964,7 @@ module Polars
942
964
  # Polars.min("a").alias("min_a"),
943
965
  # Polars.max("a").alias("max_a")
944
966
  # ]
945
- # )
967
+ # ).collect
946
968
  # # =>
947
969
  # # shape: (6, 4)
948
970
  # # ┌─────────────────────┬───────┬───────┬───────┐
@@ -962,8 +984,10 @@ module Polars
962
984
  period:,
963
985
  offset: nil,
964
986
  closed: "right",
965
- by: nil
987
+ by: nil,
988
+ check_sorted: true
966
989
  )
990
+ index_column = Utils.expr_to_lit_or_expr(index_column, str_to_lit: false)
967
991
  if offset.nil?
968
992
  offset = "-#{period}"
969
993
  end
@@ -973,7 +997,7 @@ module Polars
973
997
  offset = Utils._timedelta_to_pl_duration(offset)
974
998
 
975
999
  lgb = _ldf.groupby_rolling(
976
- index_column, period, offset, closed, rbexprs_by
1000
+ index_column._rbexpr, period, offset, closed, rbexprs_by, check_sorted
977
1001
  )
978
1002
  LazyGroupBy.new(lgb, self.class)
979
1003
  end
@@ -1111,21 +1135,21 @@ module Polars
1111
1135
  # df.groupby_dynamic("time", every: "1h", closed: "left").agg(
1112
1136
  # [
1113
1137
  # Polars.col("time").count.alias("time_count"),
1114
- # Polars.col("time").list.alias("time_agg_list")
1138
+ # Polars.col("time").alias("time_agg_list")
1115
1139
  # ]
1116
1140
  # )
1117
1141
  # # =>
1118
1142
  # # shape: (4, 3)
1119
- # # ┌─────────────────────┬────────────┬─────────────────────────────────────┐
1120
- # # │ time ┆ time_count ┆ time_agg_list
1121
- # # │ --- ┆ --- ┆ ---
1122
- # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
1123
- # # ╞═════════════════════╪════════════╪═════════════════════════════════════╡
1124
- # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16...
1125
- # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16...
1126
- # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16...
1127
- # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
1128
- # # └─────────────────────┴────────────┴─────────────────────────────────────┘
1143
+ # # ┌─────────────────────┬────────────┬───────────────────────────────────┐
1144
+ # # │ time ┆ time_count ┆ time_agg_list
1145
+ # # │ --- ┆ --- ┆ ---
1146
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
1147
+ # # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
1148
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16
1149
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16
1150
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16
1151
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
1152
+ # # └─────────────────────┴────────────┴───────────────────────────────────┘
1129
1153
  #
1130
1154
  # @example When closed="both" the time values at the window boundaries belong to 2 groups.
1131
1155
  # df.groupby_dynamic("time", every: "1h", closed: "both").agg(
@@ -1192,7 +1216,7 @@ module Polars
1192
1216
  # period: "3i",
1193
1217
  # include_boundaries: true,
1194
1218
  # closed: "right"
1195
- # ).agg(Polars.col("A").list.alias("A_agg_list"))
1219
+ # ).agg(Polars.col("A").alias("A_agg_list"))
1196
1220
  # # =>
1197
1221
  # # shape: (3, 4)
1198
1222
  # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
@@ -1215,12 +1239,9 @@ module Polars
1215
1239
  by: nil,
1216
1240
  start_by: "window"
1217
1241
  )
1242
+ index_column = Utils.expr_to_lit_or_expr(index_column, str_to_lit: false)
1218
1243
  if offset.nil?
1219
- if period.nil?
1220
- offset = "-#{every}"
1221
- else
1222
- offset = "0ns"
1223
- end
1244
+ offset = period.nil? ? "-#{every}" : "0ns"
1224
1245
  end
1225
1246
 
1226
1247
  if period.nil?
@@ -1233,7 +1254,7 @@ module Polars
1233
1254
 
1234
1255
  rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
1235
1256
  lgb = _ldf.groupby_dynamic(
1236
- index_column,
1257
+ index_column._rbexpr,
1237
1258
  every,
1238
1259
  period,
1239
1260
  offset,
@@ -1350,7 +1371,7 @@ module Polars
1350
1371
  if by.is_a?(String)
1351
1372
  by_left_ = [by]
1352
1373
  by_right_ = [by]
1353
- elsif by.is_a?(Array)
1374
+ elsif by.is_a?(::Array)
1354
1375
  by_left_ = by
1355
1376
  by_right_ = by
1356
1377
  end
@@ -1618,7 +1639,7 @@ module Polars
1618
1639
  # # │ null │
1619
1640
  # # └──────┘
1620
1641
  def with_context(other)
1621
- if !other.is_a?(Array)
1642
+ if !other.is_a?(::Array)
1622
1643
  other = [other]
1623
1644
  end
1624
1645
 
@@ -2227,7 +2248,7 @@ module Polars
2227
2248
  #
2228
2249
  # @return [LazyFrame]
2229
2250
  def unique(maintain_order: true, subset: nil, keep: "first")
2230
- if !subset.nil? && !subset.is_a?(Array)
2251
+ if !subset.nil? && !subset.is_a?(::Array)
2231
2252
  subset = [subset]
2232
2253
  end
2233
2254
  _from_rbldf(_ldf.unique(maintain_order, subset, keep))
@@ -2260,7 +2281,7 @@ module Polars
2260
2281
  # # │ 3 ┆ 8 ┆ c │
2261
2282
  # # └─────┴─────┴─────┘
2262
2283
  def drop_nulls(subset: nil)
2263
- if !subset.nil? && !subset.is_a?(Array)
2284
+ if !subset.nil? && !subset.is_a?(::Array)
2264
2285
  subset = [subset]
2265
2286
  end
2266
2287
  _from_rbldf(_ldf.drop_nulls(subset))
@@ -2422,6 +2443,38 @@ module Polars
2422
2443
  _from_rbldf(_ldf.unnest(names))
2423
2444
  end
2424
2445
 
2446
+ # TODO
2447
+ # def merge_sorted
2448
+ # end
2449
+
2450
+ # Indicate that one or multiple columns are sorted.
2451
+ #
2452
+ # @param column [Object]
2453
+ # Columns that are sorted
2454
+ # @param more_columns [Object]
2455
+ # Additional columns that are sorted, specified as positional arguments.
2456
+ # @param descending [Boolean]
2457
+ # Whether the columns are sorted in descending order.
2458
+ #
2459
+ # @return [LazyFrame]
2460
+ def set_sorted(
2461
+ column,
2462
+ *more_columns,
2463
+ descending: false
2464
+ )
2465
+ columns = Utils.selection_to_rbexpr_list(column)
2466
+ if more_columns.any?
2467
+ columns.concat(Utils.selection_to_rbexpr_list(more_columns))
2468
+ end
2469
+ with_columns(
2470
+ columns.map { |e| Utils.wrap_expr(e).set_sorted(descending: descending) }
2471
+ )
2472
+ end
2473
+
2474
+ # TODO
2475
+ # def update
2476
+ # end
2477
+
2425
2478
  private
2426
2479
 
2427
2480
  def initialize_copy(other)
@@ -14,7 +14,7 @@ module Polars
14
14
 
15
15
  if name.is_a?(DataType)
16
16
  Utils.wrap_expr(_dtype_cols([name]))
17
- elsif name.is_a?(Array)
17
+ elsif name.is_a?(::Array)
18
18
  if name.length == 0 || Utils.strlike?(name[0])
19
19
  name = name.map { |v| v.is_a?(Symbol) ? v.to_s : v }
20
20
  Utils.wrap_expr(RbExpr.cols(name))
@@ -36,7 +36,7 @@ module Polars
36
36
  # @example A horizontal rank computation by taking the elements of a list
37
37
  # df = Polars::DataFrame.new({"a" => [1, 8, 3], "b" => [4, 5, 2]})
38
38
  # df.with_column(
39
- # Polars.concat_list(["a", "b"]).arr.eval(Polars.element.rank).alias("rank")
39
+ # Polars.concat_list(["a", "b"]).list.eval(Polars.element.rank).alias("rank")
40
40
  # )
41
41
  # # =>
42
42
  # # shape: (3, 3)
@@ -156,9 +156,8 @@ module Polars
156
156
  column.sum
157
157
  elsif Utils.strlike?(column)
158
158
  col(column.to_s).sum
159
- elsif column.is_a?(Array)
159
+ elsif column.is_a?(::Array)
160
160
  exprs = Utils.selection_to_rbexpr_list(column)
161
- # TODO
162
161
  Utils.wrap_expr(_sum_exprs(exprs))
163
162
  else
164
163
  fold(lit(0).cast(:u32), ->(a, b) { a + b }, column).alias("sum")
@@ -283,18 +282,33 @@ module Polars
283
282
  # Return an expression representing a literal value.
284
283
  #
285
284
  # @return [Expr]
286
- def lit(value)
287
- if value.is_a?(Polars::Series)
285
+ def lit(value, dtype: nil, allow_object: nil)
286
+ if value.is_a?(::Time) || value.is_a?(::DateTime)
287
+ time_unit = dtype&.time_unit || "ns"
288
+ time_zone = dtype.&time_zone
289
+ e = lit(Utils._datetime_to_pl_timestamp(value, time_unit)).cast(Datetime.new(time_unit))
290
+ if time_zone
291
+ return e.dt.replace_time_zone(time_zone.to_s)
292
+ else
293
+ return e
294
+ end
295
+ elsif value.is_a?(::Date)
296
+ return lit(::Time.utc(value.year, value.month, value.day)).cast(Date)
297
+ elsif value.is_a?(Polars::Series)
288
298
  name = value.name
289
299
  value = value._s
290
- e = Utils.wrap_expr(RbExpr.lit(value))
300
+ e = Utils.wrap_expr(RbExpr.lit(value, allow_object))
291
301
  if name == ""
292
302
  return e
293
303
  end
294
304
  return e.alias(name)
305
+ elsif (defined?(Numo::NArray) && value.is_a?(Numo::NArray)) || value.is_a?(::Array)
306
+ return lit(Series.new("", value))
307
+ elsif dtype
308
+ return Utils.wrap_expr(RbExpr.lit(value, allow_object)).cast(dtype)
295
309
  end
296
310
 
297
- Utils.wrap_expr(RbExpr.lit(value))
311
+ Utils.wrap_expr(RbExpr.lit(value, allow_object))
298
312
  end
299
313
 
300
314
  # Cumulatively sum values in a column/Series, or horizontally across list of columns/expressions.
@@ -625,23 +639,42 @@ module Polars
625
639
  # @return [Expr, Series]
626
640
  #
627
641
  # @example
628
- # df.lazy.filter(Polars.col("foo") < Polars.arange(0, 100)).collect
642
+ # Polars.arange(0, 3, eager: true)
643
+ # # =>
644
+ # # shape: (3,)
645
+ # # Series: 'arange' [i64]
646
+ # # [
647
+ # # 0
648
+ # # 1
649
+ # # 2
650
+ # # ]
651
+ #
652
+ # @example
653
+ # df = Polars::DataFrame.new({"a" => [1, 2], "b" => [3, 4]})
654
+ # df.select(Polars.arange(Polars.col("a"), Polars.col("b")))
655
+ # # =>
656
+ # # shape: (2, 1)
657
+ # # ┌───────────┐
658
+ # # │ arange │
659
+ # # │ --- │
660
+ # # │ list[i64] │
661
+ # # ╞═══════════╡
662
+ # # │ [1, 2] │
663
+ # # │ [2, 3] │
664
+ # # └───────────┘
629
665
  def arange(low, high, step: 1, eager: false, dtype: nil)
630
666
  low = Utils.expr_to_lit_or_expr(low, str_to_lit: false)
631
667
  high = Utils.expr_to_lit_or_expr(high, str_to_lit: false)
632
668
  range_expr = Utils.wrap_expr(RbExpr.arange(low._rbexpr, high._rbexpr, step))
633
669
 
634
- if !dtype.nil? && dtype != "i64"
670
+ if !dtype.nil? && !["i64", Int64].include?(dtype)
635
671
  range_expr = range_expr.cast(dtype)
636
672
  end
637
673
 
638
674
  if !eager
639
675
  range_expr
640
676
  else
641
- DataFrame.new
642
- .select(range_expr)
643
- .to_series
644
- .rename("arange", in_place: true)
677
+ DataFrame.new.select(range_expr.alias("arange")).to_series
645
678
  end
646
679
  end
647
680
 
@@ -658,7 +691,7 @@ module Polars
658
691
  #
659
692
  # @return [Expr]
660
693
  def arg_sort_by(exprs, reverse: false)
661
- if !exprs.is_a?(Array)
694
+ if !exprs.is_a?(::Array)
662
695
  exprs = [exprs]
663
696
  end
664
697
  if reverse == true || reverse == false
@@ -997,19 +1030,24 @@ module Polars
997
1030
  # Only used in `eager` mode. As expression, use `alias`.
998
1031
  #
999
1032
  # @return [Expr]
1000
- def repeat(value, n, eager: false, name: nil)
1033
+ def repeat(value, n, dtype: nil, eager: false, name: nil)
1034
+ if !name.nil?
1035
+ warn "the `name` argument is deprecated. Use the `alias` method instead."
1036
+ end
1037
+
1038
+ if n.is_a?(Integer)
1039
+ n = lit(n)
1040
+ end
1041
+
1042
+ value = Utils.parse_as_expression(value, str_as_lit: true)
1043
+ expr = Utils.wrap_expr(RbExpr.repeat(value, n._rbexpr, dtype))
1044
+ if !name.nil?
1045
+ expr = expr.alias(name)
1046
+ end
1001
1047
  if eager
1002
- if name.nil?
1003
- name = ""
1004
- end
1005
- dtype = py_type_to_dtype(type(value))
1006
- Series._repeat(name, value, n, dtype)
1007
- else
1008
- if n.is_a?(Integer)
1009
- n = lit(n)
1010
- end
1011
- Utils.wrap_expr(RbExpr.repeat(value, n._rbexpr))
1048
+ return select(expr).to_series
1012
1049
  end
1050
+ expr
1013
1051
  end
1014
1052
 
1015
1053
  # Return indices where `condition` evaluates `true`.
@@ -1077,8 +1115,11 @@ module Polars
1077
1115
  # # │ null ┆ null ┆ 3.0 ┆ 3.0 │
1078
1116
  # # │ null ┆ null ┆ null ┆ 99.9 │
1079
1117
  # # └──────┴──────┴──────┴──────┘
1080
- def coalesce(exprs)
1118
+ def coalesce(exprs, *more_exprs)
1081
1119
  exprs = Utils.selection_to_rbexpr_list(exprs)
1120
+ if more_exprs.any?
1121
+ exprs.concat(Utils.selection_to_rbexpr_list(more_exprs))
1122
+ end
1082
1123
  Utils.wrap_expr(_coalesce_exprs(exprs))
1083
1124
  end
1084
1125
 
@@ -1121,13 +1162,11 @@ module Polars
1121
1162
  end
1122
1163
 
1123
1164
  if unit == "d"
1124
- expr = column.cast(:date)
1165
+ expr = column.cast(Date)
1125
1166
  elsif unit == "s"
1126
- raise Todo
1127
- # expr = (column.cast(:i64) * 1_000_000).cast(Datetime("us"))
1167
+ expr = (column.cast(Int64) * 1_000_000).cast(Datetime.new("us"))
1128
1168
  elsif Utils::DTYPE_TEMPORAL_UNITS.include?(unit)
1129
- raise Todo
1130
- # expr = column.cast(Datetime(unit))
1169
+ expr = column.cast(Datetime.new(unit))
1131
1170
  else
1132
1171
  raise ArgumentError, "'unit' must be one of {{'ns', 'us', 'ms', 's', 'd'}}, got '#{unit}'."
1133
1172
  end