polars-df 0.4.0-x86_64-darwin → 0.6.0-x86_64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,25 @@
1
1
  module Polars
2
- # Representation of a Lazy computation graph/query againat a DataFrame.
2
+ # Representation of a Lazy computation graph/query against a DataFrame.
3
3
  class LazyFrame
4
4
  # @private
5
5
  attr_accessor :_ldf
6
6
 
7
+ # Create a new LazyFrame.
8
+ def initialize(data = nil, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
9
+ self._ldf = (
10
+ DataFrame.new(
11
+ data,
12
+ schema: schema,
13
+ schema_overrides: schema_overrides,
14
+ orient: orient,
15
+ infer_schema_length: infer_schema_length,
16
+ nan_to_null: nan_to_null
17
+ )
18
+ .lazy
19
+ ._ldf
20
+ )
21
+ end
22
+
7
23
  # @private
8
24
  def self._from_rbldf(rb_ldf)
9
25
  ldf = LazyFrame.allocate
@@ -379,16 +395,16 @@ module Polars
379
395
  # # │ 2 ┆ 7.0 ┆ b │
380
396
  # # │ 1 ┆ 6.0 ┆ a │
381
397
  # # └─────┴─────┴─────┘
382
- def sort(by, reverse: false, nulls_last: false)
398
+ def sort(by, reverse: false, nulls_last: false, maintain_order: false)
383
399
  if by.is_a?(String)
384
- _from_rbldf(_ldf.sort(by, reverse, nulls_last))
400
+ return _from_rbldf(_ldf.sort(by, reverse, nulls_last, maintain_order))
385
401
  end
386
402
  if Utils.bool?(reverse)
387
403
  reverse = [reverse]
388
404
  end
389
405
 
390
406
  by = Utils.selection_to_rbexpr_list(by)
391
- _from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last))
407
+ _from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last, maintain_order))
392
408
  end
393
409
 
394
410
  # def profile
@@ -921,6 +937,12 @@ module Polars
921
937
  # Define whether the temporal window interval is closed or not.
922
938
  # @param by [Object]
923
939
  # Also group by this column/these columns.
940
+ # @param check_sorted [Boolean]
941
+ # When the `by` argument is given, polars can not check sortedness
942
+ # by the metadata and has to do a full scan on the index column to
943
+ # verify data is sorted. This is expensive. If you are sure the
944
+ # data within the by groups is sorted, you can set this to `false`.
945
+ # Doing so incorrectly will lead to incorrect output
924
946
  #
925
947
  # @return [LazyFrame]
926
948
  #
@@ -933,8 +955,8 @@ module Polars
933
955
  # "2020-01-03 19:45:32",
934
956
  # "2020-01-08 23:16:43"
935
957
  # ]
936
- # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
937
- # Polars.col("dt").str.strptime(:datetime)
958
+ # df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
959
+ # Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
938
960
  # )
939
961
  # df.groupby_rolling(index_column: "dt", period: "2d").agg(
940
962
  # [
@@ -942,7 +964,7 @@ module Polars
942
964
  # Polars.min("a").alias("min_a"),
943
965
  # Polars.max("a").alias("max_a")
944
966
  # ]
945
- # )
967
+ # ).collect
946
968
  # # =>
947
969
  # # shape: (6, 4)
948
970
  # # ┌─────────────────────┬───────┬───────┬───────┐
@@ -962,8 +984,10 @@ module Polars
962
984
  period:,
963
985
  offset: nil,
964
986
  closed: "right",
965
- by: nil
987
+ by: nil,
988
+ check_sorted: true
966
989
  )
990
+ index_column = Utils.expr_to_lit_or_expr(index_column, str_to_lit: false)
967
991
  if offset.nil?
968
992
  offset = "-#{period}"
969
993
  end
@@ -973,7 +997,7 @@ module Polars
973
997
  offset = Utils._timedelta_to_pl_duration(offset)
974
998
 
975
999
  lgb = _ldf.groupby_rolling(
976
- index_column, period, offset, closed, rbexprs_by
1000
+ index_column._rbexpr, period, offset, closed, rbexprs_by, check_sorted
977
1001
  )
978
1002
  LazyGroupBy.new(lgb, self.class)
979
1003
  end
@@ -1111,21 +1135,21 @@ module Polars
1111
1135
  # df.groupby_dynamic("time", every: "1h", closed: "left").agg(
1112
1136
  # [
1113
1137
  # Polars.col("time").count.alias("time_count"),
1114
- # Polars.col("time").list.alias("time_agg_list")
1138
+ # Polars.col("time").alias("time_agg_list")
1115
1139
  # ]
1116
1140
  # )
1117
1141
  # # =>
1118
1142
  # # shape: (4, 3)
1119
- # # ┌─────────────────────┬────────────┬─────────────────────────────────────┐
1120
- # # │ time ┆ time_count ┆ time_agg_list
1121
- # # │ --- ┆ --- ┆ ---
1122
- # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
1123
- # # ╞═════════════════════╪════════════╪═════════════════════════════════════╡
1124
- # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16...
1125
- # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16...
1126
- # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16...
1127
- # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
1128
- # # └─────────────────────┴────────────┴─────────────────────────────────────┘
1143
+ # # ┌─────────────────────┬────────────┬───────────────────────────────────┐
1144
+ # # │ time ┆ time_count ┆ time_agg_list
1145
+ # # │ --- ┆ --- ┆ ---
1146
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
1147
+ # # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
1148
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16
1149
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16
1150
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16
1151
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
1152
+ # # └─────────────────────┴────────────┴───────────────────────────────────┘
1129
1153
  #
1130
1154
  # @example When closed="both" the time values at the window boundaries belong to 2 groups.
1131
1155
  # df.groupby_dynamic("time", every: "1h", closed: "both").agg(
@@ -1192,7 +1216,7 @@ module Polars
1192
1216
  # period: "3i",
1193
1217
  # include_boundaries: true,
1194
1218
  # closed: "right"
1195
- # ).agg(Polars.col("A").list.alias("A_agg_list"))
1219
+ # ).agg(Polars.col("A").alias("A_agg_list"))
1196
1220
  # # =>
1197
1221
  # # shape: (3, 4)
1198
1222
  # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
@@ -1215,12 +1239,9 @@ module Polars
1215
1239
  by: nil,
1216
1240
  start_by: "window"
1217
1241
  )
1242
+ index_column = Utils.expr_to_lit_or_expr(index_column, str_to_lit: false)
1218
1243
  if offset.nil?
1219
- if period.nil?
1220
- offset = "-#{every}"
1221
- else
1222
- offset = "0ns"
1223
- end
1244
+ offset = period.nil? ? "-#{every}" : "0ns"
1224
1245
  end
1225
1246
 
1226
1247
  if period.nil?
@@ -1233,7 +1254,7 @@ module Polars
1233
1254
 
1234
1255
  rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
1235
1256
  lgb = _ldf.groupby_dynamic(
1236
- index_column,
1257
+ index_column._rbexpr,
1237
1258
  every,
1238
1259
  period,
1239
1260
  offset,
@@ -1350,7 +1371,7 @@ module Polars
1350
1371
  if by.is_a?(String)
1351
1372
  by_left_ = [by]
1352
1373
  by_right_ = [by]
1353
- elsif by.is_a?(Array)
1374
+ elsif by.is_a?(::Array)
1354
1375
  by_left_ = by
1355
1376
  by_right_ = by
1356
1377
  end
@@ -1618,7 +1639,7 @@ module Polars
1618
1639
  # # │ null │
1619
1640
  # # └──────┘
1620
1641
  def with_context(other)
1621
- if !other.is_a?(Array)
1642
+ if !other.is_a?(::Array)
1622
1643
  other = [other]
1623
1644
  end
1624
1645
 
@@ -2227,7 +2248,7 @@ module Polars
2227
2248
  #
2228
2249
  # @return [LazyFrame]
2229
2250
  def unique(maintain_order: true, subset: nil, keep: "first")
2230
- if !subset.nil? && !subset.is_a?(Array)
2251
+ if !subset.nil? && !subset.is_a?(::Array)
2231
2252
  subset = [subset]
2232
2253
  end
2233
2254
  _from_rbldf(_ldf.unique(maintain_order, subset, keep))
@@ -2260,7 +2281,7 @@ module Polars
2260
2281
  # # │ 3 ┆ 8 ┆ c │
2261
2282
  # # └─────┴─────┴─────┘
2262
2283
  def drop_nulls(subset: nil)
2263
- if !subset.nil? && !subset.is_a?(Array)
2284
+ if !subset.nil? && !subset.is_a?(::Array)
2264
2285
  subset = [subset]
2265
2286
  end
2266
2287
  _from_rbldf(_ldf.drop_nulls(subset))
@@ -2422,6 +2443,38 @@ module Polars
2422
2443
  _from_rbldf(_ldf.unnest(names))
2423
2444
  end
2424
2445
 
2446
+ # TODO
2447
+ # def merge_sorted
2448
+ # end
2449
+
2450
+ # Indicate that one or multiple columns are sorted.
2451
+ #
2452
+ # @param column [Object]
2453
+ # Columns that are sorted
2454
+ # @param more_columns [Object]
2455
+ # Additional columns that are sorted, specified as positional arguments.
2456
+ # @param descending [Boolean]
2457
+ # Whether the columns are sorted in descending order.
2458
+ #
2459
+ # @return [LazyFrame]
2460
+ def set_sorted(
2461
+ column,
2462
+ *more_columns,
2463
+ descending: false
2464
+ )
2465
+ columns = Utils.selection_to_rbexpr_list(column)
2466
+ if more_columns.any?
2467
+ columns.concat(Utils.selection_to_rbexpr_list(more_columns))
2468
+ end
2469
+ with_columns(
2470
+ columns.map { |e| Utils.wrap_expr(e).set_sorted(descending: descending) }
2471
+ )
2472
+ end
2473
+
2474
+ # TODO
2475
+ # def update
2476
+ # end
2477
+
2425
2478
  private
2426
2479
 
2427
2480
  def initialize_copy(other)
@@ -14,7 +14,7 @@ module Polars
14
14
 
15
15
  if name.is_a?(DataType)
16
16
  Utils.wrap_expr(_dtype_cols([name]))
17
- elsif name.is_a?(Array)
17
+ elsif name.is_a?(::Array)
18
18
  if name.length == 0 || Utils.strlike?(name[0])
19
19
  name = name.map { |v| v.is_a?(Symbol) ? v.to_s : v }
20
20
  Utils.wrap_expr(RbExpr.cols(name))
@@ -36,7 +36,7 @@ module Polars
36
36
  # @example A horizontal rank computation by taking the elements of a list
37
37
  # df = Polars::DataFrame.new({"a" => [1, 8, 3], "b" => [4, 5, 2]})
38
38
  # df.with_column(
39
- # Polars.concat_list(["a", "b"]).arr.eval(Polars.element.rank).alias("rank")
39
+ # Polars.concat_list(["a", "b"]).list.eval(Polars.element.rank).alias("rank")
40
40
  # )
41
41
  # # =>
42
42
  # # shape: (3, 3)
@@ -156,9 +156,8 @@ module Polars
156
156
  column.sum
157
157
  elsif Utils.strlike?(column)
158
158
  col(column.to_s).sum
159
- elsif column.is_a?(Array)
159
+ elsif column.is_a?(::Array)
160
160
  exprs = Utils.selection_to_rbexpr_list(column)
161
- # TODO
162
161
  Utils.wrap_expr(_sum_exprs(exprs))
163
162
  else
164
163
  fold(lit(0).cast(:u32), ->(a, b) { a + b }, column).alias("sum")
@@ -283,18 +282,33 @@ module Polars
283
282
  # Return an expression representing a literal value.
284
283
  #
285
284
  # @return [Expr]
286
- def lit(value)
287
- if value.is_a?(Polars::Series)
285
+ def lit(value, dtype: nil, allow_object: nil)
286
+ if value.is_a?(::Time) || value.is_a?(::DateTime)
287
+ time_unit = dtype&.time_unit || "ns"
288
+ time_zone = dtype.&time_zone
289
+ e = lit(Utils._datetime_to_pl_timestamp(value, time_unit)).cast(Datetime.new(time_unit))
290
+ if time_zone
291
+ return e.dt.replace_time_zone(time_zone.to_s)
292
+ else
293
+ return e
294
+ end
295
+ elsif value.is_a?(::Date)
296
+ return lit(::Time.utc(value.year, value.month, value.day)).cast(Date)
297
+ elsif value.is_a?(Polars::Series)
288
298
  name = value.name
289
299
  value = value._s
290
- e = Utils.wrap_expr(RbExpr.lit(value))
300
+ e = Utils.wrap_expr(RbExpr.lit(value, allow_object))
291
301
  if name == ""
292
302
  return e
293
303
  end
294
304
  return e.alias(name)
305
+ elsif (defined?(Numo::NArray) && value.is_a?(Numo::NArray)) || value.is_a?(::Array)
306
+ return lit(Series.new("", value))
307
+ elsif dtype
308
+ return Utils.wrap_expr(RbExpr.lit(value, allow_object)).cast(dtype)
295
309
  end
296
310
 
297
- Utils.wrap_expr(RbExpr.lit(value))
311
+ Utils.wrap_expr(RbExpr.lit(value, allow_object))
298
312
  end
299
313
 
300
314
  # Cumulatively sum values in a column/Series, or horizontally across list of columns/expressions.
@@ -625,23 +639,42 @@ module Polars
625
639
  # @return [Expr, Series]
626
640
  #
627
641
  # @example
628
- # df.lazy.filter(Polars.col("foo") < Polars.arange(0, 100)).collect
642
+ # Polars.arange(0, 3, eager: true)
643
+ # # =>
644
+ # # shape: (3,)
645
+ # # Series: 'arange' [i64]
646
+ # # [
647
+ # # 0
648
+ # # 1
649
+ # # 2
650
+ # # ]
651
+ #
652
+ # @example
653
+ # df = Polars::DataFrame.new({"a" => [1, 2], "b" => [3, 4]})
654
+ # df.select(Polars.arange(Polars.col("a"), Polars.col("b")))
655
+ # # =>
656
+ # # shape: (2, 1)
657
+ # # ┌───────────┐
658
+ # # │ arange │
659
+ # # │ --- │
660
+ # # │ list[i64] │
661
+ # # ╞═══════════╡
662
+ # # │ [1, 2] │
663
+ # # │ [2, 3] │
664
+ # # └───────────┘
629
665
  def arange(low, high, step: 1, eager: false, dtype: nil)
630
666
  low = Utils.expr_to_lit_or_expr(low, str_to_lit: false)
631
667
  high = Utils.expr_to_lit_or_expr(high, str_to_lit: false)
632
668
  range_expr = Utils.wrap_expr(RbExpr.arange(low._rbexpr, high._rbexpr, step))
633
669
 
634
- if !dtype.nil? && dtype != "i64"
670
+ if !dtype.nil? && !["i64", Int64].include?(dtype)
635
671
  range_expr = range_expr.cast(dtype)
636
672
  end
637
673
 
638
674
  if !eager
639
675
  range_expr
640
676
  else
641
- DataFrame.new
642
- .select(range_expr)
643
- .to_series
644
- .rename("arange", in_place: true)
677
+ DataFrame.new.select(range_expr.alias("arange")).to_series
645
678
  end
646
679
  end
647
680
 
@@ -658,7 +691,7 @@ module Polars
658
691
  #
659
692
  # @return [Expr]
660
693
  def arg_sort_by(exprs, reverse: false)
661
- if !exprs.is_a?(Array)
694
+ if !exprs.is_a?(::Array)
662
695
  exprs = [exprs]
663
696
  end
664
697
  if reverse == true || reverse == false
@@ -997,19 +1030,24 @@ module Polars
997
1030
  # Only used in `eager` mode. As expression, use `alias`.
998
1031
  #
999
1032
  # @return [Expr]
1000
- def repeat(value, n, eager: false, name: nil)
1033
+ def repeat(value, n, dtype: nil, eager: false, name: nil)
1034
+ if !name.nil?
1035
+ warn "the `name` argument is deprecated. Use the `alias` method instead."
1036
+ end
1037
+
1038
+ if n.is_a?(Integer)
1039
+ n = lit(n)
1040
+ end
1041
+
1042
+ value = Utils.parse_as_expression(value, str_as_lit: true)
1043
+ expr = Utils.wrap_expr(RbExpr.repeat(value, n._rbexpr, dtype))
1044
+ if !name.nil?
1045
+ expr = expr.alias(name)
1046
+ end
1001
1047
  if eager
1002
- if name.nil?
1003
- name = ""
1004
- end
1005
- dtype = py_type_to_dtype(type(value))
1006
- Series._repeat(name, value, n, dtype)
1007
- else
1008
- if n.is_a?(Integer)
1009
- n = lit(n)
1010
- end
1011
- Utils.wrap_expr(RbExpr.repeat(value, n._rbexpr))
1048
+ return select(expr).to_series
1012
1049
  end
1050
+ expr
1013
1051
  end
1014
1052
 
1015
1053
  # Return indices where `condition` evaluates `true`.
@@ -1077,8 +1115,11 @@ module Polars
1077
1115
  # # │ null ┆ null ┆ 3.0 ┆ 3.0 │
1078
1116
  # # │ null ┆ null ┆ null ┆ 99.9 │
1079
1117
  # # └──────┴──────┴──────┴──────┘
1080
- def coalesce(exprs)
1118
+ def coalesce(exprs, *more_exprs)
1081
1119
  exprs = Utils.selection_to_rbexpr_list(exprs)
1120
+ if more_exprs.any?
1121
+ exprs.concat(Utils.selection_to_rbexpr_list(more_exprs))
1122
+ end
1082
1123
  Utils.wrap_expr(_coalesce_exprs(exprs))
1083
1124
  end
1084
1125
 
@@ -1121,13 +1162,11 @@ module Polars
1121
1162
  end
1122
1163
 
1123
1164
  if unit == "d"
1124
- expr = column.cast(:date)
1165
+ expr = column.cast(Date)
1125
1166
  elsif unit == "s"
1126
- raise Todo
1127
- # expr = (column.cast(:i64) * 1_000_000).cast(Datetime("us"))
1167
+ expr = (column.cast(Int64) * 1_000_000).cast(Datetime.new("us"))
1128
1168
  elsif Utils::DTYPE_TEMPORAL_UNITS.include?(unit)
1129
- raise Todo
1130
- # expr = column.cast(Datetime(unit))
1169
+ expr = column.cast(Datetime.new(unit))
1131
1170
  else
1132
1171
  raise ArgumentError, "'unit' must be one of {{'ns', 'us', 'ms', 's', 'd'}}, got '#{unit}'."
1133
1172
  end