polars-df 0.4.0-x86_64-darwin → 0.5.0-x86_64-darwin
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/Cargo.lock +272 -191
- data/Cargo.toml +0 -1
- data/LICENSE-THIRD-PARTY.txt +2043 -1202
- data/README.md +2 -2
- data/lib/polars/3.0/polars.bundle +0 -0
- data/lib/polars/3.1/polars.bundle +0 -0
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/convert.rb +2 -2
- data/lib/polars/data_frame.rb +201 -50
- data/lib/polars/data_types.rb +6 -4
- data/lib/polars/date_time_expr.rb +142 -2
- data/lib/polars/expr.rb +70 -10
- data/lib/polars/lazy_frame.rb +4 -3
- data/lib/polars/lazy_functions.rb +4 -1
- data/lib/polars/list_expr.rb +68 -19
- data/lib/polars/series.rb +181 -73
- data/lib/polars/string_expr.rb +149 -43
- data/lib/polars/string_name_space.rb +4 -4
- data/lib/polars/struct_name_space.rb +32 -0
- data/lib/polars/utils.rb +41 -7
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +2 -2
- metadata +2 -2
data/lib/polars/expr.rb
CHANGED
@@ -1308,8 +1308,6 @@ module Polars
|
|
1308
1308
|
#
|
1309
1309
|
# @param k [Integer]
|
1310
1310
|
# Number of elements to return.
|
1311
|
-
# @param reverse [Boolean]
|
1312
|
-
# Return the smallest elements.
|
1313
1311
|
#
|
1314
1312
|
# @return [Expr]
|
1315
1313
|
#
|
@@ -1322,7 +1320,7 @@ module Polars
|
|
1322
1320
|
# df.select(
|
1323
1321
|
# [
|
1324
1322
|
# Polars.col("value").top_k.alias("top_k"),
|
1325
|
-
# Polars.col("value").
|
1323
|
+
# Polars.col("value").bottom_k.alias("bottom_k")
|
1326
1324
|
# ]
|
1327
1325
|
# )
|
1328
1326
|
# # =>
|
@@ -1338,8 +1336,46 @@ module Polars
|
|
1338
1336
|
# # │ 3 ┆ 4 │
|
1339
1337
|
# # │ 2 ┆ 98 │
|
1340
1338
|
# # └───────┴──────────┘
|
1341
|
-
def top_k(k: 5
|
1342
|
-
wrap_expr(_rbexpr.top_k(k
|
1339
|
+
def top_k(k: 5)
|
1340
|
+
wrap_expr(_rbexpr.top_k(k))
|
1341
|
+
end
|
1342
|
+
|
1343
|
+
# Return the `k` smallest elements.
|
1344
|
+
#
|
1345
|
+
# If 'reverse: true` the smallest elements will be given.
|
1346
|
+
#
|
1347
|
+
# @param k [Integer]
|
1348
|
+
# Number of elements to return.
|
1349
|
+
#
|
1350
|
+
# @return [Expr]
|
1351
|
+
#
|
1352
|
+
# @example
|
1353
|
+
# df = Polars::DataFrame.new(
|
1354
|
+
# {
|
1355
|
+
# "value" => [1, 98, 2, 3, 99, 4]
|
1356
|
+
# }
|
1357
|
+
# )
|
1358
|
+
# df.select(
|
1359
|
+
# [
|
1360
|
+
# Polars.col("value").top_k.alias("top_k"),
|
1361
|
+
# Polars.col("value").bottom_k.alias("bottom_k")
|
1362
|
+
# ]
|
1363
|
+
# )
|
1364
|
+
# # =>
|
1365
|
+
# # shape: (5, 2)
|
1366
|
+
# # ┌───────┬──────────┐
|
1367
|
+
# # │ top_k ┆ bottom_k │
|
1368
|
+
# # │ --- ┆ --- │
|
1369
|
+
# # │ i64 ┆ i64 │
|
1370
|
+
# # ╞═══════╪══════════╡
|
1371
|
+
# # │ 99 ┆ 1 │
|
1372
|
+
# # │ 98 ┆ 2 │
|
1373
|
+
# # │ 4 ┆ 3 │
|
1374
|
+
# # │ 3 ┆ 4 │
|
1375
|
+
# # │ 2 ┆ 98 │
|
1376
|
+
# # └───────┴──────────┘
|
1377
|
+
def bottom_k(k: 5)
|
1378
|
+
wrap_expr(_rbexpr.bottom_k(k))
|
1343
1379
|
end
|
1344
1380
|
|
1345
1381
|
# Get the index values that would sort this column.
|
@@ -2008,6 +2044,28 @@ module Polars
|
|
2008
2044
|
wrap_expr(_rbexpr.n_unique)
|
2009
2045
|
end
|
2010
2046
|
|
2047
|
+
# Approx count unique values.
|
2048
|
+
#
|
2049
|
+
# This is done using the HyperLogLog++ algorithm for cardinality estimation.
|
2050
|
+
#
|
2051
|
+
# @return [Expr]
|
2052
|
+
#
|
2053
|
+
# @example
|
2054
|
+
# df = Polars::DataFrame.new({"a" => [1, 1, 2]})
|
2055
|
+
# df.select(Polars.col("a").approx_unique)
|
2056
|
+
# # =>
|
2057
|
+
# # shape: (1, 1)
|
2058
|
+
# # ┌─────┐
|
2059
|
+
# # │ a │
|
2060
|
+
# # │ --- │
|
2061
|
+
# # │ u32 │
|
2062
|
+
# # ╞═════╡
|
2063
|
+
# # │ 2 │
|
2064
|
+
# # └─────┘
|
2065
|
+
def approx_unique
|
2066
|
+
wrap_expr(_rbexpr.approx_unique)
|
2067
|
+
end
|
2068
|
+
|
2011
2069
|
# Count null values.
|
2012
2070
|
#
|
2013
2071
|
# @return [Expr]
|
@@ -2194,7 +2252,7 @@ module Polars
|
|
2194
2252
|
# # │ 4 │
|
2195
2253
|
# # │ 6 │
|
2196
2254
|
# # │ 6 │
|
2197
|
-
# # │
|
2255
|
+
# # │ 4 │
|
2198
2256
|
# # │ 6 │
|
2199
2257
|
# # │ 6 │
|
2200
2258
|
# # │ 6 │
|
@@ -2751,6 +2809,7 @@ module Polars
|
|
2751
2809
|
end
|
2752
2810
|
wrap_expr(_rbexpr.is_in(other._rbexpr))
|
2753
2811
|
end
|
2812
|
+
alias_method :in?, :is_in
|
2754
2813
|
|
2755
2814
|
# Repeat the elements in this Series as specified in the given expression.
|
2756
2815
|
#
|
@@ -3914,8 +3973,8 @@ module Polars
|
|
3914
3973
|
# # │ 2 │
|
3915
3974
|
# # │ 5 │
|
3916
3975
|
# # └─────┘
|
3917
|
-
def rank(method: "average", reverse: false)
|
3918
|
-
wrap_expr(_rbexpr.rank(method, reverse))
|
3976
|
+
def rank(method: "average", reverse: false, seed: nil)
|
3977
|
+
wrap_expr(_rbexpr.rank(method, reverse, seed))
|
3919
3978
|
end
|
3920
3979
|
|
3921
3980
|
# Calculate the n-th discrete difference.
|
@@ -4916,9 +4975,10 @@ module Polars
|
|
4916
4975
|
# # ╞═══════════╪═══════════╡
|
4917
4976
|
# # │ [1, 2, 3] ┆ [4, 5, 6] │
|
4918
4977
|
# # └───────────┴───────────┘
|
4919
|
-
def
|
4920
|
-
wrap_expr(_rbexpr.
|
4978
|
+
def implode
|
4979
|
+
wrap_expr(_rbexpr.implode)
|
4921
4980
|
end
|
4981
|
+
alias_method :list, :implode
|
4922
4982
|
|
4923
4983
|
# Shrink numeric columns to the minimal required datatype.
|
4924
4984
|
#
|
data/lib/polars/lazy_frame.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
module Polars
|
2
|
-
# Representation of a Lazy computation graph/query
|
2
|
+
# Representation of a Lazy computation graph/query against a DataFrame.
|
3
3
|
class LazyFrame
|
4
4
|
# @private
|
5
5
|
attr_accessor :_ldf
|
@@ -934,7 +934,7 @@ module Polars
|
|
934
934
|
# "2020-01-08 23:16:43"
|
935
935
|
# ]
|
936
936
|
# df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
|
937
|
-
# Polars.col("dt").str.strptime(
|
937
|
+
# Polars.col("dt").str.strptime(Polars::Datetime)
|
938
938
|
# )
|
939
939
|
# df.groupby_rolling(index_column: "dt", period: "2d").agg(
|
940
940
|
# [
|
@@ -964,6 +964,7 @@ module Polars
|
|
964
964
|
closed: "right",
|
965
965
|
by: nil
|
966
966
|
)
|
967
|
+
index_column = Utils.expr_to_lit_or_expr(index_column, str_to_lit: false)
|
967
968
|
if offset.nil?
|
968
969
|
offset = "-#{period}"
|
969
970
|
end
|
@@ -973,7 +974,7 @@ module Polars
|
|
973
974
|
offset = Utils._timedelta_to_pl_duration(offset)
|
974
975
|
|
975
976
|
lgb = _ldf.groupby_rolling(
|
976
|
-
index_column, period, offset, closed, rbexprs_by
|
977
|
+
index_column._rbexpr, period, offset, closed, rbexprs_by
|
977
978
|
)
|
978
979
|
LazyGroupBy.new(lgb, self.class)
|
979
980
|
end
|
@@ -1077,8 +1077,11 @@ module Polars
|
|
1077
1077
|
# # │ null ┆ null ┆ 3.0 ┆ 3.0 │
|
1078
1078
|
# # │ null ┆ null ┆ null ┆ 99.9 │
|
1079
1079
|
# # └──────┴──────┴──────┴──────┘
|
1080
|
-
def coalesce(exprs)
|
1080
|
+
def coalesce(exprs, *more_exprs)
|
1081
1081
|
exprs = Utils.selection_to_rbexpr_list(exprs)
|
1082
|
+
if more_exprs.any?
|
1083
|
+
exprs.concat(Utils.selection_to_rbexpr_list(more_exprs))
|
1084
|
+
end
|
1082
1085
|
Utils.wrap_expr(_coalesce_exprs(exprs))
|
1083
1086
|
end
|
1084
1087
|
|
data/lib/polars/list_expr.rb
CHANGED
@@ -27,7 +27,7 @@ module Polars
|
|
27
27
|
# # │ 1 │
|
28
28
|
# # └─────┘
|
29
29
|
def lengths
|
30
|
-
Utils.wrap_expr(_rbexpr.
|
30
|
+
Utils.wrap_expr(_rbexpr.list_lengths)
|
31
31
|
end
|
32
32
|
|
33
33
|
# Sum all the lists in the array.
|
@@ -48,7 +48,7 @@ module Polars
|
|
48
48
|
# # │ 5 │
|
49
49
|
# # └────────┘
|
50
50
|
def sum
|
51
|
-
Utils.wrap_expr(_rbexpr.
|
51
|
+
Utils.wrap_expr(_rbexpr.list_sum)
|
52
52
|
end
|
53
53
|
|
54
54
|
# Compute the max value of the lists in the array.
|
@@ -69,7 +69,7 @@ module Polars
|
|
69
69
|
# # │ 3 │
|
70
70
|
# # └────────┘
|
71
71
|
def max
|
72
|
-
Utils.wrap_expr(_rbexpr.
|
72
|
+
Utils.wrap_expr(_rbexpr.list_max)
|
73
73
|
end
|
74
74
|
|
75
75
|
# Compute the min value of the lists in the array.
|
@@ -90,7 +90,7 @@ module Polars
|
|
90
90
|
# # │ 2 │
|
91
91
|
# # └────────┘
|
92
92
|
def min
|
93
|
-
Utils.wrap_expr(_rbexpr.
|
93
|
+
Utils.wrap_expr(_rbexpr.list_min)
|
94
94
|
end
|
95
95
|
|
96
96
|
# Compute the mean value of the lists in the array.
|
@@ -111,7 +111,7 @@ module Polars
|
|
111
111
|
# # │ 2.5 │
|
112
112
|
# # └────────┘
|
113
113
|
def mean
|
114
|
-
Utils.wrap_expr(_rbexpr.
|
114
|
+
Utils.wrap_expr(_rbexpr.list_mean)
|
115
115
|
end
|
116
116
|
|
117
117
|
# Sort the arrays in the list.
|
@@ -136,7 +136,7 @@ module Polars
|
|
136
136
|
# # │ [1, 2, 9] │
|
137
137
|
# # └───────────┘
|
138
138
|
def sort(reverse: false)
|
139
|
-
Utils.wrap_expr(_rbexpr.
|
139
|
+
Utils.wrap_expr(_rbexpr.list_sort(reverse))
|
140
140
|
end
|
141
141
|
|
142
142
|
# Reverse the arrays in the list.
|
@@ -161,7 +161,7 @@ module Polars
|
|
161
161
|
# # │ [2, 1, 9] │
|
162
162
|
# # └───────────┘
|
163
163
|
def reverse
|
164
|
-
Utils.wrap_expr(_rbexpr.
|
164
|
+
Utils.wrap_expr(_rbexpr.list_reverse)
|
165
165
|
end
|
166
166
|
|
167
167
|
# Get the unique/distinct values in the list.
|
@@ -184,8 +184,8 @@ module Polars
|
|
184
184
|
# # ╞═══════════╡
|
185
185
|
# # │ [1, 2] │
|
186
186
|
# # └───────────┘
|
187
|
-
def unique
|
188
|
-
Utils.wrap_expr(_rbexpr.
|
187
|
+
def unique(maintain_order: false)
|
188
|
+
Utils.wrap_expr(_rbexpr.list_unique(maintain_order))
|
189
189
|
end
|
190
190
|
|
191
191
|
# Concat the arrays in a Series dtype List in linear time.
|
@@ -255,7 +255,7 @@ module Polars
|
|
255
255
|
# # └──────┘
|
256
256
|
def get(index)
|
257
257
|
index = Utils.expr_to_lit_or_expr(index, str_to_lit: false)._rbexpr
|
258
|
-
Utils.wrap_expr(_rbexpr.
|
258
|
+
Utils.wrap_expr(_rbexpr.list_get(index))
|
259
259
|
end
|
260
260
|
|
261
261
|
# Get the value by index in the sublists.
|
@@ -265,6 +265,28 @@ module Polars
|
|
265
265
|
get(item)
|
266
266
|
end
|
267
267
|
|
268
|
+
# Take sublists by multiple indices.
|
269
|
+
#
|
270
|
+
# The indices may be defined in a single column, or by sublists in another
|
271
|
+
# column of dtype `List`.
|
272
|
+
#
|
273
|
+
# @param index [Object]
|
274
|
+
# Indices to return per sublist
|
275
|
+
# @param null_on_oob [Boolean]
|
276
|
+
# Behavior if an index is out of bounds:
|
277
|
+
# True -> set as null
|
278
|
+
# False -> raise an error
|
279
|
+
# Note that defaulting to raising an error is much cheaper
|
280
|
+
#
|
281
|
+
# @return [Expr]
|
282
|
+
def take(index, null_on_oob: false)
|
283
|
+
if index.is_a?(Array)
|
284
|
+
index = Series.new(index)
|
285
|
+
end
|
286
|
+
index = Utils.expr_to_lit_or_expr(index, str_to_lit: false)._rbexpr
|
287
|
+
Utils.wrap_expr(_rbexpr.list_take(index, null_on_oob))
|
288
|
+
end
|
289
|
+
|
268
290
|
# Get the first value of the sublists.
|
269
291
|
#
|
270
292
|
# @return [Expr]
|
@@ -331,7 +353,7 @@ module Polars
|
|
331
353
|
# # │ true │
|
332
354
|
# # └───────┘
|
333
355
|
def contains(item)
|
334
|
-
Utils.wrap_expr(_rbexpr.
|
356
|
+
Utils.wrap_expr(_rbexpr.list_contains(Utils.expr_to_lit_or_expr(item)._rbexpr))
|
335
357
|
end
|
336
358
|
|
337
359
|
# Join all string items in a sublist and place a separator between them.
|
@@ -357,7 +379,7 @@ module Polars
|
|
357
379
|
# # │ x y │
|
358
380
|
# # └───────┘
|
359
381
|
def join(separator)
|
360
|
-
Utils.wrap_expr(_rbexpr.
|
382
|
+
Utils.wrap_expr(_rbexpr.list_join(separator))
|
361
383
|
end
|
362
384
|
|
363
385
|
# Retrieve the index of the minimal value in every sublist.
|
@@ -382,7 +404,7 @@ module Polars
|
|
382
404
|
# # │ 1 │
|
383
405
|
# # └─────┘
|
384
406
|
def arg_min
|
385
|
-
Utils.wrap_expr(_rbexpr.
|
407
|
+
Utils.wrap_expr(_rbexpr.list_arg_min)
|
386
408
|
end
|
387
409
|
|
388
410
|
# Retrieve the index of the maximum value in every sublist.
|
@@ -407,7 +429,7 @@ module Polars
|
|
407
429
|
# # │ 0 │
|
408
430
|
# # └─────┘
|
409
431
|
def arg_max
|
410
|
-
Utils.wrap_expr(_rbexpr.
|
432
|
+
Utils.wrap_expr(_rbexpr.list_arg_max)
|
411
433
|
end
|
412
434
|
|
413
435
|
# Calculate the n-th discrete difference of every sublist.
|
@@ -430,7 +452,7 @@ module Polars
|
|
430
452
|
# # [null, -8, -1]
|
431
453
|
# # ]
|
432
454
|
def diff(n: 1, null_behavior: "ignore")
|
433
|
-
Utils.wrap_expr(_rbexpr.
|
455
|
+
Utils.wrap_expr(_rbexpr.list_diff(n, null_behavior))
|
434
456
|
end
|
435
457
|
|
436
458
|
# Shift values by the given period.
|
@@ -451,7 +473,7 @@ module Polars
|
|
451
473
|
# # [null, 10, 2]
|
452
474
|
# # ]
|
453
475
|
def shift(periods = 1)
|
454
|
-
Utils.wrap_expr(_rbexpr.
|
476
|
+
Utils.wrap_expr(_rbexpr.list_shift(periods))
|
455
477
|
end
|
456
478
|
|
457
479
|
# Slice every sublist.
|
@@ -477,7 +499,7 @@ module Polars
|
|
477
499
|
def slice(offset, length = nil)
|
478
500
|
offset = Utils.expr_to_lit_or_expr(offset, str_to_lit: false)._rbexpr
|
479
501
|
length = Utils.expr_to_lit_or_expr(length, str_to_lit: false)._rbexpr
|
480
|
-
Utils.wrap_expr(_rbexpr.
|
502
|
+
Utils.wrap_expr(_rbexpr.list_slice(offset, length))
|
481
503
|
end
|
482
504
|
|
483
505
|
# Slice the first `n` values of every sublist.
|
@@ -523,6 +545,33 @@ module Polars
|
|
523
545
|
slice(offset, n)
|
524
546
|
end
|
525
547
|
|
548
|
+
# Count how often the value produced by ``element`` occurs.
|
549
|
+
#
|
550
|
+
# @param element [Expr]
|
551
|
+
# An expression that produces a single value
|
552
|
+
#
|
553
|
+
# @return [Expr]
|
554
|
+
#
|
555
|
+
# @example
|
556
|
+
# df = Polars::DataFrame.new({"listcol" => [[0], [1], [1, 2, 3, 2], [1, 2, 1], [4, 4]]})
|
557
|
+
# df.select(Polars.col("listcol").arr.count_match(2).alias("number_of_twos"))
|
558
|
+
# # =>
|
559
|
+
# # shape: (5, 1)
|
560
|
+
# # ┌────────────────┐
|
561
|
+
# # │ number_of_twos │
|
562
|
+
# # │ --- │
|
563
|
+
# # │ u32 │
|
564
|
+
# # ╞════════════════╡
|
565
|
+
# # │ 0 │
|
566
|
+
# # │ 0 │
|
567
|
+
# # │ 2 │
|
568
|
+
# # │ 1 │
|
569
|
+
# # │ 0 │
|
570
|
+
# # └────────────────┘
|
571
|
+
def count_match(element)
|
572
|
+
Utils.wrap_expr(_rbexpr.list_count_match(Utils.expr_to_lit_or_expr(element)._rbexpr))
|
573
|
+
end
|
574
|
+
|
526
575
|
# Convert the series of type `List` to a series of type `Struct`.
|
527
576
|
#
|
528
577
|
# @param n_field_strategy ["first_non_null", "max_width"]
|
@@ -548,7 +597,7 @@ module Polars
|
|
548
597
|
# # └────────────┘
|
549
598
|
def to_struct(n_field_strategy: "first_non_null", name_generator: nil)
|
550
599
|
raise Todo if name_generator
|
551
|
-
Utils.wrap_expr(_rbexpr.
|
600
|
+
Utils.wrap_expr(_rbexpr.list_to_struct(n_field_strategy, name_generator, 0))
|
552
601
|
end
|
553
602
|
|
554
603
|
# Run any polars expression against the lists' elements.
|
@@ -582,7 +631,7 @@ module Polars
|
|
582
631
|
# # │ 3 ┆ 2 ┆ [2.0, 1.0] │
|
583
632
|
# # └─────┴─────┴────────────┘
|
584
633
|
def eval(expr, parallel: false)
|
585
|
-
|
634
|
+
Utils.wrap_expr(_rbexpr.list_eval(expr._rbexpr, parallel))
|
586
635
|
end
|
587
636
|
end
|
588
637
|
end
|