polars-df 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Cargo.lock +70 -9
- data/Cargo.toml +2 -0
- data/ext/polars/Cargo.toml +6 -1
- data/ext/polars/src/apply/dataframe.rs +292 -0
- data/ext/polars/src/apply/mod.rs +254 -0
- data/ext/polars/src/apply/series.rs +1173 -0
- data/ext/polars/src/conversion.rs +100 -5
- data/ext/polars/src/dataframe.rs +146 -1
- data/ext/polars/src/error.rs +8 -0
- data/ext/polars/src/lazy/apply.rs +34 -2
- data/ext/polars/src/lazy/dataframe.rs +72 -1
- data/ext/polars/src/lazy/dsl.rs +38 -0
- data/ext/polars/src/lib.rs +165 -1
- data/ext/polars/src/series.rs +296 -0
- data/ext/polars/src/utils.rs +25 -0
- data/lib/polars/convert.rb +100 -0
- data/lib/polars/data_frame.rb +1457 -56
- data/lib/polars/dynamic_group_by.rb +49 -0
- data/lib/polars/expr.rb +258 -9
- data/lib/polars/functions.rb +192 -3
- data/lib/polars/group_by.rb +43 -3
- data/lib/polars/io.rb +19 -3
- data/lib/polars/lazy_frame.rb +792 -22
- data/lib/polars/lazy_functions.rb +561 -27
- data/lib/polars/rolling_group_by.rb +35 -0
- data/lib/polars/series.rb +132 -10
- data/lib/polars/utils.rb +16 -0
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +9 -1
- metadata +9 -3
@@ -0,0 +1,49 @@
|
|
1
|
+
module Polars
|
2
|
+
# A dynamic grouper.
|
3
|
+
#
|
4
|
+
# This has an `.agg` method which allows you to run all polars expressions in a
|
5
|
+
# groupby context.
|
6
|
+
class DynamicGroupBy
|
7
|
+
def initialize(
|
8
|
+
df,
|
9
|
+
index_column,
|
10
|
+
every,
|
11
|
+
period,
|
12
|
+
offset,
|
13
|
+
truncate,
|
14
|
+
include_boundaries,
|
15
|
+
closed,
|
16
|
+
by
|
17
|
+
)
|
18
|
+
period = Utils._timedelta_to_pl_duration(period)
|
19
|
+
offset = Utils._timedelta_to_pl_duration(offset)
|
20
|
+
every = Utils._timedelta_to_pl_duration(every)
|
21
|
+
|
22
|
+
@df = df
|
23
|
+
@time_column = index_column
|
24
|
+
@every = every
|
25
|
+
@period = period
|
26
|
+
@offset = offset
|
27
|
+
@truncate = truncate
|
28
|
+
@include_boundaries = include_boundaries
|
29
|
+
@closed = closed
|
30
|
+
@by = by
|
31
|
+
end
|
32
|
+
|
33
|
+
def agg(aggs)
|
34
|
+
@df.lazy
|
35
|
+
.groupby_dynamic(
|
36
|
+
@time_column,
|
37
|
+
every: @every,
|
38
|
+
period: @period,
|
39
|
+
offset: @offset,
|
40
|
+
truncate: @truncate,
|
41
|
+
include_boundaries: @include_boundaries,
|
42
|
+
closed: @closed,
|
43
|
+
by: @by
|
44
|
+
)
|
45
|
+
.agg(aggs)
|
46
|
+
.collect(no_optimization: true, string_cache: false)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
data/lib/polars/expr.rb
CHANGED
@@ -432,8 +432,34 @@ module Polars
|
|
432
432
|
wrap_expr(_rbexpr.suffix(suffix))
|
433
433
|
end
|
434
434
|
|
435
|
-
#
|
436
|
-
#
|
435
|
+
# Rename the output of an expression by mapping a function over the root name.
|
436
|
+
#
|
437
|
+
# @return [Expr]
|
438
|
+
#
|
439
|
+
# @example
|
440
|
+
# df = Polars::DataFrame.new(
|
441
|
+
# {
|
442
|
+
# "A" => [1, 2],
|
443
|
+
# "B" => [3, 4]
|
444
|
+
# }
|
445
|
+
# )
|
446
|
+
# df.select(
|
447
|
+
# Polars.all.reverse.map_alias { |colName| colName + "_reverse" }
|
448
|
+
# )
|
449
|
+
# # =>
|
450
|
+
# # shape: (2, 2)
|
451
|
+
# # ┌───────────┬───────────┐
|
452
|
+
# # │ A_reverse ┆ B_reverse │
|
453
|
+
# # │ --- ┆ --- │
|
454
|
+
# # │ i64 ┆ i64 │
|
455
|
+
# # ╞═══════════╪═══════════╡
|
456
|
+
# # │ 2 ┆ 4 │
|
457
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
|
458
|
+
# # │ 1 ┆ 3 │
|
459
|
+
# # └───────────┴───────────┘
|
460
|
+
def map_alias(&f)
|
461
|
+
Utils.wrap_expr(_rbexpr.map_alias(f))
|
462
|
+
end
|
437
463
|
|
438
464
|
# Negate a boolean expression.
|
439
465
|
#
|
@@ -2575,14 +2601,98 @@ module Polars
|
|
2575
2601
|
# # ╞══════╪════════╡
|
2576
2602
|
# # │ 1 ┆ 0 │
|
2577
2603
|
# # └──────┴────────┘
|
2578
|
-
# def map(return_dtype: nil, agg_list: false, &
|
2604
|
+
# def map(return_dtype: nil, agg_list: false, &f)
|
2579
2605
|
# if !return_dtype.nil?
|
2580
2606
|
# return_dtype = Utils.rb_type_to_dtype(return_dtype)
|
2581
2607
|
# end
|
2582
|
-
# wrap_expr(_rbexpr.map(return_dtype, agg_list
|
2608
|
+
# wrap_expr(_rbexpr.map(f, return_dtype, agg_list))
|
2583
2609
|
# end
|
2584
2610
|
|
2585
|
-
#
|
2611
|
+
# Apply a custom/user-defined function (UDF) in a GroupBy or Projection context.
|
2612
|
+
#
|
2613
|
+
# Depending on the context it has the following behavior:
|
2614
|
+
#
|
2615
|
+
# * Selection
|
2616
|
+
# Expects `f` to be of type Callable[[Any], Any].
|
2617
|
+
# Applies a Ruby function over each individual value in the column.
|
2618
|
+
# * GroupBy
|
2619
|
+
# Expects `f` to be of type Callable[[Series], Series].
|
2620
|
+
# Applies a Ruby function over each group.
|
2621
|
+
#
|
2622
|
+
# Implementing logic using a Ruby function is almost always _significantly_
|
2623
|
+
# slower and more memory intensive than implementing the same logic using
|
2624
|
+
# the native expression API because:
|
2625
|
+
#
|
2626
|
+
# - The native expression engine runs in Rust; UDFs run in Ruby.
|
2627
|
+
# - Use of Ruby UDFs forces the DataFrame to be materialized in memory.
|
2628
|
+
# - Polars-native expressions can be parallelised (UDFs cannot).
|
2629
|
+
# - Polars-native expressions can be logically optimised (UDFs cannot).
|
2630
|
+
#
|
2631
|
+
# Wherever possible you should strongly prefer the native expression API
|
2632
|
+
# to achieve the best performance.
|
2633
|
+
#
|
2634
|
+
# @param return_dtype [Symbol]
|
2635
|
+
# Dtype of the output Series.
|
2636
|
+
# If not set, polars will assume that
|
2637
|
+
# the dtype remains unchanged.
|
2638
|
+
#
|
2639
|
+
# @return [Expr]
|
2640
|
+
#
|
2641
|
+
# @example
|
2642
|
+
# df = Polars::DataFrame.new(
|
2643
|
+
# {
|
2644
|
+
# "a" => [1, 2, 3, 1],
|
2645
|
+
# "b" => ["a", "b", "c", "c"]
|
2646
|
+
# }
|
2647
|
+
# )
|
2648
|
+
#
|
2649
|
+
# @example In a selection context, the function is applied by row.
|
2650
|
+
# df.with_column(
|
2651
|
+
# Polars.col("a").apply { |x| x * 2 }.alias("a_times_2")
|
2652
|
+
# )
|
2653
|
+
# # =>
|
2654
|
+
# # shape: (4, 3)
|
2655
|
+
# # ┌─────┬─────┬───────────┐
|
2656
|
+
# # │ a ┆ b ┆ a_times_2 │
|
2657
|
+
# # │ --- ┆ --- ┆ --- │
|
2658
|
+
# # │ i64 ┆ str ┆ i64 │
|
2659
|
+
# # ╞═════╪═════╪═══════════╡
|
2660
|
+
# # │ 1 ┆ a ┆ 2 │
|
2661
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
|
2662
|
+
# # │ 2 ┆ b ┆ 4 │
|
2663
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
|
2664
|
+
# # │ 3 ┆ c ┆ 6 │
|
2665
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
|
2666
|
+
# # │ 1 ┆ c ┆ 2 │
|
2667
|
+
# # └─────┴─────┴───────────┘
|
2668
|
+
#
|
2669
|
+
# @example In a GroupBy context the function is applied by group:
|
2670
|
+
# df.lazy
|
2671
|
+
# .groupby("b", maintain_order: true)
|
2672
|
+
# .agg(
|
2673
|
+
# [
|
2674
|
+
# Polars.col("a").apply { |x| x.sum }
|
2675
|
+
# ]
|
2676
|
+
# )
|
2677
|
+
# .collect
|
2678
|
+
# # =>
|
2679
|
+
# # shape: (3, 2)
|
2680
|
+
# # ┌─────┬─────┐
|
2681
|
+
# # │ b ┆ a │
|
2682
|
+
# # │ --- ┆ --- │
|
2683
|
+
# # │ str ┆ i64 │
|
2684
|
+
# # ╞═════╪═════╡
|
2685
|
+
# # │ a ┆ 1 │
|
2686
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
2687
|
+
# # │ b ┆ 2 │
|
2688
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
2689
|
+
# # │ c ┆ 4 │
|
2690
|
+
# # └─────┴─────┘
|
2691
|
+
# def apply(return_dtype: nil, &f)
|
2692
|
+
# wrap_f = lambda do |x|
|
2693
|
+
# x.apply(return_dtype: return_dtype, &f)
|
2694
|
+
# end
|
2695
|
+
# map(agg_list: true, return_dtype: return_dtype, &wrap_f)
|
2586
2696
|
# end
|
2587
2697
|
|
2588
2698
|
# Explode a list or utf8 Series. This means that every item is expanded to a new
|
@@ -2898,8 +3008,49 @@ module Polars
|
|
2898
3008
|
end
|
2899
3009
|
end
|
2900
3010
|
|
2901
|
-
#
|
2902
|
-
#
|
3011
|
+
# Hash the elements in the selection.
|
3012
|
+
#
|
3013
|
+
# The hash value is of type `:u64`.
|
3014
|
+
#
|
3015
|
+
# @param seed [Integer]
|
3016
|
+
# Random seed parameter. Defaults to 0.
|
3017
|
+
# @param seed_1 [Integer]
|
3018
|
+
# Random seed parameter. Defaults to `seed` if not set.
|
3019
|
+
# @param seed_2 [Integer]
|
3020
|
+
# Random seed parameter. Defaults to `seed` if not set.
|
3021
|
+
# @param seed_3 [Integer]
|
3022
|
+
# Random seed parameter. Defaults to `seed` if not set.
|
3023
|
+
#
|
3024
|
+
# @return [Expr]
|
3025
|
+
#
|
3026
|
+
# @example
|
3027
|
+
# df = Polars::DataFrame.new(
|
3028
|
+
# {
|
3029
|
+
# "a" => [1, 2, nil],
|
3030
|
+
# "b" => ["x", nil, "z"]
|
3031
|
+
# }
|
3032
|
+
# )
|
3033
|
+
# df.with_column(Polars.all._hash(10, 20, 30, 40))
|
3034
|
+
# # =>
|
3035
|
+
# # shape: (3, 2)
|
3036
|
+
# # ┌──────────────────────┬──────────────────────┐
|
3037
|
+
# # │ a ┆ b │
|
3038
|
+
# # │ --- ┆ --- │
|
3039
|
+
# # │ u64 ┆ u64 │
|
3040
|
+
# # ╞══════════════════════╪══════════════════════╡
|
3041
|
+
# # │ 4629889412789719550 ┆ 6959506404929392568 │
|
3042
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
3043
|
+
# # │ 16386608652769605760 ┆ 11638928888656214026 │
|
3044
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
3045
|
+
# # │ 11638928888656214026 ┆ 11040941213715918520 │
|
3046
|
+
# # └──────────────────────┴──────────────────────┘
|
3047
|
+
def _hash(seed = 0, seed_1 = nil, seed_2 = nil, seed_3 = nil)
|
3048
|
+
k0 = seed
|
3049
|
+
k1 = seed_1.nil? ? seed : seed_1
|
3050
|
+
k2 = seed_2.nil? ? seed : seed_2
|
3051
|
+
k3 = seed_3.nil? ? seed : seed_3
|
3052
|
+
wrap_expr(_rbexpr._hash(k0, k1, k2, k3))
|
3053
|
+
end
|
2903
3054
|
|
2904
3055
|
# Reinterpret the underlying bits as a signed/unsigned integer.
|
2905
3056
|
#
|
@@ -2937,7 +3088,40 @@ module Polars
|
|
2937
3088
|
wrap_expr(_rbexpr.reinterpret(signed))
|
2938
3089
|
end
|
2939
3090
|
|
2940
|
-
#
|
3091
|
+
# Print the value that this expression evaluates to and pass on the value.
|
3092
|
+
#
|
3093
|
+
# @return [Expr]
|
3094
|
+
#
|
3095
|
+
# @example
|
3096
|
+
# df = Polars::DataFrame.new({"foo" => [1, 1, 2]})
|
3097
|
+
# df.select(Polars.col("foo").cumsum._inspect("value is: %s").alias("bar"))
|
3098
|
+
# # =>
|
3099
|
+
# # value is: shape: (3,)
|
3100
|
+
# # Series: 'foo' [i64]
|
3101
|
+
# # [
|
3102
|
+
# # 1
|
3103
|
+
# # 2
|
3104
|
+
# # 4
|
3105
|
+
# # ]
|
3106
|
+
# # shape: (3, 1)
|
3107
|
+
# # ┌─────┐
|
3108
|
+
# # │ bar │
|
3109
|
+
# # │ --- │
|
3110
|
+
# # │ i64 │
|
3111
|
+
# # ╞═════╡
|
3112
|
+
# # │ 1 │
|
3113
|
+
# # ├╌╌╌╌╌┤
|
3114
|
+
# # │ 2 │
|
3115
|
+
# # ├╌╌╌╌╌┤
|
3116
|
+
# # │ 4 │
|
3117
|
+
# # └─────┘
|
3118
|
+
# def _inspect(fmt = "%s")
|
3119
|
+
# inspect = lambda do |s|
|
3120
|
+
# puts(fmt % [s])
|
3121
|
+
# s
|
3122
|
+
# end
|
3123
|
+
|
3124
|
+
# map(return_dtype: nil, agg_list: true, &inspect)
|
2941
3125
|
# end
|
2942
3126
|
|
2943
3127
|
# Fill nulls with linear interpolation over missing values.
|
@@ -3721,7 +3905,72 @@ module Polars
|
|
3721
3905
|
)
|
3722
3906
|
end
|
3723
3907
|
|
3724
|
-
#
|
3908
|
+
# Apply a custom rolling window function.
|
3909
|
+
#
|
3910
|
+
# Prefer the specific rolling window functions over this one, as they are faster.
|
3911
|
+
#
|
3912
|
+
# Prefer:
|
3913
|
+
# * rolling_min
|
3914
|
+
# * rolling_max
|
3915
|
+
# * rolling_mean
|
3916
|
+
# * rolling_sum
|
3917
|
+
#
|
3918
|
+
# @param window_size [Integer]
|
3919
|
+
# The length of the window.
|
3920
|
+
# @param weights [Object]
|
3921
|
+
# An optional slice with the same length as the window that will be multiplied
|
3922
|
+
# elementwise with the values in the window.
|
3923
|
+
# @param min_periods [Integer]
|
3924
|
+
# The number of values in the window that should be non-null before computing
|
3925
|
+
# a result. If nil, it will be set equal to window size.
|
3926
|
+
# @param center [Boolean]
|
3927
|
+
# Set the labels at the center of the window
|
3928
|
+
#
|
3929
|
+
# @return [Expr]
|
3930
|
+
#
|
3931
|
+
# @example
|
3932
|
+
# df = Polars::DataFrame.new(
|
3933
|
+
# {
|
3934
|
+
# "A" => [1.0, 2.0, 9.0, 2.0, 13.0]
|
3935
|
+
# }
|
3936
|
+
# )
|
3937
|
+
# df.select(
|
3938
|
+
# [
|
3939
|
+
# Polars.col("A").rolling_apply(window_size: 3) { |s| s.std }
|
3940
|
+
# ]
|
3941
|
+
# )
|
3942
|
+
# # =>
|
3943
|
+
# # shape: (5, 1)
|
3944
|
+
# # ┌──────────┐
|
3945
|
+
# # │ A │
|
3946
|
+
# # │ --- │
|
3947
|
+
# # │ f64 │
|
3948
|
+
# # ╞══════════╡
|
3949
|
+
# # │ null │
|
3950
|
+
# # ├╌╌╌╌╌╌╌╌╌╌┤
|
3951
|
+
# # │ null │
|
3952
|
+
# # ├╌╌╌╌╌╌╌╌╌╌┤
|
3953
|
+
# # │ 4.358899 │
|
3954
|
+
# # ├╌╌╌╌╌╌╌╌╌╌┤
|
3955
|
+
# # │ 4.041452 │
|
3956
|
+
# # ├╌╌╌╌╌╌╌╌╌╌┤
|
3957
|
+
# # │ 5.567764 │
|
3958
|
+
# # └──────────┘
|
3959
|
+
# def rolling_apply(
|
3960
|
+
# window_size:,
|
3961
|
+
# weights: nil,
|
3962
|
+
# min_periods: nil,
|
3963
|
+
# center: false,
|
3964
|
+
# &function
|
3965
|
+
# )
|
3966
|
+
# if min_periods.nil?
|
3967
|
+
# min_periods = window_size
|
3968
|
+
# end
|
3969
|
+
# wrap_expr(
|
3970
|
+
# _rbexpr.rolling_apply(
|
3971
|
+
# function, window_size, weights, min_periods, center
|
3972
|
+
# )
|
3973
|
+
# )
|
3725
3974
|
# end
|
3726
3975
|
|
3727
3976
|
# Compute a rolling skew.
|
data/lib/polars/functions.rb
CHANGED
@@ -199,12 +199,201 @@ module Polars
|
|
199
199
|
dt_range
|
200
200
|
end
|
201
201
|
|
202
|
-
#
|
203
|
-
#
|
202
|
+
# Bin values into discrete values.
|
203
|
+
#
|
204
|
+
# @param s [Series]
|
205
|
+
# Series to bin.
|
206
|
+
# @param bins [Array]
|
207
|
+
# Bins to create.
|
208
|
+
# @param labels [Array]
|
209
|
+
# Labels to assign to the bins. If given the length of labels must be
|
210
|
+
# len(bins) + 1.
|
211
|
+
# @param break_point_label [String]
|
212
|
+
# Name given to the breakpoint column.
|
213
|
+
# @param category_label [String]
|
214
|
+
# Name given to the category column.
|
215
|
+
#
|
216
|
+
# @return [DataFrame]
|
217
|
+
#
|
218
|
+
# @note
|
219
|
+
# This functionality is experimental and may change without it being considered a
|
220
|
+
# breaking change.
|
221
|
+
#
|
222
|
+
# @example
|
223
|
+
# a = Polars::Series.new("a", 13.times.map { |i| (-30 + i * 5) / 10.0 })
|
224
|
+
# Polars.cut(a, [-1, 1])
|
225
|
+
# # =>
|
226
|
+
# # shape: (12, 3)
|
227
|
+
# # ┌──────┬─────────────┬──────────────┐
|
228
|
+
# # │ a ┆ break_point ┆ category │
|
229
|
+
# # │ --- ┆ --- ┆ --- │
|
230
|
+
# # │ f64 ┆ f64 ┆ cat │
|
231
|
+
# # ╞══════╪═════════════╪══════════════╡
|
232
|
+
# # │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │
|
233
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
234
|
+
# # │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │
|
235
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
236
|
+
# # │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │
|
237
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
238
|
+
# # │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │
|
239
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
240
|
+
# # │ ... ┆ ... ┆ ... │
|
241
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
242
|
+
# # │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │
|
243
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
244
|
+
# # │ 1.5 ┆ inf ┆ (1.0, inf] │
|
245
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
246
|
+
# # │ 2.0 ┆ inf ┆ (1.0, inf] │
|
247
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
248
|
+
# # │ 2.5 ┆ inf ┆ (1.0, inf] │
|
249
|
+
# # └──────┴─────────────┴──────────────┘
|
250
|
+
# def cut(
|
251
|
+
# s,
|
252
|
+
# bins,
|
253
|
+
# labels: nil,
|
254
|
+
# break_point_label: "break_point",
|
255
|
+
# category_label: "category"
|
256
|
+
# )
|
257
|
+
# var_nm = s.name
|
204
258
|
|
205
|
-
#
|
259
|
+
# cuts_df = DataFrame.new(
|
260
|
+
# [
|
261
|
+
# Series.new(
|
262
|
+
# break_point_label, bins, dtype: :f64
|
263
|
+
# ).extend_constant(Float::INFINITY, 1)
|
264
|
+
# ]
|
265
|
+
# )
|
266
|
+
|
267
|
+
# if labels
|
268
|
+
# if labels.length != bins.length + 1
|
269
|
+
# raise ArgumentError, "expected more labels"
|
270
|
+
# end
|
271
|
+
# cuts_df = cuts_df.with_column(Series.new(category_label, labels))
|
272
|
+
# else
|
273
|
+
# cuts_df = cuts_df.with_column(
|
274
|
+
# Polars.format(
|
275
|
+
# "({}, {}]",
|
276
|
+
# Polars.col(break_point_label).shift_and_fill(1, -Float::INFINITY),
|
277
|
+
# Polars.col(break_point_label)
|
278
|
+
# ).alias(category_label)
|
279
|
+
# )
|
280
|
+
# end
|
281
|
+
|
282
|
+
# cuts_df = cuts_df.with_column(Polars.col(category_label).cast(:cat))
|
283
|
+
|
284
|
+
# s.cast(:f64)
|
285
|
+
# .sort
|
286
|
+
# .to_frame
|
287
|
+
# .join_asof(
|
288
|
+
# cuts_df,
|
289
|
+
# left_on: var_nm,
|
290
|
+
# right_on: break_point_label,
|
291
|
+
# strategy: "forward"
|
292
|
+
# )
|
206
293
|
# end
|
207
294
|
|
295
|
+
# Align a sequence of frames using the uique values from one or more columns as a key.
|
296
|
+
#
|
297
|
+
# Frames that do not contain the given key values have rows injected (with nulls
|
298
|
+
# filling the non-key columns), and each resulting frame is sorted by the key.
|
299
|
+
#
|
300
|
+
# The original column order of input frames is not changed unless ``select`` is
|
301
|
+
# specified (in which case the final column order is determined from that).
|
302
|
+
#
|
303
|
+
# Note that this does not result in a joined frame - you receive the same number
|
304
|
+
# of frames back that you passed in, but each is now aligned by key and has
|
305
|
+
# the same number of rows.
|
306
|
+
#
|
307
|
+
# @param frames [Array]
|
308
|
+
# Sequence of DataFrames or LazyFrames.
|
309
|
+
# @param on [Object]
|
310
|
+
# One or more columns whose unique values will be used to align the frames.
|
311
|
+
# @param select [Object]
|
312
|
+
# Optional post-alignment column select to constrain and/or order
|
313
|
+
# the columns returned from the newly aligned frames.
|
314
|
+
# @param reverse [Object]
|
315
|
+
# Sort the alignment column values in descending order; can be a single
|
316
|
+
# boolean or a list of booleans associated with each column in `on`.
|
317
|
+
#
|
318
|
+
# @return [Object]
|
319
|
+
#
|
320
|
+
# @example
|
321
|
+
# df1 = Polars::DataFrame.new(
|
322
|
+
# {
|
323
|
+
# "dt" => [Date.new(2022, 9, 1), Date.new(2022, 9, 2), Date.new(2022, 9, 3)],
|
324
|
+
# "x" => [3.5, 4.0, 1.0],
|
325
|
+
# "y" => [10.0, 2.5, 1.5]
|
326
|
+
# }
|
327
|
+
# )
|
328
|
+
# df2 = Polars::DataFrame.new(
|
329
|
+
# {
|
330
|
+
# "dt" => [Date.new(2022, 9, 2), Date.new(2022, 9, 3), Date.new(2022, 9, 1)],
|
331
|
+
# "x" => [8.0, 1.0, 3.5],
|
332
|
+
# "y" => [1.5, 12.0, 5.0]
|
333
|
+
# }
|
334
|
+
# )
|
335
|
+
# df3 = Polars::DataFrame.new(
|
336
|
+
# {
|
337
|
+
# "dt" => [Date.new(2022, 9, 3), Date.new(2022, 9, 2)],
|
338
|
+
# "x" => [2.0, 5.0],
|
339
|
+
# "y" => [2.5, 2.0]
|
340
|
+
# }
|
341
|
+
# )
|
342
|
+
# af1, af2, af3 = Polars.align_frames(
|
343
|
+
# df1, df2, df3, on: "dt", select: ["x", "y"]
|
344
|
+
# )
|
345
|
+
# (af1 * af2 * af3).fill_null(0).select(Polars.sum(Polars.col("*")).alias("dot"))
|
346
|
+
# # =>
|
347
|
+
# # shape: (3, 1)
|
348
|
+
# # ┌───────┐
|
349
|
+
# # │ dot │
|
350
|
+
# # │ --- │
|
351
|
+
# # │ f64 │
|
352
|
+
# # ╞═══════╡
|
353
|
+
# # │ 0.0 │
|
354
|
+
# # ├╌╌╌╌╌╌╌┤
|
355
|
+
# # │ 167.5 │
|
356
|
+
# # ├╌╌╌╌╌╌╌┤
|
357
|
+
# # │ 47.0 │
|
358
|
+
# # └───────┘
|
359
|
+
def align_frames(
|
360
|
+
*frames,
|
361
|
+
on:,
|
362
|
+
select: nil,
|
363
|
+
reverse: false
|
364
|
+
)
|
365
|
+
if frames.empty?
|
366
|
+
return []
|
367
|
+
elsif frames.map(&:class).uniq.length != 1
|
368
|
+
raise TypeError, "Input frames must be of a consistent type (all LazyFrame or all DataFrame)"
|
369
|
+
end
|
370
|
+
|
371
|
+
# establish the superset of all "on" column values, sort, and cache
|
372
|
+
eager = frames[0].is_a?(DataFrame)
|
373
|
+
alignment_frame = (
|
374
|
+
concat(frames.map { |df| df.lazy.select(on) })
|
375
|
+
.unique(maintain_order: false)
|
376
|
+
.sort(on, reverse: reverse)
|
377
|
+
)
|
378
|
+
alignment_frame = (
|
379
|
+
eager ? alignment_frame.collect.lazy : alignment_frame.cache
|
380
|
+
)
|
381
|
+
# finally, align all frames
|
382
|
+
aligned_frames =
|
383
|
+
frames.map do |df|
|
384
|
+
alignment_frame.join(
|
385
|
+
df.lazy,
|
386
|
+
on: alignment_frame.columns,
|
387
|
+
how: "left"
|
388
|
+
).select(df.columns)
|
389
|
+
end
|
390
|
+
if !select.nil?
|
391
|
+
aligned_frames = aligned_frames.map { |df| df.select(select) }
|
392
|
+
end
|
393
|
+
|
394
|
+
eager ? aligned_frames.map(&:collect) : aligned_frames
|
395
|
+
end
|
396
|
+
|
208
397
|
# Return a new Series of given length and type, filled with ones.
|
209
398
|
#
|
210
399
|
# @param n [Integer]
|
data/lib/polars/group_by.rb
CHANGED
@@ -12,7 +12,48 @@ module Polars
|
|
12
12
|
self.maintain_order = maintain_order
|
13
13
|
end
|
14
14
|
|
15
|
-
#
|
15
|
+
# Apply a custom/user-defined function (UDF) over the groups as a sub-DataFrame.
|
16
|
+
#
|
17
|
+
# Implementing logic using a Ruby function is almost always _significantly_
|
18
|
+
# slower and more memory intensive than implementing the same logic using
|
19
|
+
# the native expression API because:
|
20
|
+
|
21
|
+
# - The native expression engine runs in Rust; UDFs run in Ruby.
|
22
|
+
# - Use of Ruby UDFs forces the DataFrame to be materialized in memory.
|
23
|
+
# - Polars-native expressions can be parallelised (UDFs cannot).
|
24
|
+
# - Polars-native expressions can be logically optimised (UDFs cannot).
|
25
|
+
#
|
26
|
+
# Wherever possible you should strongly prefer the native expression API
|
27
|
+
# to achieve the best performance.
|
28
|
+
#
|
29
|
+
# @return [DataFrame]
|
30
|
+
#
|
31
|
+
# @example
|
32
|
+
# df = Polars::DataFrame.new(
|
33
|
+
# {
|
34
|
+
# "id" => [0, 1, 2, 3, 4],
|
35
|
+
# "color" => ["red", "green", "green", "red", "red"],
|
36
|
+
# "shape" => ["square", "triangle", "square", "triangle", "square"]
|
37
|
+
# }
|
38
|
+
# )
|
39
|
+
# df.groupby("color").apply { |group_df| group_df.sample(2) }
|
40
|
+
# # =>
|
41
|
+
# # shape: (4, 3)
|
42
|
+
# # ┌─────┬───────┬──────────┐
|
43
|
+
# # │ id ┆ color ┆ shape │
|
44
|
+
# # │ --- ┆ --- ┆ --- │
|
45
|
+
# # │ i64 ┆ str ┆ str │
|
46
|
+
# # ╞═════╪═══════╪══════════╡
|
47
|
+
# # │ 1 ┆ green ┆ triangle │
|
48
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
|
49
|
+
# # │ 2 ┆ green ┆ square │
|
50
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
|
51
|
+
# # │ 4 ┆ red ┆ square │
|
52
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
|
53
|
+
# # │ 3 ┆ red ┆ triangle │
|
54
|
+
# # └─────┴───────┴──────────┘
|
55
|
+
# def apply(&f)
|
56
|
+
# _dataframe_class._from_rbdf(_df.groupby_apply(by, f))
|
16
57
|
# end
|
17
58
|
|
18
59
|
# Use multiple aggregations on columns.
|
@@ -182,8 +223,7 @@ module Polars
|
|
182
223
|
_dataframe_class._from_rbdf(df._df)
|
183
224
|
end
|
184
225
|
|
185
|
-
#
|
186
|
-
# end
|
226
|
+
# pivot is deprecated
|
187
227
|
|
188
228
|
# Aggregate the first values in the group.
|
189
229
|
#
|
data/lib/polars/io.rb
CHANGED
@@ -59,7 +59,7 @@ module Polars
|
|
59
59
|
# Lossy means that invalid utf8 values are replaced with `�`
|
60
60
|
# characters. When using other encodings than `utf8` or
|
61
61
|
# `utf8-lossy`, the input is first decoded im memory with
|
62
|
-
#
|
62
|
+
# Ruby.
|
63
63
|
# @param low_memory [Boolean]
|
64
64
|
# Reduce memory usage at expense of performance.
|
65
65
|
# @param rechunk [Boolean]
|
@@ -451,8 +451,24 @@ module Polars
|
|
451
451
|
)
|
452
452
|
end
|
453
453
|
|
454
|
-
#
|
455
|
-
#
|
454
|
+
# Read into a DataFrame from Apache Avro format.
|
455
|
+
#
|
456
|
+
# @param file [Object]
|
457
|
+
# Path to a file or a file-like object.
|
458
|
+
# @param columns [Object]
|
459
|
+
# Columns to select. Accepts a list of column indices (starting at zero) or a list
|
460
|
+
# of column names.
|
461
|
+
# @param n_rows [Integer]
|
462
|
+
# Stop reading from Apache Avro file after reading ``n_rows``.
|
463
|
+
#
|
464
|
+
# @return [DataFrame]
|
465
|
+
def read_avro(file, columns: nil, n_rows: nil)
|
466
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
467
|
+
file = Utils.format_path(file)
|
468
|
+
end
|
469
|
+
|
470
|
+
DataFrame._read_avro(file, n_rows: n_rows, columns: columns)
|
471
|
+
end
|
456
472
|
|
457
473
|
# Read into a DataFrame from Arrow IPC (Feather v2) file.
|
458
474
|
#
|