polars-df 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Cargo.lock +70 -9
- data/Cargo.toml +2 -0
- data/ext/polars/Cargo.toml +6 -1
- data/ext/polars/src/apply/dataframe.rs +292 -0
- data/ext/polars/src/apply/mod.rs +254 -0
- data/ext/polars/src/apply/series.rs +1173 -0
- data/ext/polars/src/conversion.rs +100 -5
- data/ext/polars/src/dataframe.rs +146 -1
- data/ext/polars/src/error.rs +8 -0
- data/ext/polars/src/lazy/apply.rs +34 -2
- data/ext/polars/src/lazy/dataframe.rs +72 -1
- data/ext/polars/src/lazy/dsl.rs +38 -0
- data/ext/polars/src/lib.rs +165 -1
- data/ext/polars/src/series.rs +296 -0
- data/ext/polars/src/utils.rs +25 -0
- data/lib/polars/convert.rb +100 -0
- data/lib/polars/data_frame.rb +1457 -56
- data/lib/polars/dynamic_group_by.rb +49 -0
- data/lib/polars/expr.rb +258 -9
- data/lib/polars/functions.rb +192 -3
- data/lib/polars/group_by.rb +43 -3
- data/lib/polars/io.rb +19 -3
- data/lib/polars/lazy_frame.rb +792 -22
- data/lib/polars/lazy_functions.rb +561 -27
- data/lib/polars/rolling_group_by.rb +35 -0
- data/lib/polars/series.rb +132 -10
- data/lib/polars/utils.rb +16 -0
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +9 -1
- metadata +9 -3
@@ -0,0 +1,49 @@
|
|
1
|
+
module Polars
|
2
|
+
# A dynamic grouper.
|
3
|
+
#
|
4
|
+
# This has an `.agg` method which allows you to run all polars expressions in a
|
5
|
+
# groupby context.
|
6
|
+
class DynamicGroupBy
|
7
|
+
def initialize(
|
8
|
+
df,
|
9
|
+
index_column,
|
10
|
+
every,
|
11
|
+
period,
|
12
|
+
offset,
|
13
|
+
truncate,
|
14
|
+
include_boundaries,
|
15
|
+
closed,
|
16
|
+
by
|
17
|
+
)
|
18
|
+
period = Utils._timedelta_to_pl_duration(period)
|
19
|
+
offset = Utils._timedelta_to_pl_duration(offset)
|
20
|
+
every = Utils._timedelta_to_pl_duration(every)
|
21
|
+
|
22
|
+
@df = df
|
23
|
+
@time_column = index_column
|
24
|
+
@every = every
|
25
|
+
@period = period
|
26
|
+
@offset = offset
|
27
|
+
@truncate = truncate
|
28
|
+
@include_boundaries = include_boundaries
|
29
|
+
@closed = closed
|
30
|
+
@by = by
|
31
|
+
end
|
32
|
+
|
33
|
+
def agg(aggs)
|
34
|
+
@df.lazy
|
35
|
+
.groupby_dynamic(
|
36
|
+
@time_column,
|
37
|
+
every: @every,
|
38
|
+
period: @period,
|
39
|
+
offset: @offset,
|
40
|
+
truncate: @truncate,
|
41
|
+
include_boundaries: @include_boundaries,
|
42
|
+
closed: @closed,
|
43
|
+
by: @by
|
44
|
+
)
|
45
|
+
.agg(aggs)
|
46
|
+
.collect(no_optimization: true, string_cache: false)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
data/lib/polars/expr.rb
CHANGED
@@ -432,8 +432,34 @@ module Polars
|
|
432
432
|
wrap_expr(_rbexpr.suffix(suffix))
|
433
433
|
end
|
434
434
|
|
435
|
-
#
|
436
|
-
#
|
435
|
+
# Rename the output of an expression by mapping a function over the root name.
|
436
|
+
#
|
437
|
+
# @return [Expr]
|
438
|
+
#
|
439
|
+
# @example
|
440
|
+
# df = Polars::DataFrame.new(
|
441
|
+
# {
|
442
|
+
# "A" => [1, 2],
|
443
|
+
# "B" => [3, 4]
|
444
|
+
# }
|
445
|
+
# )
|
446
|
+
# df.select(
|
447
|
+
# Polars.all.reverse.map_alias { |colName| colName + "_reverse" }
|
448
|
+
# )
|
449
|
+
# # =>
|
450
|
+
# # shape: (2, 2)
|
451
|
+
# # ┌───────────┬───────────┐
|
452
|
+
# # │ A_reverse ┆ B_reverse │
|
453
|
+
# # │ --- ┆ --- │
|
454
|
+
# # │ i64 ┆ i64 │
|
455
|
+
# # ╞═══════════╪═══════════╡
|
456
|
+
# # │ 2 ┆ 4 │
|
457
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
|
458
|
+
# # │ 1 ┆ 3 │
|
459
|
+
# # └───────────┴───────────┘
|
460
|
+
def map_alias(&f)
|
461
|
+
Utils.wrap_expr(_rbexpr.map_alias(f))
|
462
|
+
end
|
437
463
|
|
438
464
|
# Negate a boolean expression.
|
439
465
|
#
|
@@ -2575,14 +2601,98 @@ module Polars
|
|
2575
2601
|
# # ╞══════╪════════╡
|
2576
2602
|
# # │ 1 ┆ 0 │
|
2577
2603
|
# # └──────┴────────┘
|
2578
|
-
# def map(return_dtype: nil, agg_list: false, &
|
2604
|
+
# def map(return_dtype: nil, agg_list: false, &f)
|
2579
2605
|
# if !return_dtype.nil?
|
2580
2606
|
# return_dtype = Utils.rb_type_to_dtype(return_dtype)
|
2581
2607
|
# end
|
2582
|
-
# wrap_expr(_rbexpr.map(return_dtype, agg_list
|
2608
|
+
# wrap_expr(_rbexpr.map(f, return_dtype, agg_list))
|
2583
2609
|
# end
|
2584
2610
|
|
2585
|
-
#
|
2611
|
+
# Apply a custom/user-defined function (UDF) in a GroupBy or Projection context.
|
2612
|
+
#
|
2613
|
+
# Depending on the context it has the following behavior:
|
2614
|
+
#
|
2615
|
+
# * Selection
|
2616
|
+
# Expects `f` to be of type Callable[[Any], Any].
|
2617
|
+
# Applies a Ruby function over each individual value in the column.
|
2618
|
+
# * GroupBy
|
2619
|
+
# Expects `f` to be of type Callable[[Series], Series].
|
2620
|
+
# Applies a Ruby function over each group.
|
2621
|
+
#
|
2622
|
+
# Implementing logic using a Ruby function is almost always _significantly_
|
2623
|
+
# slower and more memory intensive than implementing the same logic using
|
2624
|
+
# the native expression API because:
|
2625
|
+
#
|
2626
|
+
# - The native expression engine runs in Rust; UDFs run in Ruby.
|
2627
|
+
# - Use of Ruby UDFs forces the DataFrame to be materialized in memory.
|
2628
|
+
# - Polars-native expressions can be parallelised (UDFs cannot).
|
2629
|
+
# - Polars-native expressions can be logically optimised (UDFs cannot).
|
2630
|
+
#
|
2631
|
+
# Wherever possible you should strongly prefer the native expression API
|
2632
|
+
# to achieve the best performance.
|
2633
|
+
#
|
2634
|
+
# @param return_dtype [Symbol]
|
2635
|
+
# Dtype of the output Series.
|
2636
|
+
# If not set, polars will assume that
|
2637
|
+
# the dtype remains unchanged.
|
2638
|
+
#
|
2639
|
+
# @return [Expr]
|
2640
|
+
#
|
2641
|
+
# @example
|
2642
|
+
# df = Polars::DataFrame.new(
|
2643
|
+
# {
|
2644
|
+
# "a" => [1, 2, 3, 1],
|
2645
|
+
# "b" => ["a", "b", "c", "c"]
|
2646
|
+
# }
|
2647
|
+
# )
|
2648
|
+
#
|
2649
|
+
# @example In a selection context, the function is applied by row.
|
2650
|
+
# df.with_column(
|
2651
|
+
# Polars.col("a").apply { |x| x * 2 }.alias("a_times_2")
|
2652
|
+
# )
|
2653
|
+
# # =>
|
2654
|
+
# # shape: (4, 3)
|
2655
|
+
# # ┌─────┬─────┬───────────┐
|
2656
|
+
# # │ a ┆ b ┆ a_times_2 │
|
2657
|
+
# # │ --- ┆ --- ┆ --- │
|
2658
|
+
# # │ i64 ┆ str ┆ i64 │
|
2659
|
+
# # ╞═════╪═════╪═══════════╡
|
2660
|
+
# # │ 1 ┆ a ┆ 2 │
|
2661
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
|
2662
|
+
# # │ 2 ┆ b ┆ 4 │
|
2663
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
|
2664
|
+
# # │ 3 ┆ c ┆ 6 │
|
2665
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
|
2666
|
+
# # │ 1 ┆ c ┆ 2 │
|
2667
|
+
# # └─────┴─────┴───────────┘
|
2668
|
+
#
|
2669
|
+
# @example In a GroupBy context the function is applied by group:
|
2670
|
+
# df.lazy
|
2671
|
+
# .groupby("b", maintain_order: true)
|
2672
|
+
# .agg(
|
2673
|
+
# [
|
2674
|
+
# Polars.col("a").apply { |x| x.sum }
|
2675
|
+
# ]
|
2676
|
+
# )
|
2677
|
+
# .collect
|
2678
|
+
# # =>
|
2679
|
+
# # shape: (3, 2)
|
2680
|
+
# # ┌─────┬─────┐
|
2681
|
+
# # │ b ┆ a │
|
2682
|
+
# # │ --- ┆ --- │
|
2683
|
+
# # │ str ┆ i64 │
|
2684
|
+
# # ╞═════╪═════╡
|
2685
|
+
# # │ a ┆ 1 │
|
2686
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
2687
|
+
# # │ b ┆ 2 │
|
2688
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
2689
|
+
# # │ c ┆ 4 │
|
2690
|
+
# # └─────┴─────┘
|
2691
|
+
# def apply(return_dtype: nil, &f)
|
2692
|
+
# wrap_f = lambda do |x|
|
2693
|
+
# x.apply(return_dtype: return_dtype, &f)
|
2694
|
+
# end
|
2695
|
+
# map(agg_list: true, return_dtype: return_dtype, &wrap_f)
|
2586
2696
|
# end
|
2587
2697
|
|
2588
2698
|
# Explode a list or utf8 Series. This means that every item is expanded to a new
|
@@ -2898,8 +3008,49 @@ module Polars
|
|
2898
3008
|
end
|
2899
3009
|
end
|
2900
3010
|
|
2901
|
-
#
|
2902
|
-
#
|
3011
|
+
# Hash the elements in the selection.
|
3012
|
+
#
|
3013
|
+
# The hash value is of type `:u64`.
|
3014
|
+
#
|
3015
|
+
# @param seed [Integer]
|
3016
|
+
# Random seed parameter. Defaults to 0.
|
3017
|
+
# @param seed_1 [Integer]
|
3018
|
+
# Random seed parameter. Defaults to `seed` if not set.
|
3019
|
+
# @param seed_2 [Integer]
|
3020
|
+
# Random seed parameter. Defaults to `seed` if not set.
|
3021
|
+
# @param seed_3 [Integer]
|
3022
|
+
# Random seed parameter. Defaults to `seed` if not set.
|
3023
|
+
#
|
3024
|
+
# @return [Expr]
|
3025
|
+
#
|
3026
|
+
# @example
|
3027
|
+
# df = Polars::DataFrame.new(
|
3028
|
+
# {
|
3029
|
+
# "a" => [1, 2, nil],
|
3030
|
+
# "b" => ["x", nil, "z"]
|
3031
|
+
# }
|
3032
|
+
# )
|
3033
|
+
# df.with_column(Polars.all._hash(10, 20, 30, 40))
|
3034
|
+
# # =>
|
3035
|
+
# # shape: (3, 2)
|
3036
|
+
# # ┌──────────────────────┬──────────────────────┐
|
3037
|
+
# # │ a ┆ b │
|
3038
|
+
# # │ --- ┆ --- │
|
3039
|
+
# # │ u64 ┆ u64 │
|
3040
|
+
# # ╞══════════════════════╪══════════════════════╡
|
3041
|
+
# # │ 4629889412789719550 ┆ 6959506404929392568 │
|
3042
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
3043
|
+
# # │ 16386608652769605760 ┆ 11638928888656214026 │
|
3044
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
3045
|
+
# # │ 11638928888656214026 ┆ 11040941213715918520 │
|
3046
|
+
# # └──────────────────────┴──────────────────────┘
|
3047
|
+
def _hash(seed = 0, seed_1 = nil, seed_2 = nil, seed_3 = nil)
|
3048
|
+
k0 = seed
|
3049
|
+
k1 = seed_1.nil? ? seed : seed_1
|
3050
|
+
k2 = seed_2.nil? ? seed : seed_2
|
3051
|
+
k3 = seed_3.nil? ? seed : seed_3
|
3052
|
+
wrap_expr(_rbexpr._hash(k0, k1, k2, k3))
|
3053
|
+
end
|
2903
3054
|
|
2904
3055
|
# Reinterpret the underlying bits as a signed/unsigned integer.
|
2905
3056
|
#
|
@@ -2937,7 +3088,40 @@ module Polars
|
|
2937
3088
|
wrap_expr(_rbexpr.reinterpret(signed))
|
2938
3089
|
end
|
2939
3090
|
|
2940
|
-
#
|
3091
|
+
# Print the value that this expression evaluates to and pass on the value.
|
3092
|
+
#
|
3093
|
+
# @return [Expr]
|
3094
|
+
#
|
3095
|
+
# @example
|
3096
|
+
# df = Polars::DataFrame.new({"foo" => [1, 1, 2]})
|
3097
|
+
# df.select(Polars.col("foo").cumsum._inspect("value is: %s").alias("bar"))
|
3098
|
+
# # =>
|
3099
|
+
# # value is: shape: (3,)
|
3100
|
+
# # Series: 'foo' [i64]
|
3101
|
+
# # [
|
3102
|
+
# # 1
|
3103
|
+
# # 2
|
3104
|
+
# # 4
|
3105
|
+
# # ]
|
3106
|
+
# # shape: (3, 1)
|
3107
|
+
# # ┌─────┐
|
3108
|
+
# # │ bar │
|
3109
|
+
# # │ --- │
|
3110
|
+
# # │ i64 │
|
3111
|
+
# # ╞═════╡
|
3112
|
+
# # │ 1 │
|
3113
|
+
# # ├╌╌╌╌╌┤
|
3114
|
+
# # │ 2 │
|
3115
|
+
# # ├╌╌╌╌╌┤
|
3116
|
+
# # │ 4 │
|
3117
|
+
# # └─────┘
|
3118
|
+
# def _inspect(fmt = "%s")
|
3119
|
+
# inspect = lambda do |s|
|
3120
|
+
# puts(fmt % [s])
|
3121
|
+
# s
|
3122
|
+
# end
|
3123
|
+
|
3124
|
+
# map(return_dtype: nil, agg_list: true, &inspect)
|
2941
3125
|
# end
|
2942
3126
|
|
2943
3127
|
# Fill nulls with linear interpolation over missing values.
|
@@ -3721,7 +3905,72 @@ module Polars
|
|
3721
3905
|
)
|
3722
3906
|
end
|
3723
3907
|
|
3724
|
-
#
|
3908
|
+
# Apply a custom rolling window function.
|
3909
|
+
#
|
3910
|
+
# Prefer the specific rolling window functions over this one, as they are faster.
|
3911
|
+
#
|
3912
|
+
# Prefer:
|
3913
|
+
# * rolling_min
|
3914
|
+
# * rolling_max
|
3915
|
+
# * rolling_mean
|
3916
|
+
# * rolling_sum
|
3917
|
+
#
|
3918
|
+
# @param window_size [Integer]
|
3919
|
+
# The length of the window.
|
3920
|
+
# @param weights [Object]
|
3921
|
+
# An optional slice with the same length as the window that will be multiplied
|
3922
|
+
# elementwise with the values in the window.
|
3923
|
+
# @param min_periods [Integer]
|
3924
|
+
# The number of values in the window that should be non-null before computing
|
3925
|
+
# a result. If nil, it will be set equal to window size.
|
3926
|
+
# @param center [Boolean]
|
3927
|
+
# Set the labels at the center of the window
|
3928
|
+
#
|
3929
|
+
# @return [Expr]
|
3930
|
+
#
|
3931
|
+
# @example
|
3932
|
+
# df = Polars::DataFrame.new(
|
3933
|
+
# {
|
3934
|
+
# "A" => [1.0, 2.0, 9.0, 2.0, 13.0]
|
3935
|
+
# }
|
3936
|
+
# )
|
3937
|
+
# df.select(
|
3938
|
+
# [
|
3939
|
+
# Polars.col("A").rolling_apply(window_size: 3) { |s| s.std }
|
3940
|
+
# ]
|
3941
|
+
# )
|
3942
|
+
# # =>
|
3943
|
+
# # shape: (5, 1)
|
3944
|
+
# # ┌──────────┐
|
3945
|
+
# # │ A │
|
3946
|
+
# # │ --- │
|
3947
|
+
# # │ f64 │
|
3948
|
+
# # ╞══════════╡
|
3949
|
+
# # │ null │
|
3950
|
+
# # ├╌╌╌╌╌╌╌╌╌╌┤
|
3951
|
+
# # │ null │
|
3952
|
+
# # ├╌╌╌╌╌╌╌╌╌╌┤
|
3953
|
+
# # │ 4.358899 │
|
3954
|
+
# # ├╌╌╌╌╌╌╌╌╌╌┤
|
3955
|
+
# # │ 4.041452 │
|
3956
|
+
# # ├╌╌╌╌╌╌╌╌╌╌┤
|
3957
|
+
# # │ 5.567764 │
|
3958
|
+
# # └──────────┘
|
3959
|
+
# def rolling_apply(
|
3960
|
+
# window_size:,
|
3961
|
+
# weights: nil,
|
3962
|
+
# min_periods: nil,
|
3963
|
+
# center: false,
|
3964
|
+
# &function
|
3965
|
+
# )
|
3966
|
+
# if min_periods.nil?
|
3967
|
+
# min_periods = window_size
|
3968
|
+
# end
|
3969
|
+
# wrap_expr(
|
3970
|
+
# _rbexpr.rolling_apply(
|
3971
|
+
# function, window_size, weights, min_periods, center
|
3972
|
+
# )
|
3973
|
+
# )
|
3725
3974
|
# end
|
3726
3975
|
|
3727
3976
|
# Compute a rolling skew.
|
data/lib/polars/functions.rb
CHANGED
@@ -199,12 +199,201 @@ module Polars
|
|
199
199
|
dt_range
|
200
200
|
end
|
201
201
|
|
202
|
-
#
|
203
|
-
#
|
202
|
+
# Bin values into discrete values.
|
203
|
+
#
|
204
|
+
# @param s [Series]
|
205
|
+
# Series to bin.
|
206
|
+
# @param bins [Array]
|
207
|
+
# Bins to create.
|
208
|
+
# @param labels [Array]
|
209
|
+
# Labels to assign to the bins. If given the length of labels must be
|
210
|
+
# len(bins) + 1.
|
211
|
+
# @param break_point_label [String]
|
212
|
+
# Name given to the breakpoint column.
|
213
|
+
# @param category_label [String]
|
214
|
+
# Name given to the category column.
|
215
|
+
#
|
216
|
+
# @return [DataFrame]
|
217
|
+
#
|
218
|
+
# @note
|
219
|
+
# This functionality is experimental and may change without it being considered a
|
220
|
+
# breaking change.
|
221
|
+
#
|
222
|
+
# @example
|
223
|
+
# a = Polars::Series.new("a", 13.times.map { |i| (-30 + i * 5) / 10.0 })
|
224
|
+
# Polars.cut(a, [-1, 1])
|
225
|
+
# # =>
|
226
|
+
# # shape: (12, 3)
|
227
|
+
# # ┌──────┬─────────────┬──────────────┐
|
228
|
+
# # │ a ┆ break_point ┆ category │
|
229
|
+
# # │ --- ┆ --- ┆ --- │
|
230
|
+
# # │ f64 ┆ f64 ┆ cat │
|
231
|
+
# # ╞══════╪═════════════╪══════════════╡
|
232
|
+
# # │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │
|
233
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
234
|
+
# # │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │
|
235
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
236
|
+
# # │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │
|
237
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
238
|
+
# # │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │
|
239
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
240
|
+
# # │ ... ┆ ... ┆ ... │
|
241
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
242
|
+
# # │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │
|
243
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
244
|
+
# # │ 1.5 ┆ inf ┆ (1.0, inf] │
|
245
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
246
|
+
# # │ 2.0 ┆ inf ┆ (1.0, inf] │
|
247
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
248
|
+
# # │ 2.5 ┆ inf ┆ (1.0, inf] │
|
249
|
+
# # └──────┴─────────────┴──────────────┘
|
250
|
+
# def cut(
|
251
|
+
# s,
|
252
|
+
# bins,
|
253
|
+
# labels: nil,
|
254
|
+
# break_point_label: "break_point",
|
255
|
+
# category_label: "category"
|
256
|
+
# )
|
257
|
+
# var_nm = s.name
|
204
258
|
|
205
|
-
#
|
259
|
+
# cuts_df = DataFrame.new(
|
260
|
+
# [
|
261
|
+
# Series.new(
|
262
|
+
# break_point_label, bins, dtype: :f64
|
263
|
+
# ).extend_constant(Float::INFINITY, 1)
|
264
|
+
# ]
|
265
|
+
# )
|
266
|
+
|
267
|
+
# if labels
|
268
|
+
# if labels.length != bins.length + 1
|
269
|
+
# raise ArgumentError, "expected more labels"
|
270
|
+
# end
|
271
|
+
# cuts_df = cuts_df.with_column(Series.new(category_label, labels))
|
272
|
+
# else
|
273
|
+
# cuts_df = cuts_df.with_column(
|
274
|
+
# Polars.format(
|
275
|
+
# "({}, {}]",
|
276
|
+
# Polars.col(break_point_label).shift_and_fill(1, -Float::INFINITY),
|
277
|
+
# Polars.col(break_point_label)
|
278
|
+
# ).alias(category_label)
|
279
|
+
# )
|
280
|
+
# end
|
281
|
+
|
282
|
+
# cuts_df = cuts_df.with_column(Polars.col(category_label).cast(:cat))
|
283
|
+
|
284
|
+
# s.cast(:f64)
|
285
|
+
# .sort
|
286
|
+
# .to_frame
|
287
|
+
# .join_asof(
|
288
|
+
# cuts_df,
|
289
|
+
# left_on: var_nm,
|
290
|
+
# right_on: break_point_label,
|
291
|
+
# strategy: "forward"
|
292
|
+
# )
|
206
293
|
# end
|
207
294
|
|
295
|
+
# Align a sequence of frames using the uique values from one or more columns as a key.
|
296
|
+
#
|
297
|
+
# Frames that do not contain the given key values have rows injected (with nulls
|
298
|
+
# filling the non-key columns), and each resulting frame is sorted by the key.
|
299
|
+
#
|
300
|
+
# The original column order of input frames is not changed unless ``select`` is
|
301
|
+
# specified (in which case the final column order is determined from that).
|
302
|
+
#
|
303
|
+
# Note that this does not result in a joined frame - you receive the same number
|
304
|
+
# of frames back that you passed in, but each is now aligned by key and has
|
305
|
+
# the same number of rows.
|
306
|
+
#
|
307
|
+
# @param frames [Array]
|
308
|
+
# Sequence of DataFrames or LazyFrames.
|
309
|
+
# @param on [Object]
|
310
|
+
# One or more columns whose unique values will be used to align the frames.
|
311
|
+
# @param select [Object]
|
312
|
+
# Optional post-alignment column select to constrain and/or order
|
313
|
+
# the columns returned from the newly aligned frames.
|
314
|
+
# @param reverse [Object]
|
315
|
+
# Sort the alignment column values in descending order; can be a single
|
316
|
+
# boolean or a list of booleans associated with each column in `on`.
|
317
|
+
#
|
318
|
+
# @return [Object]
|
319
|
+
#
|
320
|
+
# @example
|
321
|
+
# df1 = Polars::DataFrame.new(
|
322
|
+
# {
|
323
|
+
# "dt" => [Date.new(2022, 9, 1), Date.new(2022, 9, 2), Date.new(2022, 9, 3)],
|
324
|
+
# "x" => [3.5, 4.0, 1.0],
|
325
|
+
# "y" => [10.0, 2.5, 1.5]
|
326
|
+
# }
|
327
|
+
# )
|
328
|
+
# df2 = Polars::DataFrame.new(
|
329
|
+
# {
|
330
|
+
# "dt" => [Date.new(2022, 9, 2), Date.new(2022, 9, 3), Date.new(2022, 9, 1)],
|
331
|
+
# "x" => [8.0, 1.0, 3.5],
|
332
|
+
# "y" => [1.5, 12.0, 5.0]
|
333
|
+
# }
|
334
|
+
# )
|
335
|
+
# df3 = Polars::DataFrame.new(
|
336
|
+
# {
|
337
|
+
# "dt" => [Date.new(2022, 9, 3), Date.new(2022, 9, 2)],
|
338
|
+
# "x" => [2.0, 5.0],
|
339
|
+
# "y" => [2.5, 2.0]
|
340
|
+
# }
|
341
|
+
# )
|
342
|
+
# af1, af2, af3 = Polars.align_frames(
|
343
|
+
# df1, df2, df3, on: "dt", select: ["x", "y"]
|
344
|
+
# )
|
345
|
+
# (af1 * af2 * af3).fill_null(0).select(Polars.sum(Polars.col("*")).alias("dot"))
|
346
|
+
# # =>
|
347
|
+
# # shape: (3, 1)
|
348
|
+
# # ┌───────┐
|
349
|
+
# # │ dot │
|
350
|
+
# # │ --- │
|
351
|
+
# # │ f64 │
|
352
|
+
# # ╞═══════╡
|
353
|
+
# # │ 0.0 │
|
354
|
+
# # ├╌╌╌╌╌╌╌┤
|
355
|
+
# # │ 167.5 │
|
356
|
+
# # ├╌╌╌╌╌╌╌┤
|
357
|
+
# # │ 47.0 │
|
358
|
+
# # └───────┘
|
359
|
+
def align_frames(
|
360
|
+
*frames,
|
361
|
+
on:,
|
362
|
+
select: nil,
|
363
|
+
reverse: false
|
364
|
+
)
|
365
|
+
if frames.empty?
|
366
|
+
return []
|
367
|
+
elsif frames.map(&:class).uniq.length != 1
|
368
|
+
raise TypeError, "Input frames must be of a consistent type (all LazyFrame or all DataFrame)"
|
369
|
+
end
|
370
|
+
|
371
|
+
# establish the superset of all "on" column values, sort, and cache
|
372
|
+
eager = frames[0].is_a?(DataFrame)
|
373
|
+
alignment_frame = (
|
374
|
+
concat(frames.map { |df| df.lazy.select(on) })
|
375
|
+
.unique(maintain_order: false)
|
376
|
+
.sort(on, reverse: reverse)
|
377
|
+
)
|
378
|
+
alignment_frame = (
|
379
|
+
eager ? alignment_frame.collect.lazy : alignment_frame.cache
|
380
|
+
)
|
381
|
+
# finally, align all frames
|
382
|
+
aligned_frames =
|
383
|
+
frames.map do |df|
|
384
|
+
alignment_frame.join(
|
385
|
+
df.lazy,
|
386
|
+
on: alignment_frame.columns,
|
387
|
+
how: "left"
|
388
|
+
).select(df.columns)
|
389
|
+
end
|
390
|
+
if !select.nil?
|
391
|
+
aligned_frames = aligned_frames.map { |df| df.select(select) }
|
392
|
+
end
|
393
|
+
|
394
|
+
eager ? aligned_frames.map(&:collect) : aligned_frames
|
395
|
+
end
|
396
|
+
|
208
397
|
# Return a new Series of given length and type, filled with ones.
|
209
398
|
#
|
210
399
|
# @param n [Integer]
|
data/lib/polars/group_by.rb
CHANGED
@@ -12,7 +12,48 @@ module Polars
|
|
12
12
|
self.maintain_order = maintain_order
|
13
13
|
end
|
14
14
|
|
15
|
-
#
|
15
|
+
# Apply a custom/user-defined function (UDF) over the groups as a sub-DataFrame.
|
16
|
+
#
|
17
|
+
# Implementing logic using a Ruby function is almost always _significantly_
|
18
|
+
# slower and more memory intensive than implementing the same logic using
|
19
|
+
# the native expression API because:
|
20
|
+
|
21
|
+
# - The native expression engine runs in Rust; UDFs run in Ruby.
|
22
|
+
# - Use of Ruby UDFs forces the DataFrame to be materialized in memory.
|
23
|
+
# - Polars-native expressions can be parallelised (UDFs cannot).
|
24
|
+
# - Polars-native expressions can be logically optimised (UDFs cannot).
|
25
|
+
#
|
26
|
+
# Wherever possible you should strongly prefer the native expression API
|
27
|
+
# to achieve the best performance.
|
28
|
+
#
|
29
|
+
# @return [DataFrame]
|
30
|
+
#
|
31
|
+
# @example
|
32
|
+
# df = Polars::DataFrame.new(
|
33
|
+
# {
|
34
|
+
# "id" => [0, 1, 2, 3, 4],
|
35
|
+
# "color" => ["red", "green", "green", "red", "red"],
|
36
|
+
# "shape" => ["square", "triangle", "square", "triangle", "square"]
|
37
|
+
# }
|
38
|
+
# )
|
39
|
+
# df.groupby("color").apply { |group_df| group_df.sample(2) }
|
40
|
+
# # =>
|
41
|
+
# # shape: (4, 3)
|
42
|
+
# # ┌─────┬───────┬──────────┐
|
43
|
+
# # │ id ┆ color ┆ shape │
|
44
|
+
# # │ --- ┆ --- ┆ --- │
|
45
|
+
# # │ i64 ┆ str ┆ str │
|
46
|
+
# # ╞═════╪═══════╪══════════╡
|
47
|
+
# # │ 1 ┆ green ┆ triangle │
|
48
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
|
49
|
+
# # │ 2 ┆ green ┆ square │
|
50
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
|
51
|
+
# # │ 4 ┆ red ┆ square │
|
52
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
|
53
|
+
# # │ 3 ┆ red ┆ triangle │
|
54
|
+
# # └─────┴───────┴──────────┘
|
55
|
+
# def apply(&f)
|
56
|
+
# _dataframe_class._from_rbdf(_df.groupby_apply(by, f))
|
16
57
|
# end
|
17
58
|
|
18
59
|
# Use multiple aggregations on columns.
|
@@ -182,8 +223,7 @@ module Polars
|
|
182
223
|
_dataframe_class._from_rbdf(df._df)
|
183
224
|
end
|
184
225
|
|
185
|
-
#
|
186
|
-
# end
|
226
|
+
# pivot is deprecated
|
187
227
|
|
188
228
|
# Aggregate the first values in the group.
|
189
229
|
#
|
data/lib/polars/io.rb
CHANGED
@@ -59,7 +59,7 @@ module Polars
|
|
59
59
|
# Lossy means that invalid utf8 values are replaced with `�`
|
60
60
|
# characters. When using other encodings than `utf8` or
|
61
61
|
# `utf8-lossy`, the input is first decoded im memory with
|
62
|
-
#
|
62
|
+
# Ruby.
|
63
63
|
# @param low_memory [Boolean]
|
64
64
|
# Reduce memory usage at expense of performance.
|
65
65
|
# @param rechunk [Boolean]
|
@@ -451,8 +451,24 @@ module Polars
|
|
451
451
|
)
|
452
452
|
end
|
453
453
|
|
454
|
-
#
|
455
|
-
#
|
454
|
+
# Read into a DataFrame from Apache Avro format.
|
455
|
+
#
|
456
|
+
# @param file [Object]
|
457
|
+
# Path to a file or a file-like object.
|
458
|
+
# @param columns [Object]
|
459
|
+
# Columns to select. Accepts a list of column indices (starting at zero) or a list
|
460
|
+
# of column names.
|
461
|
+
# @param n_rows [Integer]
|
462
|
+
# Stop reading from Apache Avro file after reading ``n_rows``.
|
463
|
+
#
|
464
|
+
# @return [DataFrame]
|
465
|
+
def read_avro(file, columns: nil, n_rows: nil)
|
466
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
467
|
+
file = Utils.format_path(file)
|
468
|
+
end
|
469
|
+
|
470
|
+
DataFrame._read_avro(file, n_rows: n_rows, columns: columns)
|
471
|
+
end
|
456
472
|
|
457
473
|
# Read into a DataFrame from Arrow IPC (Feather v2) file.
|
458
474
|
#
|