polars-df 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,49 @@
1
+ module Polars
2
+ # A dynamic grouper.
3
+ #
4
+ # This has an `.agg` method which allows you to run all polars expressions in a
5
+ # groupby context.
6
+ class DynamicGroupBy
7
+ def initialize(
8
+ df,
9
+ index_column,
10
+ every,
11
+ period,
12
+ offset,
13
+ truncate,
14
+ include_boundaries,
15
+ closed,
16
+ by
17
+ )
18
+ period = Utils._timedelta_to_pl_duration(period)
19
+ offset = Utils._timedelta_to_pl_duration(offset)
20
+ every = Utils._timedelta_to_pl_duration(every)
21
+
22
+ @df = df
23
+ @time_column = index_column
24
+ @every = every
25
+ @period = period
26
+ @offset = offset
27
+ @truncate = truncate
28
+ @include_boundaries = include_boundaries
29
+ @closed = closed
30
+ @by = by
31
+ end
32
+
33
+ def agg(aggs)
34
+ @df.lazy
35
+ .groupby_dynamic(
36
+ @time_column,
37
+ every: @every,
38
+ period: @period,
39
+ offset: @offset,
40
+ truncate: @truncate,
41
+ include_boundaries: @include_boundaries,
42
+ closed: @closed,
43
+ by: @by
44
+ )
45
+ .agg(aggs)
46
+ .collect(no_optimization: true, string_cache: false)
47
+ end
48
+ end
49
+ end
data/lib/polars/expr.rb CHANGED
@@ -432,8 +432,34 @@ module Polars
432
432
  wrap_expr(_rbexpr.suffix(suffix))
433
433
  end
434
434
 
435
- # def map_alias
436
- # end
435
+ # Rename the output of an expression by mapping a function over the root name.
436
+ #
437
+ # @return [Expr]
438
+ #
439
+ # @example
440
+ # df = Polars::DataFrame.new(
441
+ # {
442
+ # "A" => [1, 2],
443
+ # "B" => [3, 4]
444
+ # }
445
+ # )
446
+ # df.select(
447
+ # Polars.all.reverse.map_alias { |colName| colName + "_reverse" }
448
+ # )
449
+ # # =>
450
+ # # shape: (2, 2)
451
+ # # ┌───────────┬───────────┐
452
+ # # │ A_reverse ┆ B_reverse │
453
+ # # │ --- ┆ --- │
454
+ # # │ i64 ┆ i64 │
455
+ # # ╞═══════════╪═══════════╡
456
+ # # │ 2 ┆ 4 │
457
+ # # ├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
458
+ # # │ 1 ┆ 3 │
459
+ # # └───────────┴───────────┘
460
+ def map_alias(&f)
461
+ Utils.wrap_expr(_rbexpr.map_alias(f))
462
+ end
437
463
 
438
464
  # Negate a boolean expression.
439
465
  #
@@ -2575,14 +2601,98 @@ module Polars
2575
2601
  # # ╞══════╪════════╡
2576
2602
  # # │ 1 ┆ 0 │
2577
2603
  # # └──────┴────────┘
2578
- # def map(return_dtype: nil, agg_list: false, &block)
2604
+ # def map(return_dtype: nil, agg_list: false, &f)
2579
2605
  # if !return_dtype.nil?
2580
2606
  # return_dtype = Utils.rb_type_to_dtype(return_dtype)
2581
2607
  # end
2582
- # wrap_expr(_rbexpr.map(return_dtype, agg_list, &block))
2608
+ # wrap_expr(_rbexpr.map(f, return_dtype, agg_list))
2583
2609
  # end
2584
2610
 
2585
- # def apply
2611
+ # Apply a custom/user-defined function (UDF) in a GroupBy or Projection context.
2612
+ #
2613
+ # Depending on the context it has the following behavior:
2614
+ #
2615
+ # * Selection
2616
+ # Expects `f` to be of type Callable[[Any], Any].
2617
+ # Applies a Ruby function over each individual value in the column.
2618
+ # * GroupBy
2619
+ # Expects `f` to be of type Callable[[Series], Series].
2620
+ # Applies a Ruby function over each group.
2621
+ #
2622
+ # Implementing logic using a Ruby function is almost always _significantly_
2623
+ # slower and more memory intensive than implementing the same logic using
2624
+ # the native expression API because:
2625
+ #
2626
+ # - The native expression engine runs in Rust; UDFs run in Ruby.
2627
+ # - Use of Ruby UDFs forces the DataFrame to be materialized in memory.
2628
+ # - Polars-native expressions can be parallelised (UDFs cannot).
2629
+ # - Polars-native expressions can be logically optimised (UDFs cannot).
2630
+ #
2631
+ # Wherever possible you should strongly prefer the native expression API
2632
+ # to achieve the best performance.
2633
+ #
2634
+ # @param return_dtype [Symbol]
2635
+ # Dtype of the output Series.
2636
+ # If not set, polars will assume that
2637
+ # the dtype remains unchanged.
2638
+ #
2639
+ # @return [Expr]
2640
+ #
2641
+ # @example
2642
+ # df = Polars::DataFrame.new(
2643
+ # {
2644
+ # "a" => [1, 2, 3, 1],
2645
+ # "b" => ["a", "b", "c", "c"]
2646
+ # }
2647
+ # )
2648
+ #
2649
+ # @example In a selection context, the function is applied by row.
2650
+ # df.with_column(
2651
+ # Polars.col("a").apply { |x| x * 2 }.alias("a_times_2")
2652
+ # )
2653
+ # # =>
2654
+ # # shape: (4, 3)
2655
+ # # ┌─────┬─────┬───────────┐
2656
+ # # │ a ┆ b ┆ a_times_2 │
2657
+ # # │ --- ┆ --- ┆ --- │
2658
+ # # │ i64 ┆ str ┆ i64 │
2659
+ # # ╞═════╪═════╪═══════════╡
2660
+ # # │ 1 ┆ a ┆ 2 │
2661
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
2662
+ # # │ 2 ┆ b ┆ 4 │
2663
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
2664
+ # # │ 3 ┆ c ┆ 6 │
2665
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
2666
+ # # │ 1 ┆ c ┆ 2 │
2667
+ # # └─────┴─────┴───────────┘
2668
+ #
2669
+ # @example In a GroupBy context the function is applied by group:
2670
+ # df.lazy
2671
+ # .groupby("b", maintain_order: true)
2672
+ # .agg(
2673
+ # [
2674
+ # Polars.col("a").apply { |x| x.sum }
2675
+ # ]
2676
+ # )
2677
+ # .collect
2678
+ # # =>
2679
+ # # shape: (3, 2)
2680
+ # # ┌─────┬─────┐
2681
+ # # │ b ┆ a │
2682
+ # # │ --- ┆ --- │
2683
+ # # │ str ┆ i64 │
2684
+ # # ╞═════╪═════╡
2685
+ # # │ a ┆ 1 │
2686
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2687
+ # # │ b ┆ 2 │
2688
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2689
+ # # │ c ┆ 4 │
2690
+ # # └─────┴─────┘
2691
+ # def apply(return_dtype: nil, &f)
2692
+ # wrap_f = lambda do |x|
2693
+ # x.apply(return_dtype: return_dtype, &f)
2694
+ # end
2695
+ # map(agg_list: true, return_dtype: return_dtype, &wrap_f)
2586
2696
  # end
2587
2697
 
2588
2698
  # Explode a list or utf8 Series. This means that every item is expanded to a new
@@ -2898,8 +3008,49 @@ module Polars
2898
3008
  end
2899
3009
  end
2900
3010
 
2901
- # def _hash
2902
- # end
3011
+ # Hash the elements in the selection.
3012
+ #
3013
+ # The hash value is of type `:u64`.
3014
+ #
3015
+ # @param seed [Integer]
3016
+ # Random seed parameter. Defaults to 0.
3017
+ # @param seed_1 [Integer]
3018
+ # Random seed parameter. Defaults to `seed` if not set.
3019
+ # @param seed_2 [Integer]
3020
+ # Random seed parameter. Defaults to `seed` if not set.
3021
+ # @param seed_3 [Integer]
3022
+ # Random seed parameter. Defaults to `seed` if not set.
3023
+ #
3024
+ # @return [Expr]
3025
+ #
3026
+ # @example
3027
+ # df = Polars::DataFrame.new(
3028
+ # {
3029
+ # "a" => [1, 2, nil],
3030
+ # "b" => ["x", nil, "z"]
3031
+ # }
3032
+ # )
3033
+ # df.with_column(Polars.all._hash(10, 20, 30, 40))
3034
+ # # =>
3035
+ # # shape: (3, 2)
3036
+ # # ┌──────────────────────┬──────────────────────┐
3037
+ # # │ a ┆ b │
3038
+ # # │ --- ┆ --- │
3039
+ # # │ u64 ┆ u64 │
3040
+ # # ╞══════════════════════╪══════════════════════╡
3041
+ # # │ 4629889412789719550 ┆ 6959506404929392568 │
3042
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
3043
+ # # │ 16386608652769605760 ┆ 11638928888656214026 │
3044
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
3045
+ # # │ 11638928888656214026 ┆ 11040941213715918520 │
3046
+ # # └──────────────────────┴──────────────────────┘
3047
+ def _hash(seed = 0, seed_1 = nil, seed_2 = nil, seed_3 = nil)
3048
+ k0 = seed
3049
+ k1 = seed_1.nil? ? seed : seed_1
3050
+ k2 = seed_2.nil? ? seed : seed_2
3051
+ k3 = seed_3.nil? ? seed : seed_3
3052
+ wrap_expr(_rbexpr._hash(k0, k1, k2, k3))
3053
+ end
2903
3054
 
2904
3055
  # Reinterpret the underlying bits as a signed/unsigned integer.
2905
3056
  #
@@ -2937,7 +3088,40 @@ module Polars
2937
3088
  wrap_expr(_rbexpr.reinterpret(signed))
2938
3089
  end
2939
3090
 
2940
- # def _inspect
3091
+ # Print the value that this expression evaluates to and pass on the value.
3092
+ #
3093
+ # @return [Expr]
3094
+ #
3095
+ # @example
3096
+ # df = Polars::DataFrame.new({"foo" => [1, 1, 2]})
3097
+ # df.select(Polars.col("foo").cumsum._inspect("value is: %s").alias("bar"))
3098
+ # # =>
3099
+ # # value is: shape: (3,)
3100
+ # # Series: 'foo' [i64]
3101
+ # # [
3102
+ # # 1
3103
+ # # 2
3104
+ # # 4
3105
+ # # ]
3106
+ # # shape: (3, 1)
3107
+ # # ┌─────┐
3108
+ # # │ bar │
3109
+ # # │ --- │
3110
+ # # │ i64 │
3111
+ # # ╞═════╡
3112
+ # # │ 1 │
3113
+ # # ├╌╌╌╌╌┤
3114
+ # # │ 2 │
3115
+ # # ├╌╌╌╌╌┤
3116
+ # # │ 4 │
3117
+ # # └─────┘
3118
+ # def _inspect(fmt = "%s")
3119
+ # inspect = lambda do |s|
3120
+ # puts(fmt % [s])
3121
+ # s
3122
+ # end
3123
+
3124
+ # map(return_dtype: nil, agg_list: true, &inspect)
2941
3125
  # end
2942
3126
 
2943
3127
  # Fill nulls with linear interpolation over missing values.
@@ -3721,7 +3905,72 @@ module Polars
3721
3905
  )
3722
3906
  end
3723
3907
 
3724
- # def rolling_apply
3908
+ # Apply a custom rolling window function.
3909
+ #
3910
+ # Prefer the specific rolling window functions over this one, as they are faster.
3911
+ #
3912
+ # Prefer:
3913
+ # * rolling_min
3914
+ # * rolling_max
3915
+ # * rolling_mean
3916
+ # * rolling_sum
3917
+ #
3918
+ # @param window_size [Integer]
3919
+ # The length of the window.
3920
+ # @param weights [Object]
3921
+ # An optional slice with the same length as the window that will be multiplied
3922
+ # elementwise with the values in the window.
3923
+ # @param min_periods [Integer]
3924
+ # The number of values in the window that should be non-null before computing
3925
+ # a result. If nil, it will be set equal to window size.
3926
+ # @param center [Boolean]
3927
+ # Set the labels at the center of the window
3928
+ #
3929
+ # @return [Expr]
3930
+ #
3931
+ # @example
3932
+ # df = Polars::DataFrame.new(
3933
+ # {
3934
+ # "A" => [1.0, 2.0, 9.0, 2.0, 13.0]
3935
+ # }
3936
+ # )
3937
+ # df.select(
3938
+ # [
3939
+ # Polars.col("A").rolling_apply(window_size: 3) { |s| s.std }
3940
+ # ]
3941
+ # )
3942
+ # # =>
3943
+ # # shape: (5, 1)
3944
+ # # ┌──────────┐
3945
+ # # │ A │
3946
+ # # │ --- │
3947
+ # # │ f64 │
3948
+ # # ╞══════════╡
3949
+ # # │ null │
3950
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
3951
+ # # │ null │
3952
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
3953
+ # # │ 4.358899 │
3954
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
3955
+ # # │ 4.041452 │
3956
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
3957
+ # # │ 5.567764 │
3958
+ # # └──────────┘
3959
+ # def rolling_apply(
3960
+ # window_size:,
3961
+ # weights: nil,
3962
+ # min_periods: nil,
3963
+ # center: false,
3964
+ # &function
3965
+ # )
3966
+ # if min_periods.nil?
3967
+ # min_periods = window_size
3968
+ # end
3969
+ # wrap_expr(
3970
+ # _rbexpr.rolling_apply(
3971
+ # function, window_size, weights, min_periods, center
3972
+ # )
3973
+ # )
3725
3974
  # end
3726
3975
 
3727
3976
  # Compute a rolling skew.
@@ -199,12 +199,201 @@ module Polars
199
199
  dt_range
200
200
  end
201
201
 
202
- # def cut
203
- # end
202
+ # Bin values into discrete values.
203
+ #
204
+ # @param s [Series]
205
+ # Series to bin.
206
+ # @param bins [Array]
207
+ # Bins to create.
208
+ # @param labels [Array]
209
+ # Labels to assign to the bins. If given the length of labels must be
210
+ # len(bins) + 1.
211
+ # @param break_point_label [String]
212
+ # Name given to the breakpoint column.
213
+ # @param category_label [String]
214
+ # Name given to the category column.
215
+ #
216
+ # @return [DataFrame]
217
+ #
218
+ # @note
219
+ # This functionality is experimental and may change without it being considered a
220
+ # breaking change.
221
+ #
222
+ # @example
223
+ # a = Polars::Series.new("a", 13.times.map { |i| (-30 + i * 5) / 10.0 })
224
+ # Polars.cut(a, [-1, 1])
225
+ # # =>
226
+ # # shape: (12, 3)
227
+ # # ┌──────┬─────────────┬──────────────┐
228
+ # # │ a ┆ break_point ┆ category │
229
+ # # │ --- ┆ --- ┆ --- │
230
+ # # │ f64 ┆ f64 ┆ cat │
231
+ # # ╞══════╪═════════════╪══════════════╡
232
+ # # │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │
233
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
234
+ # # │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │
235
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
236
+ # # │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │
237
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
238
+ # # │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │
239
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
240
+ # # │ ... ┆ ... ┆ ... │
241
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
242
+ # # │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │
243
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
244
+ # # │ 1.5 ┆ inf ┆ (1.0, inf] │
245
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
246
+ # # │ 2.0 ┆ inf ┆ (1.0, inf] │
247
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
248
+ # # │ 2.5 ┆ inf ┆ (1.0, inf] │
249
+ # # └──────┴─────────────┴──────────────┘
250
+ # def cut(
251
+ # s,
252
+ # bins,
253
+ # labels: nil,
254
+ # break_point_label: "break_point",
255
+ # category_label: "category"
256
+ # )
257
+ # var_nm = s.name
204
258
 
205
- # def align_frames
259
+ # cuts_df = DataFrame.new(
260
+ # [
261
+ # Series.new(
262
+ # break_point_label, bins, dtype: :f64
263
+ # ).extend_constant(Float::INFINITY, 1)
264
+ # ]
265
+ # )
266
+
267
+ # if labels
268
+ # if labels.length != bins.length + 1
269
+ # raise ArgumentError, "expected more labels"
270
+ # end
271
+ # cuts_df = cuts_df.with_column(Series.new(category_label, labels))
272
+ # else
273
+ # cuts_df = cuts_df.with_column(
274
+ # Polars.format(
275
+ # "({}, {}]",
276
+ # Polars.col(break_point_label).shift_and_fill(1, -Float::INFINITY),
277
+ # Polars.col(break_point_label)
278
+ # ).alias(category_label)
279
+ # )
280
+ # end
281
+
282
+ # cuts_df = cuts_df.with_column(Polars.col(category_label).cast(:cat))
283
+
284
+ # s.cast(:f64)
285
+ # .sort
286
+ # .to_frame
287
+ # .join_asof(
288
+ # cuts_df,
289
+ # left_on: var_nm,
290
+ # right_on: break_point_label,
291
+ # strategy: "forward"
292
+ # )
206
293
  # end
207
294
 
295
+ # Align a sequence of frames using the uique values from one or more columns as a key.
296
+ #
297
+ # Frames that do not contain the given key values have rows injected (with nulls
298
+ # filling the non-key columns), and each resulting frame is sorted by the key.
299
+ #
300
+ # The original column order of input frames is not changed unless ``select`` is
301
+ # specified (in which case the final column order is determined from that).
302
+ #
303
+ # Note that this does not result in a joined frame - you receive the same number
304
+ # of frames back that you passed in, but each is now aligned by key and has
305
+ # the same number of rows.
306
+ #
307
+ # @param frames [Array]
308
+ # Sequence of DataFrames or LazyFrames.
309
+ # @param on [Object]
310
+ # One or more columns whose unique values will be used to align the frames.
311
+ # @param select [Object]
312
+ # Optional post-alignment column select to constrain and/or order
313
+ # the columns returned from the newly aligned frames.
314
+ # @param reverse [Object]
315
+ # Sort the alignment column values in descending order; can be a single
316
+ # boolean or a list of booleans associated with each column in `on`.
317
+ #
318
+ # @return [Object]
319
+ #
320
+ # @example
321
+ # df1 = Polars::DataFrame.new(
322
+ # {
323
+ # "dt" => [Date.new(2022, 9, 1), Date.new(2022, 9, 2), Date.new(2022, 9, 3)],
324
+ # "x" => [3.5, 4.0, 1.0],
325
+ # "y" => [10.0, 2.5, 1.5]
326
+ # }
327
+ # )
328
+ # df2 = Polars::DataFrame.new(
329
+ # {
330
+ # "dt" => [Date.new(2022, 9, 2), Date.new(2022, 9, 3), Date.new(2022, 9, 1)],
331
+ # "x" => [8.0, 1.0, 3.5],
332
+ # "y" => [1.5, 12.0, 5.0]
333
+ # }
334
+ # )
335
+ # df3 = Polars::DataFrame.new(
336
+ # {
337
+ # "dt" => [Date.new(2022, 9, 3), Date.new(2022, 9, 2)],
338
+ # "x" => [2.0, 5.0],
339
+ # "y" => [2.5, 2.0]
340
+ # }
341
+ # )
342
+ # af1, af2, af3 = Polars.align_frames(
343
+ # df1, df2, df3, on: "dt", select: ["x", "y"]
344
+ # )
345
+ # (af1 * af2 * af3).fill_null(0).select(Polars.sum(Polars.col("*")).alias("dot"))
346
+ # # =>
347
+ # # shape: (3, 1)
348
+ # # ┌───────┐
349
+ # # │ dot │
350
+ # # │ --- │
351
+ # # │ f64 │
352
+ # # ╞═══════╡
353
+ # # │ 0.0 │
354
+ # # ├╌╌╌╌╌╌╌┤
355
+ # # │ 167.5 │
356
+ # # ├╌╌╌╌╌╌╌┤
357
+ # # │ 47.0 │
358
+ # # └───────┘
359
+ def align_frames(
360
+ *frames,
361
+ on:,
362
+ select: nil,
363
+ reverse: false
364
+ )
365
+ if frames.empty?
366
+ return []
367
+ elsif frames.map(&:class).uniq.length != 1
368
+ raise TypeError, "Input frames must be of a consistent type (all LazyFrame or all DataFrame)"
369
+ end
370
+
371
+ # establish the superset of all "on" column values, sort, and cache
372
+ eager = frames[0].is_a?(DataFrame)
373
+ alignment_frame = (
374
+ concat(frames.map { |df| df.lazy.select(on) })
375
+ .unique(maintain_order: false)
376
+ .sort(on, reverse: reverse)
377
+ )
378
+ alignment_frame = (
379
+ eager ? alignment_frame.collect.lazy : alignment_frame.cache
380
+ )
381
+ # finally, align all frames
382
+ aligned_frames =
383
+ frames.map do |df|
384
+ alignment_frame.join(
385
+ df.lazy,
386
+ on: alignment_frame.columns,
387
+ how: "left"
388
+ ).select(df.columns)
389
+ end
390
+ if !select.nil?
391
+ aligned_frames = aligned_frames.map { |df| df.select(select) }
392
+ end
393
+
394
+ eager ? aligned_frames.map(&:collect) : aligned_frames
395
+ end
396
+
208
397
  # Return a new Series of given length and type, filled with ones.
209
398
  #
210
399
  # @param n [Integer]
@@ -12,7 +12,48 @@ module Polars
12
12
  self.maintain_order = maintain_order
13
13
  end
14
14
 
15
- # def apply
15
+ # Apply a custom/user-defined function (UDF) over the groups as a sub-DataFrame.
16
+ #
17
+ # Implementing logic using a Ruby function is almost always _significantly_
18
+ # slower and more memory intensive than implementing the same logic using
19
+ # the native expression API because:
20
+
21
+ # - The native expression engine runs in Rust; UDFs run in Ruby.
22
+ # - Use of Ruby UDFs forces the DataFrame to be materialized in memory.
23
+ # - Polars-native expressions can be parallelised (UDFs cannot).
24
+ # - Polars-native expressions can be logically optimised (UDFs cannot).
25
+ #
26
+ # Wherever possible you should strongly prefer the native expression API
27
+ # to achieve the best performance.
28
+ #
29
+ # @return [DataFrame]
30
+ #
31
+ # @example
32
+ # df = Polars::DataFrame.new(
33
+ # {
34
+ # "id" => [0, 1, 2, 3, 4],
35
+ # "color" => ["red", "green", "green", "red", "red"],
36
+ # "shape" => ["square", "triangle", "square", "triangle", "square"]
37
+ # }
38
+ # )
39
+ # df.groupby("color").apply { |group_df| group_df.sample(2) }
40
+ # # =>
41
+ # # shape: (4, 3)
42
+ # # ┌─────┬───────┬──────────┐
43
+ # # │ id ┆ color ┆ shape │
44
+ # # │ --- ┆ --- ┆ --- │
45
+ # # │ i64 ┆ str ┆ str │
46
+ # # ╞═════╪═══════╪══════════╡
47
+ # # │ 1 ┆ green ┆ triangle │
48
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
49
+ # # │ 2 ┆ green ┆ square │
50
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
51
+ # # │ 4 ┆ red ┆ square │
52
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
53
+ # # │ 3 ┆ red ┆ triangle │
54
+ # # └─────┴───────┴──────────┘
55
+ # def apply(&f)
56
+ # _dataframe_class._from_rbdf(_df.groupby_apply(by, f))
16
57
  # end
17
58
 
18
59
  # Use multiple aggregations on columns.
@@ -182,8 +223,7 @@ module Polars
182
223
  _dataframe_class._from_rbdf(df._df)
183
224
  end
184
225
 
185
- # def pivot
186
- # end
226
+ # pivot is deprecated
187
227
 
188
228
  # Aggregate the first values in the group.
189
229
  #
data/lib/polars/io.rb CHANGED
@@ -59,7 +59,7 @@ module Polars
59
59
  # Lossy means that invalid utf8 values are replaced with `�`
60
60
  # characters. When using other encodings than `utf8` or
61
61
  # `utf8-lossy`, the input is first decoded im memory with
62
- # python.
62
+ # Ruby.
63
63
  # @param low_memory [Boolean]
64
64
  # Reduce memory usage at expense of performance.
65
65
  # @param rechunk [Boolean]
@@ -451,8 +451,24 @@ module Polars
451
451
  )
452
452
  end
453
453
 
454
- # def read_avro
455
- # end
454
+ # Read into a DataFrame from Apache Avro format.
455
+ #
456
+ # @param file [Object]
457
+ # Path to a file or a file-like object.
458
+ # @param columns [Object]
459
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
460
+ # of column names.
461
+ # @param n_rows [Integer]
462
+ # Stop reading from Apache Avro file after reading ``n_rows``.
463
+ #
464
+ # @return [DataFrame]
465
+ def read_avro(file, columns: nil, n_rows: nil)
466
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
467
+ file = Utils.format_path(file)
468
+ end
469
+
470
+ DataFrame._read_avro(file, n_rows: n_rows, columns: columns)
471
+ end
456
472
 
457
473
  # Read into a DataFrame from Arrow IPC (Feather v2) file.
458
474
  #