polars-df 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,49 @@
1
+ module Polars
2
+ # A dynamic grouper.
3
+ #
4
+ # This has an `.agg` method which allows you to run all polars expressions in a
5
+ # groupby context.
6
+ class DynamicGroupBy
7
+ def initialize(
8
+ df,
9
+ index_column,
10
+ every,
11
+ period,
12
+ offset,
13
+ truncate,
14
+ include_boundaries,
15
+ closed,
16
+ by
17
+ )
18
+ period = Utils._timedelta_to_pl_duration(period)
19
+ offset = Utils._timedelta_to_pl_duration(offset)
20
+ every = Utils._timedelta_to_pl_duration(every)
21
+
22
+ @df = df
23
+ @time_column = index_column
24
+ @every = every
25
+ @period = period
26
+ @offset = offset
27
+ @truncate = truncate
28
+ @include_boundaries = include_boundaries
29
+ @closed = closed
30
+ @by = by
31
+ end
32
+
33
+ def agg(aggs)
34
+ @df.lazy
35
+ .groupby_dynamic(
36
+ @time_column,
37
+ every: @every,
38
+ period: @period,
39
+ offset: @offset,
40
+ truncate: @truncate,
41
+ include_boundaries: @include_boundaries,
42
+ closed: @closed,
43
+ by: @by
44
+ )
45
+ .agg(aggs)
46
+ .collect(no_optimization: true, string_cache: false)
47
+ end
48
+ end
49
+ end
data/lib/polars/expr.rb CHANGED
@@ -432,8 +432,34 @@ module Polars
432
432
  wrap_expr(_rbexpr.suffix(suffix))
433
433
  end
434
434
 
435
- # def map_alias
436
- # end
435
+ # Rename the output of an expression by mapping a function over the root name.
436
+ #
437
+ # @return [Expr]
438
+ #
439
+ # @example
440
+ # df = Polars::DataFrame.new(
441
+ # {
442
+ # "A" => [1, 2],
443
+ # "B" => [3, 4]
444
+ # }
445
+ # )
446
+ # df.select(
447
+ # Polars.all.reverse.map_alias { |colName| colName + "_reverse" }
448
+ # )
449
+ # # =>
450
+ # # shape: (2, 2)
451
+ # # ┌───────────┬───────────┐
452
+ # # │ A_reverse ┆ B_reverse │
453
+ # # │ --- ┆ --- │
454
+ # # │ i64 ┆ i64 │
455
+ # # ╞═══════════╪═══════════╡
456
+ # # │ 2 ┆ 4 │
457
+ # # ├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
458
+ # # │ 1 ┆ 3 │
459
+ # # └───────────┴───────────┘
460
+ def map_alias(&f)
461
+ Utils.wrap_expr(_rbexpr.map_alias(f))
462
+ end
437
463
 
438
464
  # Negate a boolean expression.
439
465
  #
@@ -2575,14 +2601,98 @@ module Polars
2575
2601
  # # ╞══════╪════════╡
2576
2602
  # # │ 1 ┆ 0 │
2577
2603
  # # └──────┴────────┘
2578
- # def map(return_dtype: nil, agg_list: false, &block)
2604
+ # def map(return_dtype: nil, agg_list: false, &f)
2579
2605
  # if !return_dtype.nil?
2580
2606
  # return_dtype = Utils.rb_type_to_dtype(return_dtype)
2581
2607
  # end
2582
- # wrap_expr(_rbexpr.map(return_dtype, agg_list, &block))
2608
+ # wrap_expr(_rbexpr.map(f, return_dtype, agg_list))
2583
2609
  # end
2584
2610
 
2585
- # def apply
2611
+ # Apply a custom/user-defined function (UDF) in a GroupBy or Projection context.
2612
+ #
2613
+ # Depending on the context it has the following behavior:
2614
+ #
2615
+ # * Selection
2616
+ # Expects `f` to be of type Callable[[Any], Any].
2617
+ # Applies a Ruby function over each individual value in the column.
2618
+ # * GroupBy
2619
+ # Expects `f` to be of type Callable[[Series], Series].
2620
+ # Applies a Ruby function over each group.
2621
+ #
2622
+ # Implementing logic using a Ruby function is almost always _significantly_
2623
+ # slower and more memory intensive than implementing the same logic using
2624
+ # the native expression API because:
2625
+ #
2626
+ # - The native expression engine runs in Rust; UDFs run in Ruby.
2627
+ # - Use of Ruby UDFs forces the DataFrame to be materialized in memory.
2628
+ # - Polars-native expressions can be parallelised (UDFs cannot).
2629
+ # - Polars-native expressions can be logically optimised (UDFs cannot).
2630
+ #
2631
+ # Wherever possible you should strongly prefer the native expression API
2632
+ # to achieve the best performance.
2633
+ #
2634
+ # @param return_dtype [Symbol]
2635
+ # Dtype of the output Series.
2636
+ # If not set, polars will assume that
2637
+ # the dtype remains unchanged.
2638
+ #
2639
+ # @return [Expr]
2640
+ #
2641
+ # @example
2642
+ # df = Polars::DataFrame.new(
2643
+ # {
2644
+ # "a" => [1, 2, 3, 1],
2645
+ # "b" => ["a", "b", "c", "c"]
2646
+ # }
2647
+ # )
2648
+ #
2649
+ # @example In a selection context, the function is applied by row.
2650
+ # df.with_column(
2651
+ # Polars.col("a").apply { |x| x * 2 }.alias("a_times_2")
2652
+ # )
2653
+ # # =>
2654
+ # # shape: (4, 3)
2655
+ # # ┌─────┬─────┬───────────┐
2656
+ # # │ a ┆ b ┆ a_times_2 │
2657
+ # # │ --- ┆ --- ┆ --- │
2658
+ # # │ i64 ┆ str ┆ i64 │
2659
+ # # ╞═════╪═════╪═══════════╡
2660
+ # # │ 1 ┆ a ┆ 2 │
2661
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
2662
+ # # │ 2 ┆ b ┆ 4 │
2663
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
2664
+ # # │ 3 ┆ c ┆ 6 │
2665
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
2666
+ # # │ 1 ┆ c ┆ 2 │
2667
+ # # └─────┴─────┴───────────┘
2668
+ #
2669
+ # @example In a GroupBy context the function is applied by group:
2670
+ # df.lazy
2671
+ # .groupby("b", maintain_order: true)
2672
+ # .agg(
2673
+ # [
2674
+ # Polars.col("a").apply { |x| x.sum }
2675
+ # ]
2676
+ # )
2677
+ # .collect
2678
+ # # =>
2679
+ # # shape: (3, 2)
2680
+ # # ┌─────┬─────┐
2681
+ # # │ b ┆ a │
2682
+ # # │ --- ┆ --- │
2683
+ # # │ str ┆ i64 │
2684
+ # # ╞═════╪═════╡
2685
+ # # │ a ┆ 1 │
2686
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2687
+ # # │ b ┆ 2 │
2688
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
2689
+ # # │ c ┆ 4 │
2690
+ # # └─────┴─────┘
2691
+ # def apply(return_dtype: nil, &f)
2692
+ # wrap_f = lambda do |x|
2693
+ # x.apply(return_dtype: return_dtype, &f)
2694
+ # end
2695
+ # map(agg_list: true, return_dtype: return_dtype, &wrap_f)
2586
2696
  # end
2587
2697
 
2588
2698
  # Explode a list or utf8 Series. This means that every item is expanded to a new
@@ -2898,8 +3008,49 @@ module Polars
2898
3008
  end
2899
3009
  end
2900
3010
 
2901
- # def _hash
2902
- # end
3011
+ # Hash the elements in the selection.
3012
+ #
3013
+ # The hash value is of type `:u64`.
3014
+ #
3015
+ # @param seed [Integer]
3016
+ # Random seed parameter. Defaults to 0.
3017
+ # @param seed_1 [Integer]
3018
+ # Random seed parameter. Defaults to `seed` if not set.
3019
+ # @param seed_2 [Integer]
3020
+ # Random seed parameter. Defaults to `seed` if not set.
3021
+ # @param seed_3 [Integer]
3022
+ # Random seed parameter. Defaults to `seed` if not set.
3023
+ #
3024
+ # @return [Expr]
3025
+ #
3026
+ # @example
3027
+ # df = Polars::DataFrame.new(
3028
+ # {
3029
+ # "a" => [1, 2, nil],
3030
+ # "b" => ["x", nil, "z"]
3031
+ # }
3032
+ # )
3033
+ # df.with_column(Polars.all._hash(10, 20, 30, 40))
3034
+ # # =>
3035
+ # # shape: (3, 2)
3036
+ # # ┌──────────────────────┬──────────────────────┐
3037
+ # # │ a ┆ b │
3038
+ # # │ --- ┆ --- │
3039
+ # # │ u64 ┆ u64 │
3040
+ # # ╞══════════════════════╪══════════════════════╡
3041
+ # # │ 4629889412789719550 ┆ 6959506404929392568 │
3042
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
3043
+ # # │ 16386608652769605760 ┆ 11638928888656214026 │
3044
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
3045
+ # # │ 11638928888656214026 ┆ 11040941213715918520 │
3046
+ # # └──────────────────────┴──────────────────────┘
3047
+ def _hash(seed = 0, seed_1 = nil, seed_2 = nil, seed_3 = nil)
3048
+ k0 = seed
3049
+ k1 = seed_1.nil? ? seed : seed_1
3050
+ k2 = seed_2.nil? ? seed : seed_2
3051
+ k3 = seed_3.nil? ? seed : seed_3
3052
+ wrap_expr(_rbexpr._hash(k0, k1, k2, k3))
3053
+ end
2903
3054
 
2904
3055
  # Reinterpret the underlying bits as a signed/unsigned integer.
2905
3056
  #
@@ -2937,7 +3088,40 @@ module Polars
2937
3088
  wrap_expr(_rbexpr.reinterpret(signed))
2938
3089
  end
2939
3090
 
2940
- # def _inspect
3091
+ # Print the value that this expression evaluates to and pass on the value.
3092
+ #
3093
+ # @return [Expr]
3094
+ #
3095
+ # @example
3096
+ # df = Polars::DataFrame.new({"foo" => [1, 1, 2]})
3097
+ # df.select(Polars.col("foo").cumsum._inspect("value is: %s").alias("bar"))
3098
+ # # =>
3099
+ # # value is: shape: (3,)
3100
+ # # Series: 'foo' [i64]
3101
+ # # [
3102
+ # # 1
3103
+ # # 2
3104
+ # # 4
3105
+ # # ]
3106
+ # # shape: (3, 1)
3107
+ # # ┌─────┐
3108
+ # # │ bar │
3109
+ # # │ --- │
3110
+ # # │ i64 │
3111
+ # # ╞═════╡
3112
+ # # │ 1 │
3113
+ # # ├╌╌╌╌╌┤
3114
+ # # │ 2 │
3115
+ # # ├╌╌╌╌╌┤
3116
+ # # │ 4 │
3117
+ # # └─────┘
3118
+ # def _inspect(fmt = "%s")
3119
+ # inspect = lambda do |s|
3120
+ # puts(fmt % [s])
3121
+ # s
3122
+ # end
3123
+
3124
+ # map(return_dtype: nil, agg_list: true, &inspect)
2941
3125
  # end
2942
3126
 
2943
3127
  # Fill nulls with linear interpolation over missing values.
@@ -3721,7 +3905,72 @@ module Polars
3721
3905
  )
3722
3906
  end
3723
3907
 
3724
- # def rolling_apply
3908
+ # Apply a custom rolling window function.
3909
+ #
3910
+ # Prefer the specific rolling window functions over this one, as they are faster.
3911
+ #
3912
+ # Prefer:
3913
+ # * rolling_min
3914
+ # * rolling_max
3915
+ # * rolling_mean
3916
+ # * rolling_sum
3917
+ #
3918
+ # @param window_size [Integer]
3919
+ # The length of the window.
3920
+ # @param weights [Object]
3921
+ # An optional slice with the same length as the window that will be multiplied
3922
+ # elementwise with the values in the window.
3923
+ # @param min_periods [Integer]
3924
+ # The number of values in the window that should be non-null before computing
3925
+ # a result. If nil, it will be set equal to window size.
3926
+ # @param center [Boolean]
3927
+ # Set the labels at the center of the window
3928
+ #
3929
+ # @return [Expr]
3930
+ #
3931
+ # @example
3932
+ # df = Polars::DataFrame.new(
3933
+ # {
3934
+ # "A" => [1.0, 2.0, 9.0, 2.0, 13.0]
3935
+ # }
3936
+ # )
3937
+ # df.select(
3938
+ # [
3939
+ # Polars.col("A").rolling_apply(window_size: 3) { |s| s.std }
3940
+ # ]
3941
+ # )
3942
+ # # =>
3943
+ # # shape: (5, 1)
3944
+ # # ┌──────────┐
3945
+ # # │ A │
3946
+ # # │ --- │
3947
+ # # │ f64 │
3948
+ # # ╞══════════╡
3949
+ # # │ null │
3950
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
3951
+ # # │ null │
3952
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
3953
+ # # │ 4.358899 │
3954
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
3955
+ # # │ 4.041452 │
3956
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
3957
+ # # │ 5.567764 │
3958
+ # # └──────────┘
3959
+ # def rolling_apply(
3960
+ # window_size:,
3961
+ # weights: nil,
3962
+ # min_periods: nil,
3963
+ # center: false,
3964
+ # &function
3965
+ # )
3966
+ # if min_periods.nil?
3967
+ # min_periods = window_size
3968
+ # end
3969
+ # wrap_expr(
3970
+ # _rbexpr.rolling_apply(
3971
+ # function, window_size, weights, min_periods, center
3972
+ # )
3973
+ # )
3725
3974
  # end
3726
3975
 
3727
3976
  # Compute a rolling skew.
@@ -199,12 +199,201 @@ module Polars
199
199
  dt_range
200
200
  end
201
201
 
202
- # def cut
203
- # end
202
+ # Bin values into discrete values.
203
+ #
204
+ # @param s [Series]
205
+ # Series to bin.
206
+ # @param bins [Array]
207
+ # Bins to create.
208
+ # @param labels [Array]
209
+ # Labels to assign to the bins. If given the length of labels must be
210
+ # len(bins) + 1.
211
+ # @param break_point_label [String]
212
+ # Name given to the breakpoint column.
213
+ # @param category_label [String]
214
+ # Name given to the category column.
215
+ #
216
+ # @return [DataFrame]
217
+ #
218
+ # @note
219
+ # This functionality is experimental and may change without it being considered a
220
+ # breaking change.
221
+ #
222
+ # @example
223
+ # a = Polars::Series.new("a", 13.times.map { |i| (-30 + i * 5) / 10.0 })
224
+ # Polars.cut(a, [-1, 1])
225
+ # # =>
226
+ # # shape: (12, 3)
227
+ # # ┌──────┬─────────────┬──────────────┐
228
+ # # │ a ┆ break_point ┆ category │
229
+ # # │ --- ┆ --- ┆ --- │
230
+ # # │ f64 ┆ f64 ┆ cat │
231
+ # # ╞══════╪═════════════╪══════════════╡
232
+ # # │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │
233
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
234
+ # # │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │
235
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
236
+ # # │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │
237
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
238
+ # # │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │
239
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
240
+ # # │ ... ┆ ... ┆ ... │
241
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
242
+ # # │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │
243
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
244
+ # # │ 1.5 ┆ inf ┆ (1.0, inf] │
245
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
246
+ # # │ 2.0 ┆ inf ┆ (1.0, inf] │
247
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
248
+ # # │ 2.5 ┆ inf ┆ (1.0, inf] │
249
+ # # └──────┴─────────────┴──────────────┘
250
+ # def cut(
251
+ # s,
252
+ # bins,
253
+ # labels: nil,
254
+ # break_point_label: "break_point",
255
+ # category_label: "category"
256
+ # )
257
+ # var_nm = s.name
204
258
 
205
- # def align_frames
259
+ # cuts_df = DataFrame.new(
260
+ # [
261
+ # Series.new(
262
+ # break_point_label, bins, dtype: :f64
263
+ # ).extend_constant(Float::INFINITY, 1)
264
+ # ]
265
+ # )
266
+
267
+ # if labels
268
+ # if labels.length != bins.length + 1
269
+ # raise ArgumentError, "expected more labels"
270
+ # end
271
+ # cuts_df = cuts_df.with_column(Series.new(category_label, labels))
272
+ # else
273
+ # cuts_df = cuts_df.with_column(
274
+ # Polars.format(
275
+ # "({}, {}]",
276
+ # Polars.col(break_point_label).shift_and_fill(1, -Float::INFINITY),
277
+ # Polars.col(break_point_label)
278
+ # ).alias(category_label)
279
+ # )
280
+ # end
281
+
282
+ # cuts_df = cuts_df.with_column(Polars.col(category_label).cast(:cat))
283
+
284
+ # s.cast(:f64)
285
+ # .sort
286
+ # .to_frame
287
+ # .join_asof(
288
+ # cuts_df,
289
+ # left_on: var_nm,
290
+ # right_on: break_point_label,
291
+ # strategy: "forward"
292
+ # )
206
293
  # end
207
294
 
295
+ # Align a sequence of frames using the uique values from one or more columns as a key.
296
+ #
297
+ # Frames that do not contain the given key values have rows injected (with nulls
298
+ # filling the non-key columns), and each resulting frame is sorted by the key.
299
+ #
300
+ # The original column order of input frames is not changed unless ``select`` is
301
+ # specified (in which case the final column order is determined from that).
302
+ #
303
+ # Note that this does not result in a joined frame - you receive the same number
304
+ # of frames back that you passed in, but each is now aligned by key and has
305
+ # the same number of rows.
306
+ #
307
+ # @param frames [Array]
308
+ # Sequence of DataFrames or LazyFrames.
309
+ # @param on [Object]
310
+ # One or more columns whose unique values will be used to align the frames.
311
+ # @param select [Object]
312
+ # Optional post-alignment column select to constrain and/or order
313
+ # the columns returned from the newly aligned frames.
314
+ # @param reverse [Object]
315
+ # Sort the alignment column values in descending order; can be a single
316
+ # boolean or a list of booleans associated with each column in `on`.
317
+ #
318
+ # @return [Object]
319
+ #
320
+ # @example
321
+ # df1 = Polars::DataFrame.new(
322
+ # {
323
+ # "dt" => [Date.new(2022, 9, 1), Date.new(2022, 9, 2), Date.new(2022, 9, 3)],
324
+ # "x" => [3.5, 4.0, 1.0],
325
+ # "y" => [10.0, 2.5, 1.5]
326
+ # }
327
+ # )
328
+ # df2 = Polars::DataFrame.new(
329
+ # {
330
+ # "dt" => [Date.new(2022, 9, 2), Date.new(2022, 9, 3), Date.new(2022, 9, 1)],
331
+ # "x" => [8.0, 1.0, 3.5],
332
+ # "y" => [1.5, 12.0, 5.0]
333
+ # }
334
+ # )
335
+ # df3 = Polars::DataFrame.new(
336
+ # {
337
+ # "dt" => [Date.new(2022, 9, 3), Date.new(2022, 9, 2)],
338
+ # "x" => [2.0, 5.0],
339
+ # "y" => [2.5, 2.0]
340
+ # }
341
+ # )
342
+ # af1, af2, af3 = Polars.align_frames(
343
+ # df1, df2, df3, on: "dt", select: ["x", "y"]
344
+ # )
345
+ # (af1 * af2 * af3).fill_null(0).select(Polars.sum(Polars.col("*")).alias("dot"))
346
+ # # =>
347
+ # # shape: (3, 1)
348
+ # # ┌───────┐
349
+ # # │ dot │
350
+ # # │ --- │
351
+ # # │ f64 │
352
+ # # ╞═══════╡
353
+ # # │ 0.0 │
354
+ # # ├╌╌╌╌╌╌╌┤
355
+ # # │ 167.5 │
356
+ # # ├╌╌╌╌╌╌╌┤
357
+ # # │ 47.0 │
358
+ # # └───────┘
359
+ def align_frames(
360
+ *frames,
361
+ on:,
362
+ select: nil,
363
+ reverse: false
364
+ )
365
+ if frames.empty?
366
+ return []
367
+ elsif frames.map(&:class).uniq.length != 1
368
+ raise TypeError, "Input frames must be of a consistent type (all LazyFrame or all DataFrame)"
369
+ end
370
+
371
+ # establish the superset of all "on" column values, sort, and cache
372
+ eager = frames[0].is_a?(DataFrame)
373
+ alignment_frame = (
374
+ concat(frames.map { |df| df.lazy.select(on) })
375
+ .unique(maintain_order: false)
376
+ .sort(on, reverse: reverse)
377
+ )
378
+ alignment_frame = (
379
+ eager ? alignment_frame.collect.lazy : alignment_frame.cache
380
+ )
381
+ # finally, align all frames
382
+ aligned_frames =
383
+ frames.map do |df|
384
+ alignment_frame.join(
385
+ df.lazy,
386
+ on: alignment_frame.columns,
387
+ how: "left"
388
+ ).select(df.columns)
389
+ end
390
+ if !select.nil?
391
+ aligned_frames = aligned_frames.map { |df| df.select(select) }
392
+ end
393
+
394
+ eager ? aligned_frames.map(&:collect) : aligned_frames
395
+ end
396
+
208
397
  # Return a new Series of given length and type, filled with ones.
209
398
  #
210
399
  # @param n [Integer]
@@ -12,7 +12,48 @@ module Polars
12
12
  self.maintain_order = maintain_order
13
13
  end
14
14
 
15
- # def apply
15
+ # Apply a custom/user-defined function (UDF) over the groups as a sub-DataFrame.
16
+ #
17
+ # Implementing logic using a Ruby function is almost always _significantly_
18
+ # slower and more memory intensive than implementing the same logic using
19
+ # the native expression API because:
20
+
21
+ # - The native expression engine runs in Rust; UDFs run in Ruby.
22
+ # - Use of Ruby UDFs forces the DataFrame to be materialized in memory.
23
+ # - Polars-native expressions can be parallelised (UDFs cannot).
24
+ # - Polars-native expressions can be logically optimised (UDFs cannot).
25
+ #
26
+ # Wherever possible you should strongly prefer the native expression API
27
+ # to achieve the best performance.
28
+ #
29
+ # @return [DataFrame]
30
+ #
31
+ # @example
32
+ # df = Polars::DataFrame.new(
33
+ # {
34
+ # "id" => [0, 1, 2, 3, 4],
35
+ # "color" => ["red", "green", "green", "red", "red"],
36
+ # "shape" => ["square", "triangle", "square", "triangle", "square"]
37
+ # }
38
+ # )
39
+ # df.groupby("color").apply { |group_df| group_df.sample(2) }
40
+ # # =>
41
+ # # shape: (4, 3)
42
+ # # ┌─────┬───────┬──────────┐
43
+ # # │ id ┆ color ┆ shape │
44
+ # # │ --- ┆ --- ┆ --- │
45
+ # # │ i64 ┆ str ┆ str │
46
+ # # ╞═════╪═══════╪══════════╡
47
+ # # │ 1 ┆ green ┆ triangle │
48
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
49
+ # # │ 2 ┆ green ┆ square │
50
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
51
+ # # │ 4 ┆ red ┆ square │
52
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
53
+ # # │ 3 ┆ red ┆ triangle │
54
+ # # └─────┴───────┴──────────┘
55
+ # def apply(&f)
56
+ # _dataframe_class._from_rbdf(_df.groupby_apply(by, f))
16
57
  # end
17
58
 
18
59
  # Use multiple aggregations on columns.
@@ -182,8 +223,7 @@ module Polars
182
223
  _dataframe_class._from_rbdf(df._df)
183
224
  end
184
225
 
185
- # def pivot
186
- # end
226
+ # pivot is deprecated
187
227
 
188
228
  # Aggregate the first values in the group.
189
229
  #
data/lib/polars/io.rb CHANGED
@@ -59,7 +59,7 @@ module Polars
59
59
  # Lossy means that invalid utf8 values are replaced with `�`
60
60
  # characters. When using other encodings than `utf8` or
61
61
  # `utf8-lossy`, the input is first decoded im memory with
62
- # python.
62
+ # Ruby.
63
63
  # @param low_memory [Boolean]
64
64
  # Reduce memory usage at expense of performance.
65
65
  # @param rechunk [Boolean]
@@ -451,8 +451,24 @@ module Polars
451
451
  )
452
452
  end
453
453
 
454
- # def read_avro
455
- # end
454
+ # Read into a DataFrame from Apache Avro format.
455
+ #
456
+ # @param file [Object]
457
+ # Path to a file or a file-like object.
458
+ # @param columns [Object]
459
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
460
+ # of column names.
461
+ # @param n_rows [Integer]
462
+ # Stop reading from Apache Avro file after reading ``n_rows``.
463
+ #
464
+ # @return [DataFrame]
465
+ def read_avro(file, columns: nil, n_rows: nil)
466
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
467
+ file = Utils.format_path(file)
468
+ end
469
+
470
+ DataFrame._read_avro(file, n_rows: n_rows, columns: columns)
471
+ end
456
472
 
457
473
  # Read into a DataFrame from Arrow IPC (Feather v2) file.
458
474
  #