polars-df 0.13.0-x64-mingw-ucrt

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +208 -0
  4. data/Cargo.lock +2556 -0
  5. data/Cargo.toml +6 -0
  6. data/LICENSE-THIRD-PARTY.txt +39278 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +437 -0
  9. data/lib/polars/3.1/polars.so +0 -0
  10. data/lib/polars/3.2/polars.so +0 -0
  11. data/lib/polars/3.3/polars.so +0 -0
  12. data/lib/polars/array_expr.rb +537 -0
  13. data/lib/polars/array_name_space.rb +423 -0
  14. data/lib/polars/batched_csv_reader.rb +104 -0
  15. data/lib/polars/binary_expr.rb +77 -0
  16. data/lib/polars/binary_name_space.rb +66 -0
  17. data/lib/polars/cat_expr.rb +36 -0
  18. data/lib/polars/cat_name_space.rb +88 -0
  19. data/lib/polars/config.rb +530 -0
  20. data/lib/polars/convert.rb +98 -0
  21. data/lib/polars/data_frame.rb +5191 -0
  22. data/lib/polars/data_types.rb +466 -0
  23. data/lib/polars/date_time_expr.rb +1397 -0
  24. data/lib/polars/date_time_name_space.rb +1287 -0
  25. data/lib/polars/dynamic_group_by.rb +52 -0
  26. data/lib/polars/exceptions.rb +38 -0
  27. data/lib/polars/expr.rb +7256 -0
  28. data/lib/polars/expr_dispatch.rb +22 -0
  29. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  30. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  31. data/lib/polars/functions/as_datatype.rb +271 -0
  32. data/lib/polars/functions/col.rb +47 -0
  33. data/lib/polars/functions/eager.rb +182 -0
  34. data/lib/polars/functions/lazy.rb +1329 -0
  35. data/lib/polars/functions/len.rb +49 -0
  36. data/lib/polars/functions/lit.rb +35 -0
  37. data/lib/polars/functions/random.rb +16 -0
  38. data/lib/polars/functions/range/date_range.rb +136 -0
  39. data/lib/polars/functions/range/datetime_range.rb +149 -0
  40. data/lib/polars/functions/range/int_range.rb +51 -0
  41. data/lib/polars/functions/range/time_range.rb +141 -0
  42. data/lib/polars/functions/repeat.rb +144 -0
  43. data/lib/polars/functions/whenthen.rb +96 -0
  44. data/lib/polars/functions.rb +57 -0
  45. data/lib/polars/group_by.rb +613 -0
  46. data/lib/polars/io/avro.rb +24 -0
  47. data/lib/polars/io/csv.rb +696 -0
  48. data/lib/polars/io/database.rb +73 -0
  49. data/lib/polars/io/ipc.rb +275 -0
  50. data/lib/polars/io/json.rb +29 -0
  51. data/lib/polars/io/ndjson.rb +80 -0
  52. data/lib/polars/io/parquet.rb +233 -0
  53. data/lib/polars/lazy_frame.rb +2708 -0
  54. data/lib/polars/lazy_group_by.rb +181 -0
  55. data/lib/polars/list_expr.rb +791 -0
  56. data/lib/polars/list_name_space.rb +449 -0
  57. data/lib/polars/meta_expr.rb +222 -0
  58. data/lib/polars/name_expr.rb +198 -0
  59. data/lib/polars/plot.rb +109 -0
  60. data/lib/polars/rolling_group_by.rb +35 -0
  61. data/lib/polars/series.rb +4444 -0
  62. data/lib/polars/slice.rb +104 -0
  63. data/lib/polars/sql_context.rb +194 -0
  64. data/lib/polars/string_cache.rb +75 -0
  65. data/lib/polars/string_expr.rb +1495 -0
  66. data/lib/polars/string_name_space.rb +811 -0
  67. data/lib/polars/struct_expr.rb +98 -0
  68. data/lib/polars/struct_name_space.rb +96 -0
  69. data/lib/polars/testing.rb +507 -0
  70. data/lib/polars/utils/constants.rb +9 -0
  71. data/lib/polars/utils/convert.rb +97 -0
  72. data/lib/polars/utils/parse.rb +89 -0
  73. data/lib/polars/utils/various.rb +76 -0
  74. data/lib/polars/utils/wrap.rb +19 -0
  75. data/lib/polars/utils.rb +130 -0
  76. data/lib/polars/version.rb +4 -0
  77. data/lib/polars/whenthen.rb +83 -0
  78. data/lib/polars-df.rb +1 -0
  79. data/lib/polars.rb +91 -0
  80. metadata +138 -0
@@ -0,0 +1,613 @@
1
+ module Polars
2
+ # Starts a new GroupBy operation.
3
+ class GroupBy
4
+ # @private
5
+ def initialize(df, by, maintain_order: false)
6
+ @df = df
7
+ @by = by
8
+ @maintain_order = maintain_order
9
+ end
10
+
11
+ # Allows iteration over the groups of the group by operation.
12
+ #
13
+ # @return [Object]
14
+ #
15
+ # @example
16
+ # df = Polars::DataFrame.new({"foo" => ["a", "a", "b"], "bar" => [1, 2, 3]})
17
+ # df.group_by("foo", maintain_order: true).each.to_h
18
+ # # =>
19
+ # # {"a"=>shape: (2, 2)
20
+ # # ┌─────┬─────┐
21
+ # # │ foo ┆ bar │
22
+ # # │ --- ┆ --- │
23
+ # # │ str ┆ i64 │
24
+ # # ╞═════╪═════╡
25
+ # # │ a ┆ 1 │
26
+ # # │ a ┆ 2 │
27
+ # # └─────┴─────┘, "b"=>shape: (1, 2)
28
+ # # ┌─────┬─────┐
29
+ # # │ foo ┆ bar │
30
+ # # │ --- ┆ --- │
31
+ # # │ str ┆ i64 │
32
+ # # ╞═════╪═════╡
33
+ # # │ b ┆ 3 │
34
+ # # └─────┴─────┘}
35
+ def each
36
+ return to_enum(:each) unless block_given?
37
+
38
+ temp_col = "__POLARS_GB_GROUP_INDICES"
39
+ groups_df =
40
+ @df.lazy
41
+ .with_row_index(name: temp_col)
42
+ .group_by(@by, maintain_order: @maintain_order)
43
+ .agg(Polars.col(temp_col))
44
+ .collect(no_optimization: true)
45
+
46
+ group_names = groups_df.select(Polars.all.exclude(temp_col))
47
+
48
+ # When grouping by a single column, group name is a single value
49
+ # When grouping by multiple columns, group name is a tuple of values
50
+ if @by.is_a?(::String) || @by.is_a?(Expr)
51
+ _group_names = group_names.to_series.each
52
+ else
53
+ _group_names = group_names.iter_rows
54
+ end
55
+
56
+ _group_indices = groups_df.select(temp_col).to_series
57
+ _current_index = 0
58
+
59
+ while _current_index < _group_indices.length
60
+ group_name = _group_names.next
61
+ group_data = @df[_group_indices[_current_index]]
62
+ _current_index += 1
63
+
64
+ yield group_name, group_data
65
+ end
66
+ end
67
+
68
+ # Apply a custom/user-defined function (UDF) over the groups as a sub-DataFrame.
69
+ #
70
+ # Implementing logic using a Ruby function is almost always _significantly_
71
+ # slower and more memory intensive than implementing the same logic using
72
+ # the native expression API because:
73
+
74
+ # - The native expression engine runs in Rust; UDFs run in Ruby.
75
+ # - Use of Ruby UDFs forces the DataFrame to be materialized in memory.
76
+ # - Polars-native expressions can be parallelised (UDFs cannot).
77
+ # - Polars-native expressions can be logically optimised (UDFs cannot).
78
+ #
79
+ # Wherever possible you should strongly prefer the native expression API
80
+ # to achieve the best performance.
81
+ #
82
+ # @return [DataFrame]
83
+ #
84
+ # @example
85
+ # df = Polars::DataFrame.new(
86
+ # {
87
+ # "id" => [0, 1, 2, 3, 4],
88
+ # "color" => ["red", "green", "green", "red", "red"],
89
+ # "shape" => ["square", "triangle", "square", "triangle", "square"]
90
+ # }
91
+ # )
92
+ # df.group_by("color").apply { |group_df| group_df.sample(2) }
93
+ # # =>
94
+ # # shape: (4, 3)
95
+ # # ┌─────┬───────┬──────────┐
96
+ # # │ id ┆ color ┆ shape │
97
+ # # │ --- ┆ --- ┆ --- │
98
+ # # │ i64 ┆ str ┆ str │
99
+ # # ╞═════╪═══════╪══════════╡
100
+ # # │ 1 ┆ green ┆ triangle │
101
+ # # │ 2 ┆ green ┆ square │
102
+ # # │ 4 ┆ red ┆ square │
103
+ # # │ 3 ┆ red ┆ triangle │
104
+ # # └─────┴───────┴──────────┘
105
+ # def apply(&f)
106
+ # _dataframe_class._from_rbdf(_df.group_by_apply(by, f))
107
+ # end
108
+
109
+ # Compute aggregations for each group of a group by operation.
110
+ #
111
+ # @param aggs [Array]
112
+ # Aggregations to compute for each group of the group by operation,
113
+ # specified as positional arguments.
114
+ # Accepts expression input. Strings are parsed as column names.
115
+ # @param named_aggs [Hash]
116
+ # Additional aggregations, specified as keyword arguments.
117
+ # The resulting columns will be renamed to the keyword used.
118
+ #
119
+ # @return [DataFrame]
120
+ #
121
+ # @example Compute the aggregation of the columns for each group.
122
+ # df = Polars::DataFrame.new(
123
+ # {
124
+ # "a" => ["a", "b", "a", "b", "c"],
125
+ # "b" => [1, 2, 1, 3, 3],
126
+ # "c" => [5, 4, 3, 2, 1]
127
+ # }
128
+ # )
129
+ # df.group_by("a").agg(Polars.col("b"), Polars.col("c"))
130
+ # # =>
131
+ # # shape: (3, 3)
132
+ # # ┌─────┬───────────┬───────────┐
133
+ # # │ a ┆ b ┆ c │
134
+ # # │ --- ┆ --- ┆ --- │
135
+ # # │ str ┆ list[i64] ┆ list[i64] │
136
+ # # ╞═════╪═══════════╪═══════════╡
137
+ # # │ a ┆ [1, 1] ┆ [5, 3] │
138
+ # # │ b ┆ [2, 3] ┆ [4, 2] │
139
+ # # │ c ┆ [3] ┆ [1] │
140
+ # # └─────┴───────────┴───────────┘
141
+ #
142
+ # @example Compute the sum of a column for each group.
143
+ # df.group_by("a").agg(Polars.col("b").sum)
144
+ # # =>
145
+ # # shape: (3, 2)
146
+ # # ┌─────┬─────┐
147
+ # # │ a ┆ b │
148
+ # # │ --- ┆ --- │
149
+ # # │ str ┆ i64 │
150
+ # # ╞═════╪═════╡
151
+ # # │ a ┆ 2 │
152
+ # # │ b ┆ 5 │
153
+ # # │ c ┆ 3 │
154
+ # # └─────┴─────┘
155
+ #
156
+ # @example Compute multiple aggregates at once by passing a list of expressions.
157
+ # df.group_by("a").agg([Polars.sum("b"), Polars.mean("c")])
158
+ # # =>
159
+ # # shape: (3, 3)
160
+ # # ┌─────┬─────┬─────┐
161
+ # # │ a ┆ b ┆ c │
162
+ # # │ --- ┆ --- ┆ --- │
163
+ # # │ str ┆ i64 ┆ f64 │
164
+ # # ╞═════╪═════╪═════╡
165
+ # # │ c ┆ 3 ┆ 1.0 │
166
+ # # │ a ┆ 2 ┆ 4.0 │
167
+ # # │ b ┆ 5 ┆ 3.0 │
168
+ # # └─────┴─────┴─────┘
169
+ #
170
+ # @example Or use positional arguments to compute multiple aggregations in the same way.
171
+ # df.group_by("a").agg(
172
+ # Polars.sum("b").name.suffix("_sum"),
173
+ # (Polars.col("c") ** 2).mean.name.suffix("_mean_squared")
174
+ # )
175
+ # # =>
176
+ # # shape: (3, 3)
177
+ # # ┌─────┬───────┬────────────────┐
178
+ # # │ a ┆ b_sum ┆ c_mean_squared │
179
+ # # │ --- ┆ --- ┆ --- │
180
+ # # │ str ┆ i64 ┆ f64 │
181
+ # # ╞═════╪═══════╪════════════════╡
182
+ # # │ a ┆ 2 ┆ 17.0 │
183
+ # # │ c ┆ 3 ┆ 1.0 │
184
+ # # │ b ┆ 5 ┆ 10.0 │
185
+ # # └─────┴───────┴────────────────┘
186
+ #
187
+ # @example Use keyword arguments to easily name your expression inputs.
188
+ # df.group_by("a").agg(
189
+ # b_sum: Polars.sum("b"),
190
+ # c_mean_squared: (Polars.col("c") ** 2).mean
191
+ # )
192
+ # # =>
193
+ # # shape: (3, 3)
194
+ # # ┌─────┬───────┬────────────────┐
195
+ # # │ a ┆ b_sum ┆ c_mean_squared │
196
+ # # │ --- ┆ --- ┆ --- │
197
+ # # │ str ┆ i64 ┆ f64 │
198
+ # # ╞═════╪═══════╪════════════════╡
199
+ # # │ a ┆ 2 ┆ 17.0 │
200
+ # # │ c ┆ 3 ┆ 1.0 │
201
+ # # │ b ┆ 5 ┆ 10.0 │
202
+ # # └─────┴───────┴────────────────┘
203
+ def agg(*aggs, **named_aggs)
204
+ @df.lazy
205
+ .group_by(@by, maintain_order: @maintain_order)
206
+ .agg(*aggs, **named_aggs)
207
+ .collect(no_optimization: true)
208
+ end
209
+
210
+ # Get the first `n` rows of each group.
211
+ #
212
+ # @param n [Integer]
213
+ # Number of rows to return.
214
+ #
215
+ # @return [DataFrame]
216
+ #
217
+ # @example
218
+ # df = Polars::DataFrame.new(
219
+ # {
220
+ # "letters" => ["c", "c", "a", "c", "a", "b"],
221
+ # "nrs" => [1, 2, 3, 4, 5, 6]
222
+ # }
223
+ # )
224
+ # # =>
225
+ # # shape: (6, 2)
226
+ # # ┌─────────┬─────┐
227
+ # # │ letters ┆ nrs │
228
+ # # │ --- ┆ --- │
229
+ # # │ str ┆ i64 │
230
+ # # ╞═════════╪═════╡
231
+ # # │ c ┆ 1 │
232
+ # # │ c ┆ 2 │
233
+ # # │ a ┆ 3 │
234
+ # # │ c ┆ 4 │
235
+ # # │ a ┆ 5 │
236
+ # # │ b ┆ 6 │
237
+ # # └─────────┴─────┘
238
+ #
239
+ # @example
240
+ # df.group_by("letters").head(2).sort("letters")
241
+ # # =>
242
+ # # shape: (5, 2)
243
+ # # ┌─────────┬─────┐
244
+ # # │ letters ┆ nrs │
245
+ # # │ --- ┆ --- │
246
+ # # │ str ┆ i64 │
247
+ # # ╞═════════╪═════╡
248
+ # # │ a ┆ 3 │
249
+ # # │ a ┆ 5 │
250
+ # # │ b ┆ 6 │
251
+ # # │ c ┆ 1 │
252
+ # # │ c ┆ 2 │
253
+ # # └─────────┴─────┘
254
+ def head(n = 5)
255
+ @df.lazy
256
+ .group_by(@by, maintain_order: @maintain_order)
257
+ .head(n)
258
+ .collect(no_optimization: true)
259
+ end
260
+
261
+ # Get the last `n` rows of each group.
262
+ #
263
+ # @param n [Integer]
264
+ # Number of rows to return.
265
+ #
266
+ # @return [DataFrame]
267
+ #
268
+ # @example
269
+ # df = Polars::DataFrame.new(
270
+ # {
271
+ # "letters" => ["c", "c", "a", "c", "a", "b"],
272
+ # "nrs" => [1, 2, 3, 4, 5, 6]
273
+ # }
274
+ # )
275
+ # # =>
276
+ # # shape: (6, 2)
277
+ # # ┌─────────┬─────┐
278
+ # # │ letters ┆ nrs │
279
+ # # │ --- ┆ --- │
280
+ # # │ str ┆ i64 │
281
+ # # ╞═════════╪═════╡
282
+ # # │ c ┆ 1 │
283
+ # # │ c ┆ 2 │
284
+ # # │ a ┆ 3 │
285
+ # # │ c ┆ 4 │
286
+ # # │ a ┆ 5 │
287
+ # # │ b ┆ 6 │
288
+ # # └─────────┴─────┘
289
+ #
290
+ # @example
291
+ # df.group_by("letters").tail(2).sort("letters")
292
+ # # =>
293
+ # # shape: (5, 2)
294
+ # # ┌─────────┬─────┐
295
+ # # │ letters ┆ nrs │
296
+ # # │ --- ┆ --- │
297
+ # # │ str ┆ i64 │
298
+ # # ╞═════════╪═════╡
299
+ # # │ a ┆ 3 │
300
+ # # │ a ┆ 5 │
301
+ # # │ b ┆ 6 │
302
+ # # │ c ┆ 2 │
303
+ # # │ c ┆ 4 │
304
+ # # └─────────┴─────┘
305
+ def tail(n = 5)
306
+ @df.lazy
307
+ .group_by(@by, maintain_order: @maintain_order)
308
+ .tail(n)
309
+ .collect(no_optimization: true)
310
+ end
311
+
312
+ # Aggregate the first values in the group.
313
+ #
314
+ # @return [DataFrame]
315
+ #
316
+ # @example
317
+ # df = Polars::DataFrame.new(
318
+ # {
319
+ # "a" => [1, 2, 2, 3, 4, 5],
320
+ # "b" => [0.5, 0.5, 4, 10, 13, 14],
321
+ # "c" => [true, true, true, false, false, true],
322
+ # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
323
+ # }
324
+ # )
325
+ # df.group_by("d", maintain_order: true).first
326
+ # # =>
327
+ # # shape: (3, 4)
328
+ # # ┌────────┬─────┬──────┬───────┐
329
+ # # │ d ┆ a ┆ b ┆ c │
330
+ # # │ --- ┆ --- ┆ --- ┆ --- │
331
+ # # │ str ┆ i64 ┆ f64 ┆ bool │
332
+ # # ╞════════╪═════╪══════╪═══════╡
333
+ # # │ Apple ┆ 1 ┆ 0.5 ┆ true │
334
+ # # │ Orange ┆ 2 ┆ 0.5 ┆ true │
335
+ # # │ Banana ┆ 4 ┆ 13.0 ┆ false │
336
+ # # └────────┴─────┴──────┴───────┘
337
+ def first
338
+ agg(Polars.all.first)
339
+ end
340
+
341
+ # Aggregate the last values in the group.
342
+ #
343
+ # @return [DataFrame]
344
+ #
345
+ # @example
346
+ # df = Polars::DataFrame.new(
347
+ # {
348
+ # "a" => [1, 2, 2, 3, 4, 5],
349
+ # "b" => [0.5, 0.5, 4, 10, 13, 14],
350
+ # "c" => [true, true, true, false, false, true],
351
+ # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
352
+ # }
353
+ # )
354
+ # df.group_by("d", maintain_order: true).last
355
+ # # =>
356
+ # # shape: (3, 4)
357
+ # # ┌────────┬─────┬──────┬───────┐
358
+ # # │ d ┆ a ┆ b ┆ c │
359
+ # # │ --- ┆ --- ┆ --- ┆ --- │
360
+ # # │ str ┆ i64 ┆ f64 ┆ bool │
361
+ # # ╞════════╪═════╪══════╪═══════╡
362
+ # # │ Apple ┆ 3 ┆ 10.0 ┆ false │
363
+ # # │ Orange ┆ 2 ┆ 0.5 ┆ true │
364
+ # # │ Banana ┆ 5 ┆ 14.0 ┆ true │
365
+ # # └────────┴─────┴──────┴───────┘
366
+ def last
367
+ agg(Polars.all.last)
368
+ end
369
+
370
+ # Reduce the groups to the sum.
371
+ #
372
+ # @return [DataFrame]
373
+ #
374
+ # @example
375
+ # df = Polars::DataFrame.new(
376
+ # {
377
+ # "a" => [1, 2, 2, 3, 4, 5],
378
+ # "b" => [0.5, 0.5, 4, 10, 13, 14],
379
+ # "c" => [true, true, true, false, false, true],
380
+ # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
381
+ # }
382
+ # )
383
+ # df.group_by("d", maintain_order: true).sum
384
+ # # =>
385
+ # # shape: (3, 4)
386
+ # # ┌────────┬─────┬──────┬─────┐
387
+ # # │ d ┆ a ┆ b ┆ c │
388
+ # # │ --- ┆ --- ┆ --- ┆ --- │
389
+ # # │ str ┆ i64 ┆ f64 ┆ u32 │
390
+ # # ╞════════╪═════╪══════╪═════╡
391
+ # # │ Apple ┆ 6 ┆ 14.5 ┆ 2 │
392
+ # # │ Orange ┆ 2 ┆ 0.5 ┆ 1 │
393
+ # # │ Banana ┆ 9 ┆ 27.0 ┆ 1 │
394
+ # # └────────┴─────┴──────┴─────┘
395
+ def sum
396
+ agg(Polars.all.sum)
397
+ end
398
+
399
+ # Reduce the groups to the minimal value.
400
+ #
401
+ # @return [DataFrame]
402
+ #
403
+ # @example
404
+ # df = Polars::DataFrame.new(
405
+ # {
406
+ # "a" => [1, 2, 2, 3, 4, 5],
407
+ # "b" => [0.5, 0.5, 4, 10, 13, 14],
408
+ # "c" => [true, true, true, false, false, true],
409
+ # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
410
+ # }
411
+ # )
412
+ # df.group_by("d", maintain_order: true).min
413
+ # # =>
414
+ # # shape: (3, 4)
415
+ # # ┌────────┬─────┬──────┬───────┐
416
+ # # │ d ┆ a ┆ b ┆ c │
417
+ # # │ --- ┆ --- ┆ --- ┆ --- │
418
+ # # │ str ┆ i64 ┆ f64 ┆ bool │
419
+ # # ╞════════╪═════╪══════╪═══════╡
420
+ # # │ Apple ┆ 1 ┆ 0.5 ┆ false │
421
+ # # │ Orange ┆ 2 ┆ 0.5 ┆ true │
422
+ # # │ Banana ┆ 4 ┆ 13.0 ┆ false │
423
+ # # └────────┴─────┴──────┴───────┘
424
+ def min
425
+ agg(Polars.all.min)
426
+ end
427
+
428
+ # Reduce the groups to the maximal value.
429
+ #
430
+ # @return [DataFrame]
431
+ #
432
+ # @example
433
+ # df = Polars::DataFrame.new(
434
+ # {
435
+ # "a" => [1, 2, 2, 3, 4, 5],
436
+ # "b" => [0.5, 0.5, 4, 10, 13, 14],
437
+ # "c" => [true, true, true, false, false, true],
438
+ # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
439
+ # }
440
+ # )
441
+ # df.group_by("d", maintain_order: true).max
442
+ # # =>
443
+ # # shape: (3, 4)
444
+ # # ┌────────┬─────┬──────┬──────┐
445
+ # # │ d ┆ a ┆ b ┆ c │
446
+ # # │ --- ┆ --- ┆ --- ┆ --- │
447
+ # # │ str ┆ i64 ┆ f64 ┆ bool │
448
+ # # ╞════════╪═════╪══════╪══════╡
449
+ # # │ Apple ┆ 3 ┆ 10.0 ┆ true │
450
+ # # │ Orange ┆ 2 ┆ 0.5 ┆ true │
451
+ # # │ Banana ┆ 5 ┆ 14.0 ┆ true │
452
+ # # └────────┴─────┴──────┴──────┘
453
+ def max
454
+ agg(Polars.all.max)
455
+ end
456
+
457
+ # Count the number of values in each group.
458
+ #
459
+ # @return [DataFrame]
460
+ #
461
+ # @example
462
+ # df = Polars::DataFrame.new(
463
+ # {
464
+ # "a" => [1, 2, 2, 3, 4, 5],
465
+ # "b" => [0.5, 0.5, 4, 10, 13, 14],
466
+ # "c" => [true, true, true, false, false, true],
467
+ # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
468
+ # }
469
+ # )
470
+ # df.group_by("d", maintain_order: true).count
471
+ # # =>
472
+ # # shape: (3, 2)
473
+ # # ┌────────┬───────┐
474
+ # # │ d ┆ count │
475
+ # # │ --- ┆ --- │
476
+ # # │ str ┆ u32 │
477
+ # # ╞════════╪═══════╡
478
+ # # │ Apple ┆ 3 │
479
+ # # │ Orange ┆ 1 │
480
+ # # │ Banana ┆ 2 │
481
+ # # └────────┴───────┘
482
+ def count
483
+ agg(Polars.len.alias("count"))
484
+ end
485
+
486
+ # Reduce the groups to the mean values.
487
+ #
488
+ # @return [DataFrame]
489
+ #
490
+ # @example
491
+ # df = Polars::DataFrame.new(
492
+ # {
493
+ # "a" => [1, 2, 2, 3, 4, 5],
494
+ # "b" => [0.5, 0.5, 4, 10, 13, 14],
495
+ # "c" => [true, true, true, false, false, true],
496
+ # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
497
+ # }
498
+ # )
499
+ # df.group_by("d", maintain_order: true).mean
500
+ # # =>
501
+ # # shape: (3, 4)
502
+ # # ┌────────┬─────┬──────────┬──────────┐
503
+ # # │ d ┆ a ┆ b ┆ c │
504
+ # # │ --- ┆ --- ┆ --- ┆ --- │
505
+ # # │ str ┆ f64 ┆ f64 ┆ f64 │
506
+ # # ╞════════╪═════╪══════════╪══════════╡
507
+ # # │ Apple ┆ 2.0 ┆ 4.833333 ┆ 0.666667 │
508
+ # # │ Orange ┆ 2.0 ┆ 0.5 ┆ 1.0 │
509
+ # # │ Banana ┆ 4.5 ┆ 13.5 ┆ 0.5 │
510
+ # # └────────┴─────┴──────────┴──────────┘
511
+ def mean
512
+ agg(Polars.all.mean)
513
+ end
514
+
515
+ # Count the unique values per group.
516
+ #
517
+ # @return [DataFrame]
518
+ #
519
+ # @example
520
+ # df = Polars::DataFrame.new(
521
+ # {
522
+ # "a" => [1, 2, 1, 3, 4, 5],
523
+ # "b" => [0.5, 0.5, 0.5, 10, 13, 14],
524
+ # "d" => ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"]
525
+ # }
526
+ # )
527
+ # df.group_by("d", maintain_order: true).n_unique
528
+ # # =>
529
+ # # shape: (2, 3)
530
+ # # ┌────────┬─────┬─────┐
531
+ # # │ d ┆ a ┆ b │
532
+ # # │ --- ┆ --- ┆ --- │
533
+ # # │ str ┆ u32 ┆ u32 │
534
+ # # ╞════════╪═════╪═════╡
535
+ # # │ Apple ┆ 2 ┆ 2 │
536
+ # # │ Banana ┆ 3 ┆ 3 │
537
+ # # └────────┴─────┴─────┘
538
+ def n_unique
539
+ agg(Polars.all.n_unique)
540
+ end
541
+
542
+ # Compute the quantile per group.
543
+ #
544
+ # @param quantile [Float]
545
+ # Quantile between 0.0 and 1.0.
546
+ # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
547
+ # Interpolation method.
548
+ #
549
+ # @return [DataFrame]
550
+ #
551
+ # @example
552
+ # df = Polars::DataFrame.new(
553
+ # {
554
+ # "a" => [1, 2, 2, 3, 4, 5],
555
+ # "b" => [0.5, 0.5, 4, 10, 13, 14],
556
+ # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
557
+ # }
558
+ # )
559
+ # df.group_by("d", maintain_order: true).quantile(1)
560
+ # # =>
561
+ # # shape: (3, 3)
562
+ # # ┌────────┬─────┬──────┐
563
+ # # │ d ┆ a ┆ b │
564
+ # # │ --- ┆ --- ┆ --- │
565
+ # # │ str ┆ f64 ┆ f64 │
566
+ # # ╞════════╪═════╪══════╡
567
+ # # │ Apple ┆ 3.0 ┆ 10.0 │
568
+ # # │ Orange ┆ 2.0 ┆ 0.5 │
569
+ # # │ Banana ┆ 5.0 ┆ 14.0 │
570
+ # # └────────┴─────┴──────┘
571
+ def quantile(quantile, interpolation: "nearest")
572
+ agg(Polars.all.quantile(quantile, interpolation: interpolation))
573
+ end
574
+
575
+ # Return the median per group.
576
+ #
577
+ # @return [DataFrame]
578
+ #
579
+ # @example
580
+ # df = Polars::DataFrame.new(
581
+ # {
582
+ # "a" => [1, 2, 2, 3, 4, 5],
583
+ # "b" => [0.5, 0.5, 4, 10, 13, 14],
584
+ # "d" => ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"]
585
+ # }
586
+ # )
587
+ # df.group_by("d", maintain_order: true).median
588
+ # # =>
589
+ # # shape: (2, 3)
590
+ # # ┌────────┬─────┬──────┐
591
+ # # │ d ┆ a ┆ b │
592
+ # # │ --- ┆ --- ┆ --- │
593
+ # # │ str ┆ f64 ┆ f64 │
594
+ # # ╞════════╪═════╪══════╡
595
+ # # │ Apple ┆ 2.0 ┆ 4.0 │
596
+ # # │ Banana ┆ 4.0 ┆ 13.0 │
597
+ # # └────────┴─────┴──────┘
598
+ def median
599
+ agg(Polars.all.median)
600
+ end
601
+
602
+ # Plot data.
603
+ #
604
+ # @return [Vega::LiteChart]
605
+ def plot(*args, **options)
606
+ raise ArgumentError, "Multiple groups not supported" if @by.is_a?(::Array) && @by.size > 1
607
+ # same message as Ruby
608
+ raise ArgumentError, "unknown keyword: :group" if options.key?(:group)
609
+
610
+ @df.plot(*args, **options, group: @by)
611
+ end
612
+ end
613
+ end
@@ -0,0 +1,24 @@
1
+ module Polars
2
+ module IO
3
+ # Read into a DataFrame from Apache Avro format.
4
+ #
5
+ # @param source [Object]
6
+ # Path to a file or a file-like object.
7
+ # @param columns [Object]
8
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
9
+ # of column names.
10
+ # @param n_rows [Integer]
11
+ # Stop reading from Apache Avro file after reading ``n_rows``.
12
+ #
13
+ # @return [DataFrame]
14
+ def read_avro(source, columns: nil, n_rows: nil)
15
+ if Utils.pathlike?(source)
16
+ source = Utils.normalize_filepath(source)
17
+ end
18
+ projection, column_names = Utils.handle_projection_columns(columns)
19
+
20
+ rbdf = RbDataFrame.read_avro(source, column_names, projection, n_rows)
21
+ Utils.wrap_df(rbdf)
22
+ end
23
+ end
24
+ end