polars-df 0.13.0-x64-mingw-ucrt
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +3 -0
- data/CHANGELOG.md +208 -0
- data/Cargo.lock +2556 -0
- data/Cargo.toml +6 -0
- data/LICENSE-THIRD-PARTY.txt +39278 -0
- data/LICENSE.txt +20 -0
- data/README.md +437 -0
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/array_expr.rb +537 -0
- data/lib/polars/array_name_space.rb +423 -0
- data/lib/polars/batched_csv_reader.rb +104 -0
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/cat_expr.rb +36 -0
- data/lib/polars/cat_name_space.rb +88 -0
- data/lib/polars/config.rb +530 -0
- data/lib/polars/convert.rb +98 -0
- data/lib/polars/data_frame.rb +5191 -0
- data/lib/polars/data_types.rb +466 -0
- data/lib/polars/date_time_expr.rb +1397 -0
- data/lib/polars/date_time_name_space.rb +1287 -0
- data/lib/polars/dynamic_group_by.rb +52 -0
- data/lib/polars/exceptions.rb +38 -0
- data/lib/polars/expr.rb +7256 -0
- data/lib/polars/expr_dispatch.rb +22 -0
- data/lib/polars/functions/aggregation/horizontal.rb +246 -0
- data/lib/polars/functions/aggregation/vertical.rb +282 -0
- data/lib/polars/functions/as_datatype.rb +271 -0
- data/lib/polars/functions/col.rb +47 -0
- data/lib/polars/functions/eager.rb +182 -0
- data/lib/polars/functions/lazy.rb +1329 -0
- data/lib/polars/functions/len.rb +49 -0
- data/lib/polars/functions/lit.rb +35 -0
- data/lib/polars/functions/random.rb +16 -0
- data/lib/polars/functions/range/date_range.rb +136 -0
- data/lib/polars/functions/range/datetime_range.rb +149 -0
- data/lib/polars/functions/range/int_range.rb +51 -0
- data/lib/polars/functions/range/time_range.rb +141 -0
- data/lib/polars/functions/repeat.rb +144 -0
- data/lib/polars/functions/whenthen.rb +96 -0
- data/lib/polars/functions.rb +57 -0
- data/lib/polars/group_by.rb +613 -0
- data/lib/polars/io/avro.rb +24 -0
- data/lib/polars/io/csv.rb +696 -0
- data/lib/polars/io/database.rb +73 -0
- data/lib/polars/io/ipc.rb +275 -0
- data/lib/polars/io/json.rb +29 -0
- data/lib/polars/io/ndjson.rb +80 -0
- data/lib/polars/io/parquet.rb +233 -0
- data/lib/polars/lazy_frame.rb +2708 -0
- data/lib/polars/lazy_group_by.rb +181 -0
- data/lib/polars/list_expr.rb +791 -0
- data/lib/polars/list_name_space.rb +449 -0
- data/lib/polars/meta_expr.rb +222 -0
- data/lib/polars/name_expr.rb +198 -0
- data/lib/polars/plot.rb +109 -0
- data/lib/polars/rolling_group_by.rb +35 -0
- data/lib/polars/series.rb +4444 -0
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/sql_context.rb +194 -0
- data/lib/polars/string_cache.rb +75 -0
- data/lib/polars/string_expr.rb +1495 -0
- data/lib/polars/string_name_space.rb +811 -0
- data/lib/polars/struct_expr.rb +98 -0
- data/lib/polars/struct_name_space.rb +96 -0
- data/lib/polars/testing.rb +507 -0
- data/lib/polars/utils/constants.rb +9 -0
- data/lib/polars/utils/convert.rb +97 -0
- data/lib/polars/utils/parse.rb +89 -0
- data/lib/polars/utils/various.rb +76 -0
- data/lib/polars/utils/wrap.rb +19 -0
- data/lib/polars/utils.rb +130 -0
- data/lib/polars/version.rb +4 -0
- data/lib/polars/whenthen.rb +83 -0
- data/lib/polars-df.rb +1 -0
- data/lib/polars.rb +91 -0
- metadata +138 -0
@@ -0,0 +1,613 @@
|
|
1
|
+
module Polars
|
2
|
+
# Starts a new GroupBy operation.
|
3
|
+
class GroupBy
|
4
|
+
# @private
|
5
|
+
def initialize(df, by, maintain_order: false)
|
6
|
+
@df = df
|
7
|
+
@by = by
|
8
|
+
@maintain_order = maintain_order
|
9
|
+
end
|
10
|
+
|
11
|
+
# Allows iteration over the groups of the group by operation.
|
12
|
+
#
|
13
|
+
# @return [Object]
|
14
|
+
#
|
15
|
+
# @example
|
16
|
+
# df = Polars::DataFrame.new({"foo" => ["a", "a", "b"], "bar" => [1, 2, 3]})
|
17
|
+
# df.group_by("foo", maintain_order: true).each.to_h
|
18
|
+
# # =>
|
19
|
+
# # {"a"=>shape: (2, 2)
|
20
|
+
# # ┌─────┬─────┐
|
21
|
+
# # │ foo ┆ bar │
|
22
|
+
# # │ --- ┆ --- │
|
23
|
+
# # │ str ┆ i64 │
|
24
|
+
# # ╞═════╪═════╡
|
25
|
+
# # │ a ┆ 1 │
|
26
|
+
# # │ a ┆ 2 │
|
27
|
+
# # └─────┴─────┘, "b"=>shape: (1, 2)
|
28
|
+
# # ┌─────┬─────┐
|
29
|
+
# # │ foo ┆ bar │
|
30
|
+
# # │ --- ┆ --- │
|
31
|
+
# # │ str ┆ i64 │
|
32
|
+
# # ╞═════╪═════╡
|
33
|
+
# # │ b ┆ 3 │
|
34
|
+
# # └─────┴─────┘}
|
35
|
+
def each
|
36
|
+
return to_enum(:each) unless block_given?
|
37
|
+
|
38
|
+
temp_col = "__POLARS_GB_GROUP_INDICES"
|
39
|
+
groups_df =
|
40
|
+
@df.lazy
|
41
|
+
.with_row_index(name: temp_col)
|
42
|
+
.group_by(@by, maintain_order: @maintain_order)
|
43
|
+
.agg(Polars.col(temp_col))
|
44
|
+
.collect(no_optimization: true)
|
45
|
+
|
46
|
+
group_names = groups_df.select(Polars.all.exclude(temp_col))
|
47
|
+
|
48
|
+
# When grouping by a single column, group name is a single value
|
49
|
+
# When grouping by multiple columns, group name is a tuple of values
|
50
|
+
if @by.is_a?(::String) || @by.is_a?(Expr)
|
51
|
+
_group_names = group_names.to_series.each
|
52
|
+
else
|
53
|
+
_group_names = group_names.iter_rows
|
54
|
+
end
|
55
|
+
|
56
|
+
_group_indices = groups_df.select(temp_col).to_series
|
57
|
+
_current_index = 0
|
58
|
+
|
59
|
+
while _current_index < _group_indices.length
|
60
|
+
group_name = _group_names.next
|
61
|
+
group_data = @df[_group_indices[_current_index]]
|
62
|
+
_current_index += 1
|
63
|
+
|
64
|
+
yield group_name, group_data
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# Apply a custom/user-defined function (UDF) over the groups as a sub-DataFrame.
|
69
|
+
#
|
70
|
+
# Implementing logic using a Ruby function is almost always _significantly_
|
71
|
+
# slower and more memory intensive than implementing the same logic using
|
72
|
+
# the native expression API because:
|
73
|
+
|
74
|
+
# - The native expression engine runs in Rust; UDFs run in Ruby.
|
75
|
+
# - Use of Ruby UDFs forces the DataFrame to be materialized in memory.
|
76
|
+
# - Polars-native expressions can be parallelised (UDFs cannot).
|
77
|
+
# - Polars-native expressions can be logically optimised (UDFs cannot).
|
78
|
+
#
|
79
|
+
# Wherever possible you should strongly prefer the native expression API
|
80
|
+
# to achieve the best performance.
|
81
|
+
#
|
82
|
+
# @return [DataFrame]
|
83
|
+
#
|
84
|
+
# @example
|
85
|
+
# df = Polars::DataFrame.new(
|
86
|
+
# {
|
87
|
+
# "id" => [0, 1, 2, 3, 4],
|
88
|
+
# "color" => ["red", "green", "green", "red", "red"],
|
89
|
+
# "shape" => ["square", "triangle", "square", "triangle", "square"]
|
90
|
+
# }
|
91
|
+
# )
|
92
|
+
# df.group_by("color").apply { |group_df| group_df.sample(2) }
|
93
|
+
# # =>
|
94
|
+
# # shape: (4, 3)
|
95
|
+
# # ┌─────┬───────┬──────────┐
|
96
|
+
# # │ id ┆ color ┆ shape │
|
97
|
+
# # │ --- ┆ --- ┆ --- │
|
98
|
+
# # │ i64 ┆ str ┆ str │
|
99
|
+
# # ╞═════╪═══════╪══════════╡
|
100
|
+
# # │ 1 ┆ green ┆ triangle │
|
101
|
+
# # │ 2 ┆ green ┆ square │
|
102
|
+
# # │ 4 ┆ red ┆ square │
|
103
|
+
# # │ 3 ┆ red ┆ triangle │
|
104
|
+
# # └─────┴───────┴──────────┘
|
105
|
+
# def apply(&f)
|
106
|
+
# _dataframe_class._from_rbdf(_df.group_by_apply(by, f))
|
107
|
+
# end
|
108
|
+
|
109
|
+
# Compute aggregations for each group of a group by operation.
|
110
|
+
#
|
111
|
+
# @param aggs [Array]
|
112
|
+
# Aggregations to compute for each group of the group by operation,
|
113
|
+
# specified as positional arguments.
|
114
|
+
# Accepts expression input. Strings are parsed as column names.
|
115
|
+
# @param named_aggs [Hash]
|
116
|
+
# Additional aggregations, specified as keyword arguments.
|
117
|
+
# The resulting columns will be renamed to the keyword used.
|
118
|
+
#
|
119
|
+
# @return [DataFrame]
|
120
|
+
#
|
121
|
+
# @example Compute the aggregation of the columns for each group.
|
122
|
+
# df = Polars::DataFrame.new(
|
123
|
+
# {
|
124
|
+
# "a" => ["a", "b", "a", "b", "c"],
|
125
|
+
# "b" => [1, 2, 1, 3, 3],
|
126
|
+
# "c" => [5, 4, 3, 2, 1]
|
127
|
+
# }
|
128
|
+
# )
|
129
|
+
# df.group_by("a").agg(Polars.col("b"), Polars.col("c"))
|
130
|
+
# # =>
|
131
|
+
# # shape: (3, 3)
|
132
|
+
# # ┌─────┬───────────┬───────────┐
|
133
|
+
# # │ a ┆ b ┆ c │
|
134
|
+
# # │ --- ┆ --- ┆ --- │
|
135
|
+
# # │ str ┆ list[i64] ┆ list[i64] │
|
136
|
+
# # ╞═════╪═══════════╪═══════════╡
|
137
|
+
# # │ a ┆ [1, 1] ┆ [5, 3] │
|
138
|
+
# # │ b ┆ [2, 3] ┆ [4, 2] │
|
139
|
+
# # │ c ┆ [3] ┆ [1] │
|
140
|
+
# # └─────┴───────────┴───────────┘
|
141
|
+
#
|
142
|
+
# @example Compute the sum of a column for each group.
|
143
|
+
# df.group_by("a").agg(Polars.col("b").sum)
|
144
|
+
# # =>
|
145
|
+
# # shape: (3, 2)
|
146
|
+
# # ┌─────┬─────┐
|
147
|
+
# # │ a ┆ b │
|
148
|
+
# # │ --- ┆ --- │
|
149
|
+
# # │ str ┆ i64 │
|
150
|
+
# # ╞═════╪═════╡
|
151
|
+
# # │ a ┆ 2 │
|
152
|
+
# # │ b ┆ 5 │
|
153
|
+
# # │ c ┆ 3 │
|
154
|
+
# # └─────┴─────┘
|
155
|
+
#
|
156
|
+
# @example Compute multiple aggregates at once by passing a list of expressions.
|
157
|
+
# df.group_by("a").agg([Polars.sum("b"), Polars.mean("c")])
|
158
|
+
# # =>
|
159
|
+
# # shape: (3, 3)
|
160
|
+
# # ┌─────┬─────┬─────┐
|
161
|
+
# # │ a ┆ b ┆ c │
|
162
|
+
# # │ --- ┆ --- ┆ --- │
|
163
|
+
# # │ str ┆ i64 ┆ f64 │
|
164
|
+
# # ╞═════╪═════╪═════╡
|
165
|
+
# # │ c ┆ 3 ┆ 1.0 │
|
166
|
+
# # │ a ┆ 2 ┆ 4.0 │
|
167
|
+
# # │ b ┆ 5 ┆ 3.0 │
|
168
|
+
# # └─────┴─────┴─────┘
|
169
|
+
#
|
170
|
+
# @example Or use positional arguments to compute multiple aggregations in the same way.
|
171
|
+
# df.group_by("a").agg(
|
172
|
+
# Polars.sum("b").name.suffix("_sum"),
|
173
|
+
# (Polars.col("c") ** 2).mean.name.suffix("_mean_squared")
|
174
|
+
# )
|
175
|
+
# # =>
|
176
|
+
# # shape: (3, 3)
|
177
|
+
# # ┌─────┬───────┬────────────────┐
|
178
|
+
# # │ a ┆ b_sum ┆ c_mean_squared │
|
179
|
+
# # │ --- ┆ --- ┆ --- │
|
180
|
+
# # │ str ┆ i64 ┆ f64 │
|
181
|
+
# # ╞═════╪═══════╪════════════════╡
|
182
|
+
# # │ a ┆ 2 ┆ 17.0 │
|
183
|
+
# # │ c ┆ 3 ┆ 1.0 │
|
184
|
+
# # │ b ┆ 5 ┆ 10.0 │
|
185
|
+
# # └─────┴───────┴────────────────┘
|
186
|
+
#
|
187
|
+
# @example Use keyword arguments to easily name your expression inputs.
|
188
|
+
# df.group_by("a").agg(
|
189
|
+
# b_sum: Polars.sum("b"),
|
190
|
+
# c_mean_squared: (Polars.col("c") ** 2).mean
|
191
|
+
# )
|
192
|
+
# # =>
|
193
|
+
# # shape: (3, 3)
|
194
|
+
# # ┌─────┬───────┬────────────────┐
|
195
|
+
# # │ a ┆ b_sum ┆ c_mean_squared │
|
196
|
+
# # │ --- ┆ --- ┆ --- │
|
197
|
+
# # │ str ┆ i64 ┆ f64 │
|
198
|
+
# # ╞═════╪═══════╪════════════════╡
|
199
|
+
# # │ a ┆ 2 ┆ 17.0 │
|
200
|
+
# # │ c ┆ 3 ┆ 1.0 │
|
201
|
+
# # │ b ┆ 5 ┆ 10.0 │
|
202
|
+
# # └─────┴───────┴────────────────┘
|
203
|
+
def agg(*aggs, **named_aggs)
|
204
|
+
@df.lazy
|
205
|
+
.group_by(@by, maintain_order: @maintain_order)
|
206
|
+
.agg(*aggs, **named_aggs)
|
207
|
+
.collect(no_optimization: true)
|
208
|
+
end
|
209
|
+
|
210
|
+
# Get the first `n` rows of each group.
|
211
|
+
#
|
212
|
+
# @param n [Integer]
|
213
|
+
# Number of rows to return.
|
214
|
+
#
|
215
|
+
# @return [DataFrame]
|
216
|
+
#
|
217
|
+
# @example
|
218
|
+
# df = Polars::DataFrame.new(
|
219
|
+
# {
|
220
|
+
# "letters" => ["c", "c", "a", "c", "a", "b"],
|
221
|
+
# "nrs" => [1, 2, 3, 4, 5, 6]
|
222
|
+
# }
|
223
|
+
# )
|
224
|
+
# # =>
|
225
|
+
# # shape: (6, 2)
|
226
|
+
# # ┌─────────┬─────┐
|
227
|
+
# # │ letters ┆ nrs │
|
228
|
+
# # │ --- ┆ --- │
|
229
|
+
# # │ str ┆ i64 │
|
230
|
+
# # ╞═════════╪═════╡
|
231
|
+
# # │ c ┆ 1 │
|
232
|
+
# # │ c ┆ 2 │
|
233
|
+
# # │ a ┆ 3 │
|
234
|
+
# # │ c ┆ 4 │
|
235
|
+
# # │ a ┆ 5 │
|
236
|
+
# # │ b ┆ 6 │
|
237
|
+
# # └─────────┴─────┘
|
238
|
+
#
|
239
|
+
# @example
|
240
|
+
# df.group_by("letters").head(2).sort("letters")
|
241
|
+
# # =>
|
242
|
+
# # shape: (5, 2)
|
243
|
+
# # ┌─────────┬─────┐
|
244
|
+
# # │ letters ┆ nrs │
|
245
|
+
# # │ --- ┆ --- │
|
246
|
+
# # │ str ┆ i64 │
|
247
|
+
# # ╞═════════╪═════╡
|
248
|
+
# # │ a ┆ 3 │
|
249
|
+
# # │ a ┆ 5 │
|
250
|
+
# # │ b ┆ 6 │
|
251
|
+
# # │ c ┆ 1 │
|
252
|
+
# # │ c ┆ 2 │
|
253
|
+
# # └─────────┴─────┘
|
254
|
+
def head(n = 5)
|
255
|
+
@df.lazy
|
256
|
+
.group_by(@by, maintain_order: @maintain_order)
|
257
|
+
.head(n)
|
258
|
+
.collect(no_optimization: true)
|
259
|
+
end
|
260
|
+
|
261
|
+
# Get the last `n` rows of each group.
|
262
|
+
#
|
263
|
+
# @param n [Integer]
|
264
|
+
# Number of rows to return.
|
265
|
+
#
|
266
|
+
# @return [DataFrame]
|
267
|
+
#
|
268
|
+
# @example
|
269
|
+
# df = Polars::DataFrame.new(
|
270
|
+
# {
|
271
|
+
# "letters" => ["c", "c", "a", "c", "a", "b"],
|
272
|
+
# "nrs" => [1, 2, 3, 4, 5, 6]
|
273
|
+
# }
|
274
|
+
# )
|
275
|
+
# # =>
|
276
|
+
# # shape: (6, 2)
|
277
|
+
# # ┌─────────┬─────┐
|
278
|
+
# # │ letters ┆ nrs │
|
279
|
+
# # │ --- ┆ --- │
|
280
|
+
# # │ str ┆ i64 │
|
281
|
+
# # ╞═════════╪═════╡
|
282
|
+
# # │ c ┆ 1 │
|
283
|
+
# # │ c ┆ 2 │
|
284
|
+
# # │ a ┆ 3 │
|
285
|
+
# # │ c ┆ 4 │
|
286
|
+
# # │ a ┆ 5 │
|
287
|
+
# # │ b ┆ 6 │
|
288
|
+
# # └─────────┴─────┘
|
289
|
+
#
|
290
|
+
# @example
|
291
|
+
# df.group_by("letters").tail(2).sort("letters")
|
292
|
+
# # =>
|
293
|
+
# # shape: (5, 2)
|
294
|
+
# # ┌─────────┬─────┐
|
295
|
+
# # │ letters ┆ nrs │
|
296
|
+
# # │ --- ┆ --- │
|
297
|
+
# # │ str ┆ i64 │
|
298
|
+
# # ╞═════════╪═════╡
|
299
|
+
# # │ a ┆ 3 │
|
300
|
+
# # │ a ┆ 5 │
|
301
|
+
# # │ b ┆ 6 │
|
302
|
+
# # │ c ┆ 2 │
|
303
|
+
# # │ c ┆ 4 │
|
304
|
+
# # └─────────┴─────┘
|
305
|
+
def tail(n = 5)
|
306
|
+
@df.lazy
|
307
|
+
.group_by(@by, maintain_order: @maintain_order)
|
308
|
+
.tail(n)
|
309
|
+
.collect(no_optimization: true)
|
310
|
+
end
|
311
|
+
|
312
|
+
# Aggregate the first values in the group.
|
313
|
+
#
|
314
|
+
# @return [DataFrame]
|
315
|
+
#
|
316
|
+
# @example
|
317
|
+
# df = Polars::DataFrame.new(
|
318
|
+
# {
|
319
|
+
# "a" => [1, 2, 2, 3, 4, 5],
|
320
|
+
# "b" => [0.5, 0.5, 4, 10, 13, 14],
|
321
|
+
# "c" => [true, true, true, false, false, true],
|
322
|
+
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
323
|
+
# }
|
324
|
+
# )
|
325
|
+
# df.group_by("d", maintain_order: true).first
|
326
|
+
# # =>
|
327
|
+
# # shape: (3, 4)
|
328
|
+
# # ┌────────┬─────┬──────┬───────┐
|
329
|
+
# # │ d ┆ a ┆ b ┆ c │
|
330
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
331
|
+
# # │ str ┆ i64 ┆ f64 ┆ bool │
|
332
|
+
# # ╞════════╪═════╪══════╪═══════╡
|
333
|
+
# # │ Apple ┆ 1 ┆ 0.5 ┆ true │
|
334
|
+
# # │ Orange ┆ 2 ┆ 0.5 ┆ true │
|
335
|
+
# # │ Banana ┆ 4 ┆ 13.0 ┆ false │
|
336
|
+
# # └────────┴─────┴──────┴───────┘
|
337
|
+
def first
|
338
|
+
agg(Polars.all.first)
|
339
|
+
end
|
340
|
+
|
341
|
+
# Aggregate the last values in the group.
|
342
|
+
#
|
343
|
+
# @return [DataFrame]
|
344
|
+
#
|
345
|
+
# @example
|
346
|
+
# df = Polars::DataFrame.new(
|
347
|
+
# {
|
348
|
+
# "a" => [1, 2, 2, 3, 4, 5],
|
349
|
+
# "b" => [0.5, 0.5, 4, 10, 13, 14],
|
350
|
+
# "c" => [true, true, true, false, false, true],
|
351
|
+
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
352
|
+
# }
|
353
|
+
# )
|
354
|
+
# df.group_by("d", maintain_order: true).last
|
355
|
+
# # =>
|
356
|
+
# # shape: (3, 4)
|
357
|
+
# # ┌────────┬─────┬──────┬───────┐
|
358
|
+
# # │ d ┆ a ┆ b ┆ c │
|
359
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
360
|
+
# # │ str ┆ i64 ┆ f64 ┆ bool │
|
361
|
+
# # ╞════════╪═════╪══════╪═══════╡
|
362
|
+
# # │ Apple ┆ 3 ┆ 10.0 ┆ false │
|
363
|
+
# # │ Orange ┆ 2 ┆ 0.5 ┆ true │
|
364
|
+
# # │ Banana ┆ 5 ┆ 14.0 ┆ true │
|
365
|
+
# # └────────┴─────┴──────┴───────┘
|
366
|
+
def last
|
367
|
+
agg(Polars.all.last)
|
368
|
+
end
|
369
|
+
|
370
|
+
# Reduce the groups to the sum.
|
371
|
+
#
|
372
|
+
# @return [DataFrame]
|
373
|
+
#
|
374
|
+
# @example
|
375
|
+
# df = Polars::DataFrame.new(
|
376
|
+
# {
|
377
|
+
# "a" => [1, 2, 2, 3, 4, 5],
|
378
|
+
# "b" => [0.5, 0.5, 4, 10, 13, 14],
|
379
|
+
# "c" => [true, true, true, false, false, true],
|
380
|
+
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
381
|
+
# }
|
382
|
+
# )
|
383
|
+
# df.group_by("d", maintain_order: true).sum
|
384
|
+
# # =>
|
385
|
+
# # shape: (3, 4)
|
386
|
+
# # ┌────────┬─────┬──────┬─────┐
|
387
|
+
# # │ d ┆ a ┆ b ┆ c │
|
388
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
389
|
+
# # │ str ┆ i64 ┆ f64 ┆ u32 │
|
390
|
+
# # ╞════════╪═════╪══════╪═════╡
|
391
|
+
# # │ Apple ┆ 6 ┆ 14.5 ┆ 2 │
|
392
|
+
# # │ Orange ┆ 2 ┆ 0.5 ┆ 1 │
|
393
|
+
# # │ Banana ┆ 9 ┆ 27.0 ┆ 1 │
|
394
|
+
# # └────────┴─────┴──────┴─────┘
|
395
|
+
def sum
|
396
|
+
agg(Polars.all.sum)
|
397
|
+
end
|
398
|
+
|
399
|
+
# Reduce the groups to the minimal value.
|
400
|
+
#
|
401
|
+
# @return [DataFrame]
|
402
|
+
#
|
403
|
+
# @example
|
404
|
+
# df = Polars::DataFrame.new(
|
405
|
+
# {
|
406
|
+
# "a" => [1, 2, 2, 3, 4, 5],
|
407
|
+
# "b" => [0.5, 0.5, 4, 10, 13, 14],
|
408
|
+
# "c" => [true, true, true, false, false, true],
|
409
|
+
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
410
|
+
# }
|
411
|
+
# )
|
412
|
+
# df.group_by("d", maintain_order: true).min
|
413
|
+
# # =>
|
414
|
+
# # shape: (3, 4)
|
415
|
+
# # ┌────────┬─────┬──────┬───────┐
|
416
|
+
# # │ d ┆ a ┆ b ┆ c │
|
417
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
418
|
+
# # │ str ┆ i64 ┆ f64 ┆ bool │
|
419
|
+
# # ╞════════╪═════╪══════╪═══════╡
|
420
|
+
# # │ Apple ┆ 1 ┆ 0.5 ┆ false │
|
421
|
+
# # │ Orange ┆ 2 ┆ 0.5 ┆ true │
|
422
|
+
# # │ Banana ┆ 4 ┆ 13.0 ┆ false │
|
423
|
+
# # └────────┴─────┴──────┴───────┘
|
424
|
+
def min
|
425
|
+
agg(Polars.all.min)
|
426
|
+
end
|
427
|
+
|
428
|
+
# Reduce the groups to the maximal value.
|
429
|
+
#
|
430
|
+
# @return [DataFrame]
|
431
|
+
#
|
432
|
+
# @example
|
433
|
+
# df = Polars::DataFrame.new(
|
434
|
+
# {
|
435
|
+
# "a" => [1, 2, 2, 3, 4, 5],
|
436
|
+
# "b" => [0.5, 0.5, 4, 10, 13, 14],
|
437
|
+
# "c" => [true, true, true, false, false, true],
|
438
|
+
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
439
|
+
# }
|
440
|
+
# )
|
441
|
+
# df.group_by("d", maintain_order: true).max
|
442
|
+
# # =>
|
443
|
+
# # shape: (3, 4)
|
444
|
+
# # ┌────────┬─────┬──────┬──────┐
|
445
|
+
# # │ d ┆ a ┆ b ┆ c │
|
446
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
447
|
+
# # │ str ┆ i64 ┆ f64 ┆ bool │
|
448
|
+
# # ╞════════╪═════╪══════╪══════╡
|
449
|
+
# # │ Apple ┆ 3 ┆ 10.0 ┆ true │
|
450
|
+
# # │ Orange ┆ 2 ┆ 0.5 ┆ true │
|
451
|
+
# # │ Banana ┆ 5 ┆ 14.0 ┆ true │
|
452
|
+
# # └────────┴─────┴──────┴──────┘
|
453
|
+
def max
|
454
|
+
agg(Polars.all.max)
|
455
|
+
end
|
456
|
+
|
457
|
+
# Count the number of values in each group.
|
458
|
+
#
|
459
|
+
# @return [DataFrame]
|
460
|
+
#
|
461
|
+
# @example
|
462
|
+
# df = Polars::DataFrame.new(
|
463
|
+
# {
|
464
|
+
# "a" => [1, 2, 2, 3, 4, 5],
|
465
|
+
# "b" => [0.5, 0.5, 4, 10, 13, 14],
|
466
|
+
# "c" => [true, true, true, false, false, true],
|
467
|
+
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
468
|
+
# }
|
469
|
+
# )
|
470
|
+
# df.group_by("d", maintain_order: true).count
|
471
|
+
# # =>
|
472
|
+
# # shape: (3, 2)
|
473
|
+
# # ┌────────┬───────┐
|
474
|
+
# # │ d ┆ count │
|
475
|
+
# # │ --- ┆ --- │
|
476
|
+
# # │ str ┆ u32 │
|
477
|
+
# # ╞════════╪═══════╡
|
478
|
+
# # │ Apple ┆ 3 │
|
479
|
+
# # │ Orange ┆ 1 │
|
480
|
+
# # │ Banana ┆ 2 │
|
481
|
+
# # └────────┴───────┘
|
482
|
+
def count
|
483
|
+
agg(Polars.len.alias("count"))
|
484
|
+
end
|
485
|
+
|
486
|
+
# Reduce the groups to the mean values.
|
487
|
+
#
|
488
|
+
# @return [DataFrame]
|
489
|
+
#
|
490
|
+
# @example
|
491
|
+
# df = Polars::DataFrame.new(
|
492
|
+
# {
|
493
|
+
# "a" => [1, 2, 2, 3, 4, 5],
|
494
|
+
# "b" => [0.5, 0.5, 4, 10, 13, 14],
|
495
|
+
# "c" => [true, true, true, false, false, true],
|
496
|
+
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
497
|
+
# }
|
498
|
+
# )
|
499
|
+
# df.group_by("d", maintain_order: true).mean
|
500
|
+
# # =>
|
501
|
+
# # shape: (3, 4)
|
502
|
+
# # ┌────────┬─────┬──────────┬──────────┐
|
503
|
+
# # │ d ┆ a ┆ b ┆ c │
|
504
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
505
|
+
# # │ str ┆ f64 ┆ f64 ┆ f64 │
|
506
|
+
# # ╞════════╪═════╪══════════╪══════════╡
|
507
|
+
# # │ Apple ┆ 2.0 ┆ 4.833333 ┆ 0.666667 │
|
508
|
+
# # │ Orange ┆ 2.0 ┆ 0.5 ┆ 1.0 │
|
509
|
+
# # │ Banana ┆ 4.5 ┆ 13.5 ┆ 0.5 │
|
510
|
+
# # └────────┴─────┴──────────┴──────────┘
|
511
|
+
def mean
|
512
|
+
agg(Polars.all.mean)
|
513
|
+
end
|
514
|
+
|
515
|
+
# Count the unique values per group.
|
516
|
+
#
|
517
|
+
# @return [DataFrame]
|
518
|
+
#
|
519
|
+
# @example
|
520
|
+
# df = Polars::DataFrame.new(
|
521
|
+
# {
|
522
|
+
# "a" => [1, 2, 1, 3, 4, 5],
|
523
|
+
# "b" => [0.5, 0.5, 0.5, 10, 13, 14],
|
524
|
+
# "d" => ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"]
|
525
|
+
# }
|
526
|
+
# )
|
527
|
+
# df.group_by("d", maintain_order: true).n_unique
|
528
|
+
# # =>
|
529
|
+
# # shape: (2, 3)
|
530
|
+
# # ┌────────┬─────┬─────┐
|
531
|
+
# # │ d ┆ a ┆ b │
|
532
|
+
# # │ --- ┆ --- ┆ --- │
|
533
|
+
# # │ str ┆ u32 ┆ u32 │
|
534
|
+
# # ╞════════╪═════╪═════╡
|
535
|
+
# # │ Apple ┆ 2 ┆ 2 │
|
536
|
+
# # │ Banana ┆ 3 ┆ 3 │
|
537
|
+
# # └────────┴─────┴─────┘
|
538
|
+
def n_unique
|
539
|
+
agg(Polars.all.n_unique)
|
540
|
+
end
|
541
|
+
|
542
|
+
# Compute the quantile per group.
|
543
|
+
#
|
544
|
+
# @param quantile [Float]
|
545
|
+
# Quantile between 0.0 and 1.0.
|
546
|
+
# @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
|
547
|
+
# Interpolation method.
|
548
|
+
#
|
549
|
+
# @return [DataFrame]
|
550
|
+
#
|
551
|
+
# @example
|
552
|
+
# df = Polars::DataFrame.new(
|
553
|
+
# {
|
554
|
+
# "a" => [1, 2, 2, 3, 4, 5],
|
555
|
+
# "b" => [0.5, 0.5, 4, 10, 13, 14],
|
556
|
+
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
557
|
+
# }
|
558
|
+
# )
|
559
|
+
# df.group_by("d", maintain_order: true).quantile(1)
|
560
|
+
# # =>
|
561
|
+
# # shape: (3, 3)
|
562
|
+
# # ┌────────┬─────┬──────┐
|
563
|
+
# # │ d ┆ a ┆ b │
|
564
|
+
# # │ --- ┆ --- ┆ --- │
|
565
|
+
# # │ str ┆ f64 ┆ f64 │
|
566
|
+
# # ╞════════╪═════╪══════╡
|
567
|
+
# # │ Apple ┆ 3.0 ┆ 10.0 │
|
568
|
+
# # │ Orange ┆ 2.0 ┆ 0.5 │
|
569
|
+
# # │ Banana ┆ 5.0 ┆ 14.0 │
|
570
|
+
# # └────────┴─────┴──────┘
|
571
|
+
def quantile(quantile, interpolation: "nearest")
|
572
|
+
agg(Polars.all.quantile(quantile, interpolation: interpolation))
|
573
|
+
end
|
574
|
+
|
575
|
+
# Return the median per group.
|
576
|
+
#
|
577
|
+
# @return [DataFrame]
|
578
|
+
#
|
579
|
+
# @example
|
580
|
+
# df = Polars::DataFrame.new(
|
581
|
+
# {
|
582
|
+
# "a" => [1, 2, 2, 3, 4, 5],
|
583
|
+
# "b" => [0.5, 0.5, 4, 10, 13, 14],
|
584
|
+
# "d" => ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"]
|
585
|
+
# }
|
586
|
+
# )
|
587
|
+
# df.group_by("d", maintain_order: true).median
|
588
|
+
# # =>
|
589
|
+
# # shape: (2, 3)
|
590
|
+
# # ┌────────┬─────┬──────┐
|
591
|
+
# # │ d ┆ a ┆ b │
|
592
|
+
# # │ --- ┆ --- ┆ --- │
|
593
|
+
# # │ str ┆ f64 ┆ f64 │
|
594
|
+
# # ╞════════╪═════╪══════╡
|
595
|
+
# # │ Apple ┆ 2.0 ┆ 4.0 │
|
596
|
+
# # │ Banana ┆ 4.0 ┆ 13.0 │
|
597
|
+
# # └────────┴─────┴──────┘
|
598
|
+
def median
|
599
|
+
agg(Polars.all.median)
|
600
|
+
end
|
601
|
+
|
602
|
+
# Plot data.
|
603
|
+
#
|
604
|
+
# @return [Vega::LiteChart]
|
605
|
+
def plot(*args, **options)
|
606
|
+
raise ArgumentError, "Multiple groups not supported" if @by.is_a?(::Array) && @by.size > 1
|
607
|
+
# same message as Ruby
|
608
|
+
raise ArgumentError, "unknown keyword: :group" if options.key?(:group)
|
609
|
+
|
610
|
+
@df.plot(*args, **options, group: @by)
|
611
|
+
end
|
612
|
+
end
|
613
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Polars
|
2
|
+
module IO
|
3
|
+
# Read into a DataFrame from Apache Avro format.
|
4
|
+
#
|
5
|
+
# @param source [Object]
|
6
|
+
# Path to a file or a file-like object.
|
7
|
+
# @param columns [Object]
|
8
|
+
# Columns to select. Accepts a list of column indices (starting at zero) or a list
|
9
|
+
# of column names.
|
10
|
+
# @param n_rows [Integer]
|
11
|
+
# Stop reading from Apache Avro file after reading ``n_rows``.
|
12
|
+
#
|
13
|
+
# @return [DataFrame]
|
14
|
+
def read_avro(source, columns: nil, n_rows: nil)
|
15
|
+
if Utils.pathlike?(source)
|
16
|
+
source = Utils.normalize_filepath(source)
|
17
|
+
end
|
18
|
+
projection, column_names = Utils.handle_projection_columns(columns)
|
19
|
+
|
20
|
+
rbdf = RbDataFrame.read_avro(source, column_names, projection, n_rows)
|
21
|
+
Utils.wrap_df(rbdf)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|