polars-runtime-compat 1.34.0b2__cp39-abi3-manylinux_2_24_aarch64.whl → 1.34.0b4__cp39-abi3-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of polars-runtime-compat might be problematic. Click here for more details.
- _polars_runtime_compat/_polars_runtime_compat.abi3.so +0 -0
- {polars_runtime_compat-1.34.0b2.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/METADATA +1 -1
- polars_runtime_compat-1.34.0b4.dist-info/RECORD +6 -0
- polars/__init__.py +0 -528
- polars/_cpu_check.py +0 -265
- polars/_dependencies.py +0 -355
- polars/_plr.py +0 -99
- polars/_plr.pyi +0 -2496
- polars/_reexport.py +0 -23
- polars/_typing.py +0 -478
- polars/_utils/__init__.py +0 -37
- polars/_utils/async_.py +0 -102
- polars/_utils/cache.py +0 -176
- polars/_utils/cloud.py +0 -40
- polars/_utils/constants.py +0 -29
- polars/_utils/construction/__init__.py +0 -46
- polars/_utils/construction/dataframe.py +0 -1397
- polars/_utils/construction/other.py +0 -72
- polars/_utils/construction/series.py +0 -560
- polars/_utils/construction/utils.py +0 -118
- polars/_utils/convert.py +0 -224
- polars/_utils/deprecation.py +0 -406
- polars/_utils/getitem.py +0 -457
- polars/_utils/logging.py +0 -11
- polars/_utils/nest_asyncio.py +0 -264
- polars/_utils/parquet.py +0 -15
- polars/_utils/parse/__init__.py +0 -12
- polars/_utils/parse/expr.py +0 -242
- polars/_utils/polars_version.py +0 -19
- polars/_utils/pycapsule.py +0 -53
- polars/_utils/scan.py +0 -27
- polars/_utils/serde.py +0 -63
- polars/_utils/slice.py +0 -215
- polars/_utils/udfs.py +0 -1251
- polars/_utils/unstable.py +0 -63
- polars/_utils/various.py +0 -782
- polars/_utils/wrap.py +0 -25
- polars/api.py +0 -370
- polars/catalog/__init__.py +0 -0
- polars/catalog/unity/__init__.py +0 -19
- polars/catalog/unity/client.py +0 -733
- polars/catalog/unity/models.py +0 -152
- polars/config.py +0 -1571
- polars/convert/__init__.py +0 -25
- polars/convert/general.py +0 -1046
- polars/convert/normalize.py +0 -261
- polars/dataframe/__init__.py +0 -5
- polars/dataframe/_html.py +0 -186
- polars/dataframe/frame.py +0 -12582
- polars/dataframe/group_by.py +0 -1067
- polars/dataframe/plotting.py +0 -257
- polars/datatype_expr/__init__.py +0 -5
- polars/datatype_expr/array.py +0 -56
- polars/datatype_expr/datatype_expr.py +0 -304
- polars/datatype_expr/list.py +0 -18
- polars/datatype_expr/struct.py +0 -69
- polars/datatypes/__init__.py +0 -122
- polars/datatypes/_parse.py +0 -195
- polars/datatypes/_utils.py +0 -48
- polars/datatypes/classes.py +0 -1213
- polars/datatypes/constants.py +0 -11
- polars/datatypes/constructor.py +0 -172
- polars/datatypes/convert.py +0 -366
- polars/datatypes/group.py +0 -130
- polars/exceptions.py +0 -230
- polars/expr/__init__.py +0 -7
- polars/expr/array.py +0 -964
- polars/expr/binary.py +0 -346
- polars/expr/categorical.py +0 -306
- polars/expr/datetime.py +0 -2620
- polars/expr/expr.py +0 -11272
- polars/expr/list.py +0 -1408
- polars/expr/meta.py +0 -444
- polars/expr/name.py +0 -321
- polars/expr/string.py +0 -3045
- polars/expr/struct.py +0 -357
- polars/expr/whenthen.py +0 -185
- polars/functions/__init__.py +0 -193
- polars/functions/aggregation/__init__.py +0 -33
- polars/functions/aggregation/horizontal.py +0 -298
- polars/functions/aggregation/vertical.py +0 -341
- polars/functions/as_datatype.py +0 -848
- polars/functions/business.py +0 -138
- polars/functions/col.py +0 -384
- polars/functions/datatype.py +0 -121
- polars/functions/eager.py +0 -524
- polars/functions/escape_regex.py +0 -29
- polars/functions/lazy.py +0 -2751
- polars/functions/len.py +0 -68
- polars/functions/lit.py +0 -210
- polars/functions/random.py +0 -22
- polars/functions/range/__init__.py +0 -19
- polars/functions/range/_utils.py +0 -15
- polars/functions/range/date_range.py +0 -303
- polars/functions/range/datetime_range.py +0 -370
- polars/functions/range/int_range.py +0 -348
- polars/functions/range/linear_space.py +0 -311
- polars/functions/range/time_range.py +0 -287
- polars/functions/repeat.py +0 -301
- polars/functions/whenthen.py +0 -353
- polars/interchange/__init__.py +0 -10
- polars/interchange/buffer.py +0 -77
- polars/interchange/column.py +0 -190
- polars/interchange/dataframe.py +0 -230
- polars/interchange/from_dataframe.py +0 -328
- polars/interchange/protocol.py +0 -303
- polars/interchange/utils.py +0 -170
- polars/io/__init__.py +0 -64
- polars/io/_utils.py +0 -317
- polars/io/avro.py +0 -49
- polars/io/clipboard.py +0 -36
- polars/io/cloud/__init__.py +0 -17
- polars/io/cloud/_utils.py +0 -80
- polars/io/cloud/credential_provider/__init__.py +0 -17
- polars/io/cloud/credential_provider/_builder.py +0 -520
- polars/io/cloud/credential_provider/_providers.py +0 -618
- polars/io/csv/__init__.py +0 -9
- polars/io/csv/_utils.py +0 -38
- polars/io/csv/batched_reader.py +0 -142
- polars/io/csv/functions.py +0 -1495
- polars/io/database/__init__.py +0 -6
- polars/io/database/_arrow_registry.py +0 -70
- polars/io/database/_cursor_proxies.py +0 -147
- polars/io/database/_executor.py +0 -578
- polars/io/database/_inference.py +0 -314
- polars/io/database/_utils.py +0 -144
- polars/io/database/functions.py +0 -516
- polars/io/delta.py +0 -499
- polars/io/iceberg/__init__.py +0 -3
- polars/io/iceberg/_utils.py +0 -697
- polars/io/iceberg/dataset.py +0 -556
- polars/io/iceberg/functions.py +0 -151
- polars/io/ipc/__init__.py +0 -8
- polars/io/ipc/functions.py +0 -514
- polars/io/json/__init__.py +0 -3
- polars/io/json/read.py +0 -101
- polars/io/ndjson.py +0 -332
- polars/io/parquet/__init__.py +0 -17
- polars/io/parquet/field_overwrites.py +0 -140
- polars/io/parquet/functions.py +0 -722
- polars/io/partition.py +0 -491
- polars/io/plugins.py +0 -187
- polars/io/pyarrow_dataset/__init__.py +0 -5
- polars/io/pyarrow_dataset/anonymous_scan.py +0 -109
- polars/io/pyarrow_dataset/functions.py +0 -79
- polars/io/scan_options/__init__.py +0 -5
- polars/io/scan_options/_options.py +0 -59
- polars/io/scan_options/cast_options.py +0 -126
- polars/io/spreadsheet/__init__.py +0 -6
- polars/io/spreadsheet/_utils.py +0 -52
- polars/io/spreadsheet/_write_utils.py +0 -647
- polars/io/spreadsheet/functions.py +0 -1323
- polars/lazyframe/__init__.py +0 -9
- polars/lazyframe/engine_config.py +0 -61
- polars/lazyframe/frame.py +0 -8564
- polars/lazyframe/group_by.py +0 -669
- polars/lazyframe/in_process.py +0 -42
- polars/lazyframe/opt_flags.py +0 -333
- polars/meta/__init__.py +0 -14
- polars/meta/build.py +0 -33
- polars/meta/index_type.py +0 -27
- polars/meta/thread_pool.py +0 -50
- polars/meta/versions.py +0 -120
- polars/ml/__init__.py +0 -0
- polars/ml/torch.py +0 -213
- polars/ml/utilities.py +0 -30
- polars/plugins.py +0 -155
- polars/py.typed +0 -0
- polars/pyproject.toml +0 -96
- polars/schema.py +0 -265
- polars/selectors.py +0 -3117
- polars/series/__init__.py +0 -5
- polars/series/array.py +0 -776
- polars/series/binary.py +0 -254
- polars/series/categorical.py +0 -246
- polars/series/datetime.py +0 -2275
- polars/series/list.py +0 -1087
- polars/series/plotting.py +0 -191
- polars/series/series.py +0 -9197
- polars/series/string.py +0 -2367
- polars/series/struct.py +0 -154
- polars/series/utils.py +0 -191
- polars/sql/__init__.py +0 -7
- polars/sql/context.py +0 -677
- polars/sql/functions.py +0 -139
- polars/string_cache.py +0 -185
- polars/testing/__init__.py +0 -13
- polars/testing/asserts/__init__.py +0 -9
- polars/testing/asserts/frame.py +0 -231
- polars/testing/asserts/series.py +0 -219
- polars/testing/asserts/utils.py +0 -12
- polars/testing/parametric/__init__.py +0 -33
- polars/testing/parametric/profiles.py +0 -107
- polars/testing/parametric/strategies/__init__.py +0 -22
- polars/testing/parametric/strategies/_utils.py +0 -14
- polars/testing/parametric/strategies/core.py +0 -615
- polars/testing/parametric/strategies/data.py +0 -452
- polars/testing/parametric/strategies/dtype.py +0 -436
- polars/testing/parametric/strategies/legacy.py +0 -169
- polars/type_aliases.py +0 -24
- polars_runtime_compat-1.34.0b2.dist-info/RECORD +0 -203
- {polars_runtime_compat-1.34.0b2.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/WHEEL +0 -0
- {polars_runtime_compat-1.34.0b2.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/licenses/LICENSE +0 -0
polars/dataframe/group_by.py
DELETED
|
@@ -1,1067 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import TYPE_CHECKING, Any, Callable
|
|
4
|
-
|
|
5
|
-
from polars import functions as F
|
|
6
|
-
from polars._utils.convert import parse_as_duration_string
|
|
7
|
-
from polars._utils.deprecation import deprecated
|
|
8
|
-
|
|
9
|
-
if TYPE_CHECKING:
|
|
10
|
-
import sys
|
|
11
|
-
from collections.abc import Iterable
|
|
12
|
-
from datetime import timedelta
|
|
13
|
-
|
|
14
|
-
from polars import DataFrame
|
|
15
|
-
from polars._typing import (
|
|
16
|
-
ClosedInterval,
|
|
17
|
-
IntoExpr,
|
|
18
|
-
Label,
|
|
19
|
-
QuantileMethod,
|
|
20
|
-
SchemaDict,
|
|
21
|
-
StartBy,
|
|
22
|
-
)
|
|
23
|
-
|
|
24
|
-
if sys.version_info >= (3, 11):
|
|
25
|
-
from typing import Self
|
|
26
|
-
else:
|
|
27
|
-
from typing_extensions import Self
|
|
28
|
-
|
|
29
|
-
if sys.version_info >= (3, 13):
|
|
30
|
-
from warnings import deprecated
|
|
31
|
-
else:
|
|
32
|
-
from typing_extensions import deprecated # noqa: TC004
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
class GroupBy:
|
|
36
|
-
"""Starts a new GroupBy operation."""
|
|
37
|
-
|
|
38
|
-
def __init__(
|
|
39
|
-
self,
|
|
40
|
-
df: DataFrame,
|
|
41
|
-
*by: IntoExpr | Iterable[IntoExpr],
|
|
42
|
-
maintain_order: bool,
|
|
43
|
-
**named_by: IntoExpr,
|
|
44
|
-
) -> None:
|
|
45
|
-
"""
|
|
46
|
-
Utility class for performing a group by operation over the given DataFrame.
|
|
47
|
-
|
|
48
|
-
Generated by calling `df.group_by(...)`.
|
|
49
|
-
|
|
50
|
-
Parameters
|
|
51
|
-
----------
|
|
52
|
-
df
|
|
53
|
-
DataFrame to perform the group by operation over.
|
|
54
|
-
*by
|
|
55
|
-
Column or columns to group by. Accepts expression input. Strings are parsed
|
|
56
|
-
as column names.
|
|
57
|
-
maintain_order
|
|
58
|
-
Ensure that the order of the groups is consistent with the input data.
|
|
59
|
-
This is slower than a default group by.
|
|
60
|
-
**named_by
|
|
61
|
-
Additional column(s) to group by, specified as keyword arguments.
|
|
62
|
-
The columns will be named as the keyword used.
|
|
63
|
-
"""
|
|
64
|
-
self.df = df
|
|
65
|
-
self.by = by
|
|
66
|
-
self.named_by = named_by
|
|
67
|
-
self.maintain_order = maintain_order
|
|
68
|
-
|
|
69
|
-
def __iter__(self) -> Self:
|
|
70
|
-
"""
|
|
71
|
-
Allows iteration over the groups of the group by operation.
|
|
72
|
-
|
|
73
|
-
Each group is represented by a tuple of `(name, data)`. The group names are
|
|
74
|
-
tuples of the distinct group values that identify each group.
|
|
75
|
-
|
|
76
|
-
Examples
|
|
77
|
-
--------
|
|
78
|
-
>>> df = pl.DataFrame({"foo": ["a", "a", "b"], "bar": [1, 2, 3]})
|
|
79
|
-
>>> for name, data in df.group_by("foo"): # doctest: +SKIP
|
|
80
|
-
... print(name)
|
|
81
|
-
... print(data)
|
|
82
|
-
(a,)
|
|
83
|
-
shape: (2, 2)
|
|
84
|
-
┌─────┬─────┐
|
|
85
|
-
│ foo ┆ bar │
|
|
86
|
-
│ --- ┆ --- │
|
|
87
|
-
│ str ┆ i64 │
|
|
88
|
-
╞═════╪═════╡
|
|
89
|
-
│ a ┆ 1 │
|
|
90
|
-
│ a ┆ 2 │
|
|
91
|
-
└─────┴─────┘
|
|
92
|
-
(b,)
|
|
93
|
-
shape: (1, 2)
|
|
94
|
-
┌─────┬─────┐
|
|
95
|
-
│ foo ┆ bar │
|
|
96
|
-
│ --- ┆ --- │
|
|
97
|
-
│ str ┆ i64 │
|
|
98
|
-
╞═════╪═════╡
|
|
99
|
-
│ b ┆ 3 │
|
|
100
|
-
└─────┴─────┘
|
|
101
|
-
"""
|
|
102
|
-
# Every group gather can trigger a rechunk, so do early.
|
|
103
|
-
from polars.lazyframe.opt_flags import QueryOptFlags
|
|
104
|
-
|
|
105
|
-
self.df = self.df.rechunk()
|
|
106
|
-
temp_col = "__POLARS_GB_GROUP_INDICES"
|
|
107
|
-
groups_df = (
|
|
108
|
-
self.df.lazy()
|
|
109
|
-
.group_by(*self.by, **self.named_by, maintain_order=self.maintain_order)
|
|
110
|
-
.agg(F.first().agg_groups().alias(temp_col))
|
|
111
|
-
.collect(optimizations=QueryOptFlags.none())
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
self._group_names = groups_df.select(F.all().exclude(temp_col)).iter_rows()
|
|
115
|
-
self._group_indices = groups_df.select(temp_col).to_series()
|
|
116
|
-
self._current_index = 0
|
|
117
|
-
|
|
118
|
-
return self
|
|
119
|
-
|
|
120
|
-
def __next__(self) -> tuple[tuple[Any, ...], DataFrame]:
|
|
121
|
-
if self._current_index >= len(self._group_indices):
|
|
122
|
-
raise StopIteration
|
|
123
|
-
|
|
124
|
-
group_name = next(self._group_names)
|
|
125
|
-
group_data = self.df[self._group_indices[self._current_index], :]
|
|
126
|
-
self._current_index += 1
|
|
127
|
-
|
|
128
|
-
return group_name, group_data
|
|
129
|
-
|
|
130
|
-
def agg(
|
|
131
|
-
self,
|
|
132
|
-
*aggs: IntoExpr | Iterable[IntoExpr],
|
|
133
|
-
**named_aggs: IntoExpr,
|
|
134
|
-
) -> DataFrame:
|
|
135
|
-
"""
|
|
136
|
-
Compute aggregations for each group of a group by operation.
|
|
137
|
-
|
|
138
|
-
Parameters
|
|
139
|
-
----------
|
|
140
|
-
*aggs
|
|
141
|
-
Aggregations to compute for each group of the group by operation,
|
|
142
|
-
specified as positional arguments.
|
|
143
|
-
Accepts expression input. Strings are parsed as column names.
|
|
144
|
-
**named_aggs
|
|
145
|
-
Additional aggregations, specified as keyword arguments.
|
|
146
|
-
The resulting columns will be renamed to the keyword used.
|
|
147
|
-
|
|
148
|
-
Examples
|
|
149
|
-
--------
|
|
150
|
-
Compute the aggregation of the columns for each group.
|
|
151
|
-
|
|
152
|
-
>>> df = pl.DataFrame(
|
|
153
|
-
... {
|
|
154
|
-
... "a": ["a", "b", "a", "b", "c"],
|
|
155
|
-
... "b": [1, 2, 1, 3, 3],
|
|
156
|
-
... "c": [5, 4, 3, 2, 1],
|
|
157
|
-
... }
|
|
158
|
-
... )
|
|
159
|
-
>>> df.group_by("a").agg(pl.col("b"), pl.col("c")) # doctest: +IGNORE_RESULT
|
|
160
|
-
shape: (3, 3)
|
|
161
|
-
┌─────┬───────────┬───────────┐
|
|
162
|
-
│ a ┆ b ┆ c │
|
|
163
|
-
│ --- ┆ --- ┆ --- │
|
|
164
|
-
│ str ┆ list[i64] ┆ list[i64] │
|
|
165
|
-
╞═════╪═══════════╪═══════════╡
|
|
166
|
-
│ a ┆ [1, 1] ┆ [5, 3] │
|
|
167
|
-
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
|
|
168
|
-
│ b ┆ [2, 3] ┆ [4, 2] │
|
|
169
|
-
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
|
|
170
|
-
│ c ┆ [3] ┆ [1] │
|
|
171
|
-
└─────┴───────────┴───────────┘
|
|
172
|
-
|
|
173
|
-
Compute the sum of a column for each group.
|
|
174
|
-
|
|
175
|
-
>>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT
|
|
176
|
-
shape: (3, 2)
|
|
177
|
-
┌─────┬─────┐
|
|
178
|
-
│ a ┆ b │
|
|
179
|
-
│ --- ┆ --- │
|
|
180
|
-
│ str ┆ i64 │
|
|
181
|
-
╞═════╪═════╡
|
|
182
|
-
│ a ┆ 2 │
|
|
183
|
-
│ b ┆ 5 │
|
|
184
|
-
│ c ┆ 3 │
|
|
185
|
-
└─────┴─────┘
|
|
186
|
-
|
|
187
|
-
Compute multiple aggregates at once by passing a list of expressions.
|
|
188
|
-
|
|
189
|
-
>>> df.group_by("a").agg([pl.sum("b"), pl.mean("c")]) # doctest: +IGNORE_RESULT
|
|
190
|
-
shape: (3, 3)
|
|
191
|
-
┌─────┬─────┬─────┐
|
|
192
|
-
│ a ┆ b ┆ c │
|
|
193
|
-
│ --- ┆ --- ┆ --- │
|
|
194
|
-
│ str ┆ i64 ┆ f64 │
|
|
195
|
-
╞═════╪═════╪═════╡
|
|
196
|
-
│ c ┆ 3 ┆ 1.0 │
|
|
197
|
-
│ a ┆ 2 ┆ 4.0 │
|
|
198
|
-
│ b ┆ 5 ┆ 3.0 │
|
|
199
|
-
└─────┴─────┴─────┘
|
|
200
|
-
|
|
201
|
-
Or use positional arguments to compute multiple aggregations in the same way.
|
|
202
|
-
|
|
203
|
-
>>> df.group_by("a").agg(
|
|
204
|
-
... pl.sum("b").name.suffix("_sum"),
|
|
205
|
-
... (pl.col("c") ** 2).mean().name.suffix("_mean_squared"),
|
|
206
|
-
... ) # doctest: +IGNORE_RESULT
|
|
207
|
-
shape: (3, 3)
|
|
208
|
-
┌─────┬───────┬────────────────┐
|
|
209
|
-
│ a ┆ b_sum ┆ c_mean_squared │
|
|
210
|
-
│ --- ┆ --- ┆ --- │
|
|
211
|
-
│ str ┆ i64 ┆ f64 │
|
|
212
|
-
╞═════╪═══════╪════════════════╡
|
|
213
|
-
│ a ┆ 2 ┆ 17.0 │
|
|
214
|
-
│ c ┆ 3 ┆ 1.0 │
|
|
215
|
-
│ b ┆ 5 ┆ 10.0 │
|
|
216
|
-
└─────┴───────┴────────────────┘
|
|
217
|
-
|
|
218
|
-
Use keyword arguments to easily name your expression inputs.
|
|
219
|
-
|
|
220
|
-
>>> df.group_by("a").agg(
|
|
221
|
-
... b_sum=pl.sum("b"),
|
|
222
|
-
... c_mean_squared=(pl.col("c") ** 2).mean(),
|
|
223
|
-
... ) # doctest: +IGNORE_RESULT
|
|
224
|
-
shape: (3, 3)
|
|
225
|
-
┌─────┬───────┬────────────────┐
|
|
226
|
-
│ a ┆ b_sum ┆ c_mean_squared │
|
|
227
|
-
│ --- ┆ --- ┆ --- │
|
|
228
|
-
│ str ┆ i64 ┆ f64 │
|
|
229
|
-
╞═════╪═══════╪════════════════╡
|
|
230
|
-
│ a ┆ 2 ┆ 17.0 │
|
|
231
|
-
│ c ┆ 3 ┆ 1.0 │
|
|
232
|
-
│ b ┆ 5 ┆ 10.0 │
|
|
233
|
-
└─────┴───────┴────────────────┘
|
|
234
|
-
"""
|
|
235
|
-
from polars.lazyframe.opt_flags import QueryOptFlags
|
|
236
|
-
|
|
237
|
-
return (
|
|
238
|
-
self.df.lazy()
|
|
239
|
-
.group_by(*self.by, **self.named_by, maintain_order=self.maintain_order)
|
|
240
|
-
.agg(*aggs, **named_aggs)
|
|
241
|
-
.collect(optimizations=QueryOptFlags.none())
|
|
242
|
-
)
|
|
243
|
-
|
|
244
|
-
def map_groups(self, function: Callable[[DataFrame], DataFrame]) -> DataFrame:
|
|
245
|
-
"""
|
|
246
|
-
Apply a custom/user-defined function (UDF) over the groups as a sub-DataFrame.
|
|
247
|
-
|
|
248
|
-
.. warning::
|
|
249
|
-
This method is much slower than the native expressions API.
|
|
250
|
-
Only use it if you cannot implement your logic otherwise.
|
|
251
|
-
|
|
252
|
-
Implementing logic using a Python function is almost always *significantly*
|
|
253
|
-
slower and more memory intensive than implementing the same logic using
|
|
254
|
-
the native expression API because:
|
|
255
|
-
|
|
256
|
-
- The native expression engine runs in Rust; UDFs run in Python.
|
|
257
|
-
- Use of Python UDFs forces the DataFrame to be materialized in memory.
|
|
258
|
-
- Polars-native expressions can be parallelised (UDFs cannot).
|
|
259
|
-
- Polars-native expressions can be logically optimised (UDFs cannot).
|
|
260
|
-
|
|
261
|
-
Wherever possible you should strongly prefer the native expression API
|
|
262
|
-
to achieve the best performance.
|
|
263
|
-
|
|
264
|
-
Parameters
|
|
265
|
-
----------
|
|
266
|
-
function
|
|
267
|
-
Custom function that receives a DataFrame and returns a DataFrame.
|
|
268
|
-
|
|
269
|
-
Returns
|
|
270
|
-
-------
|
|
271
|
-
DataFrame
|
|
272
|
-
|
|
273
|
-
Examples
|
|
274
|
-
--------
|
|
275
|
-
For each color group sample two rows:
|
|
276
|
-
|
|
277
|
-
>>> df = pl.DataFrame(
|
|
278
|
-
... {
|
|
279
|
-
... "id": [0, 1, 2, 3, 4],
|
|
280
|
-
... "color": ["red", "green", "green", "red", "red"],
|
|
281
|
-
... "shape": ["square", "triangle", "square", "triangle", "square"],
|
|
282
|
-
... }
|
|
283
|
-
... )
|
|
284
|
-
>>> df.group_by("color").map_groups(
|
|
285
|
-
... lambda group_df: group_df.sample(2)
|
|
286
|
-
... ) # doctest: +IGNORE_RESULT
|
|
287
|
-
shape: (4, 3)
|
|
288
|
-
┌─────┬───────┬──────────┐
|
|
289
|
-
│ id ┆ color ┆ shape │
|
|
290
|
-
│ --- ┆ --- ┆ --- │
|
|
291
|
-
│ i64 ┆ str ┆ str │
|
|
292
|
-
╞═════╪═══════╪══════════╡
|
|
293
|
-
│ 1 ┆ green ┆ triangle │
|
|
294
|
-
│ 2 ┆ green ┆ square │
|
|
295
|
-
│ 4 ┆ red ┆ square │
|
|
296
|
-
│ 3 ┆ red ┆ triangle │
|
|
297
|
-
└─────┴───────┴──────────┘
|
|
298
|
-
|
|
299
|
-
It is better to implement this with an expression:
|
|
300
|
-
|
|
301
|
-
>>> df.filter(
|
|
302
|
-
... pl.int_range(pl.len()).shuffle().over("color") < 2
|
|
303
|
-
... ) # doctest: +IGNORE_RESULT
|
|
304
|
-
"""
|
|
305
|
-
if self.named_by:
|
|
306
|
-
msg = "cannot call `map_groups` when grouping by named expressions"
|
|
307
|
-
raise TypeError(msg)
|
|
308
|
-
if not all(isinstance(c, str) for c in self.by):
|
|
309
|
-
msg = "cannot call `map_groups` when grouping by an expression"
|
|
310
|
-
raise TypeError(msg)
|
|
311
|
-
|
|
312
|
-
by_strs: list[str] = self.by # type: ignore[assignment]
|
|
313
|
-
|
|
314
|
-
return self.df.__class__._from_pydf(
|
|
315
|
-
self.df._df.group_by_map_groups(by_strs, function, self.maintain_order)
|
|
316
|
-
)
|
|
317
|
-
|
|
318
|
-
def head(self, n: int = 5) -> DataFrame:
|
|
319
|
-
"""
|
|
320
|
-
Get the first `n` rows of each group.
|
|
321
|
-
|
|
322
|
-
Parameters
|
|
323
|
-
----------
|
|
324
|
-
n
|
|
325
|
-
Number of rows to return.
|
|
326
|
-
|
|
327
|
-
Examples
|
|
328
|
-
--------
|
|
329
|
-
>>> df = pl.DataFrame(
|
|
330
|
-
... {
|
|
331
|
-
... "letters": ["c", "c", "a", "c", "a", "b"],
|
|
332
|
-
... "nrs": [1, 2, 3, 4, 5, 6],
|
|
333
|
-
... }
|
|
334
|
-
... )
|
|
335
|
-
>>> df
|
|
336
|
-
shape: (6, 2)
|
|
337
|
-
┌─────────┬─────┐
|
|
338
|
-
│ letters ┆ nrs │
|
|
339
|
-
│ --- ┆ --- │
|
|
340
|
-
│ str ┆ i64 │
|
|
341
|
-
╞═════════╪═════╡
|
|
342
|
-
│ c ┆ 1 │
|
|
343
|
-
│ c ┆ 2 │
|
|
344
|
-
│ a ┆ 3 │
|
|
345
|
-
│ c ┆ 4 │
|
|
346
|
-
│ a ┆ 5 │
|
|
347
|
-
│ b ┆ 6 │
|
|
348
|
-
└─────────┴─────┘
|
|
349
|
-
>>> df.group_by("letters").head(2).sort("letters")
|
|
350
|
-
shape: (5, 2)
|
|
351
|
-
┌─────────┬─────┐
|
|
352
|
-
│ letters ┆ nrs │
|
|
353
|
-
│ --- ┆ --- │
|
|
354
|
-
│ str ┆ i64 │
|
|
355
|
-
╞═════════╪═════╡
|
|
356
|
-
│ a ┆ 3 │
|
|
357
|
-
│ a ┆ 5 │
|
|
358
|
-
│ b ┆ 6 │
|
|
359
|
-
│ c ┆ 1 │
|
|
360
|
-
│ c ┆ 2 │
|
|
361
|
-
└─────────┴─────┘
|
|
362
|
-
"""
|
|
363
|
-
from polars.lazyframe.opt_flags import QueryOptFlags
|
|
364
|
-
|
|
365
|
-
return (
|
|
366
|
-
self.df.lazy()
|
|
367
|
-
.group_by(*self.by, **self.named_by, maintain_order=self.maintain_order)
|
|
368
|
-
.head(n)
|
|
369
|
-
.collect(optimizations=QueryOptFlags._eager())
|
|
370
|
-
)
|
|
371
|
-
|
|
372
|
-
def tail(self, n: int = 5) -> DataFrame:
|
|
373
|
-
"""
|
|
374
|
-
Get the last `n` rows of each group.
|
|
375
|
-
|
|
376
|
-
Parameters
|
|
377
|
-
----------
|
|
378
|
-
n
|
|
379
|
-
Number of rows to return.
|
|
380
|
-
|
|
381
|
-
Examples
|
|
382
|
-
--------
|
|
383
|
-
>>> df = pl.DataFrame(
|
|
384
|
-
... {
|
|
385
|
-
... "letters": ["c", "c", "a", "c", "a", "b"],
|
|
386
|
-
... "nrs": [1, 2, 3, 4, 5, 6],
|
|
387
|
-
... }
|
|
388
|
-
... )
|
|
389
|
-
>>> df
|
|
390
|
-
shape: (6, 2)
|
|
391
|
-
┌─────────┬─────┐
|
|
392
|
-
│ letters ┆ nrs │
|
|
393
|
-
│ --- ┆ --- │
|
|
394
|
-
│ str ┆ i64 │
|
|
395
|
-
╞═════════╪═════╡
|
|
396
|
-
│ c ┆ 1 │
|
|
397
|
-
│ c ┆ 2 │
|
|
398
|
-
│ a ┆ 3 │
|
|
399
|
-
│ c ┆ 4 │
|
|
400
|
-
│ a ┆ 5 │
|
|
401
|
-
│ b ┆ 6 │
|
|
402
|
-
└─────────┴─────┘
|
|
403
|
-
>>> df.group_by("letters").tail(2).sort("letters")
|
|
404
|
-
shape: (5, 2)
|
|
405
|
-
┌─────────┬─────┐
|
|
406
|
-
│ letters ┆ nrs │
|
|
407
|
-
│ --- ┆ --- │
|
|
408
|
-
│ str ┆ i64 │
|
|
409
|
-
╞═════════╪═════╡
|
|
410
|
-
│ a ┆ 3 │
|
|
411
|
-
│ a ┆ 5 │
|
|
412
|
-
│ b ┆ 6 │
|
|
413
|
-
│ c ┆ 2 │
|
|
414
|
-
│ c ┆ 4 │
|
|
415
|
-
└─────────┴─────┘
|
|
416
|
-
"""
|
|
417
|
-
from polars.lazyframe.opt_flags import QueryOptFlags
|
|
418
|
-
|
|
419
|
-
return (
|
|
420
|
-
self.df.lazy()
|
|
421
|
-
.group_by(*self.by, **self.named_by, maintain_order=self.maintain_order)
|
|
422
|
-
.tail(n)
|
|
423
|
-
.collect(optimizations=QueryOptFlags.none())
|
|
424
|
-
)
|
|
425
|
-
|
|
426
|
-
def all(self) -> DataFrame:
|
|
427
|
-
"""
|
|
428
|
-
Aggregate the groups into Series.
|
|
429
|
-
|
|
430
|
-
Examples
|
|
431
|
-
--------
|
|
432
|
-
>>> df = pl.DataFrame({"a": ["one", "two", "one", "two"], "b": [1, 2, 3, 4]})
|
|
433
|
-
>>> df.group_by("a", maintain_order=True).all()
|
|
434
|
-
shape: (2, 2)
|
|
435
|
-
┌─────┬───────────┐
|
|
436
|
-
│ a ┆ b │
|
|
437
|
-
│ --- ┆ --- │
|
|
438
|
-
│ str ┆ list[i64] │
|
|
439
|
-
╞═════╪═══════════╡
|
|
440
|
-
│ one ┆ [1, 3] │
|
|
441
|
-
│ two ┆ [2, 4] │
|
|
442
|
-
└─────┴───────────┘
|
|
443
|
-
"""
|
|
444
|
-
return self.agg(F.all())
|
|
445
|
-
|
|
446
|
-
def len(self, name: str | None = None) -> DataFrame:
|
|
447
|
-
"""
|
|
448
|
-
Return the number of rows in each group.
|
|
449
|
-
|
|
450
|
-
Parameters
|
|
451
|
-
----------
|
|
452
|
-
name
|
|
453
|
-
Assign a name to the resulting column; if unset, defaults to "len".
|
|
454
|
-
|
|
455
|
-
Examples
|
|
456
|
-
--------
|
|
457
|
-
>>> df = pl.DataFrame({"a": ["Apple", "Apple", "Orange"], "b": [1, None, 2]})
|
|
458
|
-
>>> df.group_by("a").len() # doctest: +IGNORE_RESULT
|
|
459
|
-
shape: (2, 2)
|
|
460
|
-
┌────────┬─────┐
|
|
461
|
-
│ a ┆ len │
|
|
462
|
-
│ --- ┆ --- │
|
|
463
|
-
│ str ┆ u32 │
|
|
464
|
-
╞════════╪═════╡
|
|
465
|
-
│ Apple ┆ 2 │
|
|
466
|
-
│ Orange ┆ 1 │
|
|
467
|
-
└────────┴─────┘
|
|
468
|
-
>>> df.group_by("a").len(name="n") # doctest: +IGNORE_RESULT
|
|
469
|
-
shape: (2, 2)
|
|
470
|
-
┌────────┬─────┐
|
|
471
|
-
│ a ┆ n │
|
|
472
|
-
│ --- ┆ --- │
|
|
473
|
-
│ str ┆ u32 │
|
|
474
|
-
╞════════╪═════╡
|
|
475
|
-
│ Apple ┆ 2 │
|
|
476
|
-
│ Orange ┆ 1 │
|
|
477
|
-
└────────┴─────┘
|
|
478
|
-
"""
|
|
479
|
-
len_expr = F.len()
|
|
480
|
-
if name is not None:
|
|
481
|
-
len_expr = len_expr.alias(name)
|
|
482
|
-
return self.agg(len_expr)
|
|
483
|
-
|
|
484
|
-
@deprecated("`GroupBy.count` was renamed; use `GroupBy.len` instead")
|
|
485
|
-
def count(self) -> DataFrame:
|
|
486
|
-
"""
|
|
487
|
-
Return the number of rows in each group.
|
|
488
|
-
|
|
489
|
-
.. deprecated:: 0.20.5
|
|
490
|
-
This method has been renamed to :func:`GroupBy.len`.
|
|
491
|
-
|
|
492
|
-
Rows containing null values count towards the total.
|
|
493
|
-
|
|
494
|
-
Examples
|
|
495
|
-
--------
|
|
496
|
-
>>> df = pl.DataFrame(
|
|
497
|
-
... {
|
|
498
|
-
... "a": ["Apple", "Apple", "Orange"],
|
|
499
|
-
... "b": [1, None, 2],
|
|
500
|
-
... }
|
|
501
|
-
... )
|
|
502
|
-
>>> df.group_by("a").count() # doctest: +SKIP
|
|
503
|
-
shape: (2, 2)
|
|
504
|
-
┌────────┬───────┐
|
|
505
|
-
│ a ┆ count │
|
|
506
|
-
│ --- ┆ --- │
|
|
507
|
-
│ str ┆ u32 │
|
|
508
|
-
╞════════╪═══════╡
|
|
509
|
-
│ Apple ┆ 2 │
|
|
510
|
-
│ Orange ┆ 1 │
|
|
511
|
-
└────────┴───────┘
|
|
512
|
-
"""
|
|
513
|
-
return self.agg(F.len().alias("count"))
|
|
514
|
-
|
|
515
|
-
def first(self) -> DataFrame:
|
|
516
|
-
"""
|
|
517
|
-
Aggregate the first values in the group.
|
|
518
|
-
|
|
519
|
-
Examples
|
|
520
|
-
--------
|
|
521
|
-
>>> df = pl.DataFrame(
|
|
522
|
-
... {
|
|
523
|
-
... "a": [1, 2, 2, 3, 4, 5],
|
|
524
|
-
... "b": [0.5, 0.5, 4, 10, 13, 14],
|
|
525
|
-
... "c": [True, True, True, False, False, True],
|
|
526
|
-
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
|
527
|
-
... }
|
|
528
|
-
... )
|
|
529
|
-
>>> df.group_by("d", maintain_order=True).first()
|
|
530
|
-
shape: (3, 4)
|
|
531
|
-
┌────────┬─────┬──────┬───────┐
|
|
532
|
-
│ d ┆ a ┆ b ┆ c │
|
|
533
|
-
│ --- ┆ --- ┆ --- ┆ --- │
|
|
534
|
-
│ str ┆ i64 ┆ f64 ┆ bool │
|
|
535
|
-
╞════════╪═════╪══════╪═══════╡
|
|
536
|
-
│ Apple ┆ 1 ┆ 0.5 ┆ true │
|
|
537
|
-
│ Orange ┆ 2 ┆ 0.5 ┆ true │
|
|
538
|
-
│ Banana ┆ 4 ┆ 13.0 ┆ false │
|
|
539
|
-
└────────┴─────┴──────┴───────┘
|
|
540
|
-
"""
|
|
541
|
-
return self.agg(F.all().first())
|
|
542
|
-
|
|
543
|
-
def last(self) -> DataFrame:
|
|
544
|
-
"""
|
|
545
|
-
Aggregate the last values in the group.
|
|
546
|
-
|
|
547
|
-
Examples
|
|
548
|
-
--------
|
|
549
|
-
>>> df = pl.DataFrame(
|
|
550
|
-
... {
|
|
551
|
-
... "a": [1, 2, 2, 3, 4, 5],
|
|
552
|
-
... "b": [0.5, 0.5, 4, 10, 14, 13],
|
|
553
|
-
... "c": [True, True, True, False, False, True],
|
|
554
|
-
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
|
555
|
-
... }
|
|
556
|
-
... )
|
|
557
|
-
>>> df.group_by("d", maintain_order=True).last()
|
|
558
|
-
shape: (3, 4)
|
|
559
|
-
┌────────┬─────┬──────┬───────┐
|
|
560
|
-
│ d ┆ a ┆ b ┆ c │
|
|
561
|
-
│ --- ┆ --- ┆ --- ┆ --- │
|
|
562
|
-
│ str ┆ i64 ┆ f64 ┆ bool │
|
|
563
|
-
╞════════╪═════╪══════╪═══════╡
|
|
564
|
-
│ Apple ┆ 3 ┆ 10.0 ┆ false │
|
|
565
|
-
│ Orange ┆ 2 ┆ 0.5 ┆ true │
|
|
566
|
-
│ Banana ┆ 5 ┆ 13.0 ┆ true │
|
|
567
|
-
└────────┴─────┴──────┴───────┘
|
|
568
|
-
"""
|
|
569
|
-
return self.agg(F.all().last())
|
|
570
|
-
|
|
571
|
-
def max(self) -> DataFrame:
|
|
572
|
-
"""
|
|
573
|
-
Reduce the groups to the maximal value.
|
|
574
|
-
|
|
575
|
-
Examples
|
|
576
|
-
--------
|
|
577
|
-
>>> df = pl.DataFrame(
|
|
578
|
-
... {
|
|
579
|
-
... "a": [1, 2, 2, 3, 4, 5],
|
|
580
|
-
... "b": [0.5, 0.5, 4, 10, 13, 14],
|
|
581
|
-
... "c": [True, True, True, False, False, True],
|
|
582
|
-
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
|
583
|
-
... }
|
|
584
|
-
... )
|
|
585
|
-
>>> df.group_by("d", maintain_order=True).max()
|
|
586
|
-
shape: (3, 4)
|
|
587
|
-
┌────────┬─────┬──────┬──────┐
|
|
588
|
-
│ d ┆ a ┆ b ┆ c │
|
|
589
|
-
│ --- ┆ --- ┆ --- ┆ --- │
|
|
590
|
-
│ str ┆ i64 ┆ f64 ┆ bool │
|
|
591
|
-
╞════════╪═════╪══════╪══════╡
|
|
592
|
-
│ Apple ┆ 3 ┆ 10.0 ┆ true │
|
|
593
|
-
│ Orange ┆ 2 ┆ 0.5 ┆ true │
|
|
594
|
-
│ Banana ┆ 5 ┆ 14.0 ┆ true │
|
|
595
|
-
└────────┴─────┴──────┴──────┘
|
|
596
|
-
"""
|
|
597
|
-
return self.agg(F.all().max())
|
|
598
|
-
|
|
599
|
-
def mean(self) -> DataFrame:
|
|
600
|
-
"""
|
|
601
|
-
Reduce the groups to the mean values.
|
|
602
|
-
|
|
603
|
-
Examples
|
|
604
|
-
--------
|
|
605
|
-
>>> df = pl.DataFrame(
|
|
606
|
-
... {
|
|
607
|
-
... "a": [1, 2, 2, 3, 4, 5],
|
|
608
|
-
... "b": [0.5, 0.5, 4, 10, 13, 14],
|
|
609
|
-
... "c": [True, True, True, False, False, True],
|
|
610
|
-
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
|
611
|
-
... }
|
|
612
|
-
... )
|
|
613
|
-
>>> df.group_by("d", maintain_order=True).mean()
|
|
614
|
-
shape: (3, 4)
|
|
615
|
-
┌────────┬─────┬──────────┬──────────┐
|
|
616
|
-
│ d ┆ a ┆ b ┆ c │
|
|
617
|
-
│ --- ┆ --- ┆ --- ┆ --- │
|
|
618
|
-
│ str ┆ f64 ┆ f64 ┆ f64 │
|
|
619
|
-
╞════════╪═════╪══════════╪══════════╡
|
|
620
|
-
│ Apple ┆ 2.0 ┆ 4.833333 ┆ 0.666667 │
|
|
621
|
-
│ Orange ┆ 2.0 ┆ 0.5 ┆ 1.0 │
|
|
622
|
-
│ Banana ┆ 4.5 ┆ 13.5 ┆ 0.5 │
|
|
623
|
-
└────────┴─────┴──────────┴──────────┘
|
|
624
|
-
"""
|
|
625
|
-
return self.agg(F.all().mean())
|
|
626
|
-
|
|
627
|
-
def median(self) -> DataFrame:
|
|
628
|
-
"""
|
|
629
|
-
Return the median per group.
|
|
630
|
-
|
|
631
|
-
Examples
|
|
632
|
-
--------
|
|
633
|
-
>>> df = pl.DataFrame(
|
|
634
|
-
... {
|
|
635
|
-
... "a": [1, 2, 2, 3, 4, 5],
|
|
636
|
-
... "b": [0.5, 0.5, 4, 10, 13, 14],
|
|
637
|
-
... "d": ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"],
|
|
638
|
-
... }
|
|
639
|
-
... )
|
|
640
|
-
>>> df.group_by("d", maintain_order=True).median()
|
|
641
|
-
shape: (2, 3)
|
|
642
|
-
┌────────┬─────┬──────┐
|
|
643
|
-
│ d ┆ a ┆ b │
|
|
644
|
-
│ --- ┆ --- ┆ --- │
|
|
645
|
-
│ str ┆ f64 ┆ f64 │
|
|
646
|
-
╞════════╪═════╪══════╡
|
|
647
|
-
│ Apple ┆ 2.0 ┆ 4.0 │
|
|
648
|
-
│ Banana ┆ 4.0 ┆ 13.0 │
|
|
649
|
-
└────────┴─────┴──────┘
|
|
650
|
-
"""
|
|
651
|
-
return self.agg(F.all().median())
|
|
652
|
-
|
|
653
|
-
def min(self) -> DataFrame:
|
|
654
|
-
"""
|
|
655
|
-
Reduce the groups to the minimal value.
|
|
656
|
-
|
|
657
|
-
Examples
|
|
658
|
-
--------
|
|
659
|
-
>>> df = pl.DataFrame(
|
|
660
|
-
... {
|
|
661
|
-
... "a": [1, 2, 2, 3, 4, 5],
|
|
662
|
-
... "b": [0.5, 0.5, 4, 10, 13, 14],
|
|
663
|
-
... "c": [True, True, True, False, False, True],
|
|
664
|
-
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
|
665
|
-
... }
|
|
666
|
-
... )
|
|
667
|
-
>>> df.group_by("d", maintain_order=True).min()
|
|
668
|
-
shape: (3, 4)
|
|
669
|
-
┌────────┬─────┬──────┬───────┐
|
|
670
|
-
│ d ┆ a ┆ b ┆ c │
|
|
671
|
-
│ --- ┆ --- ┆ --- ┆ --- │
|
|
672
|
-
│ str ┆ i64 ┆ f64 ┆ bool │
|
|
673
|
-
╞════════╪═════╪══════╪═══════╡
|
|
674
|
-
│ Apple ┆ 1 ┆ 0.5 ┆ false │
|
|
675
|
-
│ Orange ┆ 2 ┆ 0.5 ┆ true │
|
|
676
|
-
│ Banana ┆ 4 ┆ 13.0 ┆ false │
|
|
677
|
-
└────────┴─────┴──────┴───────┘
|
|
678
|
-
"""
|
|
679
|
-
return self.agg(F.all().min())
|
|
680
|
-
|
|
681
|
-
def n_unique(self) -> DataFrame:
|
|
682
|
-
"""
|
|
683
|
-
Count the unique values per group.
|
|
684
|
-
|
|
685
|
-
Examples
|
|
686
|
-
--------
|
|
687
|
-
>>> df = pl.DataFrame(
|
|
688
|
-
... {
|
|
689
|
-
... "a": [1, 2, 1, 3, 4, 5],
|
|
690
|
-
... "b": [0.5, 0.5, 0.5, 10, 13, 14],
|
|
691
|
-
... "d": ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"],
|
|
692
|
-
... }
|
|
693
|
-
... )
|
|
694
|
-
>>> df.group_by("d", maintain_order=True).n_unique()
|
|
695
|
-
shape: (2, 3)
|
|
696
|
-
┌────────┬─────┬─────┐
|
|
697
|
-
│ d ┆ a ┆ b │
|
|
698
|
-
│ --- ┆ --- ┆ --- │
|
|
699
|
-
│ str ┆ u32 ┆ u32 │
|
|
700
|
-
╞════════╪═════╪═════╡
|
|
701
|
-
│ Apple ┆ 2 ┆ 2 │
|
|
702
|
-
│ Banana ┆ 3 ┆ 3 │
|
|
703
|
-
└────────┴─────┴─────┘
|
|
704
|
-
"""
|
|
705
|
-
return self.agg(F.all().n_unique())
|
|
706
|
-
|
|
707
|
-
def quantile(
|
|
708
|
-
self, quantile: float, interpolation: QuantileMethod = "nearest"
|
|
709
|
-
) -> DataFrame:
|
|
710
|
-
"""
|
|
711
|
-
Compute the quantile per group.
|
|
712
|
-
|
|
713
|
-
Parameters
|
|
714
|
-
----------
|
|
715
|
-
quantile
|
|
716
|
-
Quantile between 0.0 and 1.0.
|
|
717
|
-
interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear', 'equiprobable'}
|
|
718
|
-
Interpolation method.
|
|
719
|
-
|
|
720
|
-
Examples
|
|
721
|
-
--------
|
|
722
|
-
>>> df = pl.DataFrame(
|
|
723
|
-
... {
|
|
724
|
-
... "a": [1, 2, 2, 3, 4, 5],
|
|
725
|
-
... "b": [0.5, 0.5, 4, 10, 13, 14],
|
|
726
|
-
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
|
727
|
-
... }
|
|
728
|
-
... )
|
|
729
|
-
>>> df.group_by("d", maintain_order=True).quantile(1)
|
|
730
|
-
shape: (3, 3)
|
|
731
|
-
┌────────┬─────┬──────┐
|
|
732
|
-
│ d ┆ a ┆ b │
|
|
733
|
-
│ --- ┆ --- ┆ --- │
|
|
734
|
-
│ str ┆ f64 ┆ f64 │
|
|
735
|
-
╞════════╪═════╪══════╡
|
|
736
|
-
│ Apple ┆ 3.0 ┆ 10.0 │
|
|
737
|
-
│ Orange ┆ 2.0 ┆ 0.5 │
|
|
738
|
-
│ Banana ┆ 5.0 ┆ 14.0 │
|
|
739
|
-
└────────┴─────┴──────┘
|
|
740
|
-
""" # noqa: W505
|
|
741
|
-
return self.agg(F.all().quantile(quantile, interpolation=interpolation))
|
|
742
|
-
|
|
743
|
-
def sum(self) -> DataFrame:
|
|
744
|
-
"""
|
|
745
|
-
Reduce the groups to the sum.
|
|
746
|
-
|
|
747
|
-
Examples
|
|
748
|
-
--------
|
|
749
|
-
>>> df = pl.DataFrame(
|
|
750
|
-
... {
|
|
751
|
-
... "a": [1, 2, 2, 3, 4, 5],
|
|
752
|
-
... "b": [0.5, 0.5, 4, 10, 13, 14],
|
|
753
|
-
... "c": [True, True, True, False, False, True],
|
|
754
|
-
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
|
755
|
-
... }
|
|
756
|
-
... )
|
|
757
|
-
>>> df.group_by("d", maintain_order=True).sum()
|
|
758
|
-
shape: (3, 4)
|
|
759
|
-
┌────────┬─────┬──────┬─────┐
|
|
760
|
-
│ d ┆ a ┆ b ┆ c │
|
|
761
|
-
│ --- ┆ --- ┆ --- ┆ --- │
|
|
762
|
-
│ str ┆ i64 ┆ f64 ┆ u32 │
|
|
763
|
-
╞════════╪═════╪══════╪═════╡
|
|
764
|
-
│ Apple ┆ 6 ┆ 14.5 ┆ 2 │
|
|
765
|
-
│ Orange ┆ 2 ┆ 0.5 ┆ 1 │
|
|
766
|
-
│ Banana ┆ 9 ┆ 27.0 ┆ 1 │
|
|
767
|
-
└────────┴─────┴──────┴─────┘
|
|
768
|
-
"""
|
|
769
|
-
return self.agg(F.all().sum())
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
class RollingGroupBy:
|
|
773
|
-
"""
|
|
774
|
-
A rolling grouper.
|
|
775
|
-
|
|
776
|
-
This has an `.agg` method which will allow you to run all polars expressions in a
|
|
777
|
-
group by context.
|
|
778
|
-
"""
|
|
779
|
-
|
|
780
|
-
def __init__(
|
|
781
|
-
self,
|
|
782
|
-
df: DataFrame,
|
|
783
|
-
index_column: IntoExpr,
|
|
784
|
-
*,
|
|
785
|
-
period: str | timedelta,
|
|
786
|
-
offset: str | timedelta | None,
|
|
787
|
-
closed: ClosedInterval,
|
|
788
|
-
group_by: IntoExpr | Iterable[IntoExpr] | None,
|
|
789
|
-
) -> None:
|
|
790
|
-
period = parse_as_duration_string(period)
|
|
791
|
-
offset = parse_as_duration_string(offset)
|
|
792
|
-
|
|
793
|
-
self.df = df
|
|
794
|
-
self.time_column = index_column
|
|
795
|
-
self.period = period
|
|
796
|
-
self.offset = offset
|
|
797
|
-
self.closed = closed
|
|
798
|
-
self.group_by = group_by
|
|
799
|
-
|
|
800
|
-
def __iter__(self) -> Self:
|
|
801
|
-
from polars.lazyframe.opt_flags import QueryOptFlags
|
|
802
|
-
|
|
803
|
-
temp_col = "__POLARS_GB_GROUP_INDICES"
|
|
804
|
-
groups_df = (
|
|
805
|
-
self.df.lazy()
|
|
806
|
-
.rolling(
|
|
807
|
-
index_column=self.time_column,
|
|
808
|
-
period=self.period,
|
|
809
|
-
offset=self.offset,
|
|
810
|
-
closed=self.closed,
|
|
811
|
-
group_by=self.group_by,
|
|
812
|
-
)
|
|
813
|
-
.agg(F.first().agg_groups().alias(temp_col))
|
|
814
|
-
.collect(optimizations=QueryOptFlags.none())
|
|
815
|
-
)
|
|
816
|
-
|
|
817
|
-
self._group_names = groups_df.select(F.all().exclude(temp_col)).iter_rows()
|
|
818
|
-
self._group_indices = groups_df.select(temp_col).to_series()
|
|
819
|
-
self._current_index = 0
|
|
820
|
-
|
|
821
|
-
return self
|
|
822
|
-
|
|
823
|
-
def __next__(self) -> tuple[tuple[object, ...], DataFrame]:
|
|
824
|
-
if self._current_index >= len(self._group_indices):
|
|
825
|
-
raise StopIteration
|
|
826
|
-
|
|
827
|
-
group_name = next(self._group_names)
|
|
828
|
-
group_data = self.df[self._group_indices[self._current_index], :]
|
|
829
|
-
self._current_index += 1
|
|
830
|
-
|
|
831
|
-
return group_name, group_data
|
|
832
|
-
|
|
833
|
-
def agg(
|
|
834
|
-
self,
|
|
835
|
-
*aggs: IntoExpr | Iterable[IntoExpr],
|
|
836
|
-
**named_aggs: IntoExpr,
|
|
837
|
-
) -> DataFrame:
|
|
838
|
-
"""
|
|
839
|
-
Compute aggregations for each group of a group by operation.
|
|
840
|
-
|
|
841
|
-
Parameters
|
|
842
|
-
----------
|
|
843
|
-
*aggs
|
|
844
|
-
Aggregations to compute for each group of the group by operation,
|
|
845
|
-
specified as positional arguments.
|
|
846
|
-
Accepts expression input. Strings are parsed as column names.
|
|
847
|
-
**named_aggs
|
|
848
|
-
Additional aggregations, specified as keyword arguments.
|
|
849
|
-
The resulting columns will be renamed to the keyword used.
|
|
850
|
-
"""
|
|
851
|
-
from polars.lazyframe.opt_flags import QueryOptFlags
|
|
852
|
-
|
|
853
|
-
return (
|
|
854
|
-
self.df.lazy()
|
|
855
|
-
.rolling(
|
|
856
|
-
index_column=self.time_column,
|
|
857
|
-
period=self.period,
|
|
858
|
-
offset=self.offset,
|
|
859
|
-
closed=self.closed,
|
|
860
|
-
group_by=self.group_by,
|
|
861
|
-
)
|
|
862
|
-
.agg(*aggs, **named_aggs)
|
|
863
|
-
.collect(optimizations=QueryOptFlags.none())
|
|
864
|
-
)
|
|
865
|
-
|
|
866
|
-
def map_groups(
|
|
867
|
-
self,
|
|
868
|
-
function: Callable[[DataFrame], DataFrame],
|
|
869
|
-
schema: SchemaDict | None,
|
|
870
|
-
) -> DataFrame:
|
|
871
|
-
"""
|
|
872
|
-
Apply a custom/user-defined function (UDF) over the groups as a new DataFrame.
|
|
873
|
-
|
|
874
|
-
Using this is considered an anti-pattern as it will be very slow because:
|
|
875
|
-
|
|
876
|
-
- it forces the engine to materialize the whole `DataFrames` for the groups.
|
|
877
|
-
- it is not parallelized.
|
|
878
|
-
- it blocks optimizations as the passed python function is opaque to the
|
|
879
|
-
optimizer.
|
|
880
|
-
|
|
881
|
-
The idiomatic way to apply custom functions over multiple columns is using:
|
|
882
|
-
|
|
883
|
-
`pl.struct([my_columns]).map_elements(lambda struct_series: ..)`
|
|
884
|
-
|
|
885
|
-
Parameters
|
|
886
|
-
----------
|
|
887
|
-
function
|
|
888
|
-
Function to apply over each group of the `LazyFrame`; it receives
|
|
889
|
-
a DataFrame and should return a DataFrame.
|
|
890
|
-
schema
|
|
891
|
-
Schema of the output function. This has to be known statically. If the
|
|
892
|
-
given schema is incorrect, this is a bug in the caller's query and may
|
|
893
|
-
lead to errors. If set to None, polars assumes the schema is unchanged.
|
|
894
|
-
"""
|
|
895
|
-
from polars.lazyframe.opt_flags import QueryOptFlags
|
|
896
|
-
|
|
897
|
-
return (
|
|
898
|
-
self.df.lazy()
|
|
899
|
-
.rolling(
|
|
900
|
-
index_column=self.time_column,
|
|
901
|
-
period=self.period,
|
|
902
|
-
offset=self.offset,
|
|
903
|
-
closed=self.closed,
|
|
904
|
-
group_by=self.group_by,
|
|
905
|
-
)
|
|
906
|
-
.map_groups(function, schema)
|
|
907
|
-
.collect(optimizations=QueryOptFlags.none())
|
|
908
|
-
)
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
class DynamicGroupBy:
|
|
912
|
-
"""
|
|
913
|
-
A dynamic grouper.
|
|
914
|
-
|
|
915
|
-
This has an `.agg` method which allows you to run all polars expressions in a
|
|
916
|
-
group by context.
|
|
917
|
-
"""
|
|
918
|
-
|
|
919
|
-
def __init__(
|
|
920
|
-
self,
|
|
921
|
-
df: DataFrame,
|
|
922
|
-
index_column: IntoExpr,
|
|
923
|
-
*,
|
|
924
|
-
every: str | timedelta,
|
|
925
|
-
period: str | timedelta | None,
|
|
926
|
-
offset: str | timedelta | None,
|
|
927
|
-
include_boundaries: bool,
|
|
928
|
-
closed: ClosedInterval,
|
|
929
|
-
label: Label,
|
|
930
|
-
group_by: IntoExpr | Iterable[IntoExpr] | None,
|
|
931
|
-
start_by: StartBy,
|
|
932
|
-
) -> None:
|
|
933
|
-
every = parse_as_duration_string(every)
|
|
934
|
-
period = parse_as_duration_string(period)
|
|
935
|
-
offset = parse_as_duration_string(offset)
|
|
936
|
-
|
|
937
|
-
self.df = df
|
|
938
|
-
self.time_column = index_column
|
|
939
|
-
self.every = every
|
|
940
|
-
self.period = period
|
|
941
|
-
self.offset = offset
|
|
942
|
-
self.label = label
|
|
943
|
-
self.include_boundaries = include_boundaries
|
|
944
|
-
self.closed = closed
|
|
945
|
-
self.group_by = group_by
|
|
946
|
-
self.start_by = start_by
|
|
947
|
-
|
|
948
|
-
def __iter__(self) -> Self:
|
|
949
|
-
from polars.lazyframe.opt_flags import QueryOptFlags
|
|
950
|
-
|
|
951
|
-
temp_col = "__POLARS_GB_GROUP_INDICES"
|
|
952
|
-
groups_df = (
|
|
953
|
-
self.df.lazy()
|
|
954
|
-
.group_by_dynamic(
|
|
955
|
-
index_column=self.time_column,
|
|
956
|
-
every=self.every,
|
|
957
|
-
period=self.period,
|
|
958
|
-
offset=self.offset,
|
|
959
|
-
label=self.label,
|
|
960
|
-
include_boundaries=self.include_boundaries,
|
|
961
|
-
closed=self.closed,
|
|
962
|
-
group_by=self.group_by,
|
|
963
|
-
start_by=self.start_by,
|
|
964
|
-
)
|
|
965
|
-
.agg(F.first().agg_groups().alias(temp_col))
|
|
966
|
-
.collect(optimizations=QueryOptFlags.none())
|
|
967
|
-
)
|
|
968
|
-
|
|
969
|
-
self._group_names = groups_df.select(F.all().exclude(temp_col)).iter_rows()
|
|
970
|
-
self._group_indices = groups_df.select(temp_col).to_series()
|
|
971
|
-
self._current_index = 0
|
|
972
|
-
|
|
973
|
-
return self
|
|
974
|
-
|
|
975
|
-
def __next__(self) -> tuple[tuple[object, ...], DataFrame]:
|
|
976
|
-
if self._current_index >= len(self._group_indices):
|
|
977
|
-
raise StopIteration
|
|
978
|
-
|
|
979
|
-
group_name = next(self._group_names)
|
|
980
|
-
group_data = self.df[self._group_indices[self._current_index], :]
|
|
981
|
-
self._current_index += 1
|
|
982
|
-
|
|
983
|
-
return group_name, group_data
|
|
984
|
-
|
|
985
|
-
def agg(
|
|
986
|
-
self,
|
|
987
|
-
*aggs: IntoExpr | Iterable[IntoExpr],
|
|
988
|
-
**named_aggs: IntoExpr,
|
|
989
|
-
) -> DataFrame:
|
|
990
|
-
"""
|
|
991
|
-
Compute aggregations for each group of a group by operation.
|
|
992
|
-
|
|
993
|
-
Parameters
|
|
994
|
-
----------
|
|
995
|
-
*aggs
|
|
996
|
-
Aggregations to compute for each group of the group by operation,
|
|
997
|
-
specified as positional arguments.
|
|
998
|
-
Accepts expression input. Strings are parsed as column names.
|
|
999
|
-
**named_aggs
|
|
1000
|
-
Additional aggregations, specified as keyword arguments.
|
|
1001
|
-
The resulting columns will be renamed to the keyword used.
|
|
1002
|
-
"""
|
|
1003
|
-
from polars.lazyframe.opt_flags import QueryOptFlags
|
|
1004
|
-
|
|
1005
|
-
return (
|
|
1006
|
-
self.df.lazy()
|
|
1007
|
-
.group_by_dynamic(
|
|
1008
|
-
index_column=self.time_column,
|
|
1009
|
-
every=self.every,
|
|
1010
|
-
period=self.period,
|
|
1011
|
-
offset=self.offset,
|
|
1012
|
-
label=self.label,
|
|
1013
|
-
include_boundaries=self.include_boundaries,
|
|
1014
|
-
closed=self.closed,
|
|
1015
|
-
group_by=self.group_by,
|
|
1016
|
-
start_by=self.start_by,
|
|
1017
|
-
)
|
|
1018
|
-
.agg(*aggs, **named_aggs)
|
|
1019
|
-
.collect(optimizations=QueryOptFlags.none())
|
|
1020
|
-
)
|
|
1021
|
-
|
|
1022
|
-
def map_groups(
|
|
1023
|
-
self,
|
|
1024
|
-
function: Callable[[DataFrame], DataFrame],
|
|
1025
|
-
schema: SchemaDict | None,
|
|
1026
|
-
) -> DataFrame:
|
|
1027
|
-
"""
|
|
1028
|
-
Apply a custom/user-defined function (UDF) over the groups as a new DataFrame.
|
|
1029
|
-
|
|
1030
|
-
Using this is considered an anti-pattern as it will be very slow because:
|
|
1031
|
-
|
|
1032
|
-
- it forces the engine to materialize the whole `DataFrames` for the groups.
|
|
1033
|
-
- it is not parallelized.
|
|
1034
|
-
- it blocks optimizations as the passed python function is opaque to the
|
|
1035
|
-
optimizer.
|
|
1036
|
-
|
|
1037
|
-
The idiomatic way to apply custom functions over multiple columns is using:
|
|
1038
|
-
|
|
1039
|
-
`pl.struct([my_columns]).map_elements(lambda struct_series: ..)`
|
|
1040
|
-
|
|
1041
|
-
Parameters
|
|
1042
|
-
----------
|
|
1043
|
-
function
|
|
1044
|
-
Function to apply over each group of the `LazyFrame`; it receives
|
|
1045
|
-
a DataFrame and should return a DataFrame.
|
|
1046
|
-
schema
|
|
1047
|
-
Schema of the output function. This has to be known statically. If the
|
|
1048
|
-
given schema is incorrect, this is a bug in the caller's query and may
|
|
1049
|
-
lead to errors. If set to None, polars assumes the schema is unchanged.
|
|
1050
|
-
"""
|
|
1051
|
-
from polars.lazyframe.opt_flags import QueryOptFlags
|
|
1052
|
-
|
|
1053
|
-
return (
|
|
1054
|
-
self.df.lazy()
|
|
1055
|
-
.group_by_dynamic(
|
|
1056
|
-
index_column=self.time_column,
|
|
1057
|
-
every=self.every,
|
|
1058
|
-
period=self.period,
|
|
1059
|
-
offset=self.offset,
|
|
1060
|
-
include_boundaries=self.include_boundaries,
|
|
1061
|
-
closed=self.closed,
|
|
1062
|
-
group_by=self.group_by,
|
|
1063
|
-
start_by=self.start_by,
|
|
1064
|
-
)
|
|
1065
|
-
.map_groups(function, schema)
|
|
1066
|
-
.collect(optimizations=QueryOptFlags.none())
|
|
1067
|
-
)
|